]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
cirrus: Force pkg update on FreeBSD.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
0de1b425 20#include "netdev-linux-private.h"
d3980822 21
e9e28be3 22#include <errno.h>
8b61709d 23#include <fcntl.h>
b2befd5b
BP
24#include <sys/types.h>
25#include <netinet/in.h>
55bc98d6 26#include <arpa/inet.h>
8b61709d 27#include <inttypes.h>
2f564bb1 28#include <math.h>
32383c3b 29#include <linux/filter.h>
c1c9c9c4 30#include <linux/gen_stats.h>
bb7d0e22 31#include <linux/if_ether.h>
29cf9c1b 32#include <linux/if_packet.h>
8b61709d
BP
33#include <linux/if_tun.h>
34#include <linux/types.h>
35#include <linux/ethtool.h>
63331829 36#include <linux/mii.h>
ef3767f5 37#include <linux/rtnetlink.h>
8b61709d 38#include <linux/sockios.h>
29cf9c1b 39#include <linux/virtio_net.h>
8b61709d
BP
40#include <sys/ioctl.h>
41#include <sys/socket.h>
29cf9c1b 42#include <sys/uio.h>
ac3e3aaa 43#include <sys/utsname.h>
8b61709d
BP
44#include <net/if.h>
45#include <net/if_arp.h>
8b61709d 46#include <net/route.h>
e9e28be3 47#include <poll.h>
8b61709d
BP
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
e9e28be3
BP
51
52#include "coverage.h"
e14deea0 53#include "dp-packet.h"
93451a0a 54#include "dpif-netlink.h"
df1e5a3b 55#include "dpif-netdev.h"
3e8a2ad1 56#include "openvswitch/dynamic-string.h"
8b61709d 57#include "fatal-signal.h"
93b13be8 58#include "hash.h"
ee89ea7b 59#include "openvswitch/hmap.h"
0de1b425 60#include "netdev-afxdp.h"
8b61709d 61#include "netdev-provider.h"
7fbef77a 62#include "netdev-vport.h"
45c8d3a1 63#include "netlink-notifier.h"
2fe27d5a 64#include "netlink-socket.h"
c060c4cf 65#include "netlink.h"
bfda5239 66#include "netnsid.h"
64c96779 67#include "openvswitch/ofpbuf.h"
8b61709d 68#include "openflow/openflow.h"
19c8e9c1 69#include "ovs-atomic.h"
105cf8df 70#include "ovs-numa.h"
8b61709d 71#include "packets.h"
fd016ae3 72#include "openvswitch/poll-loop.h"
7e9dcc0f 73#include "rtnetlink.h"
ee89ea7b 74#include "openvswitch/shash.h"
c060c4cf 75#include "socket-util.h"
19993ef3 76#include "sset.h"
c1c5c723 77#include "tc.h"
1670c579 78#include "timer.h"
c060c4cf 79#include "unaligned.h"
e6211adc 80#include "openvswitch/vlog.h"
29cf9c1b 81#include "userspace-tso.h"
ee89ea7b 82#include "util.h"
5136ce49 83
d98e6007 84VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 85
d76f09ea
BP
86COVERAGE_DEFINE(netdev_set_policing);
87COVERAGE_DEFINE(netdev_arp_lookup);
88COVERAGE_DEFINE(netdev_get_ifindex);
89COVERAGE_DEFINE(netdev_get_hwaddr);
90COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
91COVERAGE_DEFINE(netdev_get_ethtool);
92COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 93
8b61709d 94\f
756819dd
FL
95#ifndef IFLA_IF_NETNSID
96#define IFLA_IF_NETNSID 0x45
97#endif
8b61709d
BP
98/* These were introduced in Linux 2.6.14, so they might be missing if we have
99 * old headers. */
100#ifndef ADVERTISED_Pause
101#define ADVERTISED_Pause (1 << 13)
102#endif
103#ifndef ADVERTISED_Asym_Pause
104#define ADVERTISED_Asym_Pause (1 << 14)
105#endif
106
e47bd51a
JP
107/* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109#ifndef ETHTOOL_GFLAGS
110#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
111#endif
112#ifndef ETHTOOL_SFLAGS
113#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
114#endif
115
c1c9c9c4
BP
116/* This was introduced in Linux 2.6.25, so it might be missing if we have old
117 * headers. */
118#ifndef TC_RTAB_SIZE
119#define TC_RTAB_SIZE 1024
120#endif
121
e7f6ba22
PJV
122#ifndef TCM_IFINDEX_MAGIC_BLOCK
123#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
124#endif
125
b73c8518
SH
126/* Linux 2.6.21 introduced struct tpacket_auxdata.
127 * Linux 2.6.27 added the tp_vlan_tci member.
128 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
129 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
130 * TP_STATUS_VLAN_TPID_VALID.
131 *
132 * With all this churn it's easiest to unconditionally define a replacement
133 * structure that has everything we want.
134 */
55bc98d6
BP
135#ifndef PACKET_AUXDATA
136#define PACKET_AUXDATA 8
137#endif
b73c8518
SH
138#ifndef TP_STATUS_VLAN_VALID
139#define TP_STATUS_VLAN_VALID (1 << 4)
140#endif
141#ifndef TP_STATUS_VLAN_TPID_VALID
142#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
143#endif
144#undef tpacket_auxdata
145#define tpacket_auxdata rpl_tpacket_auxdata
146struct tpacket_auxdata {
147 uint32_t tp_status;
148 uint32_t tp_len;
149 uint32_t tp_snaplen;
150 uint16_t tp_mac;
151 uint16_t tp_net;
152 uint16_t tp_vlan_tci;
153 uint16_t tp_vlan_tpid;
154};
155
0c615356
SH
156/* Linux 2.6.27 introduced ethtool_cmd_speed
157 *
158 * To avoid revisiting problems reported with using configure to detect
159 * compatibility (see report at
8a7903c6 160 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
161 * unconditionally replace ethtool_cmd_speed. */
162#define ethtool_cmd_speed rpl_ethtool_cmd_speed
163static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
164{
165 return ep->speed | (ep->speed_hi << 16);
166}
167
67bed84c
SH
168/* Linux 2.6.30 introduced supported and advertised flags for
169 * 1G base KX, and 10G base KX4, KR and R. */
170#ifndef SUPPORTED_1000baseKX_Full
171#define SUPPORTED_1000baseKX_Full (1 << 17)
172#define SUPPORTED_10000baseKX4_Full (1 << 18)
173#define SUPPORTED_10000baseKR_Full (1 << 19)
174#define SUPPORTED_10000baseR_FEC (1 << 20)
175#define ADVERTISED_1000baseKX_Full (1 << 17)
176#define ADVERTISED_10000baseKX4_Full (1 << 18)
177#define ADVERTISED_10000baseKR_Full (1 << 19)
178#define ADVERTISED_10000baseR_FEC (1 << 20)
179#endif
180
181/* Linux 3.5 introduced supported and advertised flags for
182 * 40G base KR4, CR4, SR4 and LR4. */
183#ifndef SUPPORTED_40000baseKR4_Full
184#define SUPPORTED_40000baseKR4_Full (1 << 23)
185#define SUPPORTED_40000baseCR4_Full (1 << 24)
186#define SUPPORTED_40000baseSR4_Full (1 << 25)
187#define SUPPORTED_40000baseLR4_Full (1 << 26)
188#define ADVERTISED_40000baseKR4_Full (1 << 23)
189#define ADVERTISED_40000baseCR4_Full (1 << 24)
190#define ADVERTISED_40000baseSR4_Full (1 << 25)
191#define ADVERTISED_40000baseLR4_Full (1 << 26)
192#endif
193
fa373af4
BP
194/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
195 *
196 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
197 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
198 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
199 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
200 * unconditionally define a replacement. */
201#ifndef IFLA_STATS64
337c9b99 202#define IFLA_STATS64 23
fa373af4
BP
203#endif
204#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
205struct rtnl_link_stats64 {
206 uint64_t rx_packets;
207 uint64_t tx_packets;
208 uint64_t rx_bytes;
209 uint64_t tx_bytes;
210 uint64_t rx_errors;
211 uint64_t tx_errors;
212 uint64_t rx_dropped;
213 uint64_t tx_dropped;
214 uint64_t multicast;
215 uint64_t collisions;
216
217 uint64_t rx_length_errors;
218 uint64_t rx_over_errors;
219 uint64_t rx_crc_errors;
220 uint64_t rx_frame_errors;
221 uint64_t rx_fifo_errors;
222 uint64_t rx_missed_errors;
223
224 uint64_t tx_aborted_errors;
225 uint64_t tx_carrier_errors;
226 uint64_t tx_fifo_errors;
227 uint64_t tx_heartbeat_errors;
228 uint64_t tx_window_errors;
229
230 uint64_t rx_compressed;
231 uint64_t tx_compressed;
232};
337c9b99 233
8b61709d 234enum {
7fbef77a
JG
235 VALID_IFINDEX = 1 << 0,
236 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
237 VALID_IN = 1 << 2,
238 VALID_MTU = 1 << 3,
239 VALID_POLICING = 1 << 4,
240 VALID_VPORT_STAT_ERROR = 1 << 5,
241 VALID_DRVINFO = 1 << 6,
242 VALID_FEATURES = 1 << 7,
105cf8df 243 VALID_NUMA_ID = 1 << 8,
8b61709d 244};
29cf9c1b
FL
245
246/* Use one for the packet buffer and another for the aux buffer to receive
247 * TSO packets. */
248#define IOV_STD_SIZE 1
249#define IOV_TSO_SIZE 2
250
251enum {
252 IOV_PACKET = 0,
253 IOV_AUXBUF = 1,
254};
c1c9c9c4 255\f
d22f8927
JH
256struct linux_lag_slave {
257 uint32_t block_id;
258 struct shash_node *node;
259};
260
261/* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
262static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
263
264/* All slaves whose LAG masters are network devices in OvS. */
265static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
266 = SHASH_INITIALIZER(&lag_shash);
267
c1c9c9c4
BP
268/* Traffic control. */
269
270/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
271 * network device.
272 *
273 * Each TC implementation subclasses this with whatever additional data it
274 * needs. */
c1c9c9c4
BP
275struct tc {
276 const struct tc_ops *ops;
93b13be8
BP
277 struct hmap queues; /* Contains "struct tc_queue"s.
278 * Read by generic TC layer.
279 * Written only by TC implementation. */
280};
c1c9c9c4 281
559eb230
BP
282#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
283
93b13be8
BP
284/* One traffic control queue.
285 *
286 * Each TC implementation subclasses this with whatever additional data it
287 * needs. */
288struct tc_queue {
289 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
290 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 291 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
292};
293
294/* A particular kind of traffic control. Each implementation generally maps to
295 * one particular Linux qdisc class.
296 *
297 * The functions below return 0 if successful or a positive errno value on
298 * failure, except where otherwise noted. All of them must be provided, except
299 * where otherwise noted. */
300struct tc_ops {
301 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
302 * This is null for tc_ops_default and tc_ops_other, for which there are no
303 * appropriate values. */
304 const char *linux_name;
305
306 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
307 const char *ovs_name;
308
309 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
310 * queues. The queues are numbered 0 through n_queues - 1. */
311 unsigned int n_queues;
312
313 /* Called to install this TC class on 'netdev'. The implementation should
314 * make the Netlink calls required to set up 'netdev' with the right qdisc
315 * and configure it according to 'details'. The implementation may assume
316 * that the current qdisc is the default; that is, there is no need for it
317 * to delete the current qdisc before installing itself.
318 *
319 * The contents of 'details' should be documented as valid for 'ovs_name'
320 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
321 * (which is built as ovs-vswitchd.conf.db(8)).
322 *
323 * This function must return 0 if and only if it sets 'netdev->tc' to an
324 * initialized 'struct tc'.
325 *
326 * (This function is null for tc_ops_other, which cannot be installed. For
327 * other TC classes it should always be nonnull.) */
79f1cbe9 328 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
329
330 /* Called when the netdev code determines (through a Netlink query) that
331 * this TC class's qdisc is installed on 'netdev', but we didn't install
332 * it ourselves and so don't know any of the details.
333 *
334 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
335 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
336 * implementation should parse the other attributes of 'nlmsg' as
337 * necessary to determine its configuration. If necessary it should also
338 * use Netlink queries to determine the configuration of queues on
339 * 'netdev'.
340 *
341 * This function must return 0 if and only if it sets 'netdev->tc' to an
342 * initialized 'struct tc'. */
343 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
344
345 /* Destroys the data structures allocated by the implementation as part of
346 * 'tc'. (This includes destroying 'tc->queues' by calling
347 * tc_destroy(tc).
348 *
349 * The implementation should not need to perform any Netlink calls. If
350 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
351 * (But it may not be desirable.)
352 *
353 * This function may be null if 'tc' is trivial. */
354 void (*tc_destroy)(struct tc *tc);
355
356 /* Retrieves details of 'netdev->tc' configuration into 'details'.
357 *
358 * The implementation should not need to perform any Netlink calls, because
359 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
360 * cached the configuration.
361 *
362 * The contents of 'details' should be documented as valid for 'ovs_name'
363 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
364 * (which is built as ovs-vswitchd.conf.db(8)).
365 *
366 * This function may be null if 'tc' is not configurable.
367 */
79f1cbe9 368 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
369
370 /* Reconfigures 'netdev->tc' according to 'details', performing any
371 * required Netlink calls to complete the reconfiguration.
372 *
373 * The contents of 'details' should be documented as valid for 'ovs_name'
374 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
375 * (which is built as ovs-vswitchd.conf.db(8)).
376 *
377 * This function may be null if 'tc' is not configurable.
378 */
79f1cbe9 379 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 380
93b13be8
BP
381 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
382 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
383 *
384 * The contents of 'details' should be documented as valid for 'ovs_name'
385 * in the "other_config" column in the "Queue" table in
386 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
387 *
388 * The implementation should not need to perform any Netlink calls, because
389 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
390 * cached the queue configuration.
391 *
392 * This function may be null if 'tc' does not have queues ('n_queues' is
393 * 0). */
93b13be8 394 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 395 struct smap *details);
c1c9c9c4
BP
396
397 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
398 * 'details', perfoming any required Netlink calls to complete the
399 * reconfiguration. The caller ensures that 'queue_id' is less than
400 * 'n_queues'.
401 *
402 * The contents of 'details' should be documented as valid for 'ovs_name'
403 * in the "other_config" column in the "Queue" table in
404 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
405 *
406 * This function may be null if 'tc' does not have queues or its queues are
407 * not configurable. */
408 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 409 const struct smap *details);
c1c9c9c4 410
93b13be8
BP
411 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
412 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
413 *
414 * This function may be null if 'tc' does not have queues or its queues
415 * cannot be deleted. */
93b13be8 416 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 417
93b13be8
BP
418 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
419 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
420 *
421 * On success, initializes '*stats'.
422 *
423 * This function may be null if 'tc' does not have queues or if it cannot
424 * report queue statistics. */
93b13be8
BP
425 int (*class_get_stats)(const struct netdev *netdev,
426 const struct tc_queue *queue,
c1c9c9c4
BP
427 struct netdev_queue_stats *stats);
428
429 /* Extracts queue stats from 'nlmsg', which is a response to a
430 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
431 *
432 * This function may be null if 'tc' does not have queues or if it cannot
433 * report queue statistics. */
434 int (*class_dump_stats)(const struct netdev *netdev,
435 const struct ofpbuf *nlmsg,
436 netdev_dump_queue_stats_cb *cb, void *aux);
437};
438
439static void
440tc_init(struct tc *tc, const struct tc_ops *ops)
441{
442 tc->ops = ops;
93b13be8 443 hmap_init(&tc->queues);
c1c9c9c4
BP
444}
445
446static void
447tc_destroy(struct tc *tc)
448{
93b13be8 449 hmap_destroy(&tc->queues);
c1c9c9c4
BP
450}
451
452static const struct tc_ops tc_ops_htb;
a339aa81 453static const struct tc_ops tc_ops_hfsc;
677d9158
JV
454static const struct tc_ops tc_ops_codel;
455static const struct tc_ops tc_ops_fqcodel;
456static const struct tc_ops tc_ops_sfq;
2f564bb1 457static const struct tc_ops tc_ops_netem;
c1c9c9c4 458static const struct tc_ops tc_ops_default;
6cf888b8 459static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
460static const struct tc_ops tc_ops_other;
461
559eb230 462static const struct tc_ops *const tcs[] = {
c1c9c9c4 463 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 464 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
465 &tc_ops_codel, /* Controlled delay */
466 &tc_ops_fqcodel, /* Fair queue controlled delay */
467 &tc_ops_sfq, /* Stochastic fair queueing */
2f564bb1 468 &tc_ops_netem, /* Network Emulator */
6cf888b8 469 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
470 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
471 &tc_ops_other, /* Some other qdisc. */
472 NULL
473};
149f577a 474
c1c9c9c4
BP
475static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
476static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
477static unsigned int tc_buffer_per_jiffy(unsigned int rate);
2f564bb1 478static uint32_t tc_time_to_ticks(uint32_t time);
c1c9c9c4 479
7874bdff
RD
480static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
481 int type,
482 unsigned int flags,
483 struct ofpbuf *);
c7952afb
BP
484static int tc_add_policer(struct netdev *,
485 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
486
487static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
488 struct nlattr **options);
489static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
490 struct nlattr **options,
491 struct netdev_queue_stats *);
492static int tc_query_class(const struct netdev *,
493 unsigned int handle, unsigned int parent,
494 struct ofpbuf **replyp);
495static int tc_delete_class(const struct netdev *, unsigned int handle);
496
497static int tc_del_qdisc(struct netdev *netdev);
498static int tc_query_qdisc(const struct netdev *netdev);
499
e7f6ba22
PJV
500void
501tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
c1c9c9c4
BP
502static int tc_calc_cell_log(unsigned int mtu);
503static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
c1c9c9c4
BP
504static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
505\f
8b61709d 506
8b61709d
BP
507/* This is set pretty low because we probably won't learn anything from the
508 * additional log messages. */
509static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
510
19c8e9c1
JS
511/* Polling miimon status for all ports causes performance degradation when
512 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
513 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
514 *
515 * Readers do not depend on this variable synchronizing with the related
516 * changes in the device miimon status, so we can use atomic_count. */
517static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 518
29cf9c1b
FL
519static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
520static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
0b0544d7 521static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 522 int cmd, const char *cmd_name);
b5d57fc8 523static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 524static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
525static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
526 enum netdev_flags on, enum netdev_flags *old_flagsp)
527 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
528static int get_ifindex(const struct netdev *, int *ifindexp);
529static int do_set_addr(struct netdev *netdev,
530 int ioctl_nr, const char *ioctl_name,
531 struct in_addr addr);
74ff3298
JR
532static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
533static int set_etheraddr(const char *netdev_name, const struct eth_addr);
488d734d 534static int af_packet_sock(void);
19c8e9c1 535static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
536static void netdev_linux_miimon_run(void);
537static void netdev_linux_miimon_wait(void);
df1e5a3b 538static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 539
796223f5
BP
540static bool
541is_tap_netdev(const struct netdev *netdev)
542{
b5d57fc8 543 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577 544}
ff4ed3c9 545\f
bfda5239
FL
546static int
547netdev_linux_netnsid_update__(struct netdev_linux *netdev)
548{
549 struct dpif_netlink_vport reply;
550 struct ofpbuf *buf;
551 int error;
552
553 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
554 if (error) {
629e1476
FL
555 if (error == ENOENT) {
556 /* Assume it is local if there is no API (e.g. if the openvswitch
557 * kernel module is not loaded). */
558 netnsid_set_local(&netdev->netnsid);
559 } else {
560 netnsid_unset(&netdev->netnsid);
561 }
bfda5239
FL
562 return error;
563 }
564
565 netnsid_set(&netdev->netnsid, reply.netnsid);
566 ofpbuf_delete(buf);
567 return 0;
568}
569
570static int
571netdev_linux_netnsid_update(struct netdev_linux *netdev)
572{
573 if (netnsid_is_unset(netdev->netnsid)) {
3dbcbfe4
FL
574 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
575 netnsid_set_local(&netdev->netnsid);
576 } else {
577 return netdev_linux_netnsid_update__(netdev);
578 }
bfda5239
FL
579 }
580
581 return 0;
582}
583
584static bool
585netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
586{
587 netdev_linux_netnsid_update(netdev);
588 return netnsid_eq(netdev->netnsid, nsid);
589}
590
756819dd
FL
591static bool
592netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
593{
594 netdev_linux_netnsid_update(netdev);
595 return netnsid_is_remote(netdev->netnsid);
596}
597
598static int netdev_linux_update_via_netlink(struct netdev_linux *);
bfda5239 599static void netdev_linux_update(struct netdev_linux *netdev, int,
7e9dcc0f 600 const struct rtnetlink_change *)
86383816 601 OVS_REQUIRES(netdev->mutex);
cee87338 602static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
603 unsigned int ifi_flags, unsigned int mask)
604 OVS_REQUIRES(netdev->mutex);
cee87338 605
d6384a3a
AW
606/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
607 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
608 * if no such socket could be created. */
609static struct nl_sock *
610netdev_linux_notify_sock(void)
611{
612 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
613 static struct nl_sock *sock;
989d7135
PS
614 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
615 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
616
617 if (ovsthread_once_start(&once)) {
618 int error;
619
620 error = nl_sock_create(NETLINK_ROUTE, &sock);
621 if (!error) {
d6384a3a
AW
622 size_t i;
623
624 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
625 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
626 if (error) {
627 nl_sock_destroy(sock);
628 sock = NULL;
629 break;
630 }
cee87338
BP
631 }
632 }
cf114a7f 633 nl_sock_listen_all_nsid(sock, true);
cee87338
BP
634 ovsthread_once_done(&once);
635 }
636
637 return sock;
638}
639
19c8e9c1
JS
640static bool
641netdev_linux_miimon_enabled(void)
642{
812c272c 643 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
644}
645
3d9c99ab
JH
646static bool
647netdev_linux_kind_is_lag(const char *kind)
648{
649 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
650 return true;
651 }
652
653 return false;
654}
655
d22f8927
JH
656static void
657netdev_linux_update_lag(struct rtnetlink_change *change)
658 OVS_REQUIRES(lag_mutex)
659{
660 struct linux_lag_slave *lag;
661
662 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
663 return;
664 }
665
666 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
667 lag = shash_find_data(&lag_shash, change->ifname);
668
669 if (!lag) {
670 struct netdev *master_netdev;
671 char master_name[IFNAMSIZ];
672 uint32_t block_id;
673 int error = 0;
674
675 if_indextoname(change->master_ifindex, master_name);
676 master_netdev = netdev_from_name(master_name);
e3b5d7c5
TL
677 if (!master_netdev) {
678 return;
679 }
d22f8927
JH
680
681 if (is_netdev_linux_class(master_netdev->netdev_class)) {
682 block_id = netdev_get_block_id(master_netdev);
683 if (!block_id) {
e3b5d7c5
TL
684 netdev_close(master_netdev);
685 return;
d22f8927
JH
686 }
687
688 lag = xmalloc(sizeof *lag);
689 lag->block_id = block_id;
690 lag->node = shash_add(&lag_shash, change->ifname, lag);
691
cae64353 692 /* delete ingress block in case it exists */
95255018 693 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
d22f8927 694 /* LAG master is linux netdev so add slave to same block. */
95255018
JH
695 error = tc_add_del_qdisc(change->if_index, true, block_id,
696 TC_INGRESS);
d22f8927 697 if (error) {
cae64353
RD
698 VLOG_WARN("failed to bind LAG slave %s to master's block",
699 change->ifname);
d22f8927
JH
700 shash_delete(&lag_shash, lag->node);
701 free(lag);
702 }
703 }
e3b5d7c5
TL
704
705 netdev_close(master_netdev);
d22f8927
JH
706 }
707 } else if (change->master_ifindex == 0) {
708 /* Check if this was a lag slave that has been freed. */
709 lag = shash_find_data(&lag_shash, change->ifname);
710
711 if (lag) {
95255018
JH
712 tc_add_del_qdisc(change->if_index, false, lag->block_id,
713 TC_INGRESS);
d22f8927
JH
714 shash_delete(&lag_shash, lag->node);
715 free(lag);
716 }
717 }
718}
719
0de1b425 720void
1c33f0c3 721netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 722{
cee87338
BP
723 struct nl_sock *sock;
724 int error;
725
19c8e9c1
JS
726 if (netdev_linux_miimon_enabled()) {
727 netdev_linux_miimon_run();
728 }
cee87338
BP
729
730 sock = netdev_linux_notify_sock();
731 if (!sock) {
732 return;
733 }
734
735 do {
cee87338 736 uint64_t buf_stub[4096 / 8];
bfda5239 737 int nsid;
cee87338
BP
738 struct ofpbuf buf;
739
740 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
bfda5239 741 error = nl_sock_recv(sock, &buf, &nsid, false);
cee87338 742 if (!error) {
7e9dcc0f 743 struct rtnetlink_change change;
cee87338 744
7e9dcc0f 745 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
746 struct netdev *netdev_ = NULL;
747 char dev_name[IFNAMSIZ];
748
749 if (!change.ifname) {
750 change.ifname = if_indextoname(change.if_index, dev_name);
751 }
752
753 if (change.ifname) {
754 netdev_ = netdev_from_name(change.ifname);
755 }
cee87338
BP
756 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
758
759 ovs_mutex_lock(&netdev->mutex);
bfda5239 760 netdev_linux_update(netdev, nsid, &change);
86383816 761 ovs_mutex_unlock(&netdev->mutex);
cee87338 762 }
d22f8927
JH
763 else if (!netdev_ && change.ifname) {
764 /* Netdev is not present in OvS but its master could be. */
765 ovs_mutex_lock(&lag_mutex);
766 netdev_linux_update_lag(&change);
767 ovs_mutex_unlock(&lag_mutex);
768 }
38e0065b 769 netdev_close(netdev_);
cee87338
BP
770 }
771 } else if (error == ENOBUFS) {
772 struct shash device_shash;
773 struct shash_node *node;
774
775 nl_sock_drain(sock);
776
777 shash_init(&device_shash);
778 netdev_get_devices(&netdev_linux_class, &device_shash);
779 SHASH_FOR_EACH (node, &device_shash) {
780 struct netdev *netdev_ = node->data;
781 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
782 unsigned int flags;
783
86383816 784 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
785 get_flags(netdev_, &flags);
786 netdev_linux_changed(netdev, flags, 0);
86383816
BP
787 ovs_mutex_unlock(&netdev->mutex);
788
cee87338
BP
789 netdev_close(netdev_);
790 }
791 shash_destroy(&device_shash);
792 } else if (error != EAGAIN) {
7ed58d4a
JP
793 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
794 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
795 ovs_strerror(error));
796 }
797 ofpbuf_uninit(&buf);
798 } while (!error);
8b61709d
BP
799}
800
801static void
1c33f0c3 802netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 803{
cee87338
BP
804 struct nl_sock *sock;
805
19c8e9c1
JS
806 if (netdev_linux_miimon_enabled()) {
807 netdev_linux_miimon_wait();
808 }
cee87338
BP
809 sock = netdev_linux_notify_sock();
810 if (sock) {
811 nl_sock_wait(sock, POLLIN);
812 }
8b61709d
BP
813}
814
ac4d3bcb 815static void
b5d57fc8
BP
816netdev_linux_changed(struct netdev_linux *dev,
817 unsigned int ifi_flags, unsigned int mask)
86383816 818 OVS_REQUIRES(dev->mutex)
ac4d3bcb 819{
3e912ffc 820 netdev_change_seq_changed(&dev->up);
8aa77183
BP
821
822 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
823 dev->carrier_resets++;
824 }
825 dev->ifi_flags = ifi_flags;
826
4f925bd3 827 dev->cache_valid &= mask;
6b6e1329 828 if (!(mask & VALID_IN)) {
a8704b50
PS
829 netdev_get_addrs_list_flush();
830 }
4f925bd3
PS
831}
832
833static void
bfda5239
FL
834netdev_linux_update__(struct netdev_linux *dev,
835 const struct rtnetlink_change *change)
86383816 836 OVS_REQUIRES(dev->mutex)
4f925bd3 837{
bfda5239 838 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
d6384a3a 839 if (change->nlmsg_type == RTM_NEWLINK) {
105cf8df 840 /* Keep drv-info, ip addresses, and NUMA id. */
d6384a3a 841 netdev_linux_changed(dev, change->ifi_flags,
105cf8df 842 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
d6384a3a
AW
843
844 /* Update netdev from rtnl-change msg. */
845 if (change->mtu) {
846 dev->mtu = change->mtu;
847 dev->cache_valid |= VALID_MTU;
848 dev->netdev_mtu_error = 0;
849 }
90a6637d 850
74ff3298
JR
851 if (!eth_addr_is_zero(change->mac)) {
852 dev->etheraddr = change->mac;
d6384a3a
AW
853 dev->cache_valid |= VALID_ETHERADDR;
854 dev->ether_addr_error = 0;
e8e1a409
TZ
855
856 /* The mac addr has been changed, report it now. */
857 rtnetlink_report_link();
d6384a3a 858 }
44445cac 859
3d9c99ab
JH
860 if (change->master && netdev_linux_kind_is_lag(change->master)) {
861 dev->is_lag_master = true;
862 }
863
d6384a3a
AW
864 dev->ifindex = change->if_index;
865 dev->cache_valid |= VALID_IFINDEX;
866 dev->get_ifindex_error = 0;
22dcb534 867 dev->present = true;
d6384a3a 868 } else {
bfda5239 869 /* FIXME */
d6384a3a 870 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 871 dev->present = false;
bfda5239 872 netnsid_unset(&dev->netnsid);
d6384a3a
AW
873 }
874 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
875 /* Invalidates in4, in6. */
6b6e1329 876 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 877 } else {
d6384a3a 878 OVS_NOT_REACHED();
4f925bd3 879 }
ac4d3bcb
EJ
880}
881
bfda5239
FL
882static void
883netdev_linux_update(struct netdev_linux *dev, int nsid,
884 const struct rtnetlink_change *change)
885 OVS_REQUIRES(dev->mutex)
886{
887 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
888 netdev_linux_update__(dev, change);
889 }
890}
891
9dc63482
BP
892static struct netdev *
893netdev_linux_alloc(void)
894{
895 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
896 return &netdev->up;
897}
898
48c6733c
WT
899static int
900netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 901{
48c6733c
WT
902 /* Prevent any attempt to create (or open) a network device named "default"
903 * or "all". These device names are effectively reserved on Linux because
904 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
905 * itself this wouldn't call for any special treatment, but in practice if
906 * a program tries to create devices with these names, it causes the kernel
907 * to fire a "new device" notification event even though creation failed,
908 * and in turn that causes OVS to wake up and try to create them again,
909 * which ends up as a 100% CPU loop. */
910 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
911 const char *name = netdev_->name;
912 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
913 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
914 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
915 name);
916 return EINVAL;
917 }
918
bfda5239
FL
919 /* The device could be in the same network namespace or in another one. */
920 netnsid_unset(&netdev->netnsid);
834d6caf 921 ovs_mutex_init(&netdev->mutex);
29cf9c1b
FL
922
923 if (userspace_tso_enabled()) {
924 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
925 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
8c5163fe 926 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
35b5586b 927 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
29cf9c1b
FL
928 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
929 }
930
48c6733c 931 return 0;
9dc63482
BP
932}
933
1f6e0fbd 934/* Creates system and internal devices. */
f627cf1d 935int
9dc63482 936netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 937{
9dc63482 938 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
939 int error = netdev_linux_common_construct(netdev_);
940 if (error) {
941 return error;
942 }
1f6e0fbd 943
b5d57fc8
BP
944 error = get_flags(&netdev->up, &netdev->ifi_flags);
945 if (error == ENODEV) {
9dc63482 946 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 947 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
948 return ENODEV;
949 } else {
950 /* "Internal" netdevs have to be created as netdev objects before
951 * they exist in the kernel, because creating them in the kernel
952 * happens by passing a netdev object to dpif_port_add().
953 * Therefore, ignore the error. */
954 }
955 }
46415c90 956
a740f0de
JG
957 return 0;
958}
959
5b7448ed
JG
960/* For most types of netdevs we open the device for each call of
961 * netdev_open(). However, this is not the case with tap devices,
962 * since it is only possible to open the device once. In this
963 * situation we share a single file descriptor, and consequently
964 * buffers, across all readers. Therefore once data is read it will
965 * be unavailable to other reads for tap devices. */
a740f0de 966static int
9dc63482 967netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 968{
9dc63482 969 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 970 static const char tap_dev[] = "/dev/net/tun";
9dc63482 971 const char *name = netdev_->name;
a740f0de 972 struct ifreq ifr;
a740f0de 973
48c6733c
WT
974 int error = netdev_linux_common_construct(netdev_);
975 if (error) {
976 return error;
977 }
1f6e0fbd 978
6c88d577 979 /* Open tap device. */
d0d08f8a
BP
980 netdev->tap_fd = open(tap_dev, O_RDWR);
981 if (netdev->tap_fd < 0) {
6c88d577 982 error = errno;
10a89ef0 983 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 984 return error;
6c88d577
JP
985 }
986
987 /* Create tap device. */
61b9d078 988 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 989 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
29cf9c1b
FL
990 if (userspace_tso_enabled()) {
991 ifr.ifr_flags |= IFF_VNET_HDR;
992 }
993
71d7c22f 994 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 995 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 996 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 997 ovs_strerror(errno));
6c88d577 998 error = errno;
f61d8d29 999 goto error_close;
6c88d577
JP
1000 }
1001
1002 /* Make non-blocking. */
d0d08f8a 1003 error = set_nonblocking(netdev->tap_fd);
a740f0de 1004 if (error) {
f61d8d29 1005 goto error_close;
a740f0de
JG
1006 }
1007
0f28164b
FL
1008 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1009 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1010 ovs_strerror(errno));
1011 error = errno;
1012 goto error_close;
1013 }
1014
6211ad57
FL
1015 if (userspace_tso_enabled()) {
1016 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1017 * available, it will return EINVAL when a flag is unknown.
1018 * Therefore, try enabling offload with no flags to check
1019 * if TUNSETOFFLOAD support is available or not. */
1020 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) {
1021 unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
1022
1023 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) {
1024 VLOG_WARN("%s: enabling tap offloading failed: %s", name,
1025 ovs_strerror(errno));
1026 error = errno;
1027 goto error_close;
1028 }
1029 }
1030 }
1031
19aac14a 1032 netdev->present = true;
a740f0de
JG
1033 return 0;
1034
f61d8d29 1035error_close:
d0d08f8a 1036 close(netdev->tap_fd);
a740f0de
JG
1037 return error;
1038}
1039
6c88d577 1040static void
9dc63482 1041netdev_linux_destruct(struct netdev *netdev_)
6c88d577 1042{
b5d57fc8 1043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 1044
b5d57fc8
BP
1045 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1046 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
1047 }
1048
d0d08f8a
BP
1049 if (netdev_get_class(netdev_) == &netdev_tap_class
1050 && netdev->tap_fd >= 0)
1051 {
0f28164b 1052 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 1053 close(netdev->tap_fd);
6c88d577 1054 }
86383816 1055
19c8e9c1 1056 if (netdev->miimon_interval > 0) {
812c272c 1057 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1058 }
1059
86383816 1060 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
1061}
1062
9dc63482
BP
1063static void
1064netdev_linux_dealloc(struct netdev *netdev_)
1065{
1066 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1067 free(netdev);
1068}
1069
f7791740
PS
1070static struct netdev_rxq *
1071netdev_linux_rxq_alloc(void)
9dc63482 1072{
f7791740 1073 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
1074 return &rx->up;
1075}
1076
7b6b0ef4 1077static int
f7791740 1078netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 1079{
f7791740 1080 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1081 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 1082 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 1083 int error;
7b6b0ef4 1084
86383816 1085 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
1086 rx->is_tap = is_tap_netdev(netdev_);
1087 if (rx->is_tap) {
1088 rx->fd = netdev->tap_fd;
796223f5
BP
1089 } else {
1090 struct sockaddr_ll sll;
b73c8518 1091 int ifindex, val;
32383c3b 1092 /* Result of tcpdump -dd inbound */
259e0b1a 1093 static const struct sock_filter filt[] = {
32383c3b
MM
1094 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1095 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1096 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1097 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1098 };
259e0b1a
BP
1099 static const struct sock_fprog fprog = {
1100 ARRAY_SIZE(filt), (struct sock_filter *) filt
1101 };
7b6b0ef4 1102
796223f5 1103 /* Create file descriptor. */
9dc63482
BP
1104 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1105 if (rx->fd < 0) {
796223f5 1106 error = errno;
10a89ef0 1107 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
1108 goto error;
1109 }
33d82a56 1110
b73c8518
SH
1111 val = 1;
1112 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1113 error = errno;
1114 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1115 netdev_get_name(netdev_), ovs_strerror(error));
1116 goto error;
1117 }
1118
29cf9c1b
FL
1119 if (userspace_tso_enabled()
1120 && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1121 sizeof val)) {
1122 error = errno;
1123 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1124 netdev_get_name(netdev_), ovs_strerror(errno));
1125 goto error;
1126 }
1127
796223f5 1128 /* Set non-blocking mode. */
9dc63482 1129 error = set_nonblocking(rx->fd);
796223f5
BP
1130 if (error) {
1131 goto error;
1132 }
7b6b0ef4 1133
796223f5 1134 /* Get ethernet device index. */
180c6d0b 1135 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
1136 if (error) {
1137 goto error;
1138 }
7b6b0ef4 1139
796223f5
BP
1140 /* Bind to specific ethernet device. */
1141 memset(&sll, 0, sizeof sll);
1142 sll.sll_family = AF_PACKET;
1143 sll.sll_ifindex = ifindex;
b73c8518 1144 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 1145 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
1146 error = errno;
1147 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 1148 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
1149 goto error;
1150 }
32383c3b
MM
1151
1152 /* Filter for only inbound packets. */
9dc63482 1153 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
1154 sizeof fprog);
1155 if (error) {
1156 error = errno;
259e0b1a 1157 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 1158 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
1159 goto error;
1160 }
7b6b0ef4 1161 }
86383816 1162 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1163
7b6b0ef4
BP
1164 return 0;
1165
1166error:
9dc63482
BP
1167 if (rx->fd >= 0) {
1168 close(rx->fd);
7b6b0ef4 1169 }
86383816 1170 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1171 return error;
1172}
1173
796223f5 1174static void
f7791740 1175netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1176{
f7791740 1177 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
29cf9c1b 1178 int i;
8b61709d 1179
796223f5
BP
1180 if (!rx->is_tap) {
1181 close(rx->fd);
8b61709d 1182 }
29cf9c1b
FL
1183
1184 for (i = 0; i < NETDEV_MAX_BURST; i++) {
73858f9d 1185 dp_packet_delete(rx->aux_bufs[i]);
29cf9c1b 1186 }
9dc63482
BP
1187}
1188
1189static void
f7791740 1190netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1191{
f7791740 1192 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1193
796223f5
BP
1194 free(rx);
1195}
8b61709d 1196
b73c8518 1197static ovs_be16
1ebdc7eb 1198auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1199{
1200 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1201 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1202 } else if (double_tagged) {
1203 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1204 } else {
1ebdc7eb 1205 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1206 }
1207}
1208
1209static bool
1210auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1211{
1212 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1213}
1214
2109841b
YY
1215/*
1216 * Receive packets from raw socket in batch process for better performance,
1217 * it can receive NETDEV_MAX_BURST packets at most once, the received
1218 * packets are added into *batch. The return value is 0 or errno.
1219 *
1220 * It also used recvmmsg to reduce multiple syscalls overhead;
1221 */
796223f5 1222static int
29cf9c1b 1223netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
2109841b 1224 struct dp_packet_batch *batch)
796223f5 1225{
29cf9c1b
FL
1226 int iovlen;
1227 size_t std_len;
796223f5 1228 ssize_t retval;
29cf9c1b
FL
1229 int virtio_net_hdr_size;
1230 struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
b73c8518
SH
1231 struct cmsghdr *cmsg;
1232 union {
1233 struct cmsghdr cmsg;
1234 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
2109841b
YY
1235 } cmsg_buffers[NETDEV_MAX_BURST];
1236 struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1237 struct dp_packet *buffers[NETDEV_MAX_BURST];
1238 int i;
1239
29cf9c1b
FL
1240 if (userspace_tso_enabled()) {
1241 /* Use the buffer from the allocated packet below to receive MTU
1242 * sized packets and an aux_buf for extra TSO data. */
1243 iovlen = IOV_TSO_SIZE;
1244 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1245 } else {
1246 /* Use only the buffer from the allocated packet. */
1247 iovlen = IOV_STD_SIZE;
1248 virtio_net_hdr_size = 0;
1249 }
1250
73858f9d
FL
1251 /* The length here needs to be accounted in the same way when the
1252 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1253 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
2109841b 1254 for (i = 0; i < NETDEV_MAX_BURST; i++) {
29cf9c1b
FL
1255 buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1256 iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1257 iovs[i][IOV_PACKET].iov_len = std_len;
73858f9d
FL
1258 if (iovlen == IOV_TSO_SIZE) {
1259 iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1260 iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1261 }
1262
2109841b
YY
1263 mmsgs[i].msg_hdr.msg_name = NULL;
1264 mmsgs[i].msg_hdr.msg_namelen = 0;
29cf9c1b
FL
1265 mmsgs[i].msg_hdr.msg_iov = iovs[i];
1266 mmsgs[i].msg_hdr.msg_iovlen = iovlen;
2109841b
YY
1267 mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1268 mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1269 mmsgs[i].msg_hdr.msg_flags = 0;
1270 }
8e8cddf7 1271
796223f5 1272 do {
29cf9c1b 1273 retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
796223f5
BP
1274 } while (retval < 0 && errno == EINTR);
1275
bfd3367b 1276 if (retval < 0) {
29cf9c1b
FL
1277 retval = errno;
1278 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1279 dp_packet_delete(buffers[i]);
1280 }
1281
1282 return retval;
b73c8518
SH
1283 }
1284
2109841b 1285 for (i = 0; i < retval; i++) {
73858f9d
FL
1286 struct dp_packet *pkt;
1287
2109841b 1288 if (mmsgs[i].msg_len < ETH_HEADER_LEN) {
29cf9c1b
FL
1289 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1290 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1291
1292 dp_packet_delete(buffers[i]);
1293 netdev->rx_dropped += 1;
1294 VLOG_WARN_RL(&rl, "%s: Dropped packet: less than ether hdr size",
1295 netdev_get_name(netdev_));
1296 continue;
1297 }
1298
1299 if (mmsgs[i].msg_len > std_len) {
73858f9d
FL
1300 /* Build a single linear TSO packet by prepending the data from
1301 * std_len buffer to the aux_buf. */
1302 pkt = rx->aux_bufs[i];
1303 dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1304 dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1305 /* The headroom should be the same in buffers[i], pkt and
1306 * DP_NETDEV_HEADROOM. */
1307 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1308 dp_packet_delete(buffers[i]);
1309 rx->aux_bufs[i] = NULL;
1310 } else {
1311 dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1312 pkt = buffers[i];
1313 }
b73c8518 1314
73858f9d 1315 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
29cf9c1b
FL
1316 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1317 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1318
1319 /* Unexpected error situation: the virtio header is not present
1320 * or corrupted. Drop the packet but continue in case next ones
1321 * are correct. */
73858f9d 1322 dp_packet_delete(pkt);
29cf9c1b
FL
1323 netdev->rx_dropped += 1;
1324 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1325 netdev_get_name(netdev_));
1326 continue;
1327 }
1ebdc7eb 1328
2109841b
YY
1329 for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1330 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1331 const struct tpacket_auxdata *aux;
1332
1333 if (cmsg->cmsg_level != SOL_PACKET
1334 || cmsg->cmsg_type != PACKET_AUXDATA
1335 || cmsg->cmsg_len <
1336 CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1337 continue;
b73c8518
SH
1338 }
1339
2109841b
YY
1340 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1341 if (auxdata_has_vlan_tci(aux)) {
1342 struct eth_header *eth;
1343 bool double_tagged;
1ebdc7eb 1344
73858f9d 1345 eth = dp_packet_data(pkt);
2109841b
YY
1346 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1347
73858f9d 1348 eth_push_vlan(pkt,
2109841b
YY
1349 auxdata_to_vlan_tpid(aux, double_tagged),
1350 htons(aux->tp_vlan_tci));
1351 break;
1352 }
b73c8518 1353 }
73858f9d 1354 dp_packet_batch_add(batch, pkt);
2109841b
YY
1355 }
1356
29cf9c1b 1357 /* Delete unused buffers. */
2109841b
YY
1358 for (; i < NETDEV_MAX_BURST; i++) {
1359 dp_packet_delete(buffers[i]);
1360 }
1361
b73c8518
SH
1362 return 0;
1363}
1364
2109841b
YY
1365/*
1366 * Receive packets from tap by batch process for better performance,
1367 * it can receive NETDEV_MAX_BURST packets at most once, the received
1368 * packets are added into *batch. The return value is 0 or errno.
1369 */
b73c8518 1370static int
29cf9c1b
FL
1371netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1372 struct dp_packet_batch *batch)
b73c8518 1373{
29cf9c1b 1374 int virtio_net_hdr_size;
b73c8518 1375 ssize_t retval;
29cf9c1b
FL
1376 size_t std_len;
1377 int iovlen;
2109841b
YY
1378 int i;
1379
29cf9c1b
FL
1380 if (userspace_tso_enabled()) {
1381 /* Use the buffer from the allocated packet below to receive MTU
1382 * sized packets and an aux_buf for extra TSO data. */
1383 iovlen = IOV_TSO_SIZE;
1384 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1385 } else {
1386 /* Use only the buffer from the allocated packet. */
1387 iovlen = IOV_STD_SIZE;
1388 virtio_net_hdr_size = 0;
1389 }
1390
73858f9d
FL
1391 /* The length here needs to be accounted in the same way when the
1392 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1393 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
2109841b 1394 for (i = 0; i < NETDEV_MAX_BURST; i++) {
73858f9d
FL
1395 struct dp_packet *buffer;
1396 struct dp_packet *pkt;
29cf9c1b
FL
1397 struct iovec iov[IOV_TSO_SIZE];
1398
2109841b 1399 /* Assume Ethernet port. No need to set packet_type. */
29cf9c1b
FL
1400 buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1401 iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1402 iov[IOV_PACKET].iov_len = std_len;
73858f9d
FL
1403 if (iovlen == IOV_TSO_SIZE) {
1404 iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1405 iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1406 }
29cf9c1b 1407
2109841b 1408 do {
29cf9c1b 1409 retval = readv(rx->fd, iov, iovlen);
2109841b
YY
1410 } while (retval < 0 && errno == EINTR);
1411
1412 if (retval < 0) {
1413 dp_packet_delete(buffer);
1414 break;
1415 }
b73c8518 1416
29cf9c1b 1417 if (retval > std_len) {
73858f9d
FL
1418 /* Build a single linear TSO packet by prepending the data from
1419 * std_len buffer to the aux_buf. */
1420 pkt = rx->aux_bufs[i];
1421 dp_packet_set_size(pkt, retval - std_len);
1422 dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1423 /* The headroom should be the same in buffers[i], pkt and
1424 * DP_NETDEV_HEADROOM. */
1425 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1426 dp_packet_delete(buffer);
1427 rx->aux_bufs[i] = NULL;
29cf9c1b
FL
1428 } else {
1429 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
73858f9d 1430 pkt = buffer;
29cf9c1b
FL
1431 }
1432
73858f9d 1433 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
29cf9c1b
FL
1434 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1435 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1436
1437 /* Unexpected error situation: the virtio header is not present
1438 * or corrupted. Drop the packet but continue in case next ones
1439 * are correct. */
73858f9d 1440 dp_packet_delete(pkt);
29cf9c1b
FL
1441 netdev->rx_dropped += 1;
1442 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1443 netdev_get_name(netdev_));
1444 continue;
1445 }
1446
73858f9d 1447 dp_packet_batch_add(batch, pkt);
2109841b 1448 }
b73c8518 1449
2109841b 1450 if ((i == 0) && (retval < 0)) {
bfd3367b 1451 return errno;
8b61709d 1452 }
b73c8518 1453
b73c8518
SH
1454 return 0;
1455}
1456
1457static int
8492adc2
JS
1458netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1459 int *qfill)
b73c8518 1460{
f7791740 1461 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1462 struct netdev *netdev = rx->up.netdev;
df1e5a3b
PS
1463 ssize_t retval;
1464 int mtu;
1465
1466 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1467 mtu = ETH_PAYLOAD_MAX;
1468 }
1469
73858f9d
FL
1470 if (userspace_tso_enabled()) {
1471 /* Allocate TSO packets. The packet has enough headroom to store
1472 * a full non-TSO packet. When a TSO packet is received, the data
1473 * from non-TSO buffer (std_len) is prepended to the TSO packet
1474 * (aux_buf). */
1475 size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1476 + DP_NETDEV_HEADROOM + mtu;
1477 size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1478 for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1479 if (rx->aux_bufs[i]) {
1480 continue;
1481 }
1482
1483 rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1484 }
1485 }
1486
2109841b 1487 dp_packet_batch_init(batch);
b73c8518 1488 retval = (rx->is_tap
29cf9c1b
FL
1489 ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1490 : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
df1e5a3b
PS
1491
1492 if (retval) {
1493 if (retval != EAGAIN && retval != EMSGSIZE) {
1494 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1495 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1496 }
b73c8518
SH
1497 }
1498
8492adc2
JS
1499 if (qfill) {
1500 *qfill = -ENOTSUP;
1501 }
1502
b73c8518 1503 return retval;
8b61709d
BP
1504}
1505
8b61709d 1506static void
f7791740 1507netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1508{
f7791740 1509 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1510 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1511}
1512
8b61709d 1513static int
f7791740 1514netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1515{
f7791740 1516 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1517 if (rx->is_tap) {
8b61709d 1518 struct ifreq ifr;
f7791740 1519 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1520 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1521 if (error) {
1522 return error;
1523 }
796223f5 1524 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1525 return 0;
1526 } else {
796223f5 1527 return drain_rcvbuf(rx->fd);
8b61709d
BP
1528 }
1529}
1530
d19cf8bb 1531static int
29cf9c1b 1532netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu,
d19cf8bb
ZG
1533 struct dp_packet_batch *batch)
1534{
e0a00cee 1535 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1536 /* We don't bother setting most fields in sockaddr_ll because the
1537 * kernel ignores them for SOCK_RAW. */
1538 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1539 .sll_ifindex = ifindex };
1540
e0a00cee
BB
1541 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1542 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1543
e0a00cee 1544 struct dp_packet *packet;
e883448e 1545 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
29cf9c1b
FL
1546 if (tso) {
1547 netdev_linux_prepend_vnet_hdr(packet, mtu);
1548 }
1549
d19cf8bb 1550 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1551 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1552 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1553 .msg_namelen = sizeof sll,
1554 .msg_iov = &iov[i],
1555 .msg_iovlen = 1 };
1556 }
1557
1558 int error = 0;
e0a00cee 1559 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1560 ssize_t retval;
1561 do {
e0a00cee 1562 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1563 error = retval < 0 ? errno : 0;
1564 } while (error == EINTR);
1565 if (error) {
1566 break;
1567 }
1568 ofs += retval;
1569 }
1570
1571 free(mmsg);
1572 free(iov);
1573 return error;
1574}
1575
1576/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1577 * essential, because packets sent to a tap device with an AF_PACKET socket
1578 * will loop back to be *received* again on the tap device. This doesn't occur
1579 * on other interface types because we attach a socket filter to the rx
1580 * socket. */
1581static int
29cf9c1b 1582netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu,
d19cf8bb
ZG
1583 struct dp_packet_batch *batch)
1584{
1585 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1586 struct dp_packet *packet;
22dcb534
FL
1587
1588 /* The Linux tap driver returns EIO if the device is not up,
1589 * so if the device is not up, don't waste time sending it.
1590 * However, if the device is in another network namespace
1591 * then OVS can't retrieve the state. In that case, send the
1592 * packets anyway. */
1593 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1594 netdev->tx_dropped += dp_packet_batch_size(batch);
1595 return 0;
1596 }
1597
e883448e 1598 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
29cf9c1b 1599 size_t size;
d19cf8bb
ZG
1600 ssize_t retval;
1601 int error;
1602
29cf9c1b
FL
1603 if (tso) {
1604 netdev_linux_prepend_vnet_hdr(packet, mtu);
1605 }
1606
1607 size = dp_packet_size(packet);
d19cf8bb
ZG
1608 do {
1609 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1610 error = retval < 0 ? errno : 0;
1611 } while (error == EINTR);
1612
1613 if (error) {
1614 /* The Linux tap driver returns EIO if the device is not up. From
1615 * the OVS side this is not an error, so we ignore it; otherwise,
1616 * return the erro. */
1617 if (error != EIO) {
1618 return error;
1619 }
1620 } else if (retval != size) {
1621 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1622 "bytes of %"PRIuSIZE") on %s",
1623 retval, size, netdev_get_name(netdev_));
1624 return EMSGSIZE;
1625 }
1626 }
1627 return 0;
1628}
1629
105cf8df
WT
1630static int
1631netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1632 OVS_REQUIRES(netdev->mutex)
1633{
1634 char *numa_node_path;
1635 const char *name;
1636 int node_id;
1637 FILE *stream;
1638
1639 if (netdev->cache_valid & VALID_NUMA_ID) {
1640 return netdev->numa_id;
1641 }
1642
1643 netdev->numa_id = 0;
1644 netdev->cache_valid |= VALID_NUMA_ID;
1645
1646 if (ovs_numa_get_n_numas() < 2) {
1647 /* No need to check on system with a single NUMA node. */
1648 return 0;
1649 }
1650
1651 name = netdev_get_name(&netdev->up);
1652 if (strpbrk(name, "/\\")) {
1653 VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1654 "A valid name must not include '/' or '\\'."
1655 "Using numa_id 0", name);
1656 return 0;
1657 }
1658
1659 numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1660
1661 stream = fopen(numa_node_path, "r");
1662 if (!stream) {
1663 /* Virtual device does not have this info. */
1664 VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1665 name, numa_node_path, ovs_strerror(errno));
1666 free(numa_node_path);
1667 return 0;
1668 }
1669
1670 if (fscanf(stream, "%d", &node_id) != 1
1671 || !ovs_numa_numa_id_is_valid(node_id)) {
1672 VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1673 node_id = 0;
1674 }
1675
1676 netdev->numa_id = node_id;
1677 fclose(stream);
1678 free(numa_node_path);
1679 return node_id;
1680}
1681
1682static int OVS_UNUSED
1683netdev_linux_get_numa_id(const struct netdev *netdev_)
1684{
1685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1686 int numa_id;
1687
1688 ovs_mutex_lock(&netdev->mutex);
1689 numa_id = netdev_linux_get_numa_id__(netdev);
1690 ovs_mutex_unlock(&netdev->mutex);
1691
1692 return numa_id;
1693}
1694
d19cf8bb 1695/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1696 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1697 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1698 * the packet is too big or too small to transmit on the device.
1699 *
8b61709d
BP
1700 * The kernel maintains a packet transmission queue, so the caller is not
1701 * expected to do additional queuing of packets. */
1702static int
f00fa8cb 1703netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1704 struct dp_packet_batch *batch,
324c8374 1705 bool concurrent_txq OVS_UNUSED)
8b61709d 1706{
29cf9c1b
FL
1707 bool tso = userspace_tso_enabled();
1708 int mtu = ETH_PAYLOAD_MAX;
f4fd623c 1709 int error = 0;
0a62ae2c
ZG
1710 int sock = 0;
1711
29cf9c1b
FL
1712 if (tso) {
1713 netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1714 }
1715
0a62ae2c 1716 if (!is_tap_netdev(netdev_)) {
e0e2410d
FL
1717 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1718 error = EOPNOTSUPP;
1719 goto free_batch;
1720 }
1721
0a62ae2c
ZG
1722 sock = af_packet_sock();
1723 if (sock < 0) {
1724 error = -sock;
1725 goto free_batch;
1726 }
1727
1728 int ifindex = netdev_get_ifindex(netdev_);
1729 if (ifindex < 0) {
1730 error = -ifindex;
1731 goto free_batch;
1732 }
1733
29cf9c1b 1734 error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
d19cf8bb 1735 } else {
29cf9c1b 1736 error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
0a62ae2c 1737 }
d19cf8bb
ZG
1738 if (error) {
1739 if (error == ENOBUFS) {
1740 /* The Linux AF_PACKET implementation never blocks waiting
1741 * for room for packets, instead returning ENOBUFS.
1742 * Translate this into EAGAIN for the caller. */
1743 error = EAGAIN;
f23347ea 1744 } else {
f4fd623c
DDP
1745 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1746 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1747 }
f4fd623c
DDP
1748 }
1749
0a62ae2c 1750free_batch:
b30896c9 1751 dp_packet_delete_batch(batch, true);
f4fd623c 1752 return error;
8b61709d
BP
1753}
1754
1755/* Registers with the poll loop to wake up from the next call to poll_block()
1756 * when the packet transmission queue has sufficient room to transmit a packet
1757 * with netdev_send().
1758 *
1759 * The kernel maintains a packet transmission queue, so the client is not
1760 * expected to do additional queuing of packets. Thus, this function is
1761 * unlikely to ever be used. It is included for completeness. */
1762static void
f00fa8cb 1763netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1764{
796223f5 1765 if (is_tap_netdev(netdev)) {
8b61709d
BP
1766 /* TAP device always accepts packets.*/
1767 poll_immediate_wake();
1768 }
1769}
1770
1771/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1772 * otherwise a positive errno value. */
1773static int
74ff3298 1774netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1775{
b5d57fc8 1776 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1777 enum netdev_flags old_flags = 0;
eb395f2e
BP
1778 int error;
1779
86383816 1780 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1781 if (netdev_linux_netnsid_is_remote(netdev)) {
1782 error = EOPNOTSUPP;
1783 goto exit;
1784 }
86383816 1785
b5d57fc8 1786 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1787 error = netdev->ether_addr_error;
1788 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1789 goto exit;
44445cac 1790 }
b5d57fc8 1791 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1792 }
1793
7eb1bd81 1794 /* Tap devices must be brought down before setting the address. */
796223f5 1795 if (is_tap_netdev(netdev_)) {
4f9f3f21 1796 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1797 }
44445cac
PS
1798 error = set_etheraddr(netdev_get_name(netdev_), mac);
1799 if (!error || error == ENODEV) {
b5d57fc8
BP
1800 netdev->ether_addr_error = error;
1801 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1802 if (!error) {
74ff3298 1803 netdev->etheraddr = mac;
eb395f2e 1804 }
8b61709d 1805 }
44445cac 1806
4f9f3f21
BP
1807 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1808 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1809 }
7eb1bd81 1810
86383816
BP
1811exit:
1812 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1813 return error;
1814}
1815
44445cac 1816/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1817static int
74ff3298 1818netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1819{
b5d57fc8 1820 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1821 int error;
44445cac 1822
86383816 1823 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1824 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
756819dd
FL
1825 netdev_linux_update_via_netlink(netdev);
1826 }
1827
1828 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1829 /* Fall back to ioctl if netlink fails */
86383816 1830 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1831 &netdev->etheraddr);
b5d57fc8 1832 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1833 }
44445cac 1834
86383816
BP
1835 error = netdev->ether_addr_error;
1836 if (!error) {
74ff3298 1837 *mac = netdev->etheraddr;
44445cac 1838 }
86383816 1839 ovs_mutex_unlock(&netdev->mutex);
44445cac 1840
86383816 1841 return error;
8b61709d
BP
1842}
1843
8b61709d 1844static int
73371c09 1845netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1846{
86383816
BP
1847 int error;
1848
b5d57fc8 1849 if (!(netdev->cache_valid & VALID_MTU)) {
756819dd
FL
1850 netdev_linux_update_via_netlink(netdev);
1851 }
1852
1853 if (!(netdev->cache_valid & VALID_MTU)) {
1854 /* Fall back to ioctl if netlink fails */
8b61709d 1855 struct ifreq ifr;
90a6637d 1856
86383816 1857 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1858 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1859 netdev->mtu = ifr.ifr_mtu;
1860 netdev->cache_valid |= VALID_MTU;
8b61709d 1861 }
90a6637d 1862
86383816
BP
1863 error = netdev->netdev_mtu_error;
1864 if (!error) {
b5d57fc8 1865 *mtup = netdev->mtu;
90a6637d 1866 }
73371c09
BP
1867
1868 return error;
1869}
1870
1871/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1872 * in bytes, not including the hardware header; thus, this is typically 1500
1873 * bytes for Ethernet devices. */
1874static int
1875netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1876{
1877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1878 int error;
1879
1880 ovs_mutex_lock(&netdev->mutex);
1881 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1882 ovs_mutex_unlock(&netdev->mutex);
1883
1884 return error;
8b61709d
BP
1885}
1886
9b020780
PS
1887/* Sets the maximum size of transmitted (MTU) for given device using linux
1888 * networking ioctl interface.
1889 */
1890static int
4124cb12 1891netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1892{
b5d57fc8 1893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1894 struct ifreq ifr;
1895 int error;
1896
86383816 1897 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1898 if (netdev_linux_netnsid_is_remote(netdev)) {
1899 error = EOPNOTSUPP;
1900 goto exit;
1901 }
1902
52b5a5c0
EC
1903#ifdef HAVE_AF_XDP
1904 if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
1905 error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
1906 if (error) {
1907 goto exit;
1908 }
1909 }
1910#endif
1911
b5d57fc8 1912 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1913 error = netdev->netdev_mtu_error;
1914 if (error || netdev->mtu == mtu) {
1915 goto exit;
90a6637d 1916 }
b5d57fc8 1917 netdev->cache_valid &= ~VALID_MTU;
153e5481 1918 }
9b020780 1919 ifr.ifr_mtu = mtu;
259e0b1a
BP
1920 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1921 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1922 if (!error || error == ENODEV) {
b5d57fc8
BP
1923 netdev->netdev_mtu_error = error;
1924 netdev->mtu = ifr.ifr_mtu;
1925 netdev->cache_valid |= VALID_MTU;
9b020780 1926 }
86383816
BP
1927exit:
1928 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1929 return error;
9b020780
PS
1930}
1931
9ab3d9a3
BP
1932/* Returns the ifindex of 'netdev', if successful, as a positive number.
1933 * On failure, returns a negative errno value. */
1934static int
86383816 1935netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1936{
86383816 1937 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1938 int ifindex, error;
1939
86383816 1940 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1941 if (netdev_linux_netnsid_is_remote(netdev)) {
1942 error = EOPNOTSUPP;
1943 goto exit;
1944 }
86383816 1945 error = get_ifindex(netdev_, &ifindex);
86383816 1946
e0e2410d
FL
1947exit:
1948 ovs_mutex_unlock(&netdev->mutex);
9ab3d9a3
BP
1949 return error ? -error : ifindex;
1950}
1951
8b61709d
BP
1952static int
1953netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1954{
b5d57fc8 1955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1956
86383816 1957 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1958 if (netdev->miimon_interval > 0) {
1959 *carrier = netdev->miimon;
3a183124 1960 } else {
b5d57fc8 1961 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1962 }
86383816 1963 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1964
3a183124 1965 return 0;
8b61709d
BP
1966}
1967
65c3058c 1968static long long int
86383816 1969netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1970{
86383816
BP
1971 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1972 long long int carrier_resets;
1973
1974 ovs_mutex_lock(&netdev->mutex);
1975 carrier_resets = netdev->carrier_resets;
1976 ovs_mutex_unlock(&netdev->mutex);
1977
1978 return carrier_resets;
65c3058c
EJ
1979}
1980
63331829 1981static int
1670c579
EJ
1982netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1983 struct mii_ioctl_data *data)
63331829 1984{
63331829 1985 struct ifreq ifr;
782e6111 1986 int error;
63331829 1987
63331829 1988 memset(&ifr, 0, sizeof ifr);
782e6111 1989 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1990 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1991 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1992
782e6111
EJ
1993 return error;
1994}
1995
1996static int
1670c579 1997netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1998{
782e6111
EJ
1999 struct mii_ioctl_data data;
2000 int error;
63331829 2001
782e6111
EJ
2002 *miimon = false;
2003
2004 memset(&data, 0, sizeof data);
1670c579 2005 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
2006 if (!error) {
2007 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2008 data.reg_num = MII_BMSR;
1670c579 2009 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 2010 &data);
63331829
EJ
2011
2012 if (!error) {
782e6111 2013 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 2014 }
9120cfc0
DH
2015 }
2016 if (error) {
63331829 2017 struct ethtool_cmd ecmd;
63331829
EJ
2018
2019 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2020 name);
2021
ab985a77 2022 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
2023 memset(&ecmd, 0, sizeof ecmd);
2024 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2025 "ETHTOOL_GLINK");
2026 if (!error) {
782e6111
EJ
2027 struct ethtool_value eval;
2028
2029 memcpy(&eval, &ecmd, sizeof eval);
2030 *miimon = !!eval.data;
63331829
EJ
2031 } else {
2032 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2033 }
2034 }
2035
2036 return error;
2037}
2038
1670c579
EJ
2039static int
2040netdev_linux_set_miimon_interval(struct netdev *netdev_,
2041 long long int interval)
2042{
b5d57fc8 2043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 2044
86383816 2045 ovs_mutex_lock(&netdev->mutex);
1670c579 2046 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 2047 if (netdev->miimon_interval != interval) {
19c8e9c1 2048 if (interval && !netdev->miimon_interval) {
812c272c 2049 atomic_count_inc(&miimon_cnt);
19c8e9c1 2050 } else if (!interval && netdev->miimon_interval) {
812c272c 2051 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
2052 }
2053
b5d57fc8
BP
2054 netdev->miimon_interval = interval;
2055 timer_set_expired(&netdev->miimon_timer);
1670c579 2056 }
86383816 2057 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
2058
2059 return 0;
2060}
2061
2062static void
2063netdev_linux_miimon_run(void)
2064{
2065 struct shash device_shash;
2066 struct shash_node *node;
2067
2068 shash_init(&device_shash);
b5d57fc8 2069 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 2070 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
2071 struct netdev *netdev = node->data;
2072 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
2073 bool miimon;
2074
86383816
BP
2075 ovs_mutex_lock(&dev->mutex);
2076 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2077 netdev_linux_get_miimon(dev->up.name, &miimon);
2078 if (miimon != dev->miimon) {
2079 dev->miimon = miimon;
2080 netdev_linux_changed(dev, dev->ifi_flags, 0);
2081 }
1670c579 2082
86383816 2083 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 2084 }
86383816 2085 ovs_mutex_unlock(&dev->mutex);
2f980d74 2086 netdev_close(netdev);
1670c579
EJ
2087 }
2088
2089 shash_destroy(&device_shash);
2090}
2091
2092static void
2093netdev_linux_miimon_wait(void)
2094{
2095 struct shash device_shash;
2096 struct shash_node *node;
2097
2098 shash_init(&device_shash);
b5d57fc8 2099 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 2100 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
2101 struct netdev *netdev = node->data;
2102 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 2103
86383816 2104 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
2105 if (dev->miimon_interval > 0) {
2106 timer_wait(&dev->miimon_timer);
2107 }
86383816 2108 ovs_mutex_unlock(&dev->mutex);
2f980d74 2109 netdev_close(netdev);
1670c579
EJ
2110 }
2111 shash_destroy(&device_shash);
2112}
2113
92df599c
JG
2114static void
2115swap_uint64(uint64_t *a, uint64_t *b)
2116{
1de0e8ae
BP
2117 uint64_t tmp = *a;
2118 *a = *b;
2119 *b = tmp;
92df599c
JG
2120}
2121
c060c4cf
EJ
2122/* Copies 'src' into 'dst', performing format conversion in the process.
2123 *
2124 * 'src' is allowed to be misaligned. */
2125static void
2126netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2127 const struct ovs_vport_stats *src)
2128{
6a54dedc
BP
2129 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
2130 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
2131 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
2132 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
2133 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
2134 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
2135 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
2136 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
2137 dst->multicast = 0;
2138 dst->collisions = 0;
2139 dst->rx_length_errors = 0;
2140 dst->rx_over_errors = 0;
2141 dst->rx_crc_errors = 0;
2142 dst->rx_frame_errors = 0;
2143 dst->rx_fifo_errors = 0;
2144 dst->rx_missed_errors = 0;
2145 dst->tx_aborted_errors = 0;
2146 dst->tx_carrier_errors = 0;
2147 dst->tx_fifo_errors = 0;
2148 dst->tx_heartbeat_errors = 0;
2149 dst->tx_window_errors = 0;
2150}
2151
2152static int
2153get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2154{
93451a0a 2155 struct dpif_netlink_vport reply;
c060c4cf
EJ
2156 struct ofpbuf *buf;
2157 int error;
2158
93451a0a 2159 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
2160 if (error) {
2161 return error;
2162 } else if (!reply.stats) {
2163 ofpbuf_delete(buf);
2164 return EOPNOTSUPP;
2165 }
2166
2167 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
2168
2169 ofpbuf_delete(buf);
2170
2171 return 0;
2172}
2173
f613a0d7
PS
2174static void
2175get_stats_via_vport(const struct netdev *netdev_,
2176 struct netdev_stats *stats)
8b61709d 2177{
b5d57fc8 2178 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 2179
b5d57fc8
BP
2180 if (!netdev->vport_stats_error ||
2181 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 2182 int error;
7fbef77a 2183
c060c4cf 2184 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 2185 if (error && error != ENOENT && error != ENODEV) {
a57a8488 2186 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
2187 "(%s)",
2188 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 2189 }
b5d57fc8
BP
2190 netdev->vport_stats_error = error;
2191 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 2192 }
f613a0d7 2193}
8b61709d 2194
f613a0d7
PS
2195/* Retrieves current device stats for 'netdev-linux'. */
2196static int
2197netdev_linux_get_stats(const struct netdev *netdev_,
2198 struct netdev_stats *stats)
2199{
b5d57fc8 2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
2201 struct netdev_stats dev_stats;
2202 int error;
2203
86383816 2204 ovs_mutex_lock(&netdev->mutex);
f613a0d7 2205 get_stats_via_vport(netdev_, stats);
35eef899 2206 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 2207 if (error) {
86383816
BP
2208 if (!netdev->vport_stats_error) {
2209 error = 0;
f613a0d7 2210 }
86383816 2211 } else if (netdev->vport_stats_error) {
04c881eb 2212 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
2213 *stats = dev_stats;
2214 } else {
04c881eb
AZ
2215 /* Use kernel netdev's packet and byte counts since vport's counters
2216 * do not reflect packet counts on the wire when GSO, TSO or GRO are
2217 * enabled. */
2218 stats->rx_packets = dev_stats.rx_packets;
2219 stats->rx_bytes = dev_stats.rx_bytes;
2220 stats->tx_packets = dev_stats.tx_packets;
2221 stats->tx_bytes = dev_stats.tx_bytes;
2222
f613a0d7
PS
2223 stats->rx_errors += dev_stats.rx_errors;
2224 stats->tx_errors += dev_stats.tx_errors;
2225 stats->rx_dropped += dev_stats.rx_dropped;
2226 stats->tx_dropped += dev_stats.tx_dropped;
2227 stats->multicast += dev_stats.multicast;
2228 stats->collisions += dev_stats.collisions;
2229 stats->rx_length_errors += dev_stats.rx_length_errors;
2230 stats->rx_over_errors += dev_stats.rx_over_errors;
2231 stats->rx_crc_errors += dev_stats.rx_crc_errors;
2232 stats->rx_frame_errors += dev_stats.rx_frame_errors;
2233 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
2234 stats->rx_missed_errors += dev_stats.rx_missed_errors;
2235 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
2236 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
2237 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
2238 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2239 stats->tx_window_errors += dev_stats.tx_window_errors;
2240 }
86383816
BP
2241 ovs_mutex_unlock(&netdev->mutex);
2242
2243 return error;
f613a0d7
PS
2244}
2245
2246/* Retrieves current device stats for 'netdev-tap' netdev or
2247 * netdev-internal. */
2248static int
15aee116 2249netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 2250{
b5d57fc8 2251 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
2252 struct netdev_stats dev_stats;
2253 int error;
2254
86383816 2255 ovs_mutex_lock(&netdev->mutex);
f613a0d7 2256 get_stats_via_vport(netdev_, stats);
35eef899 2257 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 2258 if (error) {
86383816
BP
2259 if (!netdev->vport_stats_error) {
2260 error = 0;
8b61709d 2261 }
86383816
BP
2262 } else if (netdev->vport_stats_error) {
2263 /* Transmit and receive stats will appear to be swapped relative to the
2264 * other ports since we are the one sending the data, not a remote
2265 * computer. For consistency, we swap them back here. This does not
2266 * apply if we are getting stats from the vport layer because it always
2267 * tracks stats from the perspective of the switch. */
fe6b0e03 2268
f613a0d7 2269 *stats = dev_stats;
92df599c
JG
2270 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2271 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2272 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2273 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
2274 stats->rx_length_errors = 0;
2275 stats->rx_over_errors = 0;
2276 stats->rx_crc_errors = 0;
2277 stats->rx_frame_errors = 0;
2278 stats->rx_fifo_errors = 0;
2279 stats->rx_missed_errors = 0;
2280 stats->tx_aborted_errors = 0;
2281 stats->tx_carrier_errors = 0;
2282 stats->tx_fifo_errors = 0;
2283 stats->tx_heartbeat_errors = 0;
2284 stats->tx_window_errors = 0;
f613a0d7 2285 } else {
04c881eb
AZ
2286 /* Use kernel netdev's packet and byte counts since vport counters
2287 * do not reflect packet counts on the wire when GSO, TSO or GRO
2288 * are enabled. */
2289 stats->rx_packets = dev_stats.tx_packets;
2290 stats->rx_bytes = dev_stats.tx_bytes;
2291 stats->tx_packets = dev_stats.rx_packets;
2292 stats->tx_bytes = dev_stats.rx_bytes;
2293
f613a0d7
PS
2294 stats->rx_dropped += dev_stats.tx_dropped;
2295 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 2296
f613a0d7
PS
2297 stats->rx_errors += dev_stats.tx_errors;
2298 stats->tx_errors += dev_stats.rx_errors;
2299
2300 stats->multicast += dev_stats.multicast;
2301 stats->collisions += dev_stats.collisions;
2302 }
22dcb534 2303 stats->tx_dropped += netdev->tx_dropped;
29cf9c1b 2304 stats->rx_dropped += netdev->rx_dropped;
86383816
BP
2305 ovs_mutex_unlock(&netdev->mutex);
2306
2307 return error;
8b61709d
BP
2308}
2309
bba1e6f3
PS
2310static int
2311netdev_internal_get_stats(const struct netdev *netdev_,
2312 struct netdev_stats *stats)
2313{
b5d57fc8 2314 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2315 int error;
bba1e6f3 2316
86383816 2317 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 2318 get_stats_via_vport(netdev_, stats);
86383816
BP
2319 error = netdev->vport_stats_error;
2320 ovs_mutex_unlock(&netdev->mutex);
2321
2322 return error;
bba1e6f3
PS
2323}
2324
51f87458 2325static void
b5d57fc8 2326netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
2327{
2328 struct ethtool_cmd ecmd;
6c038611 2329 uint32_t speed;
8b61709d
BP
2330 int error;
2331
b5d57fc8 2332 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
2333 return;
2334 }
2335
ab985a77 2336 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2337 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 2338 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
2339 ETHTOOL_GSET, "ETHTOOL_GSET");
2340 if (error) {
51f87458 2341 goto out;
8b61709d
BP
2342 }
2343
2344 /* Supported features. */
b5d57fc8 2345 netdev->supported = 0;
8b61709d 2346 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 2347 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
2348 }
2349 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 2350 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
2351 }
2352 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 2353 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
2354 }
2355 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 2356 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
2357 }
2358 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 2359 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 2360 }
67bed84c
SH
2361 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2362 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 2363 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 2364 }
67bed84c
SH
2365 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2366 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2367 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2368 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 2369 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 2370 }
67bed84c
SH
2371 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2372 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2373 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2374 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2375 netdev->supported |= NETDEV_F_40GB_FD;
2376 }
8b61709d 2377 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 2378 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
2379 }
2380 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 2381 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
2382 }
2383 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 2384 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
2385 }
2386 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 2387 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
2388 }
2389 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 2390 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2391 }
2392
2393 /* Advertised features. */
b5d57fc8 2394 netdev->advertised = 0;
8b61709d 2395 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 2396 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
2397 }
2398 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 2399 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
2400 }
2401 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 2402 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
2403 }
2404 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 2405 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
2406 }
2407 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 2408 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 2409 }
67bed84c
SH
2410 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2411 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 2412 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 2413 }
67bed84c
SH
2414 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2415 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2416 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2417 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 2418 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 2419 }
67bed84c
SH
2420 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2421 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2422 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2423 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2424 netdev->advertised |= NETDEV_F_40GB_FD;
2425 }
8b61709d 2426 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 2427 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
2428 }
2429 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 2430 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
2431 }
2432 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 2433 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
2434 }
2435 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 2436 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
2437 }
2438 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 2439 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2440 }
2441
2442 /* Current settings. */
0c615356 2443 speed = ethtool_cmd_speed(&ecmd);
6c038611 2444 if (speed == SPEED_10) {
b5d57fc8 2445 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 2446 } else if (speed == SPEED_100) {
b5d57fc8 2447 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 2448 } else if (speed == SPEED_1000) {
b5d57fc8 2449 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 2450 } else if (speed == SPEED_10000) {
b5d57fc8 2451 netdev->current = NETDEV_F_10GB_FD;
6c038611 2452 } else if (speed == 40000) {
b5d57fc8 2453 netdev->current = NETDEV_F_40GB_FD;
6c038611 2454 } else if (speed == 100000) {
b5d57fc8 2455 netdev->current = NETDEV_F_100GB_FD;
6c038611 2456 } else if (speed == 1000000) {
b5d57fc8 2457 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2458 } else {
b5d57fc8 2459 netdev->current = 0;
8b61709d
BP
2460 }
2461
2462 if (ecmd.port == PORT_TP) {
b5d57fc8 2463 netdev->current |= NETDEV_F_COPPER;
8b61709d 2464 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2465 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2466 }
2467
2468 if (ecmd.autoneg) {
b5d57fc8 2469 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2470 }
2471
51f87458 2472out:
b5d57fc8
BP
2473 netdev->cache_valid |= VALID_FEATURES;
2474 netdev->get_features_error = error;
51f87458
PS
2475}
2476
887ed8b2
BP
2477/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2478 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2479 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2480static int
2481netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2482 enum netdev_features *current,
2483 enum netdev_features *advertised,
2484 enum netdev_features *supported,
2485 enum netdev_features *peer)
51f87458 2486{
b5d57fc8 2487 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2488 int error;
51f87458 2489
86383816 2490 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2491 if (netdev_linux_netnsid_is_remote(netdev)) {
2492 error = EOPNOTSUPP;
2493 goto exit;
2494 }
2495
b5d57fc8 2496 netdev_linux_read_features(netdev);
b5d57fc8
BP
2497 if (!netdev->get_features_error) {
2498 *current = netdev->current;
2499 *advertised = netdev->advertised;
2500 *supported = netdev->supported;
887ed8b2 2501 *peer = 0; /* XXX */
51f87458 2502 }
86383816 2503 error = netdev->get_features_error;
86383816 2504
e0e2410d
FL
2505exit:
2506 ovs_mutex_unlock(&netdev->mutex);
86383816 2507 return error;
8b61709d
BP
2508}
2509
2510/* Set the features advertised by 'netdev' to 'advertise'. */
2511static int
86383816 2512netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2513 enum netdev_features advertise)
8b61709d 2514{
86383816 2515 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2516 struct ethtool_cmd ecmd;
2517 int error;
2518
86383816
BP
2519 ovs_mutex_lock(&netdev->mutex);
2520
ab985a77 2521 COVERAGE_INC(netdev_get_ethtool);
e0e2410d
FL
2522
2523 if (netdev_linux_netnsid_is_remote(netdev)) {
2524 error = EOPNOTSUPP;
2525 goto exit;
2526 }
2527
8b61709d 2528 memset(&ecmd, 0, sizeof ecmd);
86383816 2529 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2530 ETHTOOL_GSET, "ETHTOOL_GSET");
2531 if (error) {
86383816 2532 goto exit;
8b61709d
BP
2533 }
2534
2535 ecmd.advertising = 0;
6c038611 2536 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2537 ecmd.advertising |= ADVERTISED_10baseT_Half;
2538 }
6c038611 2539 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2540 ecmd.advertising |= ADVERTISED_10baseT_Full;
2541 }
6c038611 2542 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2543 ecmd.advertising |= ADVERTISED_100baseT_Half;
2544 }
6c038611 2545 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2546 ecmd.advertising |= ADVERTISED_100baseT_Full;
2547 }
6c038611 2548 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2549 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2550 }
6c038611 2551 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2552 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2553 }
6c038611 2554 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2555 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2556 }
6c038611 2557 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2558 ecmd.advertising |= ADVERTISED_TP;
2559 }
6c038611 2560 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2561 ecmd.advertising |= ADVERTISED_FIBRE;
2562 }
6c038611 2563 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2564 ecmd.advertising |= ADVERTISED_Autoneg;
2565 }
6c038611 2566 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2567 ecmd.advertising |= ADVERTISED_Pause;
2568 }
6c038611 2569 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2570 ecmd.advertising |= ADVERTISED_Asym_Pause;
2571 }
ab985a77 2572 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2573 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2574 ETHTOOL_SSET, "ETHTOOL_SSET");
2575
2576exit:
2577 ovs_mutex_unlock(&netdev->mutex);
2578 return error;
8b61709d
BP
2579}
2580
e7f6ba22
PJV
2581static struct tc_police
2582tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2583{
2584 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2585 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2586 struct tc_police police;
2587 struct tc_ratespec rate;
2588 int mtu = 65535;
2589
2590 memset(&rate, 0, sizeof rate);
2591 rate.rate = bps;
2592 rate.cell_log = tc_calc_cell_log(mtu);
2593 rate.mpu = ETH_TOTAL_MIN;
2594
2595 memset(&police, 0, sizeof police);
2596 police.burst = tc_bytes_to_ticks(bps, bsize);
2597 police.action = TC_POLICE_SHOT;
2598 police.rate = rate;
2599 police.mtu = mtu;
2600
2601 return police;
2602}
2603
2604static void
2605nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2606{
2607 size_t offset;
2608
2609 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2610 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2611 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2612 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2613 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2614 nl_msg_end_nested(request, offset);
2615}
2616
2617static int
2618tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2619 uint32_t kbits_burst)
2620{
2621 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2622 size_t basic_offset, action_offset, inner_offset;
2623 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2624 int ifindex, index, err = 0;
2625 struct tc_police pol_act;
2626 uint32_t block_id = 0;
2627 struct ofpbuf request;
2628 struct ofpbuf *reply;
2629 struct tcmsg *tcmsg;
2630 uint32_t handle = 1;
2631
2632 err = get_ifindex(netdev, &ifindex);
2633 if (err) {
2634 return err;
2635 }
2636
2637 index = block_id ? TCM_IFINDEX_MAGIC_BLOCK : ifindex;
2638 tcmsg = tc_make_request(index, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2639 &request);
2640 tcmsg->tcm_parent = block_id ? : TC_INGRESS_PARENT;
2641 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2642 tcmsg->tcm_handle = handle;
2643
2644 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2645 nl_msg_put_string(&request, TCA_KIND, "matchall");
2646 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2647 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2648 inner_offset = nl_msg_start_nested(&request, 1);
2649 nl_msg_put_act_police(&request, pol_act);
2650 nl_msg_end_nested(&request, inner_offset);
2651 nl_msg_end_nested(&request, action_offset);
2652 nl_msg_end_nested(&request, basic_offset);
2653
2654 err = tc_transact(&request, &reply);
2655 if (!err) {
2656 struct tcmsg *tc =
2657 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2658 ofpbuf_delete(reply);
2659 }
2660
2661 return err;
2662}
2663
2664static int
2665tc_del_matchall_policer(struct netdev *netdev)
2666{
acdd544c 2667 int prio = TC_RESERVED_PRIORITY_POLICE;
e7f6ba22 2668 uint32_t block_id = 0;
acdd544c 2669 struct tcf_id id;
e7f6ba22
PJV
2670 int ifindex;
2671 int err;
2672
2673 err = get_ifindex(netdev, &ifindex);
2674 if (err) {
2675 return err;
2676 }
2677
acdd544c
PB
2678 id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
2679 err = tc_del_filter(&id);
e7f6ba22
PJV
2680 if (err) {
2681 return err;
2682 }
2683
2684 return 0;
2685}
2686
f8500004
JP
2687/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2688 * successful, otherwise a positive errno value. */
8b61709d 2689static int
b5d57fc8 2690netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2691 uint32_t kbits_rate, uint32_t kbits_burst)
2692{
b5d57fc8
BP
2693 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2694 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2695 int ifindex;
f8500004 2696 int error;
8b61709d 2697
80a86fbe 2698 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2699 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2700 : kbits_burst); /* Stick with user-specified value. */
2701
86383816 2702 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2703 if (netdev_linux_netnsid_is_remote(netdev)) {
2704 error = EOPNOTSUPP;
2705 goto out;
2706 }
2707
b5d57fc8 2708 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2709 error = netdev->netdev_policing_error;
2710 if (error || (netdev->kbits_rate == kbits_rate &&
2711 netdev->kbits_burst == kbits_burst)) {
c9f71668 2712 /* Assume that settings haven't changed since we last set them. */
86383816 2713 goto out;
c9f71668 2714 }
b5d57fc8 2715 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2716 }
2717
718be50d 2718 COVERAGE_INC(netdev_set_policing);
7874bdff 2719
e7f6ba22
PJV
2720 /* Use matchall for policing when offloadling ovs with tc-flower. */
2721 if (netdev_is_flow_api_enabled()) {
2722 error = tc_del_matchall_policer(netdev_);
2723 if (kbits_rate) {
2724 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2725 }
2726 ovs_mutex_unlock(&netdev->mutex);
2727 return error;
2728 }
2729
718be50d
TZ
2730 error = get_ifindex(netdev_, &ifindex);
2731 if (error) {
2732 goto out;
2733 }
2734
f8500004 2735 /* Remove any existing ingress qdisc. */
95255018 2736 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
f8500004
JP
2737 if (error) {
2738 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2739 netdev_name, ovs_strerror(error));
c9f71668 2740 goto out;
f8500004
JP
2741 }
2742
8b61709d 2743 if (kbits_rate) {
95255018 2744 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
f8500004
JP
2745 if (error) {
2746 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2747 netdev_name, ovs_strerror(error));
c9f71668 2748 goto out;
8b61709d
BP
2749 }
2750
b5d57fc8 2751 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2752 if (error){
2753 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2754 netdev_name, ovs_strerror(error));
c9f71668 2755 goto out;
8b61709d 2756 }
8b61709d
BP
2757 }
2758
b5d57fc8
BP
2759 netdev->kbits_rate = kbits_rate;
2760 netdev->kbits_burst = kbits_burst;
f8500004 2761
c9f71668
PS
2762out:
2763 if (!error || error == ENODEV) {
b5d57fc8
BP
2764 netdev->netdev_policing_error = error;
2765 netdev->cache_valid |= VALID_POLICING;
c9f71668 2766 }
86383816 2767 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2768 return error;
8b61709d
BP
2769}
2770
c1c9c9c4
BP
2771static int
2772netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2773 struct sset *types)
c1c9c9c4 2774{
559eb230 2775 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2776 for (opsp = tcs; *opsp != NULL; opsp++) {
2777 const struct tc_ops *ops = *opsp;
2778 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2779 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2780 }
2781 }
2782 return 0;
2783}
2784
2785static const struct tc_ops *
2786tc_lookup_ovs_name(const char *name)
2787{
559eb230 2788 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2789
2790 for (opsp = tcs; *opsp != NULL; opsp++) {
2791 const struct tc_ops *ops = *opsp;
2792 if (!strcmp(name, ops->ovs_name)) {
2793 return ops;
2794 }
2795 }
2796 return NULL;
2797}
2798
2799static const struct tc_ops *
2800tc_lookup_linux_name(const char *name)
2801{
559eb230 2802 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2803
2804 for (opsp = tcs; *opsp != NULL; opsp++) {
2805 const struct tc_ops *ops = *opsp;
2806 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2807 return ops;
2808 }
2809 }
2810 return NULL;
2811}
2812
93b13be8 2813static struct tc_queue *
b5d57fc8 2814tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2815 size_t hash)
2816{
b5d57fc8 2817 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2818 struct tc_queue *queue;
2819
b5d57fc8 2820 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2821 if (queue->queue_id == queue_id) {
2822 return queue;
2823 }
2824 }
2825 return NULL;
2826}
2827
2828static struct tc_queue *
2829tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2830{
2831 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2832}
2833
c1c9c9c4
BP
2834static int
2835netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2836 const char *type,
2837 struct netdev_qos_capabilities *caps)
2838{
2839 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2840 if (!ops) {
2841 return EOPNOTSUPP;
2842 }
2843 caps->n_queues = ops->n_queues;
2844 return 0;
2845}
2846
2847static int
b5d57fc8 2848netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2849 const char **typep, struct smap *details)
c1c9c9c4 2850{
b5d57fc8 2851 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2852 int error;
2853
86383816 2854 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2855 if (netdev_linux_netnsid_is_remote(netdev)) {
2856 error = EOPNOTSUPP;
2857 goto exit;
2858 }
2859
b5d57fc8 2860 error = tc_query_qdisc(netdev_);
86383816
BP
2861 if (!error) {
2862 *typep = netdev->tc->ops->ovs_name;
2863 error = (netdev->tc->ops->qdisc_get
2864 ? netdev->tc->ops->qdisc_get(netdev_, details)
2865 : 0);
c1c9c9c4
BP
2866 }
2867
e0e2410d
FL
2868exit:
2869 ovs_mutex_unlock(&netdev->mutex);
86383816 2870 return error;
c1c9c9c4
BP
2871}
2872
2873static int
b5d57fc8 2874netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2875 const char *type, const struct smap *details)
c1c9c9c4 2876{
b5d57fc8 2877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2878 const struct tc_ops *new_ops;
2879 int error;
2880
2881 new_ops = tc_lookup_ovs_name(type);
2882 if (!new_ops || !new_ops->tc_install) {
2883 return EOPNOTSUPP;
2884 }
2885
6cf888b8
BS
2886 if (new_ops == &tc_ops_noop) {
2887 return new_ops->tc_install(netdev_, details);
2888 }
2889
86383816 2890 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2891 if (netdev_linux_netnsid_is_remote(netdev)) {
2892 error = EOPNOTSUPP;
2893 goto exit;
2894 }
2895
b5d57fc8 2896 error = tc_query_qdisc(netdev_);
c1c9c9c4 2897 if (error) {
86383816 2898 goto exit;
c1c9c9c4
BP
2899 }
2900
b5d57fc8 2901 if (new_ops == netdev->tc->ops) {
86383816 2902 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2903 } else {
2904 /* Delete existing qdisc. */
b5d57fc8 2905 error = tc_del_qdisc(netdev_);
c1c9c9c4 2906 if (error) {
86383816 2907 goto exit;
c1c9c9c4 2908 }
b5d57fc8 2909 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2910
2911 /* Install new qdisc. */
b5d57fc8
BP
2912 error = new_ops->tc_install(netdev_, details);
2913 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2914 }
86383816
BP
2915
2916exit:
2917 ovs_mutex_unlock(&netdev->mutex);
2918 return error;
c1c9c9c4
BP
2919}
2920
2921static int
b5d57fc8 2922netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2923 unsigned int queue_id, struct smap *details)
c1c9c9c4 2924{
b5d57fc8 2925 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2926 int error;
2927
86383816 2928 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2929 if (netdev_linux_netnsid_is_remote(netdev)) {
2930 error = EOPNOTSUPP;
2931 goto exit;
2932 }
2933
b5d57fc8 2934 error = tc_query_qdisc(netdev_);
86383816 2935 if (!error) {
b5d57fc8 2936 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2937 error = (queue
b5d57fc8 2938 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2939 : ENOENT);
c1c9c9c4 2940 }
86383816 2941
e0e2410d
FL
2942exit:
2943 ovs_mutex_unlock(&netdev->mutex);
86383816 2944 return error;
c1c9c9c4
BP
2945}
2946
2947static int
b5d57fc8 2948netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2949 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2950{
b5d57fc8 2951 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2952 int error;
2953
86383816 2954 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2955 if (netdev_linux_netnsid_is_remote(netdev)) {
2956 error = EOPNOTSUPP;
2957 goto exit;
2958 }
2959
b5d57fc8 2960 error = tc_query_qdisc(netdev_);
86383816
BP
2961 if (!error) {
2962 error = (queue_id < netdev->tc->ops->n_queues
2963 && netdev->tc->ops->class_set
2964 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2965 : EINVAL);
c1c9c9c4
BP
2966 }
2967
e0e2410d
FL
2968exit:
2969 ovs_mutex_unlock(&netdev->mutex);
86383816 2970 return error;
c1c9c9c4
BP
2971}
2972
2973static int
b5d57fc8 2974netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2975{
b5d57fc8 2976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2977 int error;
2978
86383816 2979 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2980 if (netdev_linux_netnsid_is_remote(netdev)) {
2981 error = EOPNOTSUPP;
2982 goto exit;
2983 }
2984
b5d57fc8 2985 error = tc_query_qdisc(netdev_);
86383816
BP
2986 if (!error) {
2987 if (netdev->tc->ops->class_delete) {
2988 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2989 error = (queue
2990 ? netdev->tc->ops->class_delete(netdev_, queue)
2991 : ENOENT);
2992 } else {
2993 error = EINVAL;
2994 }
c1c9c9c4 2995 }
86383816 2996
e0e2410d
FL
2997exit:
2998 ovs_mutex_unlock(&netdev->mutex);
86383816 2999 return error;
c1c9c9c4
BP
3000}
3001
3002static int
b5d57fc8 3003netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
3004 unsigned int queue_id,
3005 struct netdev_queue_stats *stats)
3006{
b5d57fc8 3007 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3008 int error;
3009
86383816 3010 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3011 if (netdev_linux_netnsid_is_remote(netdev)) {
3012 error = EOPNOTSUPP;
3013 goto exit;
3014 }
3015
b5d57fc8 3016 error = tc_query_qdisc(netdev_);
86383816
BP
3017 if (!error) {
3018 if (netdev->tc->ops->class_get_stats) {
3019 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3020 if (queue) {
3021 stats->created = queue->created;
3022 error = netdev->tc->ops->class_get_stats(netdev_, queue,
3023 stats);
3024 } else {
3025 error = ENOENT;
3026 }
3027 } else {
3028 error = EOPNOTSUPP;
6dc34a0d 3029 }
c1c9c9c4 3030 }
86383816 3031
e0e2410d
FL
3032exit:
3033 ovs_mutex_unlock(&netdev->mutex);
86383816 3034 return error;
c1c9c9c4
BP
3035}
3036
d57695d7
JS
3037struct queue_dump_state {
3038 struct nl_dump dump;
3039 struct ofpbuf buf;
3040};
3041
23a98ffe 3042static bool
d57695d7 3043start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
3044{
3045 struct ofpbuf request;
3046 struct tcmsg *tcmsg;
3047
7874bdff 3048 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
3049 if (!tcmsg) {
3050 return false;
3051 }
3c4de644 3052 tcmsg->tcm_parent = 0;
d57695d7 3053 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 3054 ofpbuf_uninit(&request);
d57695d7
JS
3055
3056 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 3057 return true;
c1c9c9c4
BP
3058}
3059
d57695d7
JS
3060static int
3061finish_queue_dump(struct queue_dump_state *state)
3062{
3063 ofpbuf_uninit(&state->buf);
3064 return nl_dump_done(&state->dump);
3065}
3066
89454bf4
BP
3067struct netdev_linux_queue_state {
3068 unsigned int *queues;
3069 size_t cur_queue;
3070 size_t n_queues;
3071};
3072
c1c9c9c4 3073static int
89454bf4 3074netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 3075{
e0e2410d 3076 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3077 int error;
3078
86383816 3079 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3080 if (netdev_linux_netnsid_is_remote(netdev)) {
3081 error = EOPNOTSUPP;
3082 goto exit;
3083 }
3084
b5d57fc8 3085 error = tc_query_qdisc(netdev_);
86383816
BP
3086 if (!error) {
3087 if (netdev->tc->ops->class_get) {
89454bf4
BP
3088 struct netdev_linux_queue_state *state;
3089 struct tc_queue *queue;
3090 size_t i;
3091
3092 *statep = state = xmalloc(sizeof *state);
3093 state->n_queues = hmap_count(&netdev->tc->queues);
3094 state->cur_queue = 0;
3095 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3096
3097 i = 0;
3098 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3099 state->queues[i++] = queue->queue_id;
86383816 3100 }
c1c9c9c4 3101 } else {
86383816 3102 error = EOPNOTSUPP;
c1c9c9c4
BP
3103 }
3104 }
c1c9c9c4 3105
e0e2410d
FL
3106exit:
3107 ovs_mutex_unlock(&netdev->mutex);
86383816 3108 return error;
c1c9c9c4
BP
3109}
3110
89454bf4
BP
3111static int
3112netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3113 unsigned int *queue_idp, struct smap *details)
3114{
e0e2410d 3115 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
89454bf4
BP
3116 struct netdev_linux_queue_state *state = state_;
3117 int error = EOF;
3118
3119 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3120 if (netdev_linux_netnsid_is_remote(netdev)) {
3121 error = EOPNOTSUPP;
3122 goto exit;
3123 }
3124
89454bf4
BP
3125 while (state->cur_queue < state->n_queues) {
3126 unsigned int queue_id = state->queues[state->cur_queue++];
3127 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3128
3129 if (queue) {
3130 *queue_idp = queue_id;
3131 error = netdev->tc->ops->class_get(netdev_, queue, details);
3132 break;
3133 }
3134 }
89454bf4 3135
e0e2410d
FL
3136exit:
3137 ovs_mutex_unlock(&netdev->mutex);
89454bf4
BP
3138 return error;
3139}
3140
3141static int
3142netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3143 void *state_)
3144{
3145 struct netdev_linux_queue_state *state = state_;
3146
3147 free(state->queues);
3148 free(state);
3149 return 0;
3150}
3151
c1c9c9c4 3152static int
b5d57fc8 3153netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
3154 netdev_dump_queue_stats_cb *cb, void *aux)
3155{
b5d57fc8 3156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3157 int error;
3158
86383816 3159 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3160 if (netdev_linux_netnsid_is_remote(netdev)) {
3161 error = EOPNOTSUPP;
3162 goto exit;
3163 }
3164
b5d57fc8 3165 error = tc_query_qdisc(netdev_);
86383816 3166 if (!error) {
d57695d7 3167 struct queue_dump_state state;
c1c9c9c4 3168
86383816
BP
3169 if (!netdev->tc->ops->class_dump_stats) {
3170 error = EOPNOTSUPP;
d57695d7 3171 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
3172 error = ENODEV;
3173 } else {
3174 struct ofpbuf msg;
3175 int retval;
3176
d57695d7 3177 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
3178 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3179 cb, aux);
3180 if (retval) {
3181 error = retval;
3182 }
3183 }
3184
d57695d7 3185 retval = finish_queue_dump(&state);
86383816
BP
3186 if (retval) {
3187 error = retval;
3188 }
c1c9c9c4
BP
3189 }
3190 }
3191
e0e2410d
FL
3192exit:
3193 ovs_mutex_unlock(&netdev->mutex);
86383816 3194 return error;
c1c9c9c4
BP
3195}
3196
8b61709d 3197static int
f1acd62b
BP
3198netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3199 struct in_addr netmask)
8b61709d 3200{
b5d57fc8 3201 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
3202 int error;
3203
86383816 3204 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3205 if (netdev_linux_netnsid_is_remote(netdev)) {
3206 error = EOPNOTSUPP;
3207 goto exit;
3208 }
3209
f1acd62b 3210 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 3211 if (!error) {
f1acd62b 3212 if (address.s_addr != INADDR_ANY) {
8b61709d 3213 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 3214 "SIOCSIFNETMASK", netmask);
8b61709d
BP
3215 }
3216 }
49af9a3d 3217
e0e2410d 3218exit:
86383816 3219 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3220 return error;
3221}
3222
7df6932e
AW
3223/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3224 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3225 * error. */
8b61709d 3226static int
a8704b50
PS
3227netdev_linux_get_addr_list(const struct netdev *netdev_,
3228 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 3229{
b5d57fc8 3230 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 3231 int error;
86383816
BP
3232
3233 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3234 if (netdev_linux_netnsid_is_remote(netdev)) {
3235 error = EOPNOTSUPP;
3236 goto exit;
3237 }
3238
a8704b50 3239 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816 3240
e0e2410d
FL
3241exit:
3242 ovs_mutex_unlock(&netdev->mutex);
7df6932e 3243 return error;
8b61709d
BP
3244}
3245
3246static void
3247make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3248{
3249 struct sockaddr_in sin;
3250 memset(&sin, 0, sizeof sin);
3251 sin.sin_family = AF_INET;
3252 sin.sin_addr = addr;
3253 sin.sin_port = 0;
3254
3255 memset(sa, 0, sizeof *sa);
3256 memcpy(sa, &sin, sizeof sin);
3257}
3258
3259static int
3260do_set_addr(struct netdev *netdev,
3261 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3262{
3263 struct ifreq ifr;
149f577a 3264
259e0b1a
BP
3265 make_in4_sockaddr(&ifr.ifr_addr, addr);
3266 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3267 ioctl_name);
8b61709d
BP
3268}
3269
3270/* Adds 'router' as a default IP gateway. */
3271static int
67a4917b 3272netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
3273{
3274 struct in_addr any = { INADDR_ANY };
3275 struct rtentry rt;
3276 int error;
3277
3278 memset(&rt, 0, sizeof rt);
3279 make_in4_sockaddr(&rt.rt_dst, any);
3280 make_in4_sockaddr(&rt.rt_gateway, router);
3281 make_in4_sockaddr(&rt.rt_genmask, any);
3282 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 3283 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 3284 if (error) {
10a89ef0 3285 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
3286 }
3287 return error;
3288}
3289
f1acd62b
BP
3290static int
3291netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3292 char **netdev_name)
3293{
3294 static const char fn[] = "/proc/net/route";
3295 FILE *stream;
3296 char line[256];
3297 int ln;
3298
3299 *netdev_name = NULL;
3300 stream = fopen(fn, "r");
3301 if (stream == NULL) {
10a89ef0 3302 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
3303 return errno;
3304 }
3305
3306 ln = 0;
3307 while (fgets(line, sizeof line, stream)) {
3308 if (++ln >= 2) {
3309 char iface[17];
dbba996b 3310 ovs_be32 dest, gateway, mask;
f1acd62b
BP
3311 int refcnt, metric, mtu;
3312 unsigned int flags, use, window, irtt;
3313
c2c28dfd
BP
3314 if (!ovs_scan(line,
3315 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3316 " %d %u %u\n",
3317 iface, &dest, &gateway, &flags, &refcnt,
3318 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 3319 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
3320 fn, ln, line);
3321 continue;
3322 }
3323 if (!(flags & RTF_UP)) {
3324 /* Skip routes that aren't up. */
3325 continue;
3326 }
3327
3328 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 3329 * network byte order, so we don't need need any endian
f1acd62b
BP
3330 * conversions here. */
3331 if ((dest & mask) == (host->s_addr & mask)) {
3332 if (!gateway) {
3333 /* The host is directly reachable. */
3334 next_hop->s_addr = 0;
3335 } else {
3336 /* To reach the host, we must go through a gateway. */
3337 next_hop->s_addr = gateway;
3338 }
3339 *netdev_name = xstrdup(iface);
3340 fclose(stream);
3341 return 0;
3342 }
3343 }
3344 }
3345
3346 fclose(stream);
3347 return ENXIO;
3348}
3349
e210037e 3350static int
b5d57fc8 3351netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 3352{
b5d57fc8 3353 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
3354 int error = 0;
3355
86383816 3356 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
3357 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3358 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
3359
3360 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
3361 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3362 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
3363 cmd,
3364 ETHTOOL_GDRVINFO,
3365 "ETHTOOL_GDRVINFO");
3366 if (!error) {
b5d57fc8 3367 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
3368 }
3369 }
e210037e 3370
e210037e 3371 if (!error) {
b5d57fc8
BP
3372 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3373 smap_add(smap, "driver_version", netdev->drvinfo.version);
3374 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 3375 }
86383816
BP
3376 ovs_mutex_unlock(&netdev->mutex);
3377
e210037e
AE
3378 return error;
3379}
3380
4f925bd3 3381static int
275707c3
EJ
3382netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3383 struct smap *smap)
4f925bd3 3384{
79f1cbe9 3385 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
3386 return 0;
3387}
3388
25db83be
JH
3389static uint32_t
3390netdev_linux_get_block_id(struct netdev *netdev_)
3391{
3392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3393 uint32_t block_id = 0;
3394
3395 ovs_mutex_lock(&netdev->mutex);
3396 /* Ensure the linux netdev has had its fields populated. */
3397 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3398 netdev_linux_update_via_netlink(netdev);
3399 }
3400
3401 /* Only assigning block ids to linux netdevs that are LAG masters. */
3402 if (netdev->is_lag_master) {
3403 block_id = netdev->ifindex;
3404 }
3405 ovs_mutex_unlock(&netdev->mutex);
3406
3407 return block_id;
3408}
3409
8b61709d
BP
3410/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3411 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3412 * returns 0. Otherwise, it returns a positive errno value; in particular,
3413 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3414static int
3415netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 3416 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
3417{
3418 struct arpreq r;
c100e025 3419 struct sockaddr_in sin;
8b61709d
BP
3420 int retval;
3421
3422 memset(&r, 0, sizeof r);
f2cc621b 3423 memset(&sin, 0, sizeof sin);
c100e025
BP
3424 sin.sin_family = AF_INET;
3425 sin.sin_addr.s_addr = ip;
3426 sin.sin_port = 0;
3427 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
3428 r.arp_ha.sa_family = ARPHRD_ETHER;
3429 r.arp_flags = 0;
71d7c22f 3430 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 3431 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 3432 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
3433 if (!retval) {
3434 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3435 } else if (retval != ENXIO) {
3436 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
3437 netdev_get_name(netdev), IP_ARGS(ip),
3438 ovs_strerror(retval));
8b61709d
BP
3439 }
3440 return retval;
3441}
3442
b24751ff 3443static unsigned int
8b61709d
BP
3444nd_to_iff_flags(enum netdev_flags nd)
3445{
b24751ff 3446 unsigned int iff = 0;
8b61709d
BP
3447 if (nd & NETDEV_UP) {
3448 iff |= IFF_UP;
3449 }
3450 if (nd & NETDEV_PROMISC) {
3451 iff |= IFF_PROMISC;
3452 }
7ba19d41
AC
3453 if (nd & NETDEV_LOOPBACK) {
3454 iff |= IFF_LOOPBACK;
3455 }
8b61709d
BP
3456 return iff;
3457}
3458
3459static int
b24751ff 3460iff_to_nd_flags(unsigned int iff)
8b61709d
BP
3461{
3462 enum netdev_flags nd = 0;
3463 if (iff & IFF_UP) {
3464 nd |= NETDEV_UP;
3465 }
3466 if (iff & IFF_PROMISC) {
3467 nd |= NETDEV_PROMISC;
3468 }
7ba19d41
AC
3469 if (iff & IFF_LOOPBACK) {
3470 nd |= NETDEV_LOOPBACK;
3471 }
8b61709d
BP
3472 return nd;
3473}
3474
3475static int
4f9f3f21
BP
3476update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3477 enum netdev_flags on, enum netdev_flags *old_flagsp)
3478 OVS_REQUIRES(netdev->mutex)
8b61709d 3479{
b24751ff 3480 unsigned int old_flags, new_flags;
c37d4da4
EJ
3481 int error = 0;
3482
b5d57fc8 3483 old_flags = netdev->ifi_flags;
c37d4da4
EJ
3484 *old_flagsp = iff_to_nd_flags(old_flags);
3485 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3486 if (new_flags != old_flags) {
4f9f3f21
BP
3487 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3488 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 3489 }
4f9f3f21
BP
3490
3491 return error;
3492}
3493
3494static int
3495netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3496 enum netdev_flags on, enum netdev_flags *old_flagsp)
3497{
3498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756819dd 3499 int error = 0;
4f9f3f21
BP
3500
3501 ovs_mutex_lock(&netdev->mutex);
756819dd
FL
3502 if (on || off) {
3503 /* Changing flags over netlink isn't support yet. */
e0e2410d
FL
3504 if (netdev_linux_netnsid_is_remote(netdev)) {
3505 error = EOPNOTSUPP;
3506 goto exit;
3507 }
756819dd
FL
3508 error = update_flags(netdev, off, on, old_flagsp);
3509 } else {
3510 /* Try reading flags over netlink, or fall back to ioctl. */
3511 if (!netdev_linux_update_via_netlink(netdev)) {
3512 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3513 } else {
3514 error = update_flags(netdev, off, on, old_flagsp);
3515 }
3516 }
e0e2410d
FL
3517
3518exit:
86383816 3519 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3520 return error;
3521}
3522
89c09c1c
BP
3523#define NETDEV_LINUX_CLASS_COMMON \
3524 .run = netdev_linux_run, \
3525 .wait = netdev_linux_wait, \
3526 .alloc = netdev_linux_alloc, \
89c09c1c 3527 .dealloc = netdev_linux_dealloc, \
89c09c1c
BP
3528 .send_wait = netdev_linux_send_wait, \
3529 .set_etheraddr = netdev_linux_set_etheraddr, \
3530 .get_etheraddr = netdev_linux_get_etheraddr, \
3531 .get_mtu = netdev_linux_get_mtu, \
3532 .set_mtu = netdev_linux_set_mtu, \
3533 .get_ifindex = netdev_linux_get_ifindex, \
3534 .get_carrier = netdev_linux_get_carrier, \
3535 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3536 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3537 .set_advertisements = netdev_linux_set_advertisements, \
3538 .set_policing = netdev_linux_set_policing, \
3539 .get_qos_types = netdev_linux_get_qos_types, \
3540 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3541 .get_qos = netdev_linux_get_qos, \
3542 .set_qos = netdev_linux_set_qos, \
3543 .get_queue = netdev_linux_get_queue, \
3544 .set_queue = netdev_linux_set_queue, \
3545 .delete_queue = netdev_linux_delete_queue, \
3546 .get_queue_stats = netdev_linux_get_queue_stats, \
3547 .queue_dump_start = netdev_linux_queue_dump_start, \
3548 .queue_dump_next = netdev_linux_queue_dump_next, \
3549 .queue_dump_done = netdev_linux_queue_dump_done, \
3550 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3551 .set_in4 = netdev_linux_set_in4, \
3552 .get_addr_list = netdev_linux_get_addr_list, \
3553 .add_router = netdev_linux_add_router, \
3554 .get_next_hop = netdev_linux_get_next_hop, \
3555 .arp_lookup = netdev_linux_arp_lookup, \
3556 .update_flags = netdev_linux_update_flags, \
3557 .rxq_alloc = netdev_linux_rxq_alloc, \
89c09c1c 3558 .rxq_dealloc = netdev_linux_rxq_dealloc, \
89c09c1c
BP
3559 .rxq_wait = netdev_linux_rxq_wait, \
3560 .rxq_drain = netdev_linux_rxq_drain
3561
3562const struct netdev_class netdev_linux_class = {
3563 NETDEV_LINUX_CLASS_COMMON,
89c09c1c 3564 .type = "system",
0de1b425 3565 .is_pmd = false,
89c09c1c 3566 .construct = netdev_linux_construct,
0de1b425 3567 .destruct = netdev_linux_destruct,
89c09c1c
BP
3568 .get_stats = netdev_linux_get_stats,
3569 .get_features = netdev_linux_get_features,
3570 .get_status = netdev_linux_get_status,
0de1b425
WT
3571 .get_block_id = netdev_linux_get_block_id,
3572 .send = netdev_linux_send,
3573 .rxq_construct = netdev_linux_rxq_construct,
3574 .rxq_destruct = netdev_linux_rxq_destruct,
3575 .rxq_recv = netdev_linux_rxq_recv,
89c09c1c
BP
3576};
3577
3578const struct netdev_class netdev_tap_class = {
3579 NETDEV_LINUX_CLASS_COMMON,
3580 .type = "tap",
0de1b425 3581 .is_pmd = false,
89c09c1c 3582 .construct = netdev_linux_construct_tap,
0de1b425 3583 .destruct = netdev_linux_destruct,
89c09c1c
BP
3584 .get_stats = netdev_tap_get_stats,
3585 .get_features = netdev_linux_get_features,
3586 .get_status = netdev_linux_get_status,
0de1b425
WT
3587 .send = netdev_linux_send,
3588 .rxq_construct = netdev_linux_rxq_construct,
3589 .rxq_destruct = netdev_linux_rxq_destruct,
3590 .rxq_recv = netdev_linux_rxq_recv,
89c09c1c
BP
3591};
3592
3593const struct netdev_class netdev_internal_class = {
3594 NETDEV_LINUX_CLASS_COMMON,
3595 .type = "internal",
0de1b425 3596 .is_pmd = false,
89c09c1c 3597 .construct = netdev_linux_construct,
0de1b425 3598 .destruct = netdev_linux_destruct,
89c09c1c
BP
3599 .get_stats = netdev_internal_get_stats,
3600 .get_status = netdev_internal_get_status,
0de1b425
WT
3601 .send = netdev_linux_send,
3602 .rxq_construct = netdev_linux_rxq_construct,
3603 .rxq_destruct = netdev_linux_rxq_destruct,
3604 .rxq_recv = netdev_linux_rxq_recv,
89c09c1c 3605};
0de1b425
WT
3606
3607#ifdef HAVE_AF_XDP
3608const struct netdev_class netdev_afxdp_class = {
3609 NETDEV_LINUX_CLASS_COMMON,
3610 .type = "afxdp",
3611 .is_pmd = true,
7bf075d9 3612 .init = netdev_afxdp_init,
f627cf1d 3613 .construct = netdev_afxdp_construct,
0de1b425
WT
3614 .destruct = netdev_afxdp_destruct,
3615 .get_stats = netdev_afxdp_get_stats,
d560bc1b 3616 .get_custom_stats = netdev_afxdp_get_custom_stats,
0de1b425
WT
3617 .get_status = netdev_linux_get_status,
3618 .set_config = netdev_afxdp_set_config,
3619 .get_config = netdev_afxdp_get_config,
3620 .reconfigure = netdev_afxdp_reconfigure,
105cf8df 3621 .get_numa_id = netdev_linux_get_numa_id,
0de1b425
WT
3622 .send = netdev_afxdp_batch_send,
3623 .rxq_construct = netdev_afxdp_rxq_construct,
3624 .rxq_destruct = netdev_afxdp_rxq_destruct,
3625 .rxq_recv = netdev_afxdp_rxq_recv,
3626};
3627#endif
8b61709d 3628\f
677d9158
JV
3629
3630#define CODEL_N_QUEUES 0x0000
3631
2f4298ce
BP
3632/* In sufficiently new kernel headers these are defined as enums in
3633 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3634 * kernels. (This overrides any enum definition in the header file but that's
3635 * harmless.) */
3636#define TCA_CODEL_TARGET 1
3637#define TCA_CODEL_LIMIT 2
3638#define TCA_CODEL_INTERVAL 3
3639
677d9158
JV
3640struct codel {
3641 struct tc tc;
3642 uint32_t target;
3643 uint32_t limit;
3644 uint32_t interval;
3645};
3646
3647static struct codel *
3648codel_get__(const struct netdev *netdev_)
3649{
3650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3651 return CONTAINER_OF(netdev->tc, struct codel, tc);
3652}
3653
3654static void
3655codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3656 uint32_t interval)
3657{
3658 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3659 struct codel *codel;
3660
3661 codel = xmalloc(sizeof *codel);
3662 tc_init(&codel->tc, &tc_ops_codel);
3663 codel->target = target;
3664 codel->limit = limit;
3665 codel->interval = interval;
3666
3667 netdev->tc = &codel->tc;
3668}
3669
3670static int
3671codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3672 uint32_t interval)
3673{
3674 size_t opt_offset;
3675 struct ofpbuf request;
3676 struct tcmsg *tcmsg;
3677 uint32_t otarget, olimit, ointerval;
3678 int error;
3679
3680 tc_del_qdisc(netdev);
3681
7874bdff
RD
3682 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3683 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3684 if (!tcmsg) {
3685 return ENODEV;
3686 }
3687 tcmsg->tcm_handle = tc_make_handle(1, 0);
3688 tcmsg->tcm_parent = TC_H_ROOT;
3689
3690 otarget = target ? target : 5000;
3691 olimit = limit ? limit : 10240;
3692 ointerval = interval ? interval : 100000;
3693
3694 nl_msg_put_string(&request, TCA_KIND, "codel");
3695 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3696 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3697 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3698 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3699 nl_msg_end_nested(&request, opt_offset);
3700
3701 error = tc_transact(&request, NULL);
3702 if (error) {
3703 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3704 "target %u, limit %u, interval %u error %d(%s)",
3705 netdev_get_name(netdev),
3706 otarget, olimit, ointerval,
3707 error, ovs_strerror(error));
3708 }
3709 return error;
3710}
3711
3712static void
3713codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3714 const struct smap *details, struct codel *codel)
3715{
13c1637f
BP
3716 codel->target = smap_get_ullong(details, "target", 0);
3717 codel->limit = smap_get_ullong(details, "limit", 0);
3718 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3719
3720 if (!codel->target) {
3721 codel->target = 5000;
3722 }
3723 if (!codel->limit) {
3724 codel->limit = 10240;
3725 }
3726 if (!codel->interval) {
3727 codel->interval = 100000;
3728 }
3729}
3730
3731static int
3732codel_tc_install(struct netdev *netdev, const struct smap *details)
3733{
3734 int error;
3735 struct codel codel;
3736
3737 codel_parse_qdisc_details__(netdev, details, &codel);
3738 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3739 codel.interval);
3740 if (!error) {
3741 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3742 }
3743 return error;
3744}
3745
3746static int
3747codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3748{
3749 static const struct nl_policy tca_codel_policy[] = {
3750 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3751 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3752 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3753 };
3754
3755 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3756
3757 if (!nl_parse_nested(nl_options, tca_codel_policy,
3758 attrs, ARRAY_SIZE(tca_codel_policy))) {
3759 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3760 return EPROTO;
3761 }
3762
3763 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3764 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3765 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3766 return 0;
3767}
3768
3769static int
3770codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3771{
3772 struct nlattr *nlattr;
3773 const char * kind;
3774 int error;
3775 struct codel codel;
3776
3777 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3778 if (error != 0) {
3779 return error;
3780 }
3781
3782 error = codel_parse_tca_options__(nlattr, &codel);
3783 if (error != 0) {
3784 return error;
3785 }
3786
3787 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3788 return 0;
3789}
3790
3791
3792static void
3793codel_tc_destroy(struct tc *tc)
3794{
3795 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3796 tc_destroy(tc);
3797 free(codel);
3798}
3799
3800static int
3801codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3802{
3803 const struct codel *codel = codel_get__(netdev);
3804 smap_add_format(details, "target", "%u", codel->target);
3805 smap_add_format(details, "limit", "%u", codel->limit);
3806 smap_add_format(details, "interval", "%u", codel->interval);
3807 return 0;
3808}
3809
3810static int
3811codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3812{
3813 struct codel codel;
3814
3815 codel_parse_qdisc_details__(netdev, details, &codel);
3816 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3817 codel_get__(netdev)->target = codel.target;
3818 codel_get__(netdev)->limit = codel.limit;
3819 codel_get__(netdev)->interval = codel.interval;
3820 return 0;
3821}
3822
3823static const struct tc_ops tc_ops_codel = {
89c09c1c
BP
3824 .linux_name = "codel",
3825 .ovs_name = "linux-codel",
3826 .n_queues = CODEL_N_QUEUES,
3827 .tc_install = codel_tc_install,
3828 .tc_load = codel_tc_load,
3829 .tc_destroy = codel_tc_destroy,
3830 .qdisc_get = codel_qdisc_get,
3831 .qdisc_set = codel_qdisc_set,
677d9158
JV
3832};
3833\f
3834/* FQ-CoDel traffic control class. */
3835
3836#define FQCODEL_N_QUEUES 0x0000
3837
2f4298ce
BP
3838/* In sufficiently new kernel headers these are defined as enums in
3839 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3840 * kernels. (This overrides any enum definition in the header file but that's
3841 * harmless.) */
3842#define TCA_FQ_CODEL_TARGET 1
3843#define TCA_FQ_CODEL_LIMIT 2
3844#define TCA_FQ_CODEL_INTERVAL 3
3845#define TCA_FQ_CODEL_ECN 4
3846#define TCA_FQ_CODEL_FLOWS 5
3847#define TCA_FQ_CODEL_QUANTUM 6
3848
677d9158
JV
3849struct fqcodel {
3850 struct tc tc;
3851 uint32_t target;
3852 uint32_t limit;
3853 uint32_t interval;
3854 uint32_t flows;
3855 uint32_t quantum;
3856};
3857
3858static struct fqcodel *
3859fqcodel_get__(const struct netdev *netdev_)
3860{
3861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3862 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3863}
3864
3865static void
3866fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3867 uint32_t interval, uint32_t flows, uint32_t quantum)
3868{
3869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3870 struct fqcodel *fqcodel;
3871
3872 fqcodel = xmalloc(sizeof *fqcodel);
3873 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3874 fqcodel->target = target;
3875 fqcodel->limit = limit;
3876 fqcodel->interval = interval;
3877 fqcodel->flows = flows;
3878 fqcodel->quantum = quantum;
3879
3880 netdev->tc = &fqcodel->tc;
3881}
3882
3883static int
3884fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3885 uint32_t interval, uint32_t flows, uint32_t quantum)
3886{
3887 size_t opt_offset;
3888 struct ofpbuf request;
3889 struct tcmsg *tcmsg;
3890 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3891 int error;
3892
3893 tc_del_qdisc(netdev);
3894
7874bdff
RD
3895 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3896 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3897 if (!tcmsg) {
3898 return ENODEV;
3899 }
3900 tcmsg->tcm_handle = tc_make_handle(1, 0);
3901 tcmsg->tcm_parent = TC_H_ROOT;
3902
3903 otarget = target ? target : 5000;
3904 olimit = limit ? limit : 10240;
3905 ointerval = interval ? interval : 100000;
3906 oflows = flows ? flows : 1024;
3907 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3908 not mtu */
3909
3910 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3911 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3912 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3913 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3914 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3915 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3916 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3917 nl_msg_end_nested(&request, opt_offset);
3918
3919 error = tc_transact(&request, NULL);
3920 if (error) {
3921 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3922 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3923 netdev_get_name(netdev),
3924 otarget, olimit, ointerval, oflows, oquantum,
3925 error, ovs_strerror(error));
3926 }
3927 return error;
3928}
3929
3930static void
3931fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3932 const struct smap *details, struct fqcodel *fqcodel)
3933{
13c1637f
BP
3934 fqcodel->target = smap_get_ullong(details, "target", 0);
3935 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3936 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3937 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3938 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3939
677d9158
JV
3940 if (!fqcodel->target) {
3941 fqcodel->target = 5000;
3942 }
3943 if (!fqcodel->limit) {
3944 fqcodel->limit = 10240;
3945 }
3946 if (!fqcodel->interval) {
3947 fqcodel->interval = 1000000;
3948 }
3949 if (!fqcodel->flows) {
3950 fqcodel->flows = 1024;
3951 }
3952 if (!fqcodel->quantum) {
3953 fqcodel->quantum = 1514;
3954 }
3955}
3956
3957static int
3958fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3959{
3960 int error;
3961 struct fqcodel fqcodel;
3962
3963 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3964 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3965 fqcodel.interval, fqcodel.flows,
3966 fqcodel.quantum);
3967 if (!error) {
3968 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3969 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3970 }
3971 return error;
3972}
3973
3974static int
3975fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3976{
3977 static const struct nl_policy tca_fqcodel_policy[] = {
3978 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3979 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3980 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3981 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3982 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3983 };
3984
3985 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3986
3987 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3988 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3989 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3990 return EPROTO;
3991 }
3992
3993 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3994 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3995 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3996 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3997 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3998 return 0;
3999}
4000
4001static int
4002fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4003{
4004 struct nlattr *nlattr;
4005 const char * kind;
4006 int error;
4007 struct fqcodel fqcodel;
4008
4009 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4010 if (error != 0) {
4011 return error;
4012 }
4013
4014 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4015 if (error != 0) {
4016 return error;
4017 }
4018
4019 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4020 fqcodel.flows, fqcodel.quantum);
4021 return 0;
4022}
4023
4024static void
4025fqcodel_tc_destroy(struct tc *tc)
4026{
4027 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4028 tc_destroy(tc);
4029 free(fqcodel);
4030}
4031
4032static int
4033fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4034{
4035 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4036 smap_add_format(details, "target", "%u", fqcodel->target);
4037 smap_add_format(details, "limit", "%u", fqcodel->limit);
4038 smap_add_format(details, "interval", "%u", fqcodel->interval);
4039 smap_add_format(details, "flows", "%u", fqcodel->flows);
4040 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4041 return 0;
4042}
4043
4044static int
4045fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4046{
4047 struct fqcodel fqcodel;
4048
4049 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4050 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4051 fqcodel.flows, fqcodel.quantum);
4052 fqcodel_get__(netdev)->target = fqcodel.target;
4053 fqcodel_get__(netdev)->limit = fqcodel.limit;
4054 fqcodel_get__(netdev)->interval = fqcodel.interval;
4055 fqcodel_get__(netdev)->flows = fqcodel.flows;
4056 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4057 return 0;
4058}
4059
4060static const struct tc_ops tc_ops_fqcodel = {
89c09c1c
BP
4061 .linux_name = "fq_codel",
4062 .ovs_name = "linux-fq_codel",
4063 .n_queues = FQCODEL_N_QUEUES,
4064 .tc_install = fqcodel_tc_install,
4065 .tc_load = fqcodel_tc_load,
4066 .tc_destroy = fqcodel_tc_destroy,
4067 .qdisc_get = fqcodel_qdisc_get,
4068 .qdisc_set = fqcodel_qdisc_set,
677d9158
JV
4069};
4070\f
4071/* SFQ traffic control class. */
4072
4073#define SFQ_N_QUEUES 0x0000
4074
4075struct sfq {
4076 struct tc tc;
4077 uint32_t quantum;
4078 uint32_t perturb;
4079};
4080
4081static struct sfq *
4082sfq_get__(const struct netdev *netdev_)
4083{
4084 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4085 return CONTAINER_OF(netdev->tc, struct sfq, tc);
4086}
4087
4088static void
4089sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4090{
4091 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4092 struct sfq *sfq;
4093
4094 sfq = xmalloc(sizeof *sfq);
4095 tc_init(&sfq->tc, &tc_ops_sfq);
4096 sfq->perturb = perturb;
4097 sfq->quantum = quantum;
4098
4099 netdev->tc = &sfq->tc;
4100}
4101
4102static int
4103sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4104{
4105 struct tc_sfq_qopt opt;
4106 struct ofpbuf request;
4107 struct tcmsg *tcmsg;
4108 int mtu;
4109 int mtu_error, error;
4110 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4111
4112 tc_del_qdisc(netdev);
4113
7874bdff
RD
4114 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4115 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
4116 if (!tcmsg) {
4117 return ENODEV;
4118 }
4119 tcmsg->tcm_handle = tc_make_handle(1, 0);
4120 tcmsg->tcm_parent = TC_H_ROOT;
4121
4122 memset(&opt, 0, sizeof opt);
4123 if (!quantum) {
4124 if (!mtu_error) {
4125 opt.quantum = mtu; /* if we cannot find mtu, use default */
4126 }
4127 } else {
4128 opt.quantum = quantum;
4129 }
4130
4131 if (!perturb) {
4132 opt.perturb_period = 10;
4133 } else {
4134 opt.perturb_period = perturb;
4135 }
4136
4137 nl_msg_put_string(&request, TCA_KIND, "sfq");
4138 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4139
4140 error = tc_transact(&request, NULL);
4141 if (error) {
4142 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4143 "quantum %u, perturb %u error %d(%s)",
4144 netdev_get_name(netdev),
4145 opt.quantum, opt.perturb_period,
4146 error, ovs_strerror(error));
4147 }
4148 return error;
4149}
4150
4151static void
4152sfq_parse_qdisc_details__(struct netdev *netdev,
4153 const struct smap *details, struct sfq *sfq)
4154{
13c1637f
BP
4155 sfq->perturb = smap_get_ullong(details, "perturb", 0);
4156 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 4157
677d9158
JV
4158 if (!sfq->perturb) {
4159 sfq->perturb = 10;
4160 }
4161
4162 if (!sfq->quantum) {
13c1637f
BP
4163 int mtu;
4164 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
4165 sfq->quantum = mtu;
4166 } else {
4167 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4168 "device without mtu");
677d9158
JV
4169 }
4170 }
4171}
4172
4173static int
4174sfq_tc_install(struct netdev *netdev, const struct smap *details)
4175{
4176 int error;
4177 struct sfq sfq;
4178
4179 sfq_parse_qdisc_details__(netdev, details, &sfq);
4180 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4181 if (!error) {
4182 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4183 }
4184 return error;
4185}
4186
4187static int
4188sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4189{
4190 const struct tc_sfq_qopt *sfq;
4191 struct nlattr *nlattr;
4192 const char * kind;
4193 int error;
4194
4195 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4196 if (error == 0) {
4197 sfq = nl_attr_get(nlattr);
61265c03 4198 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
677d9158
JV
4199 return 0;
4200 }
4201
4202 return error;
4203}
4204
4205static void
4206sfq_tc_destroy(struct tc *tc)
4207{
4208 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4209 tc_destroy(tc);
4210 free(sfq);
4211}
4212
4213static int
4214sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4215{
4216 const struct sfq *sfq = sfq_get__(netdev);
4217 smap_add_format(details, "quantum", "%u", sfq->quantum);
4218 smap_add_format(details, "perturb", "%u", sfq->perturb);
4219 return 0;
4220}
4221
4222static int
4223sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4224{
4225 struct sfq sfq;
4226
4227 sfq_parse_qdisc_details__(netdev, details, &sfq);
4228 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4229 sfq_get__(netdev)->quantum = sfq.quantum;
4230 sfq_get__(netdev)->perturb = sfq.perturb;
4231 return 0;
4232}
4233
4234static const struct tc_ops tc_ops_sfq = {
89c09c1c
BP
4235 .linux_name = "sfq",
4236 .ovs_name = "linux-sfq",
4237 .n_queues = SFQ_N_QUEUES,
4238 .tc_install = sfq_tc_install,
4239 .tc_load = sfq_tc_load,
4240 .tc_destroy = sfq_tc_destroy,
4241 .qdisc_get = sfq_qdisc_get,
4242 .qdisc_set = sfq_qdisc_set,
677d9158
JV
4243};
4244\f
2f564bb1
S
4245/* netem traffic control class. */
4246
4247struct netem {
4248 struct tc tc;
4249 uint32_t latency;
4250 uint32_t limit;
4251 uint32_t loss;
4252};
4253
4254static struct netem *
4255netem_get__(const struct netdev *netdev_)
4256{
4257 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4258 return CONTAINER_OF(netdev->tc, struct netem, tc);
4259}
4260
4261static void
4262netem_install__(struct netdev *netdev_, uint32_t latency,
4263 uint32_t limit, uint32_t loss)
4264{
4265 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4266 struct netem *netem;
4267
4268 netem = xmalloc(sizeof *netem);
4269 tc_init(&netem->tc, &tc_ops_netem);
4270 netem->latency = latency;
4271 netem->limit = limit;
4272 netem->loss = loss;
4273
4274 netdev->tc = &netem->tc;
4275}
4276
4277static int
4278netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4279 uint32_t limit, uint32_t loss)
4280{
4281 struct tc_netem_qopt opt;
4282 struct ofpbuf request;
4283 struct tcmsg *tcmsg;
4284 int error;
4285
4286 tc_del_qdisc(netdev);
4287
4288 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4289 NLM_F_EXCL | NLM_F_CREATE, &request);
4290 if (!tcmsg) {
4291 return ENODEV;
4292 }
4293 tcmsg->tcm_handle = tc_make_handle(1, 0);
4294 tcmsg->tcm_parent = TC_H_ROOT;
4295
4296 memset(&opt, 0, sizeof opt);
4297
4298 if (!limit) {
4299 opt.limit = 1000;
4300 } else {
4301 opt.limit = limit;
4302 }
4303
4304 if (loss) {
4305 if (loss > 100) {
4306 VLOG_WARN_RL(&rl,
4307 "loss should be a percentage value between 0 to 100, "
4308 "loss was %u", loss);
4309 return EINVAL;
4310 }
4311 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4312 }
4313
4314 opt.latency = tc_time_to_ticks(latency);
4315
4316 nl_msg_put_string(&request, TCA_KIND, "netem");
4317 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4318
4319 error = tc_transact(&request, NULL);
4320 if (error) {
4321 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4322 "latency %u, limit %u, loss %u error %d(%s)",
4323 netdev_get_name(netdev),
4324 opt.latency, opt.limit, opt.loss,
4325 error, ovs_strerror(error));
4326 }
4327 return error;
4328}
4329
4330static void
4331netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4332 const struct smap *details, struct netem *netem)
4333{
4334 netem->latency = smap_get_ullong(details, "latency", 0);
4335 netem->limit = smap_get_ullong(details, "limit", 0);
4336 netem->loss = smap_get_ullong(details, "loss", 0);
4337
4338 if (!netem->limit) {
4339 netem->limit = 1000;
4340 }
4341}
4342
4343static int
4344netem_tc_install(struct netdev *netdev, const struct smap *details)
4345{
4346 int error;
4347 struct netem netem;
4348
4349 netem_parse_qdisc_details__(netdev, details, &netem);
4350 error = netem_setup_qdisc__(netdev, netem.latency,
4351 netem.limit, netem.loss);
4352 if (!error) {
4353 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4354 }
4355 return error;
4356}
4357
4358static int
4359netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4360{
4361 const struct tc_netem_qopt *netem;
4362 struct nlattr *nlattr;
4363 const char *kind;
4364 int error;
4365
4366 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4367 if (error == 0) {
4368 netem = nl_attr_get(nlattr);
4369 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4370 return 0;
4371 }
4372
4373 return error;
4374}
4375
4376static void
4377netem_tc_destroy(struct tc *tc)
4378{
4379 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4380 tc_destroy(tc);
4381 free(netem);
4382}
4383
4384static int
4385netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4386{
4387 const struct netem *netem = netem_get__(netdev);
4388 smap_add_format(details, "latency", "%u", netem->latency);
4389 smap_add_format(details, "limit", "%u", netem->limit);
4390 smap_add_format(details, "loss", "%u", netem->loss);
4391 return 0;
4392}
4393
4394static int
4395netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4396{
4397 struct netem netem;
4398
4399 netem_parse_qdisc_details__(netdev, details, &netem);
4400 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4401 netem_get__(netdev)->latency = netem.latency;
4402 netem_get__(netdev)->limit = netem.limit;
4403 netem_get__(netdev)->loss = netem.loss;
4404 return 0;
4405}
4406
4407static const struct tc_ops tc_ops_netem = {
4408 .linux_name = "netem",
4409 .ovs_name = "linux-netem",
4410 .n_queues = 0,
4411 .tc_install = netem_tc_install,
4412 .tc_load = netem_tc_load,
4413 .tc_destroy = netem_tc_destroy,
4414 .qdisc_get = netem_qdisc_get,
4415 .qdisc_set = netem_qdisc_set,
4416};
4417\f
c1c9c9c4 4418/* HTB traffic control class. */
559843ed 4419
c1c9c9c4 4420#define HTB_N_QUEUES 0xf000
4f631ccd 4421#define HTB_RATE2QUANTUM 10
8b61709d 4422
c1c9c9c4
BP
4423struct htb {
4424 struct tc tc;
4425 unsigned int max_rate; /* In bytes/s. */
4426};
8b61709d 4427
c1c9c9c4 4428struct htb_class {
93b13be8 4429 struct tc_queue tc_queue;
c1c9c9c4
BP
4430 unsigned int min_rate; /* In bytes/s. */
4431 unsigned int max_rate; /* In bytes/s. */
4432 unsigned int burst; /* In bytes. */
4433 unsigned int priority; /* Lower values are higher priorities. */
4434};
8b61709d 4435
c1c9c9c4 4436static struct htb *
b5d57fc8 4437htb_get__(const struct netdev *netdev_)
c1c9c9c4 4438{
b5d57fc8
BP
4439 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4440 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
4441}
4442
24045e35 4443static void
b5d57fc8 4444htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 4445{
b5d57fc8 4446 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4447 struct htb *htb;
4448
4449 htb = xmalloc(sizeof *htb);
4450 tc_init(&htb->tc, &tc_ops_htb);
4451 htb->max_rate = max_rate;
4452
b5d57fc8 4453 netdev->tc = &htb->tc;
c1c9c9c4
BP
4454}
4455
4456/* Create an HTB qdisc.
4457 *
a339aa81 4458 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
4459static int
4460htb_setup_qdisc__(struct netdev *netdev)
4461{
4462 size_t opt_offset;
4463 struct tc_htb_glob opt;
4464 struct ofpbuf request;
4465 struct tcmsg *tcmsg;
4466
4467 tc_del_qdisc(netdev);
4468
7874bdff
RD
4469 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4470 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
4471 if (!tcmsg) {
4472 return ENODEV;
4473 }
c1c9c9c4
BP
4474 tcmsg->tcm_handle = tc_make_handle(1, 0);
4475 tcmsg->tcm_parent = TC_H_ROOT;
4476
4477 nl_msg_put_string(&request, TCA_KIND, "htb");
4478
4479 memset(&opt, 0, sizeof opt);
4f631ccd 4480 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 4481 opt.version = 3;
4ecf12d5 4482 opt.defcls = 1;
c1c9c9c4
BP
4483
4484 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4485 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4486 nl_msg_end_nested(&request, opt_offset);
4487
4488 return tc_transact(&request, NULL);
4489}
4490
4491/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4492 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4493static int
4494htb_setup_class__(struct netdev *netdev, unsigned int handle,
4495 unsigned int parent, struct htb_class *class)
4496{
4497 size_t opt_offset;
4498 struct tc_htb_opt opt;
4499 struct ofpbuf request;
4500 struct tcmsg *tcmsg;
4501 int error;
4502 int mtu;
4503
73371c09 4504 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4505 if (error) {
f915f1a8
BP
4506 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4507 netdev_get_name(netdev));
9b020780 4508 return error;
f915f1a8 4509 }
c1c9c9c4
BP
4510
4511 memset(&opt, 0, sizeof opt);
4512 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4513 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
4514 /* Makes sure the quantum is at least MTU. Setting quantum will
4515 * make htb ignore the r2q for this class. */
4516 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4517 opt.quantum = mtu;
4518 }
c1c9c9c4
BP
4519 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4520 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4521 opt.prio = class->priority;
4522
7874bdff
RD
4523 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4524 &request);
23a98ffe
BP
4525 if (!tcmsg) {
4526 return ENODEV;
4527 }
c1c9c9c4
BP
4528 tcmsg->tcm_handle = handle;
4529 tcmsg->tcm_parent = parent;
4530
4531 nl_msg_put_string(&request, TCA_KIND, "htb");
4532 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4533 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4534 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4535 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4536 nl_msg_end_nested(&request, opt_offset);
4537
4538 error = tc_transact(&request, NULL);
4539 if (error) {
4540 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4541 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4542 netdev_get_name(netdev),
4543 tc_get_major(handle), tc_get_minor(handle),
4544 tc_get_major(parent), tc_get_minor(parent),
4545 class->min_rate, class->max_rate,
10a89ef0 4546 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
4547 }
4548 return error;
4549}
4550
4551/* Parses Netlink attributes in 'options' for HTB parameters and stores a
4552 * description of them into 'details'. The description complies with the
4553 * specification given in the vswitch database documentation for linux-htb
4554 * queue details. */
4555static int
4556htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4557{
4558 static const struct nl_policy tca_htb_policy[] = {
4559 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4560 .min_len = sizeof(struct tc_htb_opt) },
4561 };
4562
4563 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4564 const struct tc_htb_opt *htb;
4565
4566 if (!nl_parse_nested(nl_options, tca_htb_policy,
4567 attrs, ARRAY_SIZE(tca_htb_policy))) {
4568 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4569 return EPROTO;
4570 }
4571
4572 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4573 class->min_rate = htb->rate.rate;
4574 class->max_rate = htb->ceil.rate;
4575 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4576 class->priority = htb->prio;
4577 return 0;
4578}
4579
4580static int
4581htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4582 struct htb_class *options,
4583 struct netdev_queue_stats *stats)
4584{
4585 struct nlattr *nl_options;
4586 unsigned int handle;
4587 int error;
4588
4589 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4590 if (!error && queue_id) {
17ee3c1f
BP
4591 unsigned int major = tc_get_major(handle);
4592 unsigned int minor = tc_get_minor(handle);
4593 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4594 *queue_id = minor - 1;
c1c9c9c4
BP
4595 } else {
4596 error = EPROTO;
4597 }
4598 }
4599 if (!error && options) {
4600 error = htb_parse_tca_options__(nl_options, options);
4601 }
4602 return error;
4603}
4604
4605static void
73371c09 4606htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 4607 const struct smap *details, struct htb_class *hc)
c1c9c9c4 4608{
73371c09 4609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 4610
13c1637f 4611 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 4612 if (!hc->max_rate) {
a00ca915 4613 enum netdev_features current;
c1c9c9c4 4614
73371c09
BP
4615 netdev_linux_read_features(netdev);
4616 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4617 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
4618 }
4619 hc->min_rate = hc->max_rate;
4620 hc->burst = 0;
4621 hc->priority = 0;
4622}
4623
4624static int
4625htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 4626 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
4627{
4628 const struct htb *htb = htb_get__(netdev);
9b020780 4629 int mtu, error;
214117fd 4630 unsigned long long int max_rate_bit;
c1c9c9c4 4631
73371c09 4632 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4633 if (error) {
f915f1a8
BP
4634 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4635 netdev_get_name(netdev));
9b020780 4636 return error;
f915f1a8
BP
4637 }
4638
4f104611
EJ
4639 /* HTB requires at least an mtu sized min-rate to send any traffic even
4640 * on uncongested links. */
13c1637f 4641 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 4642 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
4643 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4644
4645 /* max-rate */
214117fd
KF
4646 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4647 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
4648 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4649 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4650
4651 /* burst
4652 *
4653 * According to hints in the documentation that I've read, it is important
4654 * that 'burst' be at least as big as the largest frame that might be
4655 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4656 * but having it a bit too small is a problem. Since netdev_get_mtu()
4657 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4658 * the MTU. We actually add 64, instead of 14, as a guard against
4659 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 4660 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
4661 hc->burst = MAX(hc->burst, mtu + 64);
4662
4663 /* priority */
13c1637f 4664 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
4665
4666 return 0;
4667}
4668
4669static int
4670htb_query_class__(const struct netdev *netdev, unsigned int handle,
4671 unsigned int parent, struct htb_class *options,
4672 struct netdev_queue_stats *stats)
4673{
4674 struct ofpbuf *reply;
4675 int error;
4676
4677 error = tc_query_class(netdev, handle, parent, &reply);
4678 if (!error) {
4679 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4680 ofpbuf_delete(reply);
4681 }
4682 return error;
4683}
4684
4685static int
79f1cbe9 4686htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4687{
4688 int error;
4689
4690 error = htb_setup_qdisc__(netdev);
4691 if (!error) {
4692 struct htb_class hc;
4693
4694 htb_parse_qdisc_details__(netdev, details, &hc);
4695 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4696 tc_make_handle(1, 0), &hc);
4697 if (!error) {
4698 htb_install__(netdev, hc.max_rate);
4699 }
4700 }
4701 return error;
4702}
4703
93b13be8
BP
4704static struct htb_class *
4705htb_class_cast__(const struct tc_queue *queue)
4706{
4707 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4708}
4709
c1c9c9c4
BP
4710static void
4711htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4712 const struct htb_class *hc)
4713{
4714 struct htb *htb = htb_get__(netdev);
93b13be8
BP
4715 size_t hash = hash_int(queue_id, 0);
4716 struct tc_queue *queue;
c1c9c9c4
BP
4717 struct htb_class *hcp;
4718
93b13be8
BP
4719 queue = tc_find_queue__(netdev, queue_id, hash);
4720 if (queue) {
4721 hcp = htb_class_cast__(queue);
4722 } else {
c1c9c9c4 4723 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
4724 queue = &hcp->tc_queue;
4725 queue->queue_id = queue_id;
6dc34a0d 4726 queue->created = time_msec();
93b13be8 4727 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 4728 }
93b13be8
BP
4729
4730 hcp->min_rate = hc->min_rate;
4731 hcp->max_rate = hc->max_rate;
4732 hcp->burst = hc->burst;
4733 hcp->priority = hc->priority;
c1c9c9c4
BP
4734}
4735
4736static int
4737htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4738{
c1c9c9c4 4739 struct ofpbuf msg;
d57695d7 4740 struct queue_dump_state state;
c1c9c9c4 4741 struct htb_class hc;
c1c9c9c4
BP
4742
4743 /* Get qdisc options. */
4744 hc.max_rate = 0;
4745 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4746 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
4747
4748 /* Get queues. */
d57695d7 4749 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
4750 return ENODEV;
4751 }
d57695d7 4752 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
4753 unsigned int queue_id;
4754
4755 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4756 htb_update_queue__(netdev, queue_id, &hc);
4757 }
4758 }
d57695d7 4759 finish_queue_dump(&state);
c1c9c9c4
BP
4760
4761 return 0;
4762}
4763
4764static void
4765htb_tc_destroy(struct tc *tc)
4766{
4767 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 4768 struct htb_class *hc;
c1c9c9c4 4769
4ec3d7c7 4770 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
4771 free(hc);
4772 }
4773 tc_destroy(tc);
4774 free(htb);
4775}
4776
4777static int
79f1cbe9 4778htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
4779{
4780 const struct htb *htb = htb_get__(netdev);
79f1cbe9 4781 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
4782 return 0;
4783}
4784
4785static int
79f1cbe9 4786htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4787{
4788 struct htb_class hc;
4789 int error;
4790
4791 htb_parse_qdisc_details__(netdev, details, &hc);
4792 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4793 tc_make_handle(1, 0), &hc);
4794 if (!error) {
4795 htb_get__(netdev)->max_rate = hc.max_rate;
4796 }
4797 return error;
4798}
4799
4800static int
93b13be8 4801htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4802 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 4803{
93b13be8 4804 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4805
79f1cbe9 4806 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 4807 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4808 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 4809 }
79f1cbe9 4810 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 4811 if (hc->priority) {
79f1cbe9 4812 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
4813 }
4814 return 0;
4815}
4816
4817static int
4818htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4819 const struct smap *details)
c1c9c9c4
BP
4820{
4821 struct htb_class hc;
4822 int error;
4823
4824 error = htb_parse_class_details__(netdev, details, &hc);
4825 if (error) {
4826 return error;
4827 }
4828
17ee3c1f 4829 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
4830 tc_make_handle(1, 0xfffe), &hc);
4831 if (error) {
4832 return error;
4833 }
4834
4835 htb_update_queue__(netdev, queue_id, &hc);
4836 return 0;
4837}
4838
4839static int
93b13be8 4840htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 4841{
93b13be8 4842 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4843 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4844 int error;
4845
93b13be8 4846 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4847 if (!error) {
93b13be8 4848 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4849 free(hc);
c1c9c9c4
BP
4850 }
4851 return error;
4852}
4853
4854static int
93b13be8 4855htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4856 struct netdev_queue_stats *stats)
4857{
93b13be8 4858 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4859 tc_make_handle(1, 0xfffe), NULL, stats);
4860}
4861
4862static int
4863htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4864 const struct ofpbuf *nlmsg,
4865 netdev_dump_queue_stats_cb *cb, void *aux)
4866{
4867 struct netdev_queue_stats stats;
17ee3c1f 4868 unsigned int handle, major, minor;
c1c9c9c4
BP
4869 int error;
4870
4871 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4872 if (error) {
4873 return error;
4874 }
4875
17ee3c1f
BP
4876 major = tc_get_major(handle);
4877 minor = tc_get_minor(handle);
4878 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4879 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4880 }
4881 return 0;
4882}
4883
4884static const struct tc_ops tc_ops_htb = {
89c09c1c
BP
4885 .linux_name = "htb",
4886 .ovs_name = "linux-htb",
4887 .n_queues = HTB_N_QUEUES,
4888 .tc_install = htb_tc_install,
4889 .tc_load = htb_tc_load,
4890 .tc_destroy = htb_tc_destroy,
4891 .qdisc_get = htb_qdisc_get,
4892 .qdisc_set = htb_qdisc_set,
4893 .class_get = htb_class_get,
4894 .class_set = htb_class_set,
4895 .class_delete = htb_class_delete,
4896 .class_get_stats = htb_class_get_stats,
4897 .class_dump_stats = htb_class_dump_stats
c1c9c9c4
BP
4898};
4899\f
a339aa81
EJ
4900/* "linux-hfsc" traffic control class. */
4901
4902#define HFSC_N_QUEUES 0xf000
4903
4904struct hfsc {
4905 struct tc tc;
4906 uint32_t max_rate;
4907};
4908
4909struct hfsc_class {
4910 struct tc_queue tc_queue;
4911 uint32_t min_rate;
4912 uint32_t max_rate;
4913};
4914
4915static struct hfsc *
b5d57fc8 4916hfsc_get__(const struct netdev *netdev_)
a339aa81 4917{
b5d57fc8
BP
4918 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4919 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4920}
4921
4922static struct hfsc_class *
4923hfsc_class_cast__(const struct tc_queue *queue)
4924{
4925 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4926}
4927
24045e35 4928static void
b5d57fc8 4929hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4930{
b5d57fc8 4931 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4932 struct hfsc *hfsc;
4933
a339aa81
EJ
4934 hfsc = xmalloc(sizeof *hfsc);
4935 tc_init(&hfsc->tc, &tc_ops_hfsc);
4936 hfsc->max_rate = max_rate;
b5d57fc8 4937 netdev->tc = &hfsc->tc;
a339aa81
EJ
4938}
4939
4940static void
4941hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4942 const struct hfsc_class *hc)
4943{
4944 size_t hash;
4945 struct hfsc *hfsc;
4946 struct hfsc_class *hcp;
4947 struct tc_queue *queue;
4948
4949 hfsc = hfsc_get__(netdev);
4950 hash = hash_int(queue_id, 0);
4951
4952 queue = tc_find_queue__(netdev, queue_id, hash);
4953 if (queue) {
4954 hcp = hfsc_class_cast__(queue);
4955 } else {
4956 hcp = xmalloc(sizeof *hcp);
4957 queue = &hcp->tc_queue;
4958 queue->queue_id = queue_id;
6dc34a0d 4959 queue->created = time_msec();
a339aa81
EJ
4960 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4961 }
4962
4963 hcp->min_rate = hc->min_rate;
4964 hcp->max_rate = hc->max_rate;
4965}
4966
4967static int
4968hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4969{
4970 const struct tc_service_curve *rsc, *fsc, *usc;
4971 static const struct nl_policy tca_hfsc_policy[] = {
4972 [TCA_HFSC_RSC] = {
4973 .type = NL_A_UNSPEC,
4974 .optional = false,
4975 .min_len = sizeof(struct tc_service_curve),
4976 },
4977 [TCA_HFSC_FSC] = {
4978 .type = NL_A_UNSPEC,
4979 .optional = false,
4980 .min_len = sizeof(struct tc_service_curve),
4981 },
4982 [TCA_HFSC_USC] = {
4983 .type = NL_A_UNSPEC,
4984 .optional = false,
4985 .min_len = sizeof(struct tc_service_curve),
4986 },
4987 };
4988 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4989
4990 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4991 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4992 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4993 return EPROTO;
4994 }
4995
4996 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4997 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4998 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4999
5000 if (rsc->m1 != 0 || rsc->d != 0 ||
5001 fsc->m1 != 0 || fsc->d != 0 ||
5002 usc->m1 != 0 || usc->d != 0) {
5003 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5004 "Non-linear service curves are not supported.");
5005 return EPROTO;
5006 }
5007
5008 if (rsc->m2 != fsc->m2) {
5009 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5010 "Real-time service curves are not supported ");
5011 return EPROTO;
5012 }
5013
5014 if (rsc->m2 > usc->m2) {
5015 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5016 "Min-rate service curve is greater than "
5017 "the max-rate service curve.");
5018 return EPROTO;
5019 }
5020
5021 class->min_rate = fsc->m2;
5022 class->max_rate = usc->m2;
5023 return 0;
5024}
5025
5026static int
5027hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5028 struct hfsc_class *options,
5029 struct netdev_queue_stats *stats)
5030{
5031 int error;
5032 unsigned int handle;
5033 struct nlattr *nl_options;
5034
5035 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5036 if (error) {
5037 return error;
5038 }
5039
5040 if (queue_id) {
5041 unsigned int major, minor;
5042
5043 major = tc_get_major(handle);
5044 minor = tc_get_minor(handle);
5045 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5046 *queue_id = minor - 1;
5047 } else {
5048 return EPROTO;
5049 }
5050 }
5051
5052 if (options) {
5053 error = hfsc_parse_tca_options__(nl_options, options);
5054 }
5055
5056 return error;
5057}
5058
5059static int
5060hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5061 unsigned int parent, struct hfsc_class *options,
5062 struct netdev_queue_stats *stats)
5063{
5064 int error;
5065 struct ofpbuf *reply;
5066
5067 error = tc_query_class(netdev, handle, parent, &reply);
5068 if (error) {
5069 return error;
5070 }
5071
5072 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5073 ofpbuf_delete(reply);
5074 return error;
5075}
5076
5077static void
73371c09 5078hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
5079 struct hfsc_class *class)
5080{
73371c09 5081 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 5082
13c1637f 5083 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 5084 if (!max_rate) {
a00ca915 5085 enum netdev_features current;
a339aa81 5086
73371c09
BP
5087 netdev_linux_read_features(netdev);
5088 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 5089 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
5090 }
5091
5092 class->min_rate = max_rate;
5093 class->max_rate = max_rate;
5094}
5095
5096static int
5097hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 5098 const struct smap *details,
a339aa81
EJ
5099 struct hfsc_class * class)
5100{
5101 const struct hfsc *hfsc;
5102 uint32_t min_rate, max_rate;
a339aa81
EJ
5103
5104 hfsc = hfsc_get__(netdev);
a339aa81 5105
13c1637f 5106 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 5107 min_rate = MAX(min_rate, 1);
a339aa81
EJ
5108 min_rate = MIN(min_rate, hfsc->max_rate);
5109
13c1637f 5110 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
5111 max_rate = MAX(max_rate, min_rate);
5112 max_rate = MIN(max_rate, hfsc->max_rate);
5113
5114 class->min_rate = min_rate;
5115 class->max_rate = max_rate;
5116
5117 return 0;
5118}
5119
5120/* Create an HFSC qdisc.
5121 *
5122 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5123static int
5124hfsc_setup_qdisc__(struct netdev * netdev)
5125{
5126 struct tcmsg *tcmsg;
5127 struct ofpbuf request;
5128 struct tc_hfsc_qopt opt;
5129
5130 tc_del_qdisc(netdev);
5131
7874bdff
RD
5132 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5133 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
5134
5135 if (!tcmsg) {
5136 return ENODEV;
5137 }
5138
5139 tcmsg->tcm_handle = tc_make_handle(1, 0);
5140 tcmsg->tcm_parent = TC_H_ROOT;
5141
5142 memset(&opt, 0, sizeof opt);
5143 opt.defcls = 1;
5144
5145 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5146 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5147
5148 return tc_transact(&request, NULL);
5149}
5150
5151/* Create an HFSC class.
5152 *
5153 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5154 * sc rate <min_rate> ul rate <max_rate>" */
5155static int
5156hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5157 unsigned int parent, struct hfsc_class *class)
5158{
5159 int error;
5160 size_t opt_offset;
5161 struct tcmsg *tcmsg;
5162 struct ofpbuf request;
5163 struct tc_service_curve min, max;
5164
7874bdff
RD
5165 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5166 &request);
a339aa81
EJ
5167
5168 if (!tcmsg) {
5169 return ENODEV;
5170 }
5171
5172 tcmsg->tcm_handle = handle;
5173 tcmsg->tcm_parent = parent;
5174
5175 min.m1 = 0;
5176 min.d = 0;
5177 min.m2 = class->min_rate;
5178
5179 max.m1 = 0;
5180 max.d = 0;
5181 max.m2 = class->max_rate;
5182
5183 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5184 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5185 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5186 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5187 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5188 nl_msg_end_nested(&request, opt_offset);
5189
5190 error = tc_transact(&request, NULL);
5191 if (error) {
5192 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5193 "min-rate %ubps, max-rate %ubps (%s)",
5194 netdev_get_name(netdev),
5195 tc_get_major(handle), tc_get_minor(handle),
5196 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5197 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
5198 }
5199
5200 return error;
5201}
5202
5203static int
79f1cbe9 5204hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
5205{
5206 int error;
5207 struct hfsc_class class;
5208
5209 error = hfsc_setup_qdisc__(netdev);
5210
5211 if (error) {
5212 return error;
5213 }
5214
5215 hfsc_parse_qdisc_details__(netdev, details, &class);
5216 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5217 tc_make_handle(1, 0), &class);
5218
5219 if (error) {
5220 return error;
5221 }
5222
5223 hfsc_install__(netdev, class.max_rate);
5224 return 0;
5225}
5226
5227static int
5228hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5229{
5230 struct ofpbuf msg;
d57695d7 5231 struct queue_dump_state state;
a339aa81
EJ
5232 struct hfsc_class hc;
5233
5234 hc.max_rate = 0;
5235 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 5236 hfsc_install__(netdev, hc.max_rate);
a339aa81 5237
d57695d7 5238 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
5239 return ENODEV;
5240 }
5241
d57695d7 5242 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
5243 unsigned int queue_id;
5244
5245 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5246 hfsc_update_queue__(netdev, queue_id, &hc);
5247 }
5248 }
5249
d57695d7 5250 finish_queue_dump(&state);
a339aa81
EJ
5251 return 0;
5252}
5253
5254static void
5255hfsc_tc_destroy(struct tc *tc)
5256{
5257 struct hfsc *hfsc;
5258 struct hfsc_class *hc, *next;
5259
5260 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5261
5262 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
5263 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5264 free(hc);
5265 }
5266
5267 tc_destroy(tc);
5268 free(hfsc);
5269}
5270
5271static int
79f1cbe9 5272hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
5273{
5274 const struct hfsc *hfsc;
5275 hfsc = hfsc_get__(netdev);
79f1cbe9 5276 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
5277 return 0;
5278}
5279
5280static int
79f1cbe9 5281hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
5282{
5283 int error;
5284 struct hfsc_class class;
5285
5286 hfsc_parse_qdisc_details__(netdev, details, &class);
5287 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5288 tc_make_handle(1, 0), &class);
5289
5290 if (!error) {
5291 hfsc_get__(netdev)->max_rate = class.max_rate;
5292 }
5293
5294 return error;
5295}
5296
5297static int
5298hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 5299 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
5300{
5301 const struct hfsc_class *hc;
5302
5303 hc = hfsc_class_cast__(queue);
79f1cbe9 5304 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 5305 if (hc->min_rate != hc->max_rate) {
79f1cbe9 5306 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
5307 }
5308 return 0;
5309}
5310
5311static int
5312hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 5313 const struct smap *details)
a339aa81
EJ
5314{
5315 int error;
5316 struct hfsc_class class;
5317
5318 error = hfsc_parse_class_details__(netdev, details, &class);
5319 if (error) {
5320 return error;
5321 }
5322
5323 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5324 tc_make_handle(1, 0xfffe), &class);
5325 if (error) {
5326 return error;
5327 }
5328
5329 hfsc_update_queue__(netdev, queue_id, &class);
5330 return 0;
5331}
5332
5333static int
5334hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5335{
5336 int error;
5337 struct hfsc *hfsc;
5338 struct hfsc_class *hc;
5339
5340 hc = hfsc_class_cast__(queue);
5341 hfsc = hfsc_get__(netdev);
5342
5343 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5344 if (!error) {
5345 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5346 free(hc);
5347 }
5348 return error;
5349}
5350
5351static int
5352hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5353 struct netdev_queue_stats *stats)
5354{
5355 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5356 tc_make_handle(1, 0xfffe), NULL, stats);
5357}
5358
5359static int
5360hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5361 const struct ofpbuf *nlmsg,
5362 netdev_dump_queue_stats_cb *cb, void *aux)
5363{
5364 struct netdev_queue_stats stats;
5365 unsigned int handle, major, minor;
5366 int error;
5367
5368 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5369 if (error) {
5370 return error;
5371 }
5372
5373 major = tc_get_major(handle);
5374 minor = tc_get_minor(handle);
5375 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5376 (*cb)(minor - 1, &stats, aux);
5377 }
5378 return 0;
5379}
5380
5381static const struct tc_ops tc_ops_hfsc = {
89c09c1c
BP
5382 .linux_name = "hfsc",
5383 .ovs_name = "linux-hfsc",
5384 .n_queues = HFSC_N_QUEUES, /* n_queues */
5385 .tc_install = hfsc_tc_install,
5386 .tc_load = hfsc_tc_load,
5387 .tc_destroy = hfsc_tc_destroy,
5388 .qdisc_get = hfsc_qdisc_get,
5389 .qdisc_set = hfsc_qdisc_set,
5390 .class_get = hfsc_class_get,
5391 .class_set = hfsc_class_set,
5392 .class_delete = hfsc_class_delete,
5393 .class_get_stats = hfsc_class_get_stats,
5394 .class_dump_stats = hfsc_class_dump_stats,
a339aa81
EJ
5395};
5396\f
6cf888b8
BS
5397/* "linux-noop" traffic control class. */
5398
5399static void
5400noop_install__(struct netdev *netdev_)
5401{
5402 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5403 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5404
5405 netdev->tc = CONST_CAST(struct tc *, &tc);
5406}
5407
5408static int
5409noop_tc_install(struct netdev *netdev,
5410 const struct smap *details OVS_UNUSED)
5411{
5412 noop_install__(netdev);
5413 return 0;
5414}
5415
5416static int
5417noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5418{
5419 noop_install__(netdev);
5420 return 0;
5421}
5422
5423static const struct tc_ops tc_ops_noop = {
89c09c1c
BP
5424 .ovs_name = "linux-noop", /* ovs_name */
5425 .tc_install = noop_tc_install,
5426 .tc_load = noop_tc_load,
6cf888b8
BS
5427};
5428\f
c1c9c9c4
BP
5429/* "linux-default" traffic control class.
5430 *
5431 * This class represents the default, unnamed Linux qdisc. It corresponds to
5432 * the "" (empty string) QoS type in the OVS database. */
5433
5434static void
b5d57fc8 5435default_install__(struct netdev *netdev_)
c1c9c9c4 5436{
b5d57fc8 5437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5438 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 5439
559eb230
BP
5440 /* Nothing but a tc class implementation is allowed to write to a tc. This
5441 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5442 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5443}
5444
5445static int
5446default_tc_install(struct netdev *netdev,
79f1cbe9 5447 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
5448{
5449 default_install__(netdev);
5450 return 0;
5451}
5452
5453static int
5454default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5455{
5456 default_install__(netdev);
5457 return 0;
5458}
5459
5460static const struct tc_ops tc_ops_default = {
89c09c1c
BP
5461 .ovs_name = "", /* ovs_name */
5462 .tc_install = default_tc_install,
5463 .tc_load = default_tc_load,
c1c9c9c4
BP
5464};
5465\f
5466/* "linux-other" traffic control class.
5467 *
5468 * */
5469
5470static int
b5d57fc8 5471other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 5472{
b5d57fc8 5473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5474 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 5475
559eb230
BP
5476 /* Nothing but a tc class implementation is allowed to write to a tc. This
5477 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5478 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5479 return 0;
5480}
5481
5482static const struct tc_ops tc_ops_other = {
89c09c1c
BP
5483 .ovs_name = "linux-other",
5484 .tc_load = other_tc_load,
c1c9c9c4
BP
5485};
5486\f
5487/* Traffic control. */
5488
5489/* Number of kernel "tc" ticks per second. */
5490static double ticks_per_s;
5491
5492/* Number of kernel "jiffies" per second. This is used for the purpose of
5493 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5494 * one jiffy's worth of data.
5495 *
5496 * There are two possibilities here:
5497 *
5498 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5499 * approximate range of 100 to 1024. That means that we really need to
5500 * make sure that the qdisc can buffer that much data.
5501 *
5502 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5503 * has finely granular timers and there's no need to fudge additional room
5504 * for buffers. (There's no extra effort needed to implement that: the
5505 * large 'buffer_hz' is used as a divisor, so practically any number will
5506 * come out as 0 in the division. Small integer results in the case of
5507 * really high dividends won't have any real effect anyhow.)
5508 */
5509static unsigned int buffer_hz;
5510
7874bdff
RD
5511static struct tcmsg *
5512netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5513 unsigned int flags, struct ofpbuf *request)
5514{
5515 int ifindex;
5516 int error;
5517
5518 error = get_ifindex(netdev, &ifindex);
5519 if (error) {
5520 return NULL;
5521 }
5522
5523 return tc_make_request(ifindex, type, flags, request);
5524}
5525
f8500004
JP
5526/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5527 * of 'kbits_burst'.
5528 *
5529 * This function is equivalent to running:
5530 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5531 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5532 * mtu 65535 drop
5533 *
5534 * The configuration and stats may be seen with the following command:
c7952afb 5535 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
5536 *
5537 * Returns 0 if successful, otherwise a positive errno value.
5538 */
5539static int
c7952afb
BP
5540tc_add_policer(struct netdev *netdev,
5541 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
5542{
5543 struct tc_police tc_police;
5544 struct ofpbuf request;
5545 struct tcmsg *tcmsg;
5546 size_t basic_offset;
5547 size_t police_offset;
5548 int error;
5549 int mtu = 65535;
5550
5551 memset(&tc_police, 0, sizeof tc_police);
5552 tc_police.action = TC_POLICE_SHOT;
5553 tc_police.mtu = mtu;
1aca400c 5554 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 5555
79abacc8
MAA
5556 /* The following appears wrong in one way: In networking a kilobit is
5557 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
5558 *
5559 * However if you "fix" those problems then "tc filter show ..." shows
5560 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5561 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5562 * tc's point of view. Whatever. */
5563 tc_police.burst = tc_bytes_to_ticks(
79abacc8 5564 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 5565
7874bdff
RD
5566 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5567 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
5568 if (!tcmsg) {
5569 return ENODEV;
5570 }
5571 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5572 tcmsg->tcm_info = tc_make_handle(49,
5573 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5574
5575 nl_msg_put_string(&request, TCA_KIND, "basic");
5576 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5577 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5578 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5579 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5580 nl_msg_end_nested(&request, police_offset);
5581 nl_msg_end_nested(&request, basic_offset);
5582
5583 error = tc_transact(&request, NULL);
5584 if (error) {
5585 return error;
5586 }
5587
5588 return 0;
5589}
5590
c1c9c9c4
BP
5591static void
5592read_psched(void)
5593{
5594 /* The values in psched are not individually very meaningful, but they are
5595 * important. The tables below show some values seen in the wild.
5596 *
5597 * Some notes:
5598 *
5599 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5600 * (Before that, there are hints that it was 1000000000.)
5601 *
5602 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5603 * above.
5604 *
5605 * /proc/net/psched
5606 * -----------------------------------
5607 * [1] 000c8000 000f4240 000f4240 00000064
5608 * [2] 000003e8 00000400 000f4240 3b9aca00
5609 * [3] 000003e8 00000400 000f4240 3b9aca00
5610 * [4] 000003e8 00000400 000f4240 00000064
5611 * [5] 000003e8 00000040 000f4240 3b9aca00
5612 * [6] 000003e8 00000040 000f4240 000000f9
5613 *
5614 * a b c d ticks_per_s buffer_hz
5615 * ------- --------- ---------- ------------- ----------- -------------
5616 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5617 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5618 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5619 * [4] 1,000 1,024 1,000,000 100 976,562 100
5620 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5621 * [6] 1,000 64 1,000,000 249 15,625,000 249
5622 *
5623 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5624 * [2] 2.6.26-1-686-bigmem from Debian lenny
5625 * [3] 2.6.26-2-sparc64 from Debian lenny
5626 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5627 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5628 * [6] 2.6.34 from kernel.org on KVM
5629 */
23882115 5630 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
5631 static const char fn[] = "/proc/net/psched";
5632 unsigned int a, b, c, d;
5633 FILE *stream;
5634
23882115
BP
5635 if (!ovsthread_once_start(&once)) {
5636 return;
5637 }
5638
c1c9c9c4
BP
5639 ticks_per_s = 1.0;
5640 buffer_hz = 100;
5641
5642 stream = fopen(fn, "r");
5643 if (!stream) {
10a89ef0 5644 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 5645 goto exit;
c1c9c9c4
BP
5646 }
5647
5648 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5649 VLOG_WARN("%s: read failed", fn);
5650 fclose(stream);
23882115 5651 goto exit;
c1c9c9c4
BP
5652 }
5653 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5654 fclose(stream);
5655
1bab4901 5656 if (!a || !b || !c) {
c1c9c9c4 5657 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 5658 goto exit;
c1c9c9c4
BP
5659 }
5660
5661 ticks_per_s = (double) a * c / b;
5662 if (c == 1000000) {
5663 buffer_hz = d;
5664 } else {
5665 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5666 fn, a, b, c, d);
5667 }
5668 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
5669
5670exit:
5671 ovsthread_once_done(&once);
c1c9c9c4
BP
5672}
5673
5674/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5675 * rate of 'rate' bytes per second. */
5676static unsigned int
5677tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5678{
23882115 5679 read_psched();
c1c9c9c4
BP
5680 return (rate * ticks) / ticks_per_s;
5681}
5682
5683/* Returns the number of ticks that it would take to transmit 'size' bytes at a
5684 * rate of 'rate' bytes per second. */
5685static unsigned int
5686tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5687{
23882115 5688 read_psched();
015c93a4 5689 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
5690}
5691
5692/* Returns the number of bytes that need to be reserved for qdisc buffering at
5693 * a transmission rate of 'rate' bytes per second. */
5694static unsigned int
5695tc_buffer_per_jiffy(unsigned int rate)
5696{
23882115 5697 read_psched();
c1c9c9c4
BP
5698 return rate / buffer_hz;
5699}
5700
2f564bb1
S
5701static uint32_t
5702tc_time_to_ticks(uint32_t time) {
5703 read_psched();
5704 return time * (ticks_per_s / 1000000);
5705}
5706
c1c9c9c4
BP
5707/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5708 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5709 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5710 * stores NULL into it if it is absent.
5711 *
5712 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5713 * 'msg'.
5714 *
5715 * Returns 0 if successful, otherwise a positive errno value. */
5716static int
5717tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5718 struct nlattr **options)
5719{
5720 static const struct nl_policy tca_policy[] = {
5721 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5722 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5723 };
5724 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5725
5726 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5727 tca_policy, ta, ARRAY_SIZE(ta))) {
5728 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5729 goto error;
5730 }
5731
5732 if (kind) {
5733 *kind = nl_attr_get_string(ta[TCA_KIND]);
5734 }
5735
5736 if (options) {
5737 *options = ta[TCA_OPTIONS];
5738 }
5739
5740 return 0;
5741
5742error:
5743 if (kind) {
5744 *kind = NULL;
5745 }
5746 if (options) {
5747 *options = NULL;
5748 }
5749 return EPROTO;
5750}
5751
5752/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5753 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5754 * into '*options', and its queue statistics into '*stats'. Any of the output
5755 * arguments may be null.
5756 *
5757 * Returns 0 if successful, otherwise a positive errno value. */
5758static int
5759tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5760 struct nlattr **options, struct netdev_queue_stats *stats)
5761{
5762 static const struct nl_policy tca_policy[] = {
5763 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5764 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5765 };
5766 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5767
5768 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5769 tca_policy, ta, ARRAY_SIZE(ta))) {
5770 VLOG_WARN_RL(&rl, "failed to parse class message");
5771 goto error;
5772 }
5773
5774 if (handlep) {
5775 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5776 *handlep = tc->tcm_handle;
5777 }
5778
5779 if (options) {
5780 *options = ta[TCA_OPTIONS];
5781 }
5782
5783 if (stats) {
5784 const struct gnet_stats_queue *gsq;
5785 struct gnet_stats_basic gsb;
5786
5787 static const struct nl_policy stats_policy[] = {
5788 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5789 .min_len = sizeof gsb },
5790 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5791 .min_len = sizeof *gsq },
5792 };
5793 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5794
5795 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5796 sa, ARRAY_SIZE(sa))) {
5797 VLOG_WARN_RL(&rl, "failed to parse class stats");
5798 goto error;
5799 }
5800
5801 /* Alignment issues screw up the length of struct gnet_stats_basic on
5802 * some arch/bitsize combinations. Newer versions of Linux have a
5803 * struct gnet_stats_basic_packed, but we can't depend on that. The
5804 * easiest thing to do is just to make a copy. */
5805 memset(&gsb, 0, sizeof gsb);
5806 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5807 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5808 stats->tx_bytes = gsb.bytes;
5809 stats->tx_packets = gsb.packets;
5810
5811 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5812 stats->tx_errors = gsq->drops;
5813 }
5814
5815 return 0;
5816
5817error:
5818 if (options) {
5819 *options = NULL;
5820 }
5821 if (stats) {
5822 memset(stats, 0, sizeof *stats);
5823 }
5824 return EPROTO;
5825}
5826
5827/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5828 * on 'netdev'. */
5829static int
5830tc_query_class(const struct netdev *netdev,
5831 unsigned int handle, unsigned int parent,
5832 struct ofpbuf **replyp)
5833{
5834 struct ofpbuf request;
5835 struct tcmsg *tcmsg;
5836 int error;
5837
7874bdff
RD
5838 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5839 &request);
23a98ffe
BP
5840 if (!tcmsg) {
5841 return ENODEV;
5842 }
c1c9c9c4
BP
5843 tcmsg->tcm_handle = handle;
5844 tcmsg->tcm_parent = parent;
5845
5846 error = tc_transact(&request, replyp);
5847 if (error) {
5848 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5849 netdev_get_name(netdev),
5850 tc_get_major(handle), tc_get_minor(handle),
5851 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5852 ovs_strerror(error));
c1c9c9c4
BP
5853 }
5854 return error;
5855}
5856
5857/* Equivalent to "tc class del dev <name> handle <handle>". */
5858static int
5859tc_delete_class(const struct netdev *netdev, unsigned int handle)
5860{
5861 struct ofpbuf request;
5862 struct tcmsg *tcmsg;
5863 int error;
5864
7874bdff 5865 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5866 if (!tcmsg) {
5867 return ENODEV;
5868 }
c1c9c9c4
BP
5869 tcmsg->tcm_handle = handle;
5870 tcmsg->tcm_parent = 0;
5871
5872 error = tc_transact(&request, NULL);
5873 if (error) {
5874 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5875 netdev_get_name(netdev),
5876 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5877 ovs_strerror(error));
c1c9c9c4
BP
5878 }
5879 return error;
5880}
5881
5882/* Equivalent to "tc qdisc del dev <name> root". */
5883static int
b5d57fc8 5884tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5885{
b5d57fc8 5886 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5887 struct ofpbuf request;
5888 struct tcmsg *tcmsg;
5889 int error;
5890
7874bdff 5891 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5892 if (!tcmsg) {
5893 return ENODEV;
5894 }
c1c9c9c4
BP
5895 tcmsg->tcm_handle = tc_make_handle(1, 0);
5896 tcmsg->tcm_parent = TC_H_ROOT;
5897
5898 error = tc_transact(&request, NULL);
5899 if (error == EINVAL) {
5900 /* EINVAL probably means that the default qdisc was in use, in which
5901 * case we've accomplished our purpose. */
5902 error = 0;
5903 }
b5d57fc8
BP
5904 if (!error && netdev->tc) {
5905 if (netdev->tc->ops->tc_destroy) {
5906 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5907 }
b5d57fc8 5908 netdev->tc = NULL;
c1c9c9c4
BP
5909 }
5910 return error;
5911}
5912
ac3e3aaa
BP
5913static bool
5914getqdisc_is_safe(void)
5915{
5916 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5917 static bool safe = false;
5918
5919 if (ovsthread_once_start(&once)) {
5920 struct utsname utsname;
5921 int major, minor;
5922
5923 if (uname(&utsname) == -1) {
5924 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5925 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5926 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5927 } else if (major < 2 || (major == 2 && minor < 35)) {
5928 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5929 utsname.release);
5930 } else {
5931 safe = true;
5932 }
5933 ovsthread_once_done(&once);
5934 }
5935 return safe;
5936}
5937
c1c9c9c4
BP
5938/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5939 * kernel to determine what they are. Returns 0 if successful, otherwise a
5940 * positive errno value. */
5941static int
b5d57fc8 5942tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5943{
b5d57fc8 5944 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5945 struct ofpbuf request, *qdisc;
5946 const struct tc_ops *ops;
5947 struct tcmsg *tcmsg;
5948 int load_error;
5949 int error;
5950
b5d57fc8 5951 if (netdev->tc) {
c1c9c9c4
BP
5952 return 0;
5953 }
5954
5955 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5956 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5957 * 2.6.35 without that fix backported to it.
5958 *
5959 * To avoid the OOPS, we must not make a request that would attempt to dump
5960 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5961 * few others. There are a few ways that I can see to do this, but most of
5962 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5963 * technique chosen here is to assume that any non-default qdisc that we
5964 * create will have a class with handle 1:0. The built-in qdiscs only have
5965 * a class with handle 0:0.
5966 *
ac3e3aaa
BP
5967 * On Linux 2.6.35+ we use the straightforward method because it allows us
5968 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5969 * in such a case we get no response at all from the kernel (!) if a
5970 * builtin qdisc is in use (which is later caught by "!error &&
5971 * !qdisc->size"). */
7874bdff
RD
5972 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5973 &request);
23a98ffe
BP
5974 if (!tcmsg) {
5975 return ENODEV;
5976 }
ac3e3aaa
BP
5977 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5978 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5979
5980 /* Figure out what tc class to instantiate. */
5981 error = tc_transact(&request, &qdisc);
ac3e3aaa 5982 if (!error && qdisc->size) {
c1c9c9c4
BP
5983 const char *kind;
5984
5985 error = tc_parse_qdisc(qdisc, &kind, NULL);
5986 if (error) {
5987 ops = &tc_ops_other;
5988 } else {
5989 ops = tc_lookup_linux_name(kind);
5990 if (!ops) {
5991 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5992 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5993
5994 ops = &tc_ops_other;
5995 }
5996 }
ac3e3aaa
BP
5997 } else if ((!error && !qdisc->size) || error == ENOENT) {
5998 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5999 * set up by some other entity that doesn't have a handle 1:0. We will
6000 * assume that it's the system default qdisc. */
c1c9c9c4
BP
6001 ops = &tc_ops_default;
6002 error = 0;
6003 } else {
6004 /* Who knows? Maybe the device got deleted. */
6005 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 6006 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
6007 ops = &tc_ops_other;
6008 }
6009
6010 /* Instantiate it. */
b5d57fc8
BP
6011 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6012 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
6013 ofpbuf_delete(qdisc);
6014
6015 return error ? error : load_error;
6016}
6017
6018/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6019 approximate the time to transmit packets of various lengths. For an MTU of
6020 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6021 represents two possible packet lengths; for a MTU of 513 through 1024, four
6022 possible lengths; and so on.
6023
6024 Returns, for the specified 'mtu', the number of bits that packet lengths
6025 need to be shifted right to fit within such a 256-entry table. */
6026static int
6027tc_calc_cell_log(unsigned int mtu)
6028{
6029 int cell_log;
6030
6031 if (!mtu) {
6032 mtu = ETH_PAYLOAD_MAX;
6033 }
6034 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6035
6036 for (cell_log = 0; mtu >= 256; cell_log++) {
6037 mtu >>= 1;
6038 }
6039
6040 return cell_log;
6041}
6042
6043/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6044 * of 'mtu'. */
6045static void
6046tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6047{
6048 memset(rate, 0, sizeof *rate);
6049 rate->cell_log = tc_calc_cell_log(mtu);
6050 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6051 /* rate->cell_align = 0; */ /* distro headers. */
6052 rate->mpu = ETH_TOTAL_MIN;
6053 rate->rate = Bps;
6054}
6055
6056/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6057 * attribute of the specified "type".
6058 *
6059 * See tc_calc_cell_log() above for a description of "rtab"s. */
e7f6ba22 6060void
c1c9c9c4
BP
6061tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
6062{
6063 uint32_t *rtab;
6064 unsigned int i;
6065
6066 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6067 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6068 unsigned packet_size = (i + 1) << rate->cell_log;
6069 if (packet_size < rate->mpu) {
6070 packet_size = rate->mpu;
6071 }
6072 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
6073 }
6074}
6075
6076/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6077 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6078 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 6079 * 0 is fine.) */
c1c9c9c4
BP
6080static int
6081tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
6082{
6083 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6084 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6085}
d3980822 6086\f
aaf2fb1a
BP
6087/* Linux-only functions declared in netdev-linux.h */
6088
6089/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6090 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6091int
6092netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6093 const char *flag_name, bool enable)
6094{
6095 const char *netdev_name = netdev_get_name(netdev);
6096 struct ethtool_value evalue;
6097 uint32_t new_flags;
6098 int error;
6099
ab985a77 6100 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
6101 memset(&evalue, 0, sizeof evalue);
6102 error = netdev_linux_do_ethtool(netdev_name,
6103 (struct ethtool_cmd *)&evalue,
6104 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6105 if (error) {
6106 return error;
6107 }
6108
ab985a77 6109 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
6110 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6111 if (new_flags == evalue.data) {
6112 return 0;
6113 }
6114 evalue.data = new_flags;
aaf2fb1a
BP
6115 error = netdev_linux_do_ethtool(netdev_name,
6116 (struct ethtool_cmd *)&evalue,
6117 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6118 if (error) {
6119 return error;
6120 }
6121
ab985a77 6122 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
6123 memset(&evalue, 0, sizeof evalue);
6124 error = netdev_linux_do_ethtool(netdev_name,
6125 (struct ethtool_cmd *)&evalue,
6126 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6127 if (error) {
6128 return error;
6129 }
6130
6131 if (new_flags != evalue.data) {
6132 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6133 "device %s failed", enable ? "enable" : "disable",
6134 flag_name, netdev_name);
6135 return EOPNOTSUPP;
6136 }
6137
6138 return 0;
6139}
6140\f
6141/* Utility functions. */
6142
d3980822 6143/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 6144static void
d3980822
BP
6145netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6146 const struct rtnl_link_stats *src)
6147{
f613a0d7
PS
6148 dst->rx_packets = src->rx_packets;
6149 dst->tx_packets = src->tx_packets;
6150 dst->rx_bytes = src->rx_bytes;
6151 dst->tx_bytes = src->tx_bytes;
6152 dst->rx_errors = src->rx_errors;
6153 dst->tx_errors = src->tx_errors;
6154 dst->rx_dropped = src->rx_dropped;
6155 dst->tx_dropped = src->tx_dropped;
6156 dst->multicast = src->multicast;
6157 dst->collisions = src->collisions;
6158 dst->rx_length_errors = src->rx_length_errors;
6159 dst->rx_over_errors = src->rx_over_errors;
6160 dst->rx_crc_errors = src->rx_crc_errors;
6161 dst->rx_frame_errors = src->rx_frame_errors;
6162 dst->rx_fifo_errors = src->rx_fifo_errors;
6163 dst->rx_missed_errors = src->rx_missed_errors;
6164 dst->tx_aborted_errors = src->tx_aborted_errors;
6165 dst->tx_carrier_errors = src->tx_carrier_errors;
6166 dst->tx_fifo_errors = src->tx_fifo_errors;
6167 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6168 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
6169}
6170
337c9b99
BP
6171/* Copies 'src' into 'dst', performing format conversion in the process. */
6172static void
6173netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6174 const struct rtnl_link_stats64 *src)
6175{
6176 dst->rx_packets = src->rx_packets;
6177 dst->tx_packets = src->tx_packets;
6178 dst->rx_bytes = src->rx_bytes;
6179 dst->tx_bytes = src->tx_bytes;
6180 dst->rx_errors = src->rx_errors;
6181 dst->tx_errors = src->tx_errors;
6182 dst->rx_dropped = src->rx_dropped;
6183 dst->tx_dropped = src->tx_dropped;
6184 dst->multicast = src->multicast;
6185 dst->collisions = src->collisions;
6186 dst->rx_length_errors = src->rx_length_errors;
6187 dst->rx_over_errors = src->rx_over_errors;
6188 dst->rx_crc_errors = src->rx_crc_errors;
6189 dst->rx_frame_errors = src->rx_frame_errors;
6190 dst->rx_fifo_errors = src->rx_fifo_errors;
6191 dst->rx_missed_errors = src->rx_missed_errors;
6192 dst->tx_aborted_errors = src->tx_aborted_errors;
6193 dst->tx_carrier_errors = src->tx_carrier_errors;
6194 dst->tx_fifo_errors = src->tx_fifo_errors;
6195 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6196 dst->tx_window_errors = src->tx_window_errors;
6197}
6198
0de1b425 6199int
35eef899 6200get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 6201{
c1c9c9c4
BP
6202 struct ofpbuf request;
6203 struct ofpbuf *reply;
c1c9c9c4
BP
6204 int error;
6205
d6e3feb5 6206 /* Filtering all counters by default */
6207 memset(stats, 0xFF, sizeof(struct netdev_stats));
6208
c1c9c9c4 6209 ofpbuf_init(&request, 0);
13a24df8
BP
6210 nl_msg_put_nlmsghdr(&request,
6211 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6212 RTM_GETLINK, NLM_F_REQUEST);
6213 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6214 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 6215 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
6216 ofpbuf_uninit(&request);
6217 if (error) {
6218 return error;
6219 }
6220
13a24df8 6221 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
6222 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6223 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6224 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
6225 error = 0;
6226 } else {
71f21279 6227 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
6228 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6229 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6230 error = 0;
6231 } else {
6232 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6233 error = EPROTO;
6234 }
13a24df8
BP
6235 }
6236 } else {
6237 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6238 error = EPROTO;
c1c9c9c4 6239 }
8b61709d 6240
8b61709d 6241
576e26d7 6242 ofpbuf_delete(reply);
35eef899 6243 return error;
8b61709d 6244}
c1c9c9c4 6245
3a183124 6246static int
b5d57fc8 6247get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
6248{
6249 struct ifreq ifr;
6250 int error;
6251
755be9ea 6252 *flags = 0;
259e0b1a 6253 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
6254 if (!error) {
6255 *flags = ifr.ifr_flags;
6256 }
8b61709d
BP
6257 return error;
6258}
6259
6260static int
4b609110 6261set_flags(const char *name, unsigned int flags)
8b61709d
BP
6262{
6263 struct ifreq ifr;
6264
6265 ifr.ifr_flags = flags;
259e0b1a 6266 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
6267}
6268
01b25786
PB
6269int
6270linux_get_ifindex(const char *netdev_name)
8b61709d
BP
6271{
6272 struct ifreq ifr;
259e0b1a 6273 int error;
8b61709d 6274
71d7c22f 6275 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 6276 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
6277
6278 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6279 if (error) {
580e1152
RD
6280 /* ENODEV probably means that a vif disappeared asynchronously and
6281 * hasn't been removed from the database yet, so reduce the log level
6282 * to INFO for that case. */
6283 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6284 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6285 netdev_name, ovs_strerror(error));
259e0b1a 6286 return -error;
8b61709d
BP
6287 }
6288 return ifr.ifr_ifindex;
6289}
6290
6291static int
6292get_ifindex(const struct netdev *netdev_, int *ifindexp)
6293{
b5d57fc8 6294 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 6295
b5d57fc8 6296 if (!(netdev->cache_valid & VALID_IFINDEX)) {
756819dd
FL
6297 netdev_linux_update_via_netlink(netdev);
6298 }
6299
6300 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6301 /* Fall back to ioctl if netlink fails */
01b25786 6302 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 6303
8b61709d 6304 if (ifindex < 0) {
b5d57fc8
BP
6305 netdev->get_ifindex_error = -ifindex;
6306 netdev->ifindex = 0;
c7b1b0a5 6307 } else {
b5d57fc8
BP
6308 netdev->get_ifindex_error = 0;
6309 netdev->ifindex = ifindex;
8b61709d 6310 }
b5d57fc8 6311 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 6312 }
c7b1b0a5 6313
b5d57fc8
BP
6314 *ifindexp = netdev->ifindex;
6315 return netdev->get_ifindex_error;
8b61709d
BP
6316}
6317
6318static int
756819dd
FL
6319netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6320{
6321 struct ofpbuf request;
6322 struct ofpbuf *reply;
6323 struct rtnetlink_change chg;
6324 struct rtnetlink_change *change = &chg;
6325 int error;
6326
6327 ofpbuf_init(&request, 0);
6328 nl_msg_put_nlmsghdr(&request,
b43762a5
FL
6329 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6330 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
756819dd
FL
6331 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6332
6333 /* The correct identifiers for a Linux device are netnsid and ifindex,
6334 * but ifindex changes as the port is moved to another network namespace
6335 * and the interface name statically stored in ovsdb. */
6336 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6337 if (netdev_linux_netnsid_is_remote(netdev)) {
23fa50f6 6338 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
756819dd
FL
6339 }
6340 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6341 ofpbuf_uninit(&request);
6342 if (error) {
6343 ofpbuf_delete(reply);
6344 return error;
6345 }
6346
6347 if (rtnetlink_parse(reply, change)
6348 && change->nlmsg_type == RTM_NEWLINK) {
6349 bool changed = false;
6350 error = 0;
6351
6352 /* Update netdev from rtnl msg and increment its seq if needed. */
6353 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6354 netdev->carrier_resets++;
6355 changed = true;
6356 }
6357 if (change->ifi_flags != netdev->ifi_flags) {
6358 netdev->ifi_flags = change->ifi_flags;
6359 changed = true;
6360 }
6361 if (change->mtu && change->mtu != netdev->mtu) {
6362 netdev->mtu = change->mtu;
6363 netdev->cache_valid |= VALID_MTU;
6364 netdev->netdev_mtu_error = 0;
6365 changed = true;
6366 }
6367 if (!eth_addr_is_zero(change->mac)
6368 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6369 netdev->etheraddr = change->mac;
6370 netdev->cache_valid |= VALID_ETHERADDR;
6371 netdev->ether_addr_error = 0;
6372 changed = true;
6373 }
6374 if (change->if_index != netdev->ifindex) {
6375 netdev->ifindex = change->if_index;
6376 netdev->cache_valid |= VALID_IFINDEX;
6377 netdev->get_ifindex_error = 0;
6378 changed = true;
6379 }
3d9c99ab
JH
6380 if (change->master && netdev_linux_kind_is_lag(change->master)) {
6381 netdev->is_lag_master = true;
6382 }
756819dd
FL
6383 if (changed) {
6384 netdev_change_seq_changed(&netdev->up);
6385 }
6386 } else {
6387 error = EINVAL;
6388 }
6389
6390 ofpbuf_delete(reply);
6391 return error;
6392}
6393
6394static int
74ff3298 6395get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
6396{
6397 struct ifreq ifr;
6398 int hwaddr_family;
259e0b1a 6399 int error;
8b61709d
BP
6400
6401 memset(&ifr, 0, sizeof ifr);
71d7c22f 6402 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 6403 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
6404 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6405 if (error) {
78857dfb
BP
6406 /* ENODEV probably means that a vif disappeared asynchronously and
6407 * hasn't been removed from the database yet, so reduce the log level
6408 * to INFO for that case. */
259e0b1a 6409 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 6410 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
6411 netdev_name, ovs_strerror(error));
6412 return error;
8b61709d
BP
6413 }
6414 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
6415 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6416 hwaddr_family != ARPHRD_NONE) {
c9697f35 6417 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 6418 netdev_name, hwaddr_family);
c9697f35 6419 return EINVAL;
8b61709d
BP
6420 }
6421 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6422 return 0;
6423}
6424
6425static int
74ff3298 6426set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
6427{
6428 struct ifreq ifr;
259e0b1a 6429 int error;
8b61709d
BP
6430
6431 memset(&ifr, 0, sizeof ifr);
71d7c22f 6432 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 6433 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 6434 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 6435 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
6436 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6437 if (error) {
8b61709d 6438 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 6439 netdev_name, ovs_strerror(error));
8b61709d 6440 }
259e0b1a 6441 return error;
8b61709d
BP
6442}
6443
6444static int
0b0544d7 6445netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
6446 int cmd, const char *cmd_name)
6447{
6448 struct ifreq ifr;
259e0b1a 6449 int error;
8b61709d
BP
6450
6451 memset(&ifr, 0, sizeof ifr);
71d7c22f 6452 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
6453 ifr.ifr_data = (caddr_t) ecmd;
6454
6455 ecmd->cmd = cmd;
259e0b1a
BP
6456 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6457 if (error) {
6458 if (error != EOPNOTSUPP) {
8b61709d 6459 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 6460 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
6461 } else {
6462 /* The device doesn't support this operation. That's pretty
6463 * common, so there's no point in logging anything. */
6464 }
8b61709d 6465 }
259e0b1a 6466 return error;
8b61709d 6467}
f1acd62b 6468
488d734d
BP
6469/* Returns an AF_PACKET raw socket or a negative errno value. */
6470static int
6471af_packet_sock(void)
6472{
23882115
BP
6473 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6474 static int sock;
488d734d 6475
23882115 6476 if (ovsthread_once_start(&once)) {
488d734d
BP
6477 sock = socket(AF_PACKET, SOCK_RAW, 0);
6478 if (sock >= 0) {
8450059e
BP
6479 int error = set_nonblocking(sock);
6480 if (error) {
6481 close(sock);
6482 sock = -error;
29cf9c1b
FL
6483 } else if (userspace_tso_enabled()) {
6484 int val = 1;
6485 error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
6486 sizeof val);
6487 if (error) {
6488 error = errno;
6489 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6490 ovs_strerror(errno));
6491 close(sock);
6492 sock = -error;
6493 }
8450059e 6494 }
488d734d
BP
6495 } else {
6496 sock = -errno;
10a89ef0
BP
6497 VLOG_ERR("failed to create packet socket: %s",
6498 ovs_strerror(errno));
488d734d 6499 }
23882115 6500 ovsthread_once_done(&once);
488d734d
BP
6501 }
6502
6503 return sock;
6504}
29cf9c1b
FL
6505
6506static int
6507netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
6508{
6509 struct eth_header *eth_hdr;
6510 ovs_be16 eth_type;
6511 int l2_len;
6512
6513 eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
6514 if (!eth_hdr) {
6515 return -EINVAL;
6516 }
6517
6518 l2_len = ETH_HEADER_LEN;
6519 eth_type = eth_hdr->eth_type;
6520 if (eth_type_vlan(eth_type)) {
6521 struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
6522
6523 if (!vlan) {
6524 return -EINVAL;
6525 }
6526
6527 eth_type = vlan->vlan_next_type;
6528 l2_len += VLAN_HEADER_LEN;
6529 }
6530
6531 if (eth_type == htons(ETH_TYPE_IP)) {
6532 struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
6533
6534 if (!ip_hdr) {
6535 return -EINVAL;
6536 }
6537
6538 *l4proto = ip_hdr->ip_proto;
6539 dp_packet_hwol_set_tx_ipv4(b);
6540 } else if (eth_type == htons(ETH_TYPE_IPV6)) {
6541 struct ovs_16aligned_ip6_hdr *nh6;
6542
6543 nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
6544 if (!nh6) {
6545 return -EINVAL;
6546 }
6547
6548 *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
6549 dp_packet_hwol_set_tx_ipv6(b);
6550 }
6551
6552 return 0;
6553}
6554
6555static int
6556netdev_linux_parse_vnet_hdr(struct dp_packet *b)
6557{
6558 struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
6559 uint16_t l4proto = 0;
6560
6561 if (OVS_UNLIKELY(!vnet)) {
6562 return -EINVAL;
6563 }
6564
6565 if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
6566 return 0;
6567 }
6568
6569 if (netdev_linux_parse_l2(b, &l4proto)) {
6570 return -EINVAL;
6571 }
6572
6573 if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
6574 if (l4proto == IPPROTO_TCP) {
6575 dp_packet_hwol_set_csum_tcp(b);
6576 } else if (l4proto == IPPROTO_UDP) {
6577 dp_packet_hwol_set_csum_udp(b);
6578 } else if (l4proto == IPPROTO_SCTP) {
6579 dp_packet_hwol_set_csum_sctp(b);
6580 }
6581 }
6582
6583 if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
6584 uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4
6585 | VIRTIO_NET_HDR_GSO_TCPV6
6586 | VIRTIO_NET_HDR_GSO_UDP;
6587 uint8_t type = vnet->gso_type & allowed_mask;
6588
6589 if (type == VIRTIO_NET_HDR_GSO_TCPV4
6590 || type == VIRTIO_NET_HDR_GSO_TCPV6) {
6591 dp_packet_hwol_set_tcp_seg(b);
6592 }
6593 }
6594
6595 return 0;
6596}
6597
6598static void
6599netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
6600{
6601 struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet);
6602
6603 if (dp_packet_hwol_is_tso(b)) {
6604 uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b))
6605 + TCP_HEADER_LEN;
6606
6607 vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
6608 vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len);
6609 if (dp_packet_hwol_is_ipv4(b)) {
6610 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
6611 } else {
6612 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
6613 }
6614
6615 } else {
6616 vnet->flags = VIRTIO_NET_HDR_GSO_NONE;
6617 }
6618
6619 if (dp_packet_hwol_l4_mask(b)) {
6620 vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
6621 vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b)
6622 - (char *)dp_packet_eth(b));
6623
6624 if (dp_packet_hwol_l4_is_tcp(b)) {
6625 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6626 struct tcp_header, tcp_csum);
6627 } else if (dp_packet_hwol_l4_is_udp(b)) {
6628 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6629 struct udp_header, udp_csum);
6630 } else if (dp_packet_hwol_l4_is_sctp(b)) {
6631 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6632 struct sctp_header, sctp_csum);
6633 } else {
6634 VLOG_WARN_RL(&rl, "Unsupported L4 protocol");
6635 }
6636 }
6637}