]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
ovs-thread: Add pthread spin lock support.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
b2befd5b
BP
23#include <sys/types.h>
24#include <netinet/in.h>
55bc98d6 25#include <arpa/inet.h>
8b61709d 26#include <inttypes.h>
2f564bb1 27#include <math.h>
32383c3b 28#include <linux/filter.h>
c1c9c9c4 29#include <linux/gen_stats.h>
bb7d0e22 30#include <linux/if_ether.h>
8b61709d
BP
31#include <linux/if_tun.h>
32#include <linux/types.h>
33#include <linux/ethtool.h>
63331829 34#include <linux/mii.h>
ef3767f5 35#include <linux/rtnetlink.h>
8b61709d 36#include <linux/sockios.h>
8b61709d
BP
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
8b61709d 43#include <net/route.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
e14deea0 50#include "dp-packet.h"
93451a0a 51#include "dpif-netlink.h"
df1e5a3b 52#include "dpif-netdev.h"
3e8a2ad1 53#include "openvswitch/dynamic-string.h"
8b61709d 54#include "fatal-signal.h"
93b13be8 55#include "hash.h"
ee89ea7b 56#include "openvswitch/hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
45c8d3a1 59#include "netlink-notifier.h"
2fe27d5a 60#include "netlink-socket.h"
c060c4cf 61#include "netlink.h"
bfda5239 62#include "netnsid.h"
64c96779 63#include "openvswitch/ofpbuf.h"
8b61709d 64#include "openflow/openflow.h"
19c8e9c1 65#include "ovs-atomic.h"
8b61709d 66#include "packets.h"
fd016ae3 67#include "openvswitch/poll-loop.h"
7e9dcc0f 68#include "rtnetlink.h"
ee89ea7b 69#include "openvswitch/shash.h"
c060c4cf 70#include "socket-util.h"
19993ef3 71#include "sset.h"
c1c5c723 72#include "tc.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
ee89ea7b 76#include "util.h"
5136ce49 77
d98e6007 78VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 79
d76f09ea
BP
80COVERAGE_DEFINE(netdev_set_policing);
81COVERAGE_DEFINE(netdev_arp_lookup);
82COVERAGE_DEFINE(netdev_get_ifindex);
83COVERAGE_DEFINE(netdev_get_hwaddr);
84COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
85COVERAGE_DEFINE(netdev_get_ethtool);
86COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 87
8b61709d 88\f
756819dd
FL
89#ifndef IFLA_IF_NETNSID
90#define IFLA_IF_NETNSID 0x45
91#endif
8b61709d
BP
92/* These were introduced in Linux 2.6.14, so they might be missing if we have
93 * old headers. */
94#ifndef ADVERTISED_Pause
95#define ADVERTISED_Pause (1 << 13)
96#endif
97#ifndef ADVERTISED_Asym_Pause
98#define ADVERTISED_Asym_Pause (1 << 14)
99#endif
100
e47bd51a
JP
101/* These were introduced in Linux 2.6.24, so they might be missing if we
102 * have old headers. */
103#ifndef ETHTOOL_GFLAGS
104#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
105#endif
106#ifndef ETHTOOL_SFLAGS
107#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
108#endif
109
c1c9c9c4
BP
110/* This was introduced in Linux 2.6.25, so it might be missing if we have old
111 * headers. */
112#ifndef TC_RTAB_SIZE
113#define TC_RTAB_SIZE 1024
114#endif
115
e7f6ba22
PJV
116#ifndef TCM_IFINDEX_MAGIC_BLOCK
117#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
118#endif
119
b73c8518
SH
120/* Linux 2.6.21 introduced struct tpacket_auxdata.
121 * Linux 2.6.27 added the tp_vlan_tci member.
122 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
123 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
124 * TP_STATUS_VLAN_TPID_VALID.
125 *
126 * With all this churn it's easiest to unconditionally define a replacement
127 * structure that has everything we want.
128 */
55bc98d6
BP
129#ifndef PACKET_AUXDATA
130#define PACKET_AUXDATA 8
131#endif
b73c8518
SH
132#ifndef TP_STATUS_VLAN_VALID
133#define TP_STATUS_VLAN_VALID (1 << 4)
134#endif
135#ifndef TP_STATUS_VLAN_TPID_VALID
136#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
137#endif
138#undef tpacket_auxdata
139#define tpacket_auxdata rpl_tpacket_auxdata
140struct tpacket_auxdata {
141 uint32_t tp_status;
142 uint32_t tp_len;
143 uint32_t tp_snaplen;
144 uint16_t tp_mac;
145 uint16_t tp_net;
146 uint16_t tp_vlan_tci;
147 uint16_t tp_vlan_tpid;
148};
149
0c615356
SH
150/* Linux 2.6.27 introduced ethtool_cmd_speed
151 *
152 * To avoid revisiting problems reported with using configure to detect
153 * compatibility (see report at
8a7903c6 154 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
155 * unconditionally replace ethtool_cmd_speed. */
156#define ethtool_cmd_speed rpl_ethtool_cmd_speed
157static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
158{
159 return ep->speed | (ep->speed_hi << 16);
160}
161
67bed84c
SH
162/* Linux 2.6.30 introduced supported and advertised flags for
163 * 1G base KX, and 10G base KX4, KR and R. */
164#ifndef SUPPORTED_1000baseKX_Full
165#define SUPPORTED_1000baseKX_Full (1 << 17)
166#define SUPPORTED_10000baseKX4_Full (1 << 18)
167#define SUPPORTED_10000baseKR_Full (1 << 19)
168#define SUPPORTED_10000baseR_FEC (1 << 20)
169#define ADVERTISED_1000baseKX_Full (1 << 17)
170#define ADVERTISED_10000baseKX4_Full (1 << 18)
171#define ADVERTISED_10000baseKR_Full (1 << 19)
172#define ADVERTISED_10000baseR_FEC (1 << 20)
173#endif
174
175/* Linux 3.5 introduced supported and advertised flags for
176 * 40G base KR4, CR4, SR4 and LR4. */
177#ifndef SUPPORTED_40000baseKR4_Full
178#define SUPPORTED_40000baseKR4_Full (1 << 23)
179#define SUPPORTED_40000baseCR4_Full (1 << 24)
180#define SUPPORTED_40000baseSR4_Full (1 << 25)
181#define SUPPORTED_40000baseLR4_Full (1 << 26)
182#define ADVERTISED_40000baseKR4_Full (1 << 23)
183#define ADVERTISED_40000baseCR4_Full (1 << 24)
184#define ADVERTISED_40000baseSR4_Full (1 << 25)
185#define ADVERTISED_40000baseLR4_Full (1 << 26)
186#endif
187
fa373af4
BP
188/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
189 *
190 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
191 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
192 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
193 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
194 * unconditionally define a replacement. */
195#ifndef IFLA_STATS64
337c9b99 196#define IFLA_STATS64 23
fa373af4
BP
197#endif
198#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
199struct rtnl_link_stats64 {
200 uint64_t rx_packets;
201 uint64_t tx_packets;
202 uint64_t rx_bytes;
203 uint64_t tx_bytes;
204 uint64_t rx_errors;
205 uint64_t tx_errors;
206 uint64_t rx_dropped;
207 uint64_t tx_dropped;
208 uint64_t multicast;
209 uint64_t collisions;
210
211 uint64_t rx_length_errors;
212 uint64_t rx_over_errors;
213 uint64_t rx_crc_errors;
214 uint64_t rx_frame_errors;
215 uint64_t rx_fifo_errors;
216 uint64_t rx_missed_errors;
217
218 uint64_t tx_aborted_errors;
219 uint64_t tx_carrier_errors;
220 uint64_t tx_fifo_errors;
221 uint64_t tx_heartbeat_errors;
222 uint64_t tx_window_errors;
223
224 uint64_t rx_compressed;
225 uint64_t tx_compressed;
226};
337c9b99 227
8b61709d 228enum {
7fbef77a
JG
229 VALID_IFINDEX = 1 << 0,
230 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
231 VALID_IN = 1 << 2,
232 VALID_MTU = 1 << 3,
233 VALID_POLICING = 1 << 4,
234 VALID_VPORT_STAT_ERROR = 1 << 5,
235 VALID_DRVINFO = 1 << 6,
236 VALID_FEATURES = 1 << 7,
8b61709d 237};
c1c9c9c4 238\f
d22f8927
JH
239struct linux_lag_slave {
240 uint32_t block_id;
241 struct shash_node *node;
242};
243
244/* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
245static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
246
247/* All slaves whose LAG masters are network devices in OvS. */
248static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
249 = SHASH_INITIALIZER(&lag_shash);
250
c1c9c9c4
BP
251/* Traffic control. */
252
253/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
254 * network device.
255 *
256 * Each TC implementation subclasses this with whatever additional data it
257 * needs. */
c1c9c9c4
BP
258struct tc {
259 const struct tc_ops *ops;
93b13be8
BP
260 struct hmap queues; /* Contains "struct tc_queue"s.
261 * Read by generic TC layer.
262 * Written only by TC implementation. */
263};
c1c9c9c4 264
559eb230
BP
265#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
266
93b13be8
BP
267/* One traffic control queue.
268 *
269 * Each TC implementation subclasses this with whatever additional data it
270 * needs. */
271struct tc_queue {
272 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
273 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 274 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
275};
276
277/* A particular kind of traffic control. Each implementation generally maps to
278 * one particular Linux qdisc class.
279 *
280 * The functions below return 0 if successful or a positive errno value on
281 * failure, except where otherwise noted. All of them must be provided, except
282 * where otherwise noted. */
283struct tc_ops {
284 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
285 * This is null for tc_ops_default and tc_ops_other, for which there are no
286 * appropriate values. */
287 const char *linux_name;
288
289 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
290 const char *ovs_name;
291
292 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
293 * queues. The queues are numbered 0 through n_queues - 1. */
294 unsigned int n_queues;
295
296 /* Called to install this TC class on 'netdev'. The implementation should
297 * make the Netlink calls required to set up 'netdev' with the right qdisc
298 * and configure it according to 'details'. The implementation may assume
299 * that the current qdisc is the default; that is, there is no need for it
300 * to delete the current qdisc before installing itself.
301 *
302 * The contents of 'details' should be documented as valid for 'ovs_name'
303 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
304 * (which is built as ovs-vswitchd.conf.db(8)).
305 *
306 * This function must return 0 if and only if it sets 'netdev->tc' to an
307 * initialized 'struct tc'.
308 *
309 * (This function is null for tc_ops_other, which cannot be installed. For
310 * other TC classes it should always be nonnull.) */
79f1cbe9 311 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
312
313 /* Called when the netdev code determines (through a Netlink query) that
314 * this TC class's qdisc is installed on 'netdev', but we didn't install
315 * it ourselves and so don't know any of the details.
316 *
317 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
318 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
319 * implementation should parse the other attributes of 'nlmsg' as
320 * necessary to determine its configuration. If necessary it should also
321 * use Netlink queries to determine the configuration of queues on
322 * 'netdev'.
323 *
324 * This function must return 0 if and only if it sets 'netdev->tc' to an
325 * initialized 'struct tc'. */
326 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
327
328 /* Destroys the data structures allocated by the implementation as part of
329 * 'tc'. (This includes destroying 'tc->queues' by calling
330 * tc_destroy(tc).
331 *
332 * The implementation should not need to perform any Netlink calls. If
333 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
334 * (But it may not be desirable.)
335 *
336 * This function may be null if 'tc' is trivial. */
337 void (*tc_destroy)(struct tc *tc);
338
339 /* Retrieves details of 'netdev->tc' configuration into 'details'.
340 *
341 * The implementation should not need to perform any Netlink calls, because
342 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
343 * cached the configuration.
344 *
345 * The contents of 'details' should be documented as valid for 'ovs_name'
346 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
347 * (which is built as ovs-vswitchd.conf.db(8)).
348 *
349 * This function may be null if 'tc' is not configurable.
350 */
79f1cbe9 351 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
352
353 /* Reconfigures 'netdev->tc' according to 'details', performing any
354 * required Netlink calls to complete the reconfiguration.
355 *
356 * The contents of 'details' should be documented as valid for 'ovs_name'
357 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
358 * (which is built as ovs-vswitchd.conf.db(8)).
359 *
360 * This function may be null if 'tc' is not configurable.
361 */
79f1cbe9 362 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 363
93b13be8
BP
364 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
365 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
366 *
367 * The contents of 'details' should be documented as valid for 'ovs_name'
368 * in the "other_config" column in the "Queue" table in
369 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
370 *
371 * The implementation should not need to perform any Netlink calls, because
372 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
373 * cached the queue configuration.
374 *
375 * This function may be null if 'tc' does not have queues ('n_queues' is
376 * 0). */
93b13be8 377 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 378 struct smap *details);
c1c9c9c4
BP
379
380 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
381 * 'details', perfoming any required Netlink calls to complete the
382 * reconfiguration. The caller ensures that 'queue_id' is less than
383 * 'n_queues'.
384 *
385 * The contents of 'details' should be documented as valid for 'ovs_name'
386 * in the "other_config" column in the "Queue" table in
387 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
388 *
389 * This function may be null if 'tc' does not have queues or its queues are
390 * not configurable. */
391 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 392 const struct smap *details);
c1c9c9c4 393
93b13be8
BP
394 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
395 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
396 *
397 * This function may be null if 'tc' does not have queues or its queues
398 * cannot be deleted. */
93b13be8 399 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 400
93b13be8
BP
401 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
402 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
403 *
404 * On success, initializes '*stats'.
405 *
406 * This function may be null if 'tc' does not have queues or if it cannot
407 * report queue statistics. */
93b13be8
BP
408 int (*class_get_stats)(const struct netdev *netdev,
409 const struct tc_queue *queue,
c1c9c9c4
BP
410 struct netdev_queue_stats *stats);
411
412 /* Extracts queue stats from 'nlmsg', which is a response to a
413 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
414 *
415 * This function may be null if 'tc' does not have queues or if it cannot
416 * report queue statistics. */
417 int (*class_dump_stats)(const struct netdev *netdev,
418 const struct ofpbuf *nlmsg,
419 netdev_dump_queue_stats_cb *cb, void *aux);
420};
421
422static void
423tc_init(struct tc *tc, const struct tc_ops *ops)
424{
425 tc->ops = ops;
93b13be8 426 hmap_init(&tc->queues);
c1c9c9c4
BP
427}
428
429static void
430tc_destroy(struct tc *tc)
431{
93b13be8 432 hmap_destroy(&tc->queues);
c1c9c9c4
BP
433}
434
435static const struct tc_ops tc_ops_htb;
a339aa81 436static const struct tc_ops tc_ops_hfsc;
677d9158
JV
437static const struct tc_ops tc_ops_codel;
438static const struct tc_ops tc_ops_fqcodel;
439static const struct tc_ops tc_ops_sfq;
2f564bb1 440static const struct tc_ops tc_ops_netem;
c1c9c9c4 441static const struct tc_ops tc_ops_default;
6cf888b8 442static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
443static const struct tc_ops tc_ops_other;
444
559eb230 445static const struct tc_ops *const tcs[] = {
c1c9c9c4 446 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 447 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
448 &tc_ops_codel, /* Controlled delay */
449 &tc_ops_fqcodel, /* Fair queue controlled delay */
450 &tc_ops_sfq, /* Stochastic fair queueing */
2f564bb1 451 &tc_ops_netem, /* Network Emulator */
6cf888b8 452 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
453 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
454 &tc_ops_other, /* Some other qdisc. */
455 NULL
456};
149f577a 457
c1c9c9c4
BP
458static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
459static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
460static unsigned int tc_buffer_per_jiffy(unsigned int rate);
2f564bb1 461static uint32_t tc_time_to_ticks(uint32_t time);
c1c9c9c4 462
7874bdff
RD
463static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
464 int type,
465 unsigned int flags,
466 struct ofpbuf *);
c7952afb
BP
467static int tc_add_policer(struct netdev *,
468 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
469
470static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
471 struct nlattr **options);
472static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
473 struct nlattr **options,
474 struct netdev_queue_stats *);
475static int tc_query_class(const struct netdev *,
476 unsigned int handle, unsigned int parent,
477 struct ofpbuf **replyp);
478static int tc_delete_class(const struct netdev *, unsigned int handle);
479
480static int tc_del_qdisc(struct netdev *netdev);
481static int tc_query_qdisc(const struct netdev *netdev);
482
e7f6ba22
PJV
483void
484tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
c1c9c9c4
BP
485static int tc_calc_cell_log(unsigned int mtu);
486static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
c1c9c9c4
BP
487static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
488\f
b5d57fc8
BP
489struct netdev_linux {
490 struct netdev up;
149f577a 491
86383816
BP
492 /* Protects all members below. */
493 struct ovs_mutex mutex;
494
149f577a 495 unsigned int cache_valid;
8b61709d 496
1670c579
EJ
497 bool miimon; /* Link status of last poll. */
498 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
499 struct timer miimon_timer;
500
bfda5239 501 int netnsid; /* Network namespace ID. */
8722022c
BP
502 /* The following are figured out "on demand" only. They are only valid
503 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 504 int ifindex;
74ff3298 505 struct eth_addr etheraddr;
8b61709d 506 int mtu;
059e5f4f 507 unsigned int ifi_flags;
65c3058c 508 long long int carrier_resets;
80a86fbe
BP
509 uint32_t kbits_rate; /* Policing data. */
510 uint32_t kbits_burst;
bba1e6f3
PS
511 int vport_stats_error; /* Cached error code from vport_get_stats().
512 0 or an errno value. */
90a6637d 513 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 514 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 515 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 516 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 517 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 518
a00ca915
EJ
519 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
520 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
521 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 522
4f925bd3 523 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 524 struct tc *tc;
149f577a 525
d0d08f8a
BP
526 /* For devices of class netdev_tap_class only. */
527 int tap_fd;
22dcb534
FL
528 bool present; /* If the device is present in the namespace */
529 uint64_t tx_dropped; /* tap device can drop if the iface is down */
3d9c99ab
JH
530
531 /* LAG information. */
532 bool is_lag_master; /* True if the netdev is a LAG master. */
8b61709d
BP
533};
534
f7791740
PS
535struct netdev_rxq_linux {
536 struct netdev_rxq up;
796223f5 537 bool is_tap;
5b7448ed 538 int fd;
149f577a 539};
8b61709d 540
8b61709d
BP
541/* This is set pretty low because we probably won't learn anything from the
542 * additional log messages. */
543static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
544
19c8e9c1
JS
545/* Polling miimon status for all ports causes performance degradation when
546 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
547 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
548 *
549 * Readers do not depend on this variable synchronizing with the related
550 * changes in the device miimon status, so we can use atomic_count. */
551static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 552
1c33f0c3 553static void netdev_linux_run(const struct netdev_class *);
6f643e49 554
0b0544d7 555static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 556 int cmd, const char *cmd_name);
b5d57fc8 557static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 558static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
559static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
560 enum netdev_flags on, enum netdev_flags *old_flagsp)
561 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
562static int get_ifindex(const struct netdev *, int *ifindexp);
563static int do_set_addr(struct netdev *netdev,
564 int ioctl_nr, const char *ioctl_name,
565 struct in_addr addr);
74ff3298
JR
566static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
567static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 568static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 569static int af_packet_sock(void);
19c8e9c1 570static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
571static void netdev_linux_miimon_run(void);
572static void netdev_linux_miimon_wait(void);
df1e5a3b 573static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 574
15b3596a
JG
575static bool
576is_netdev_linux_class(const struct netdev_class *netdev_class)
577{
259e0b1a 578 return netdev_class->run == netdev_linux_run;
15b3596a
JG
579}
580
796223f5
BP
581static bool
582is_tap_netdev(const struct netdev *netdev)
583{
b5d57fc8 584 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
585}
586
8b61709d
BP
587static struct netdev_linux *
588netdev_linux_cast(const struct netdev *netdev)
589{
b5d57fc8 590 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 591
180c6d0b 592 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 593}
796223f5 594
f7791740
PS
595static struct netdev_rxq_linux *
596netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 597{
9dc63482 598 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 599 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 600}
ff4ed3c9 601\f
bfda5239
FL
602static int
603netdev_linux_netnsid_update__(struct netdev_linux *netdev)
604{
605 struct dpif_netlink_vport reply;
606 struct ofpbuf *buf;
607 int error;
608
609 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
610 if (error) {
629e1476
FL
611 if (error == ENOENT) {
612 /* Assume it is local if there is no API (e.g. if the openvswitch
613 * kernel module is not loaded). */
614 netnsid_set_local(&netdev->netnsid);
615 } else {
616 netnsid_unset(&netdev->netnsid);
617 }
bfda5239
FL
618 return error;
619 }
620
621 netnsid_set(&netdev->netnsid, reply.netnsid);
622 ofpbuf_delete(buf);
623 return 0;
624}
625
626static int
627netdev_linux_netnsid_update(struct netdev_linux *netdev)
628{
629 if (netnsid_is_unset(netdev->netnsid)) {
3dbcbfe4
FL
630 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
631 netnsid_set_local(&netdev->netnsid);
632 } else {
633 return netdev_linux_netnsid_update__(netdev);
634 }
bfda5239
FL
635 }
636
637 return 0;
638}
639
640static bool
641netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
642{
643 netdev_linux_netnsid_update(netdev);
644 return netnsid_eq(netdev->netnsid, nsid);
645}
646
756819dd
FL
647static bool
648netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
649{
650 netdev_linux_netnsid_update(netdev);
651 return netnsid_is_remote(netdev->netnsid);
652}
653
654static int netdev_linux_update_via_netlink(struct netdev_linux *);
bfda5239 655static void netdev_linux_update(struct netdev_linux *netdev, int,
7e9dcc0f 656 const struct rtnetlink_change *)
86383816 657 OVS_REQUIRES(netdev->mutex);
cee87338 658static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
659 unsigned int ifi_flags, unsigned int mask)
660 OVS_REQUIRES(netdev->mutex);
cee87338 661
d6384a3a
AW
662/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
663 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
664 * if no such socket could be created. */
665static struct nl_sock *
666netdev_linux_notify_sock(void)
667{
668 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
669 static struct nl_sock *sock;
989d7135
PS
670 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
671 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
672
673 if (ovsthread_once_start(&once)) {
674 int error;
675
676 error = nl_sock_create(NETLINK_ROUTE, &sock);
677 if (!error) {
d6384a3a
AW
678 size_t i;
679
680 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
681 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
682 if (error) {
683 nl_sock_destroy(sock);
684 sock = NULL;
685 break;
686 }
cee87338
BP
687 }
688 }
cf114a7f 689 nl_sock_listen_all_nsid(sock, true);
cee87338
BP
690 ovsthread_once_done(&once);
691 }
692
693 return sock;
694}
695
19c8e9c1
JS
696static bool
697netdev_linux_miimon_enabled(void)
698{
812c272c 699 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
700}
701
3d9c99ab
JH
702static bool
703netdev_linux_kind_is_lag(const char *kind)
704{
705 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
706 return true;
707 }
708
709 return false;
710}
711
d22f8927
JH
712static void
713netdev_linux_update_lag(struct rtnetlink_change *change)
714 OVS_REQUIRES(lag_mutex)
715{
716 struct linux_lag_slave *lag;
717
718 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
719 return;
720 }
721
722 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
723 lag = shash_find_data(&lag_shash, change->ifname);
724
725 if (!lag) {
726 struct netdev *master_netdev;
727 char master_name[IFNAMSIZ];
728 uint32_t block_id;
729 int error = 0;
730
731 if_indextoname(change->master_ifindex, master_name);
732 master_netdev = netdev_from_name(master_name);
e3b5d7c5
TL
733 if (!master_netdev) {
734 return;
735 }
d22f8927
JH
736
737 if (is_netdev_linux_class(master_netdev->netdev_class)) {
738 block_id = netdev_get_block_id(master_netdev);
739 if (!block_id) {
e3b5d7c5
TL
740 netdev_close(master_netdev);
741 return;
d22f8927
JH
742 }
743
744 lag = xmalloc(sizeof *lag);
745 lag->block_id = block_id;
746 lag->node = shash_add(&lag_shash, change->ifname, lag);
747
cae64353 748 /* delete ingress block in case it exists */
95255018 749 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
d22f8927 750 /* LAG master is linux netdev so add slave to same block. */
95255018
JH
751 error = tc_add_del_qdisc(change->if_index, true, block_id,
752 TC_INGRESS);
d22f8927 753 if (error) {
cae64353
RD
754 VLOG_WARN("failed to bind LAG slave %s to master's block",
755 change->ifname);
d22f8927
JH
756 shash_delete(&lag_shash, lag->node);
757 free(lag);
758 }
759 }
e3b5d7c5
TL
760
761 netdev_close(master_netdev);
d22f8927
JH
762 }
763 } else if (change->master_ifindex == 0) {
764 /* Check if this was a lag slave that has been freed. */
765 lag = shash_find_data(&lag_shash, change->ifname);
766
767 if (lag) {
95255018
JH
768 tc_add_del_qdisc(change->if_index, false, lag->block_id,
769 TC_INGRESS);
d22f8927
JH
770 shash_delete(&lag_shash, lag->node);
771 free(lag);
772 }
773 }
774}
775
8b61709d 776static void
1c33f0c3 777netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 778{
cee87338
BP
779 struct nl_sock *sock;
780 int error;
781
19c8e9c1
JS
782 if (netdev_linux_miimon_enabled()) {
783 netdev_linux_miimon_run();
784 }
cee87338
BP
785
786 sock = netdev_linux_notify_sock();
787 if (!sock) {
788 return;
789 }
790
791 do {
cee87338 792 uint64_t buf_stub[4096 / 8];
bfda5239 793 int nsid;
cee87338
BP
794 struct ofpbuf buf;
795
796 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
bfda5239 797 error = nl_sock_recv(sock, &buf, &nsid, false);
cee87338 798 if (!error) {
7e9dcc0f 799 struct rtnetlink_change change;
cee87338 800
7e9dcc0f 801 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
802 struct netdev *netdev_ = NULL;
803 char dev_name[IFNAMSIZ];
804
805 if (!change.ifname) {
806 change.ifname = if_indextoname(change.if_index, dev_name);
807 }
808
809 if (change.ifname) {
810 netdev_ = netdev_from_name(change.ifname);
811 }
cee87338
BP
812 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
814
815 ovs_mutex_lock(&netdev->mutex);
bfda5239 816 netdev_linux_update(netdev, nsid, &change);
86383816 817 ovs_mutex_unlock(&netdev->mutex);
cee87338 818 }
d22f8927
JH
819 else if (!netdev_ && change.ifname) {
820 /* Netdev is not present in OvS but its master could be. */
821 ovs_mutex_lock(&lag_mutex);
822 netdev_linux_update_lag(&change);
823 ovs_mutex_unlock(&lag_mutex);
824 }
38e0065b 825 netdev_close(netdev_);
cee87338
BP
826 }
827 } else if (error == ENOBUFS) {
828 struct shash device_shash;
829 struct shash_node *node;
830
831 nl_sock_drain(sock);
832
833 shash_init(&device_shash);
834 netdev_get_devices(&netdev_linux_class, &device_shash);
835 SHASH_FOR_EACH (node, &device_shash) {
836 struct netdev *netdev_ = node->data;
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
838 unsigned int flags;
839
86383816 840 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
841 get_flags(netdev_, &flags);
842 netdev_linux_changed(netdev, flags, 0);
86383816
BP
843 ovs_mutex_unlock(&netdev->mutex);
844
cee87338
BP
845 netdev_close(netdev_);
846 }
847 shash_destroy(&device_shash);
848 } else if (error != EAGAIN) {
7ed58d4a
JP
849 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
850 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
851 ovs_strerror(error));
852 }
853 ofpbuf_uninit(&buf);
854 } while (!error);
8b61709d
BP
855}
856
857static void
1c33f0c3 858netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 859{
cee87338
BP
860 struct nl_sock *sock;
861
19c8e9c1
JS
862 if (netdev_linux_miimon_enabled()) {
863 netdev_linux_miimon_wait();
864 }
cee87338
BP
865 sock = netdev_linux_notify_sock();
866 if (sock) {
867 nl_sock_wait(sock, POLLIN);
868 }
8b61709d
BP
869}
870
ac4d3bcb 871static void
b5d57fc8
BP
872netdev_linux_changed(struct netdev_linux *dev,
873 unsigned int ifi_flags, unsigned int mask)
86383816 874 OVS_REQUIRES(dev->mutex)
ac4d3bcb 875{
3e912ffc 876 netdev_change_seq_changed(&dev->up);
8aa77183
BP
877
878 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
879 dev->carrier_resets++;
880 }
881 dev->ifi_flags = ifi_flags;
882
4f925bd3 883 dev->cache_valid &= mask;
6b6e1329 884 if (!(mask & VALID_IN)) {
a8704b50
PS
885 netdev_get_addrs_list_flush();
886 }
4f925bd3
PS
887}
888
889static void
bfda5239
FL
890netdev_linux_update__(struct netdev_linux *dev,
891 const struct rtnetlink_change *change)
86383816 892 OVS_REQUIRES(dev->mutex)
4f925bd3 893{
bfda5239 894 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
d6384a3a 895 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 896 /* Keep drv-info, and ip addresses. */
d6384a3a 897 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 898 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
899
900 /* Update netdev from rtnl-change msg. */
901 if (change->mtu) {
902 dev->mtu = change->mtu;
903 dev->cache_valid |= VALID_MTU;
904 dev->netdev_mtu_error = 0;
905 }
90a6637d 906
74ff3298
JR
907 if (!eth_addr_is_zero(change->mac)) {
908 dev->etheraddr = change->mac;
d6384a3a
AW
909 dev->cache_valid |= VALID_ETHERADDR;
910 dev->ether_addr_error = 0;
e8e1a409
TZ
911
912 /* The mac addr has been changed, report it now. */
913 rtnetlink_report_link();
d6384a3a 914 }
44445cac 915
3d9c99ab
JH
916 if (change->master && netdev_linux_kind_is_lag(change->master)) {
917 dev->is_lag_master = true;
918 }
919
d6384a3a
AW
920 dev->ifindex = change->if_index;
921 dev->cache_valid |= VALID_IFINDEX;
922 dev->get_ifindex_error = 0;
22dcb534 923 dev->present = true;
d6384a3a 924 } else {
bfda5239 925 /* FIXME */
d6384a3a 926 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 927 dev->present = false;
bfda5239 928 netnsid_unset(&dev->netnsid);
d6384a3a
AW
929 }
930 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
931 /* Invalidates in4, in6. */
6b6e1329 932 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 933 } else {
d6384a3a 934 OVS_NOT_REACHED();
4f925bd3 935 }
ac4d3bcb
EJ
936}
937
bfda5239
FL
938static void
939netdev_linux_update(struct netdev_linux *dev, int nsid,
940 const struct rtnetlink_change *change)
941 OVS_REQUIRES(dev->mutex)
942{
943 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
944 netdev_linux_update__(dev, change);
945 }
946}
947
9dc63482
BP
948static struct netdev *
949netdev_linux_alloc(void)
950{
951 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
952 return &netdev->up;
953}
954
48c6733c
WT
955static int
956netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 957{
48c6733c
WT
958 /* Prevent any attempt to create (or open) a network device named "default"
959 * or "all". These device names are effectively reserved on Linux because
960 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
961 * itself this wouldn't call for any special treatment, but in practice if
962 * a program tries to create devices with these names, it causes the kernel
963 * to fire a "new device" notification event even though creation failed,
964 * and in turn that causes OVS to wake up and try to create them again,
965 * which ends up as a 100% CPU loop. */
966 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
967 const char *name = netdev_->name;
968 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
969 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
970 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
971 name);
972 return EINVAL;
973 }
974
bfda5239
FL
975 /* The device could be in the same network namespace or in another one. */
976 netnsid_unset(&netdev->netnsid);
834d6caf 977 ovs_mutex_init(&netdev->mutex);
48c6733c 978 return 0;
9dc63482
BP
979}
980
1f6e0fbd
BP
981/* Creates system and internal devices. */
982static int
9dc63482 983netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 984{
9dc63482 985 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
986 int error = netdev_linux_common_construct(netdev_);
987 if (error) {
988 return error;
989 }
1f6e0fbd 990
b5d57fc8
BP
991 error = get_flags(&netdev->up, &netdev->ifi_flags);
992 if (error == ENODEV) {
9dc63482 993 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 994 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
995 return ENODEV;
996 } else {
997 /* "Internal" netdevs have to be created as netdev objects before
998 * they exist in the kernel, because creating them in the kernel
999 * happens by passing a netdev object to dpif_port_add().
1000 * Therefore, ignore the error. */
1001 }
1002 }
46415c90 1003
a740f0de
JG
1004 return 0;
1005}
1006
5b7448ed
JG
1007/* For most types of netdevs we open the device for each call of
1008 * netdev_open(). However, this is not the case with tap devices,
1009 * since it is only possible to open the device once. In this
1010 * situation we share a single file descriptor, and consequently
1011 * buffers, across all readers. Therefore once data is read it will
1012 * be unavailable to other reads for tap devices. */
a740f0de 1013static int
9dc63482 1014netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 1015{
9dc63482 1016 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 1017 static const char tap_dev[] = "/dev/net/tun";
9dc63482 1018 const char *name = netdev_->name;
a740f0de 1019 struct ifreq ifr;
a740f0de 1020
48c6733c
WT
1021 int error = netdev_linux_common_construct(netdev_);
1022 if (error) {
1023 return error;
1024 }
1f6e0fbd 1025
6c88d577 1026 /* Open tap device. */
d0d08f8a
BP
1027 netdev->tap_fd = open(tap_dev, O_RDWR);
1028 if (netdev->tap_fd < 0) {
6c88d577 1029 error = errno;
10a89ef0 1030 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 1031 return error;
6c88d577
JP
1032 }
1033
1034 /* Create tap device. */
61b9d078 1035 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 1036 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 1037 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 1038 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 1039 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 1040 ovs_strerror(errno));
6c88d577 1041 error = errno;
f61d8d29 1042 goto error_close;
6c88d577
JP
1043 }
1044
1045 /* Make non-blocking. */
d0d08f8a 1046 error = set_nonblocking(netdev->tap_fd);
a740f0de 1047 if (error) {
f61d8d29 1048 goto error_close;
a740f0de
JG
1049 }
1050
0f28164b
FL
1051 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1052 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1053 ovs_strerror(errno));
1054 error = errno;
1055 goto error_close;
1056 }
1057
19aac14a 1058 netdev->present = true;
a740f0de
JG
1059 return 0;
1060
f61d8d29 1061error_close:
d0d08f8a 1062 close(netdev->tap_fd);
a740f0de
JG
1063 return error;
1064}
1065
6c88d577 1066static void
9dc63482 1067netdev_linux_destruct(struct netdev *netdev_)
6c88d577 1068{
b5d57fc8 1069 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 1070
b5d57fc8
BP
1071 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1072 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
1073 }
1074
d0d08f8a
BP
1075 if (netdev_get_class(netdev_) == &netdev_tap_class
1076 && netdev->tap_fd >= 0)
1077 {
0f28164b 1078 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 1079 close(netdev->tap_fd);
6c88d577 1080 }
86383816 1081
19c8e9c1 1082 if (netdev->miimon_interval > 0) {
812c272c 1083 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1084 }
1085
86383816 1086 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
1087}
1088
9dc63482
BP
1089static void
1090netdev_linux_dealloc(struct netdev *netdev_)
1091{
1092 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1093 free(netdev);
1094}
1095
f7791740
PS
1096static struct netdev_rxq *
1097netdev_linux_rxq_alloc(void)
9dc63482 1098{
f7791740 1099 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
1100 return &rx->up;
1101}
1102
7b6b0ef4 1103static int
f7791740 1104netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 1105{
f7791740 1106 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1107 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 1108 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 1109 int error;
7b6b0ef4 1110
86383816 1111 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
1112 rx->is_tap = is_tap_netdev(netdev_);
1113 if (rx->is_tap) {
1114 rx->fd = netdev->tap_fd;
796223f5
BP
1115 } else {
1116 struct sockaddr_ll sll;
b73c8518 1117 int ifindex, val;
32383c3b 1118 /* Result of tcpdump -dd inbound */
259e0b1a 1119 static const struct sock_filter filt[] = {
32383c3b
MM
1120 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1121 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1122 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1123 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1124 };
259e0b1a
BP
1125 static const struct sock_fprog fprog = {
1126 ARRAY_SIZE(filt), (struct sock_filter *) filt
1127 };
7b6b0ef4 1128
796223f5 1129 /* Create file descriptor. */
9dc63482
BP
1130 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1131 if (rx->fd < 0) {
796223f5 1132 error = errno;
10a89ef0 1133 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
1134 goto error;
1135 }
33d82a56 1136
b73c8518
SH
1137 val = 1;
1138 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1139 error = errno;
1140 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1141 netdev_get_name(netdev_), ovs_strerror(error));
1142 goto error;
1143 }
1144
796223f5 1145 /* Set non-blocking mode. */
9dc63482 1146 error = set_nonblocking(rx->fd);
796223f5
BP
1147 if (error) {
1148 goto error;
1149 }
7b6b0ef4 1150
796223f5 1151 /* Get ethernet device index. */
180c6d0b 1152 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
1153 if (error) {
1154 goto error;
1155 }
7b6b0ef4 1156
796223f5
BP
1157 /* Bind to specific ethernet device. */
1158 memset(&sll, 0, sizeof sll);
1159 sll.sll_family = AF_PACKET;
1160 sll.sll_ifindex = ifindex;
b73c8518 1161 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 1162 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
1163 error = errno;
1164 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 1165 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
1166 goto error;
1167 }
32383c3b
MM
1168
1169 /* Filter for only inbound packets. */
9dc63482 1170 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
1171 sizeof fprog);
1172 if (error) {
1173 error = errno;
259e0b1a 1174 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 1175 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
1176 goto error;
1177 }
7b6b0ef4 1178 }
86383816 1179 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1180
7b6b0ef4
BP
1181 return 0;
1182
1183error:
9dc63482
BP
1184 if (rx->fd >= 0) {
1185 close(rx->fd);
7b6b0ef4 1186 }
86383816 1187 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1188 return error;
1189}
1190
796223f5 1191static void
f7791740 1192netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1193{
f7791740 1194 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1195
796223f5
BP
1196 if (!rx->is_tap) {
1197 close(rx->fd);
8b61709d 1198 }
9dc63482
BP
1199}
1200
1201static void
f7791740 1202netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1203{
f7791740 1204 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1205
796223f5
BP
1206 free(rx);
1207}
8b61709d 1208
b73c8518 1209static ovs_be16
1ebdc7eb 1210auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1211{
1212 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1213 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1214 } else if (double_tagged) {
1215 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1216 } else {
1ebdc7eb 1217 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1218 }
1219}
1220
1221static bool
1222auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1223{
1224 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1225}
1226
796223f5 1227static int
cf62fa4c 1228netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1229{
b73c8518 1230 size_t size;
796223f5 1231 ssize_t retval;
b73c8518
SH
1232 struct iovec iov;
1233 struct cmsghdr *cmsg;
1234 union {
1235 struct cmsghdr cmsg;
1236 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1237 } cmsg_buffer;
1238 struct msghdr msgh;
1239
1240 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1241 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1242 size = dp_packet_tailroom(buffer);
b73c8518 1243
cf62fa4c 1244 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1245 iov.iov_len = size;
1246 msgh.msg_name = NULL;
1247 msgh.msg_namelen = 0;
1248 msgh.msg_iov = &iov;
1249 msgh.msg_iovlen = 1;
1250 msgh.msg_control = &cmsg_buffer;
1251 msgh.msg_controllen = sizeof cmsg_buffer;
1252 msgh.msg_flags = 0;
8e8cddf7 1253
796223f5 1254 do {
b73c8518 1255 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1256 } while (retval < 0 && errno == EINTR);
1257
bfd3367b 1258 if (retval < 0) {
b73c8518
SH
1259 return errno;
1260 } else if (retval > size) {
1261 return EMSGSIZE;
1262 }
1263
cf62fa4c 1264 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1265
1266 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1267 const struct tpacket_auxdata *aux;
1268
1269 if (cmsg->cmsg_level != SOL_PACKET
1270 || cmsg->cmsg_type != PACKET_AUXDATA
1271 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1272 continue;
8b61709d 1273 }
b73c8518
SH
1274
1275 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1276 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1277 struct eth_header *eth;
1278 bool double_tagged;
1279
b73c8518
SH
1280 if (retval < ETH_HEADER_LEN) {
1281 return EINVAL;
1282 }
1283
1ebdc7eb
EG
1284 eth = dp_packet_data(buffer);
1285 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1286
1287 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1288 htons(aux->tp_vlan_tci));
1289 break;
1290 }
1291 }
1292
1293 return 0;
1294}
1295
1296static int
cf62fa4c 1297netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1298{
1299 ssize_t retval;
cf62fa4c 1300 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1301
1302 do {
cf62fa4c 1303 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1304 } while (retval < 0 && errno == EINTR);
1305
1306 if (retval < 0) {
bfd3367b 1307 return errno;
8b61709d 1308 }
b73c8518 1309
cf62fa4c 1310 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1311 return 0;
1312}
1313
1314static int
8492adc2
JS
1315netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1316 int *qfill)
b73c8518 1317{
f7791740 1318 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1319 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1320 struct dp_packet *buffer;
df1e5a3b
PS
1321 ssize_t retval;
1322 int mtu;
1323
1324 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1325 mtu = ETH_PAYLOAD_MAX;
1326 }
1327
2482b0b0 1328 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1329 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1330 DP_NETDEV_HEADROOM);
b73c8518 1331 retval = (rx->is_tap
f7791740
PS
1332 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1333 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1334
1335 if (retval) {
1336 if (retval != EAGAIN && retval != EMSGSIZE) {
1337 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1338 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1339 }
cf62fa4c 1340 dp_packet_delete(buffer);
df1e5a3b 1341 } else {
72c84bc2 1342 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1343 }
1344
8492adc2
JS
1345 if (qfill) {
1346 *qfill = -ENOTSUP;
1347 }
1348
b73c8518 1349 return retval;
8b61709d
BP
1350}
1351
8b61709d 1352static void
f7791740 1353netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1354{
f7791740 1355 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1356 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1357}
1358
8b61709d 1359static int
f7791740 1360netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1361{
f7791740 1362 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1363 if (rx->is_tap) {
8b61709d 1364 struct ifreq ifr;
f7791740 1365 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1366 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1367 if (error) {
1368 return error;
1369 }
796223f5 1370 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1371 return 0;
1372 } else {
796223f5 1373 return drain_rcvbuf(rx->fd);
8b61709d
BP
1374 }
1375}
1376
d19cf8bb
ZG
1377static int
1378netdev_linux_sock_batch_send(int sock, int ifindex,
1379 struct dp_packet_batch *batch)
1380{
e0a00cee 1381 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1382 /* We don't bother setting most fields in sockaddr_ll because the
1383 * kernel ignores them for SOCK_RAW. */
1384 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1385 .sll_ifindex = ifindex };
1386
e0a00cee
BB
1387 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1388 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1389
e0a00cee 1390 struct dp_packet *packet;
e883448e 1391 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
d19cf8bb 1392 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1393 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1394 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1395 .msg_namelen = sizeof sll,
1396 .msg_iov = &iov[i],
1397 .msg_iovlen = 1 };
1398 }
1399
1400 int error = 0;
e0a00cee 1401 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1402 ssize_t retval;
1403 do {
e0a00cee 1404 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1405 error = retval < 0 ? errno : 0;
1406 } while (error == EINTR);
1407 if (error) {
1408 break;
1409 }
1410 ofs += retval;
1411 }
1412
1413 free(mmsg);
1414 free(iov);
1415 return error;
1416}
1417
1418/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1419 * essential, because packets sent to a tap device with an AF_PACKET socket
1420 * will loop back to be *received* again on the tap device. This doesn't occur
1421 * on other interface types because we attach a socket filter to the rx
1422 * socket. */
1423static int
1424netdev_linux_tap_batch_send(struct netdev *netdev_,
1425 struct dp_packet_batch *batch)
1426{
1427 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1428 struct dp_packet *packet;
22dcb534
FL
1429
1430 /* The Linux tap driver returns EIO if the device is not up,
1431 * so if the device is not up, don't waste time sending it.
1432 * However, if the device is in another network namespace
1433 * then OVS can't retrieve the state. In that case, send the
1434 * packets anyway. */
1435 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1436 netdev->tx_dropped += dp_packet_batch_size(batch);
1437 return 0;
1438 }
1439
e883448e 1440 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
ad8b0b4f 1441 size_t size = dp_packet_size(packet);
d19cf8bb
ZG
1442 ssize_t retval;
1443 int error;
1444
1445 do {
1446 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1447 error = retval < 0 ? errno : 0;
1448 } while (error == EINTR);
1449
1450 if (error) {
1451 /* The Linux tap driver returns EIO if the device is not up. From
1452 * the OVS side this is not an error, so we ignore it; otherwise,
1453 * return the erro. */
1454 if (error != EIO) {
1455 return error;
1456 }
1457 } else if (retval != size) {
1458 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1459 "bytes of %"PRIuSIZE") on %s",
1460 retval, size, netdev_get_name(netdev_));
1461 return EMSGSIZE;
1462 }
1463 }
1464 return 0;
1465}
1466
1467/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1468 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1469 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1470 * the packet is too big or too small to transmit on the device.
1471 *
8b61709d
BP
1472 * The kernel maintains a packet transmission queue, so the caller is not
1473 * expected to do additional queuing of packets. */
1474static int
f00fa8cb 1475netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1476 struct dp_packet_batch *batch,
324c8374 1477 bool concurrent_txq OVS_UNUSED)
8b61709d 1478{
f4fd623c 1479 int error = 0;
0a62ae2c
ZG
1480 int sock = 0;
1481
0a62ae2c 1482 if (!is_tap_netdev(netdev_)) {
e0e2410d
FL
1483 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1484 error = EOPNOTSUPP;
1485 goto free_batch;
1486 }
1487
0a62ae2c
ZG
1488 sock = af_packet_sock();
1489 if (sock < 0) {
1490 error = -sock;
1491 goto free_batch;
1492 }
1493
1494 int ifindex = netdev_get_ifindex(netdev_);
1495 if (ifindex < 0) {
1496 error = -ifindex;
1497 goto free_batch;
1498 }
1499
d19cf8bb
ZG
1500 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1501 } else {
1502 error = netdev_linux_tap_batch_send(netdev_, batch);
0a62ae2c 1503 }
d19cf8bb
ZG
1504 if (error) {
1505 if (error == ENOBUFS) {
1506 /* The Linux AF_PACKET implementation never blocks waiting
1507 * for room for packets, instead returning ENOBUFS.
1508 * Translate this into EAGAIN for the caller. */
1509 error = EAGAIN;
f23347ea 1510 } else {
f4fd623c
DDP
1511 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1512 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1513 }
f4fd623c
DDP
1514 }
1515
0a62ae2c 1516free_batch:
b30896c9 1517 dp_packet_delete_batch(batch, true);
f4fd623c 1518 return error;
8b61709d
BP
1519}
1520
1521/* Registers with the poll loop to wake up from the next call to poll_block()
1522 * when the packet transmission queue has sufficient room to transmit a packet
1523 * with netdev_send().
1524 *
1525 * The kernel maintains a packet transmission queue, so the client is not
1526 * expected to do additional queuing of packets. Thus, this function is
1527 * unlikely to ever be used. It is included for completeness. */
1528static void
f00fa8cb 1529netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1530{
796223f5 1531 if (is_tap_netdev(netdev)) {
8b61709d
BP
1532 /* TAP device always accepts packets.*/
1533 poll_immediate_wake();
1534 }
1535}
1536
1537/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1538 * otherwise a positive errno value. */
1539static int
74ff3298 1540netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1541{
b5d57fc8 1542 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1543 enum netdev_flags old_flags = 0;
eb395f2e
BP
1544 int error;
1545
86383816 1546 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1547 if (netdev_linux_netnsid_is_remote(netdev)) {
1548 error = EOPNOTSUPP;
1549 goto exit;
1550 }
86383816 1551
b5d57fc8 1552 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1553 error = netdev->ether_addr_error;
1554 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1555 goto exit;
44445cac 1556 }
b5d57fc8 1557 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1558 }
1559
7eb1bd81 1560 /* Tap devices must be brought down before setting the address. */
796223f5 1561 if (is_tap_netdev(netdev_)) {
4f9f3f21 1562 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1563 }
44445cac
PS
1564 error = set_etheraddr(netdev_get_name(netdev_), mac);
1565 if (!error || error == ENODEV) {
b5d57fc8
BP
1566 netdev->ether_addr_error = error;
1567 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1568 if (!error) {
74ff3298 1569 netdev->etheraddr = mac;
eb395f2e 1570 }
8b61709d 1571 }
44445cac 1572
4f9f3f21
BP
1573 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1574 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1575 }
7eb1bd81 1576
86383816
BP
1577exit:
1578 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1579 return error;
1580}
1581
44445cac 1582/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1583static int
74ff3298 1584netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1585{
b5d57fc8 1586 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1587 int error;
44445cac 1588
86383816 1589 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1590 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
756819dd
FL
1591 netdev_linux_update_via_netlink(netdev);
1592 }
1593
1594 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1595 /* Fall back to ioctl if netlink fails */
86383816 1596 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1597 &netdev->etheraddr);
b5d57fc8 1598 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1599 }
44445cac 1600
86383816
BP
1601 error = netdev->ether_addr_error;
1602 if (!error) {
74ff3298 1603 *mac = netdev->etheraddr;
44445cac 1604 }
86383816 1605 ovs_mutex_unlock(&netdev->mutex);
44445cac 1606
86383816 1607 return error;
8b61709d
BP
1608}
1609
8b61709d 1610static int
73371c09 1611netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1612{
86383816
BP
1613 int error;
1614
b5d57fc8 1615 if (!(netdev->cache_valid & VALID_MTU)) {
756819dd
FL
1616 netdev_linux_update_via_netlink(netdev);
1617 }
1618
1619 if (!(netdev->cache_valid & VALID_MTU)) {
1620 /* Fall back to ioctl if netlink fails */
8b61709d 1621 struct ifreq ifr;
90a6637d 1622
86383816 1623 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1624 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1625 netdev->mtu = ifr.ifr_mtu;
1626 netdev->cache_valid |= VALID_MTU;
8b61709d 1627 }
90a6637d 1628
86383816
BP
1629 error = netdev->netdev_mtu_error;
1630 if (!error) {
b5d57fc8 1631 *mtup = netdev->mtu;
90a6637d 1632 }
73371c09
BP
1633
1634 return error;
1635}
1636
1637/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1638 * in bytes, not including the hardware header; thus, this is typically 1500
1639 * bytes for Ethernet devices. */
1640static int
1641netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1642{
1643 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1644 int error;
1645
1646 ovs_mutex_lock(&netdev->mutex);
1647 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1648 ovs_mutex_unlock(&netdev->mutex);
1649
1650 return error;
8b61709d
BP
1651}
1652
9b020780
PS
1653/* Sets the maximum size of transmitted (MTU) for given device using linux
1654 * networking ioctl interface.
1655 */
1656static int
4124cb12 1657netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1658{
b5d57fc8 1659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1660 struct ifreq ifr;
1661 int error;
1662
86383816 1663 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1664 if (netdev_linux_netnsid_is_remote(netdev)) {
1665 error = EOPNOTSUPP;
1666 goto exit;
1667 }
1668
b5d57fc8 1669 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1670 error = netdev->netdev_mtu_error;
1671 if (error || netdev->mtu == mtu) {
1672 goto exit;
90a6637d 1673 }
b5d57fc8 1674 netdev->cache_valid &= ~VALID_MTU;
153e5481 1675 }
9b020780 1676 ifr.ifr_mtu = mtu;
259e0b1a
BP
1677 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1678 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1679 if (!error || error == ENODEV) {
b5d57fc8
BP
1680 netdev->netdev_mtu_error = error;
1681 netdev->mtu = ifr.ifr_mtu;
1682 netdev->cache_valid |= VALID_MTU;
9b020780 1683 }
86383816
BP
1684exit:
1685 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1686 return error;
9b020780
PS
1687}
1688
9ab3d9a3
BP
1689/* Returns the ifindex of 'netdev', if successful, as a positive number.
1690 * On failure, returns a negative errno value. */
1691static int
86383816 1692netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1693{
86383816 1694 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1695 int ifindex, error;
1696
86383816 1697 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1698 if (netdev_linux_netnsid_is_remote(netdev)) {
1699 error = EOPNOTSUPP;
1700 goto exit;
1701 }
86383816 1702 error = get_ifindex(netdev_, &ifindex);
86383816 1703
e0e2410d
FL
1704exit:
1705 ovs_mutex_unlock(&netdev->mutex);
9ab3d9a3
BP
1706 return error ? -error : ifindex;
1707}
1708
8b61709d
BP
1709static int
1710netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1711{
b5d57fc8 1712 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1713
86383816 1714 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1715 if (netdev->miimon_interval > 0) {
1716 *carrier = netdev->miimon;
3a183124 1717 } else {
b5d57fc8 1718 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1719 }
86383816 1720 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1721
3a183124 1722 return 0;
8b61709d
BP
1723}
1724
65c3058c 1725static long long int
86383816 1726netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1727{
86383816
BP
1728 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1729 long long int carrier_resets;
1730
1731 ovs_mutex_lock(&netdev->mutex);
1732 carrier_resets = netdev->carrier_resets;
1733 ovs_mutex_unlock(&netdev->mutex);
1734
1735 return carrier_resets;
65c3058c
EJ
1736}
1737
63331829 1738static int
1670c579
EJ
1739netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1740 struct mii_ioctl_data *data)
63331829 1741{
63331829 1742 struct ifreq ifr;
782e6111 1743 int error;
63331829 1744
63331829 1745 memset(&ifr, 0, sizeof ifr);
782e6111 1746 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1747 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1748 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1749
782e6111
EJ
1750 return error;
1751}
1752
1753static int
1670c579 1754netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1755{
782e6111
EJ
1756 struct mii_ioctl_data data;
1757 int error;
63331829 1758
782e6111
EJ
1759 *miimon = false;
1760
1761 memset(&data, 0, sizeof data);
1670c579 1762 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1763 if (!error) {
1764 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1765 data.reg_num = MII_BMSR;
1670c579 1766 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1767 &data);
63331829
EJ
1768
1769 if (!error) {
782e6111 1770 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1771 }
9120cfc0
DH
1772 }
1773 if (error) {
63331829 1774 struct ethtool_cmd ecmd;
63331829
EJ
1775
1776 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1777 name);
1778
ab985a77 1779 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1780 memset(&ecmd, 0, sizeof ecmd);
1781 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1782 "ETHTOOL_GLINK");
1783 if (!error) {
782e6111
EJ
1784 struct ethtool_value eval;
1785
1786 memcpy(&eval, &ecmd, sizeof eval);
1787 *miimon = !!eval.data;
63331829
EJ
1788 } else {
1789 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1790 }
1791 }
1792
1793 return error;
1794}
1795
1670c579
EJ
1796static int
1797netdev_linux_set_miimon_interval(struct netdev *netdev_,
1798 long long int interval)
1799{
b5d57fc8 1800 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1801
86383816 1802 ovs_mutex_lock(&netdev->mutex);
1670c579 1803 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1804 if (netdev->miimon_interval != interval) {
19c8e9c1 1805 if (interval && !netdev->miimon_interval) {
812c272c 1806 atomic_count_inc(&miimon_cnt);
19c8e9c1 1807 } else if (!interval && netdev->miimon_interval) {
812c272c 1808 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1809 }
1810
b5d57fc8
BP
1811 netdev->miimon_interval = interval;
1812 timer_set_expired(&netdev->miimon_timer);
1670c579 1813 }
86383816 1814 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1815
1816 return 0;
1817}
1818
1819static void
1820netdev_linux_miimon_run(void)
1821{
1822 struct shash device_shash;
1823 struct shash_node *node;
1824
1825 shash_init(&device_shash);
b5d57fc8 1826 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1827 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1828 struct netdev *netdev = node->data;
1829 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1830 bool miimon;
1831
86383816
BP
1832 ovs_mutex_lock(&dev->mutex);
1833 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1834 netdev_linux_get_miimon(dev->up.name, &miimon);
1835 if (miimon != dev->miimon) {
1836 dev->miimon = miimon;
1837 netdev_linux_changed(dev, dev->ifi_flags, 0);
1838 }
1670c579 1839
86383816 1840 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1841 }
86383816 1842 ovs_mutex_unlock(&dev->mutex);
2f980d74 1843 netdev_close(netdev);
1670c579
EJ
1844 }
1845
1846 shash_destroy(&device_shash);
1847}
1848
1849static void
1850netdev_linux_miimon_wait(void)
1851{
1852 struct shash device_shash;
1853 struct shash_node *node;
1854
1855 shash_init(&device_shash);
b5d57fc8 1856 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1857 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1858 struct netdev *netdev = node->data;
1859 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1860
86383816 1861 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1862 if (dev->miimon_interval > 0) {
1863 timer_wait(&dev->miimon_timer);
1864 }
86383816 1865 ovs_mutex_unlock(&dev->mutex);
2f980d74 1866 netdev_close(netdev);
1670c579
EJ
1867 }
1868 shash_destroy(&device_shash);
1869}
1870
92df599c
JG
1871static void
1872swap_uint64(uint64_t *a, uint64_t *b)
1873{
1de0e8ae
BP
1874 uint64_t tmp = *a;
1875 *a = *b;
1876 *b = tmp;
92df599c
JG
1877}
1878
c060c4cf
EJ
1879/* Copies 'src' into 'dst', performing format conversion in the process.
1880 *
1881 * 'src' is allowed to be misaligned. */
1882static void
1883netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1884 const struct ovs_vport_stats *src)
1885{
6a54dedc
BP
1886 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1887 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1888 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1889 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1890 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1891 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1892 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1893 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1894 dst->multicast = 0;
1895 dst->collisions = 0;
1896 dst->rx_length_errors = 0;
1897 dst->rx_over_errors = 0;
1898 dst->rx_crc_errors = 0;
1899 dst->rx_frame_errors = 0;
1900 dst->rx_fifo_errors = 0;
1901 dst->rx_missed_errors = 0;
1902 dst->tx_aborted_errors = 0;
1903 dst->tx_carrier_errors = 0;
1904 dst->tx_fifo_errors = 0;
1905 dst->tx_heartbeat_errors = 0;
1906 dst->tx_window_errors = 0;
1907}
1908
1909static int
1910get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1911{
93451a0a 1912 struct dpif_netlink_vport reply;
c060c4cf
EJ
1913 struct ofpbuf *buf;
1914 int error;
1915
93451a0a 1916 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1917 if (error) {
1918 return error;
1919 } else if (!reply.stats) {
1920 ofpbuf_delete(buf);
1921 return EOPNOTSUPP;
1922 }
1923
1924 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1925
1926 ofpbuf_delete(buf);
1927
1928 return 0;
1929}
1930
f613a0d7
PS
1931static void
1932get_stats_via_vport(const struct netdev *netdev_,
1933 struct netdev_stats *stats)
8b61709d 1934{
b5d57fc8 1935 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1936
b5d57fc8
BP
1937 if (!netdev->vport_stats_error ||
1938 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1939 int error;
7fbef77a 1940
c060c4cf 1941 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1942 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1943 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1944 "(%s)",
1945 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1946 }
b5d57fc8
BP
1947 netdev->vport_stats_error = error;
1948 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1949 }
f613a0d7 1950}
8b61709d 1951
f613a0d7
PS
1952/* Retrieves current device stats for 'netdev-linux'. */
1953static int
1954netdev_linux_get_stats(const struct netdev *netdev_,
1955 struct netdev_stats *stats)
1956{
b5d57fc8 1957 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1958 struct netdev_stats dev_stats;
1959 int error;
1960
86383816 1961 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1962 get_stats_via_vport(netdev_, stats);
35eef899 1963 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1964 if (error) {
86383816
BP
1965 if (!netdev->vport_stats_error) {
1966 error = 0;
f613a0d7 1967 }
86383816 1968 } else if (netdev->vport_stats_error) {
04c881eb 1969 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1970 *stats = dev_stats;
1971 } else {
04c881eb
AZ
1972 /* Use kernel netdev's packet and byte counts since vport's counters
1973 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1974 * enabled. */
1975 stats->rx_packets = dev_stats.rx_packets;
1976 stats->rx_bytes = dev_stats.rx_bytes;
1977 stats->tx_packets = dev_stats.tx_packets;
1978 stats->tx_bytes = dev_stats.tx_bytes;
1979
f613a0d7
PS
1980 stats->rx_errors += dev_stats.rx_errors;
1981 stats->tx_errors += dev_stats.tx_errors;
1982 stats->rx_dropped += dev_stats.rx_dropped;
1983 stats->tx_dropped += dev_stats.tx_dropped;
1984 stats->multicast += dev_stats.multicast;
1985 stats->collisions += dev_stats.collisions;
1986 stats->rx_length_errors += dev_stats.rx_length_errors;
1987 stats->rx_over_errors += dev_stats.rx_over_errors;
1988 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1989 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1990 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1991 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1992 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1993 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1994 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1995 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1996 stats->tx_window_errors += dev_stats.tx_window_errors;
1997 }
86383816
BP
1998 ovs_mutex_unlock(&netdev->mutex);
1999
2000 return error;
f613a0d7
PS
2001}
2002
2003/* Retrieves current device stats for 'netdev-tap' netdev or
2004 * netdev-internal. */
2005static int
15aee116 2006netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 2007{
b5d57fc8 2008 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
2009 struct netdev_stats dev_stats;
2010 int error;
2011
86383816 2012 ovs_mutex_lock(&netdev->mutex);
f613a0d7 2013 get_stats_via_vport(netdev_, stats);
35eef899 2014 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 2015 if (error) {
86383816
BP
2016 if (!netdev->vport_stats_error) {
2017 error = 0;
8b61709d 2018 }
86383816
BP
2019 } else if (netdev->vport_stats_error) {
2020 /* Transmit and receive stats will appear to be swapped relative to the
2021 * other ports since we are the one sending the data, not a remote
2022 * computer. For consistency, we swap them back here. This does not
2023 * apply if we are getting stats from the vport layer because it always
2024 * tracks stats from the perspective of the switch. */
fe6b0e03 2025
f613a0d7 2026 *stats = dev_stats;
92df599c
JG
2027 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2028 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2029 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2030 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
2031 stats->rx_length_errors = 0;
2032 stats->rx_over_errors = 0;
2033 stats->rx_crc_errors = 0;
2034 stats->rx_frame_errors = 0;
2035 stats->rx_fifo_errors = 0;
2036 stats->rx_missed_errors = 0;
2037 stats->tx_aborted_errors = 0;
2038 stats->tx_carrier_errors = 0;
2039 stats->tx_fifo_errors = 0;
2040 stats->tx_heartbeat_errors = 0;
2041 stats->tx_window_errors = 0;
f613a0d7 2042 } else {
04c881eb
AZ
2043 /* Use kernel netdev's packet and byte counts since vport counters
2044 * do not reflect packet counts on the wire when GSO, TSO or GRO
2045 * are enabled. */
2046 stats->rx_packets = dev_stats.tx_packets;
2047 stats->rx_bytes = dev_stats.tx_bytes;
2048 stats->tx_packets = dev_stats.rx_packets;
2049 stats->tx_bytes = dev_stats.rx_bytes;
2050
f613a0d7
PS
2051 stats->rx_dropped += dev_stats.tx_dropped;
2052 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 2053
f613a0d7
PS
2054 stats->rx_errors += dev_stats.tx_errors;
2055 stats->tx_errors += dev_stats.rx_errors;
2056
2057 stats->multicast += dev_stats.multicast;
2058 stats->collisions += dev_stats.collisions;
2059 }
22dcb534 2060 stats->tx_dropped += netdev->tx_dropped;
86383816
BP
2061 ovs_mutex_unlock(&netdev->mutex);
2062
2063 return error;
8b61709d
BP
2064}
2065
bba1e6f3
PS
2066static int
2067netdev_internal_get_stats(const struct netdev *netdev_,
2068 struct netdev_stats *stats)
2069{
b5d57fc8 2070 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2071 int error;
bba1e6f3 2072
86383816 2073 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 2074 get_stats_via_vport(netdev_, stats);
86383816
BP
2075 error = netdev->vport_stats_error;
2076 ovs_mutex_unlock(&netdev->mutex);
2077
2078 return error;
bba1e6f3
PS
2079}
2080
51f87458 2081static void
b5d57fc8 2082netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
2083{
2084 struct ethtool_cmd ecmd;
6c038611 2085 uint32_t speed;
8b61709d
BP
2086 int error;
2087
b5d57fc8 2088 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
2089 return;
2090 }
2091
ab985a77 2092 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2093 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 2094 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
2095 ETHTOOL_GSET, "ETHTOOL_GSET");
2096 if (error) {
51f87458 2097 goto out;
8b61709d
BP
2098 }
2099
2100 /* Supported features. */
b5d57fc8 2101 netdev->supported = 0;
8b61709d 2102 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 2103 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
2104 }
2105 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 2106 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
2107 }
2108 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 2109 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
2110 }
2111 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 2112 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
2113 }
2114 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 2115 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 2116 }
67bed84c
SH
2117 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2118 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 2119 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 2120 }
67bed84c
SH
2121 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2122 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2123 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2124 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 2125 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 2126 }
67bed84c
SH
2127 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2128 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2129 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2130 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2131 netdev->supported |= NETDEV_F_40GB_FD;
2132 }
8b61709d 2133 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 2134 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
2135 }
2136 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 2137 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
2138 }
2139 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 2140 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
2141 }
2142 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 2143 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
2144 }
2145 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 2146 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2147 }
2148
2149 /* Advertised features. */
b5d57fc8 2150 netdev->advertised = 0;
8b61709d 2151 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 2152 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
2153 }
2154 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 2155 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
2156 }
2157 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 2158 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
2159 }
2160 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 2161 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
2162 }
2163 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 2164 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 2165 }
67bed84c
SH
2166 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2167 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 2168 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 2169 }
67bed84c
SH
2170 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2171 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2172 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2173 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 2174 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 2175 }
67bed84c
SH
2176 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2177 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2178 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2179 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2180 netdev->advertised |= NETDEV_F_40GB_FD;
2181 }
8b61709d 2182 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 2183 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
2184 }
2185 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 2186 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
2187 }
2188 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 2189 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
2190 }
2191 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 2192 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
2193 }
2194 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 2195 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2196 }
2197
2198 /* Current settings. */
0c615356 2199 speed = ethtool_cmd_speed(&ecmd);
6c038611 2200 if (speed == SPEED_10) {
b5d57fc8 2201 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 2202 } else if (speed == SPEED_100) {
b5d57fc8 2203 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 2204 } else if (speed == SPEED_1000) {
b5d57fc8 2205 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 2206 } else if (speed == SPEED_10000) {
b5d57fc8 2207 netdev->current = NETDEV_F_10GB_FD;
6c038611 2208 } else if (speed == 40000) {
b5d57fc8 2209 netdev->current = NETDEV_F_40GB_FD;
6c038611 2210 } else if (speed == 100000) {
b5d57fc8 2211 netdev->current = NETDEV_F_100GB_FD;
6c038611 2212 } else if (speed == 1000000) {
b5d57fc8 2213 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2214 } else {
b5d57fc8 2215 netdev->current = 0;
8b61709d
BP
2216 }
2217
2218 if (ecmd.port == PORT_TP) {
b5d57fc8 2219 netdev->current |= NETDEV_F_COPPER;
8b61709d 2220 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2221 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2222 }
2223
2224 if (ecmd.autoneg) {
b5d57fc8 2225 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2226 }
2227
51f87458 2228out:
b5d57fc8
BP
2229 netdev->cache_valid |= VALID_FEATURES;
2230 netdev->get_features_error = error;
51f87458
PS
2231}
2232
887ed8b2
BP
2233/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2234 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2235 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2236static int
2237netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2238 enum netdev_features *current,
2239 enum netdev_features *advertised,
2240 enum netdev_features *supported,
2241 enum netdev_features *peer)
51f87458 2242{
b5d57fc8 2243 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2244 int error;
51f87458 2245
86383816 2246 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2247 if (netdev_linux_netnsid_is_remote(netdev)) {
2248 error = EOPNOTSUPP;
2249 goto exit;
2250 }
2251
b5d57fc8 2252 netdev_linux_read_features(netdev);
b5d57fc8
BP
2253 if (!netdev->get_features_error) {
2254 *current = netdev->current;
2255 *advertised = netdev->advertised;
2256 *supported = netdev->supported;
887ed8b2 2257 *peer = 0; /* XXX */
51f87458 2258 }
86383816 2259 error = netdev->get_features_error;
86383816 2260
e0e2410d
FL
2261exit:
2262 ovs_mutex_unlock(&netdev->mutex);
86383816 2263 return error;
8b61709d
BP
2264}
2265
2266/* Set the features advertised by 'netdev' to 'advertise'. */
2267static int
86383816 2268netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2269 enum netdev_features advertise)
8b61709d 2270{
86383816 2271 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2272 struct ethtool_cmd ecmd;
2273 int error;
2274
86383816
BP
2275 ovs_mutex_lock(&netdev->mutex);
2276
ab985a77 2277 COVERAGE_INC(netdev_get_ethtool);
e0e2410d
FL
2278
2279 if (netdev_linux_netnsid_is_remote(netdev)) {
2280 error = EOPNOTSUPP;
2281 goto exit;
2282 }
2283
8b61709d 2284 memset(&ecmd, 0, sizeof ecmd);
86383816 2285 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2286 ETHTOOL_GSET, "ETHTOOL_GSET");
2287 if (error) {
86383816 2288 goto exit;
8b61709d
BP
2289 }
2290
2291 ecmd.advertising = 0;
6c038611 2292 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2293 ecmd.advertising |= ADVERTISED_10baseT_Half;
2294 }
6c038611 2295 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2296 ecmd.advertising |= ADVERTISED_10baseT_Full;
2297 }
6c038611 2298 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2299 ecmd.advertising |= ADVERTISED_100baseT_Half;
2300 }
6c038611 2301 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2302 ecmd.advertising |= ADVERTISED_100baseT_Full;
2303 }
6c038611 2304 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2305 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2306 }
6c038611 2307 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2308 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2309 }
6c038611 2310 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2311 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2312 }
6c038611 2313 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2314 ecmd.advertising |= ADVERTISED_TP;
2315 }
6c038611 2316 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2317 ecmd.advertising |= ADVERTISED_FIBRE;
2318 }
6c038611 2319 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2320 ecmd.advertising |= ADVERTISED_Autoneg;
2321 }
6c038611 2322 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2323 ecmd.advertising |= ADVERTISED_Pause;
2324 }
6c038611 2325 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2326 ecmd.advertising |= ADVERTISED_Asym_Pause;
2327 }
ab985a77 2328 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2329 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2330 ETHTOOL_SSET, "ETHTOOL_SSET");
2331
2332exit:
2333 ovs_mutex_unlock(&netdev->mutex);
2334 return error;
8b61709d
BP
2335}
2336
e7f6ba22
PJV
2337static struct tc_police
2338tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2339{
2340 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2341 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2342 struct tc_police police;
2343 struct tc_ratespec rate;
2344 int mtu = 65535;
2345
2346 memset(&rate, 0, sizeof rate);
2347 rate.rate = bps;
2348 rate.cell_log = tc_calc_cell_log(mtu);
2349 rate.mpu = ETH_TOTAL_MIN;
2350
2351 memset(&police, 0, sizeof police);
2352 police.burst = tc_bytes_to_ticks(bps, bsize);
2353 police.action = TC_POLICE_SHOT;
2354 police.rate = rate;
2355 police.mtu = mtu;
2356
2357 return police;
2358}
2359
2360static void
2361nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2362{
2363 size_t offset;
2364
2365 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2366 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2367 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2368 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2369 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2370 nl_msg_end_nested(request, offset);
2371}
2372
2373static int
2374tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2375 uint32_t kbits_burst)
2376{
2377 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2378 size_t basic_offset, action_offset, inner_offset;
2379 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2380 int ifindex, index, err = 0;
2381 struct tc_police pol_act;
2382 uint32_t block_id = 0;
2383 struct ofpbuf request;
2384 struct ofpbuf *reply;
2385 struct tcmsg *tcmsg;
2386 uint32_t handle = 1;
2387
2388 err = get_ifindex(netdev, &ifindex);
2389 if (err) {
2390 return err;
2391 }
2392
2393 index = block_id ? TCM_IFINDEX_MAGIC_BLOCK : ifindex;
2394 tcmsg = tc_make_request(index, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2395 &request);
2396 tcmsg->tcm_parent = block_id ? : TC_INGRESS_PARENT;
2397 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2398 tcmsg->tcm_handle = handle;
2399
2400 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2401 nl_msg_put_string(&request, TCA_KIND, "matchall");
2402 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2403 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2404 inner_offset = nl_msg_start_nested(&request, 1);
2405 nl_msg_put_act_police(&request, pol_act);
2406 nl_msg_end_nested(&request, inner_offset);
2407 nl_msg_end_nested(&request, action_offset);
2408 nl_msg_end_nested(&request, basic_offset);
2409
2410 err = tc_transact(&request, &reply);
2411 if (!err) {
2412 struct tcmsg *tc =
2413 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2414 ofpbuf_delete(reply);
2415 }
2416
2417 return err;
2418}
2419
2420static int
2421tc_del_matchall_policer(struct netdev *netdev)
2422{
2423 uint32_t block_id = 0;
2424 int ifindex;
2425 int err;
2426
2427 err = get_ifindex(netdev, &ifindex);
2428 if (err) {
2429 return err;
2430 }
2431
95255018
JH
2432 err = tc_del_filter(ifindex, TC_RESERVED_PRIORITY_POLICE, 1, block_id,
2433 TC_INGRESS);
e7f6ba22
PJV
2434 if (err) {
2435 return err;
2436 }
2437
2438 return 0;
2439}
2440
f8500004
JP
2441/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2442 * successful, otherwise a positive errno value. */
8b61709d 2443static int
b5d57fc8 2444netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2445 uint32_t kbits_rate, uint32_t kbits_burst)
2446{
b5d57fc8
BP
2447 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2448 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2449 int ifindex;
f8500004 2450 int error;
8b61709d 2451
80a86fbe 2452 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2453 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2454 : kbits_burst); /* Stick with user-specified value. */
2455
86383816 2456 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2457 if (netdev_linux_netnsid_is_remote(netdev)) {
2458 error = EOPNOTSUPP;
2459 goto out;
2460 }
2461
b5d57fc8 2462 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2463 error = netdev->netdev_policing_error;
2464 if (error || (netdev->kbits_rate == kbits_rate &&
2465 netdev->kbits_burst == kbits_burst)) {
c9f71668 2466 /* Assume that settings haven't changed since we last set them. */
86383816 2467 goto out;
c9f71668 2468 }
b5d57fc8 2469 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2470 }
2471
718be50d 2472 COVERAGE_INC(netdev_set_policing);
7874bdff 2473
e7f6ba22
PJV
2474 /* Use matchall for policing when offloadling ovs with tc-flower. */
2475 if (netdev_is_flow_api_enabled()) {
2476 error = tc_del_matchall_policer(netdev_);
2477 if (kbits_rate) {
2478 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2479 }
2480 ovs_mutex_unlock(&netdev->mutex);
2481 return error;
2482 }
2483
718be50d
TZ
2484 error = get_ifindex(netdev_, &ifindex);
2485 if (error) {
2486 goto out;
2487 }
2488
f8500004 2489 /* Remove any existing ingress qdisc. */
95255018 2490 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
f8500004
JP
2491 if (error) {
2492 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2493 netdev_name, ovs_strerror(error));
c9f71668 2494 goto out;
f8500004
JP
2495 }
2496
8b61709d 2497 if (kbits_rate) {
95255018 2498 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
f8500004
JP
2499 if (error) {
2500 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2501 netdev_name, ovs_strerror(error));
c9f71668 2502 goto out;
8b61709d
BP
2503 }
2504
b5d57fc8 2505 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2506 if (error){
2507 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2508 netdev_name, ovs_strerror(error));
c9f71668 2509 goto out;
8b61709d 2510 }
8b61709d
BP
2511 }
2512
b5d57fc8
BP
2513 netdev->kbits_rate = kbits_rate;
2514 netdev->kbits_burst = kbits_burst;
f8500004 2515
c9f71668
PS
2516out:
2517 if (!error || error == ENODEV) {
b5d57fc8
BP
2518 netdev->netdev_policing_error = error;
2519 netdev->cache_valid |= VALID_POLICING;
c9f71668 2520 }
86383816 2521 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2522 return error;
8b61709d
BP
2523}
2524
c1c9c9c4
BP
2525static int
2526netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2527 struct sset *types)
c1c9c9c4 2528{
559eb230 2529 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2530 for (opsp = tcs; *opsp != NULL; opsp++) {
2531 const struct tc_ops *ops = *opsp;
2532 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2533 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2534 }
2535 }
2536 return 0;
2537}
2538
2539static const struct tc_ops *
2540tc_lookup_ovs_name(const char *name)
2541{
559eb230 2542 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2543
2544 for (opsp = tcs; *opsp != NULL; opsp++) {
2545 const struct tc_ops *ops = *opsp;
2546 if (!strcmp(name, ops->ovs_name)) {
2547 return ops;
2548 }
2549 }
2550 return NULL;
2551}
2552
2553static const struct tc_ops *
2554tc_lookup_linux_name(const char *name)
2555{
559eb230 2556 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2557
2558 for (opsp = tcs; *opsp != NULL; opsp++) {
2559 const struct tc_ops *ops = *opsp;
2560 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2561 return ops;
2562 }
2563 }
2564 return NULL;
2565}
2566
93b13be8 2567static struct tc_queue *
b5d57fc8 2568tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2569 size_t hash)
2570{
b5d57fc8 2571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2572 struct tc_queue *queue;
2573
b5d57fc8 2574 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2575 if (queue->queue_id == queue_id) {
2576 return queue;
2577 }
2578 }
2579 return NULL;
2580}
2581
2582static struct tc_queue *
2583tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2584{
2585 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2586}
2587
c1c9c9c4
BP
2588static int
2589netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2590 const char *type,
2591 struct netdev_qos_capabilities *caps)
2592{
2593 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2594 if (!ops) {
2595 return EOPNOTSUPP;
2596 }
2597 caps->n_queues = ops->n_queues;
2598 return 0;
2599}
2600
2601static int
b5d57fc8 2602netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2603 const char **typep, struct smap *details)
c1c9c9c4 2604{
b5d57fc8 2605 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2606 int error;
2607
86383816 2608 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2609 if (netdev_linux_netnsid_is_remote(netdev)) {
2610 error = EOPNOTSUPP;
2611 goto exit;
2612 }
2613
b5d57fc8 2614 error = tc_query_qdisc(netdev_);
86383816
BP
2615 if (!error) {
2616 *typep = netdev->tc->ops->ovs_name;
2617 error = (netdev->tc->ops->qdisc_get
2618 ? netdev->tc->ops->qdisc_get(netdev_, details)
2619 : 0);
c1c9c9c4
BP
2620 }
2621
e0e2410d
FL
2622exit:
2623 ovs_mutex_unlock(&netdev->mutex);
86383816 2624 return error;
c1c9c9c4
BP
2625}
2626
2627static int
b5d57fc8 2628netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2629 const char *type, const struct smap *details)
c1c9c9c4 2630{
b5d57fc8 2631 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2632 const struct tc_ops *new_ops;
2633 int error;
2634
2635 new_ops = tc_lookup_ovs_name(type);
2636 if (!new_ops || !new_ops->tc_install) {
2637 return EOPNOTSUPP;
2638 }
2639
6cf888b8
BS
2640 if (new_ops == &tc_ops_noop) {
2641 return new_ops->tc_install(netdev_, details);
2642 }
2643
86383816 2644 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2645 if (netdev_linux_netnsid_is_remote(netdev)) {
2646 error = EOPNOTSUPP;
2647 goto exit;
2648 }
2649
b5d57fc8 2650 error = tc_query_qdisc(netdev_);
c1c9c9c4 2651 if (error) {
86383816 2652 goto exit;
c1c9c9c4
BP
2653 }
2654
b5d57fc8 2655 if (new_ops == netdev->tc->ops) {
86383816 2656 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2657 } else {
2658 /* Delete existing qdisc. */
b5d57fc8 2659 error = tc_del_qdisc(netdev_);
c1c9c9c4 2660 if (error) {
86383816 2661 goto exit;
c1c9c9c4 2662 }
b5d57fc8 2663 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2664
2665 /* Install new qdisc. */
b5d57fc8
BP
2666 error = new_ops->tc_install(netdev_, details);
2667 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2668 }
86383816
BP
2669
2670exit:
2671 ovs_mutex_unlock(&netdev->mutex);
2672 return error;
c1c9c9c4
BP
2673}
2674
2675static int
b5d57fc8 2676netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2677 unsigned int queue_id, struct smap *details)
c1c9c9c4 2678{
b5d57fc8 2679 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2680 int error;
2681
86383816 2682 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2683 if (netdev_linux_netnsid_is_remote(netdev)) {
2684 error = EOPNOTSUPP;
2685 goto exit;
2686 }
2687
b5d57fc8 2688 error = tc_query_qdisc(netdev_);
86383816 2689 if (!error) {
b5d57fc8 2690 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2691 error = (queue
b5d57fc8 2692 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2693 : ENOENT);
c1c9c9c4 2694 }
86383816 2695
e0e2410d
FL
2696exit:
2697 ovs_mutex_unlock(&netdev->mutex);
86383816 2698 return error;
c1c9c9c4
BP
2699}
2700
2701static int
b5d57fc8 2702netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2703 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2704{
b5d57fc8 2705 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2706 int error;
2707
86383816 2708 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2709 if (netdev_linux_netnsid_is_remote(netdev)) {
2710 error = EOPNOTSUPP;
2711 goto exit;
2712 }
2713
b5d57fc8 2714 error = tc_query_qdisc(netdev_);
86383816
BP
2715 if (!error) {
2716 error = (queue_id < netdev->tc->ops->n_queues
2717 && netdev->tc->ops->class_set
2718 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2719 : EINVAL);
c1c9c9c4
BP
2720 }
2721
e0e2410d
FL
2722exit:
2723 ovs_mutex_unlock(&netdev->mutex);
86383816 2724 return error;
c1c9c9c4
BP
2725}
2726
2727static int
b5d57fc8 2728netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2729{
b5d57fc8 2730 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2731 int error;
2732
86383816 2733 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2734 if (netdev_linux_netnsid_is_remote(netdev)) {
2735 error = EOPNOTSUPP;
2736 goto exit;
2737 }
2738
b5d57fc8 2739 error = tc_query_qdisc(netdev_);
86383816
BP
2740 if (!error) {
2741 if (netdev->tc->ops->class_delete) {
2742 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2743 error = (queue
2744 ? netdev->tc->ops->class_delete(netdev_, queue)
2745 : ENOENT);
2746 } else {
2747 error = EINVAL;
2748 }
c1c9c9c4 2749 }
86383816 2750
e0e2410d
FL
2751exit:
2752 ovs_mutex_unlock(&netdev->mutex);
86383816 2753 return error;
c1c9c9c4
BP
2754}
2755
2756static int
b5d57fc8 2757netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2758 unsigned int queue_id,
2759 struct netdev_queue_stats *stats)
2760{
b5d57fc8 2761 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2762 int error;
2763
86383816 2764 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2765 if (netdev_linux_netnsid_is_remote(netdev)) {
2766 error = EOPNOTSUPP;
2767 goto exit;
2768 }
2769
b5d57fc8 2770 error = tc_query_qdisc(netdev_);
86383816
BP
2771 if (!error) {
2772 if (netdev->tc->ops->class_get_stats) {
2773 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2774 if (queue) {
2775 stats->created = queue->created;
2776 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2777 stats);
2778 } else {
2779 error = ENOENT;
2780 }
2781 } else {
2782 error = EOPNOTSUPP;
6dc34a0d 2783 }
c1c9c9c4 2784 }
86383816 2785
e0e2410d
FL
2786exit:
2787 ovs_mutex_unlock(&netdev->mutex);
86383816 2788 return error;
c1c9c9c4
BP
2789}
2790
d57695d7
JS
2791struct queue_dump_state {
2792 struct nl_dump dump;
2793 struct ofpbuf buf;
2794};
2795
23a98ffe 2796static bool
d57695d7 2797start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2798{
2799 struct ofpbuf request;
2800 struct tcmsg *tcmsg;
2801
7874bdff 2802 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2803 if (!tcmsg) {
2804 return false;
2805 }
3c4de644 2806 tcmsg->tcm_parent = 0;
d57695d7 2807 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2808 ofpbuf_uninit(&request);
d57695d7
JS
2809
2810 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2811 return true;
c1c9c9c4
BP
2812}
2813
d57695d7
JS
2814static int
2815finish_queue_dump(struct queue_dump_state *state)
2816{
2817 ofpbuf_uninit(&state->buf);
2818 return nl_dump_done(&state->dump);
2819}
2820
89454bf4
BP
2821struct netdev_linux_queue_state {
2822 unsigned int *queues;
2823 size_t cur_queue;
2824 size_t n_queues;
2825};
2826
c1c9c9c4 2827static int
89454bf4 2828netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2829{
e0e2410d 2830 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2831 int error;
2832
86383816 2833 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2834 if (netdev_linux_netnsid_is_remote(netdev)) {
2835 error = EOPNOTSUPP;
2836 goto exit;
2837 }
2838
b5d57fc8 2839 error = tc_query_qdisc(netdev_);
86383816
BP
2840 if (!error) {
2841 if (netdev->tc->ops->class_get) {
89454bf4
BP
2842 struct netdev_linux_queue_state *state;
2843 struct tc_queue *queue;
2844 size_t i;
2845
2846 *statep = state = xmalloc(sizeof *state);
2847 state->n_queues = hmap_count(&netdev->tc->queues);
2848 state->cur_queue = 0;
2849 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2850
2851 i = 0;
2852 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2853 state->queues[i++] = queue->queue_id;
86383816 2854 }
c1c9c9c4 2855 } else {
86383816 2856 error = EOPNOTSUPP;
c1c9c9c4
BP
2857 }
2858 }
c1c9c9c4 2859
e0e2410d
FL
2860exit:
2861 ovs_mutex_unlock(&netdev->mutex);
86383816 2862 return error;
c1c9c9c4
BP
2863}
2864
89454bf4
BP
2865static int
2866netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2867 unsigned int *queue_idp, struct smap *details)
2868{
e0e2410d 2869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
89454bf4
BP
2870 struct netdev_linux_queue_state *state = state_;
2871 int error = EOF;
2872
2873 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2874 if (netdev_linux_netnsid_is_remote(netdev)) {
2875 error = EOPNOTSUPP;
2876 goto exit;
2877 }
2878
89454bf4
BP
2879 while (state->cur_queue < state->n_queues) {
2880 unsigned int queue_id = state->queues[state->cur_queue++];
2881 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2882
2883 if (queue) {
2884 *queue_idp = queue_id;
2885 error = netdev->tc->ops->class_get(netdev_, queue, details);
2886 break;
2887 }
2888 }
89454bf4 2889
e0e2410d
FL
2890exit:
2891 ovs_mutex_unlock(&netdev->mutex);
89454bf4
BP
2892 return error;
2893}
2894
2895static int
2896netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2897 void *state_)
2898{
2899 struct netdev_linux_queue_state *state = state_;
2900
2901 free(state->queues);
2902 free(state);
2903 return 0;
2904}
2905
c1c9c9c4 2906static int
b5d57fc8 2907netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2908 netdev_dump_queue_stats_cb *cb, void *aux)
2909{
b5d57fc8 2910 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2911 int error;
2912
86383816 2913 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2914 if (netdev_linux_netnsid_is_remote(netdev)) {
2915 error = EOPNOTSUPP;
2916 goto exit;
2917 }
2918
b5d57fc8 2919 error = tc_query_qdisc(netdev_);
86383816 2920 if (!error) {
d57695d7 2921 struct queue_dump_state state;
c1c9c9c4 2922
86383816
BP
2923 if (!netdev->tc->ops->class_dump_stats) {
2924 error = EOPNOTSUPP;
d57695d7 2925 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2926 error = ENODEV;
2927 } else {
2928 struct ofpbuf msg;
2929 int retval;
2930
d57695d7 2931 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2932 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2933 cb, aux);
2934 if (retval) {
2935 error = retval;
2936 }
2937 }
2938
d57695d7 2939 retval = finish_queue_dump(&state);
86383816
BP
2940 if (retval) {
2941 error = retval;
2942 }
c1c9c9c4
BP
2943 }
2944 }
2945
e0e2410d
FL
2946exit:
2947 ovs_mutex_unlock(&netdev->mutex);
86383816 2948 return error;
c1c9c9c4
BP
2949}
2950
8b61709d 2951static int
f1acd62b
BP
2952netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2953 struct in_addr netmask)
8b61709d 2954{
b5d57fc8 2955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2956 int error;
2957
86383816 2958 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2959 if (netdev_linux_netnsid_is_remote(netdev)) {
2960 error = EOPNOTSUPP;
2961 goto exit;
2962 }
2963
f1acd62b 2964 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2965 if (!error) {
f1acd62b 2966 if (address.s_addr != INADDR_ANY) {
8b61709d 2967 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2968 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2969 }
2970 }
49af9a3d 2971
e0e2410d 2972exit:
86383816 2973 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
2974 return error;
2975}
2976
7df6932e
AW
2977/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2978 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2979 * error. */
8b61709d 2980static int
a8704b50
PS
2981netdev_linux_get_addr_list(const struct netdev *netdev_,
2982 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2983{
b5d57fc8 2984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2985 int error;
86383816
BP
2986
2987 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2988 if (netdev_linux_netnsid_is_remote(netdev)) {
2989 error = EOPNOTSUPP;
2990 goto exit;
2991 }
2992
a8704b50 2993 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816 2994
e0e2410d
FL
2995exit:
2996 ovs_mutex_unlock(&netdev->mutex);
7df6932e 2997 return error;
8b61709d
BP
2998}
2999
3000static void
3001make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3002{
3003 struct sockaddr_in sin;
3004 memset(&sin, 0, sizeof sin);
3005 sin.sin_family = AF_INET;
3006 sin.sin_addr = addr;
3007 sin.sin_port = 0;
3008
3009 memset(sa, 0, sizeof *sa);
3010 memcpy(sa, &sin, sizeof sin);
3011}
3012
3013static int
3014do_set_addr(struct netdev *netdev,
3015 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3016{
3017 struct ifreq ifr;
149f577a 3018
259e0b1a
BP
3019 make_in4_sockaddr(&ifr.ifr_addr, addr);
3020 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3021 ioctl_name);
8b61709d
BP
3022}
3023
3024/* Adds 'router' as a default IP gateway. */
3025static int
67a4917b 3026netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
3027{
3028 struct in_addr any = { INADDR_ANY };
3029 struct rtentry rt;
3030 int error;
3031
3032 memset(&rt, 0, sizeof rt);
3033 make_in4_sockaddr(&rt.rt_dst, any);
3034 make_in4_sockaddr(&rt.rt_gateway, router);
3035 make_in4_sockaddr(&rt.rt_genmask, any);
3036 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 3037 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 3038 if (error) {
10a89ef0 3039 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
3040 }
3041 return error;
3042}
3043
f1acd62b
BP
3044static int
3045netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3046 char **netdev_name)
3047{
3048 static const char fn[] = "/proc/net/route";
3049 FILE *stream;
3050 char line[256];
3051 int ln;
3052
3053 *netdev_name = NULL;
3054 stream = fopen(fn, "r");
3055 if (stream == NULL) {
10a89ef0 3056 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
3057 return errno;
3058 }
3059
3060 ln = 0;
3061 while (fgets(line, sizeof line, stream)) {
3062 if (++ln >= 2) {
3063 char iface[17];
dbba996b 3064 ovs_be32 dest, gateway, mask;
f1acd62b
BP
3065 int refcnt, metric, mtu;
3066 unsigned int flags, use, window, irtt;
3067
c2c28dfd
BP
3068 if (!ovs_scan(line,
3069 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3070 " %d %u %u\n",
3071 iface, &dest, &gateway, &flags, &refcnt,
3072 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 3073 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
3074 fn, ln, line);
3075 continue;
3076 }
3077 if (!(flags & RTF_UP)) {
3078 /* Skip routes that aren't up. */
3079 continue;
3080 }
3081
3082 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 3083 * network byte order, so we don't need need any endian
f1acd62b
BP
3084 * conversions here. */
3085 if ((dest & mask) == (host->s_addr & mask)) {
3086 if (!gateway) {
3087 /* The host is directly reachable. */
3088 next_hop->s_addr = 0;
3089 } else {
3090 /* To reach the host, we must go through a gateway. */
3091 next_hop->s_addr = gateway;
3092 }
3093 *netdev_name = xstrdup(iface);
3094 fclose(stream);
3095 return 0;
3096 }
3097 }
3098 }
3099
3100 fclose(stream);
3101 return ENXIO;
3102}
3103
e210037e 3104static int
b5d57fc8 3105netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 3106{
b5d57fc8 3107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
3108 int error = 0;
3109
86383816 3110 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
3111 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3112 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
3113
3114 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
3115 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3116 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
3117 cmd,
3118 ETHTOOL_GDRVINFO,
3119 "ETHTOOL_GDRVINFO");
3120 if (!error) {
b5d57fc8 3121 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
3122 }
3123 }
e210037e 3124
e210037e 3125 if (!error) {
b5d57fc8
BP
3126 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3127 smap_add(smap, "driver_version", netdev->drvinfo.version);
3128 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 3129 }
86383816
BP
3130 ovs_mutex_unlock(&netdev->mutex);
3131
e210037e
AE
3132 return error;
3133}
3134
4f925bd3 3135static int
275707c3
EJ
3136netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3137 struct smap *smap)
4f925bd3 3138{
79f1cbe9 3139 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
3140 return 0;
3141}
3142
25db83be
JH
3143static uint32_t
3144netdev_linux_get_block_id(struct netdev *netdev_)
3145{
3146 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3147 uint32_t block_id = 0;
3148
3149 ovs_mutex_lock(&netdev->mutex);
3150 /* Ensure the linux netdev has had its fields populated. */
3151 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3152 netdev_linux_update_via_netlink(netdev);
3153 }
3154
3155 /* Only assigning block ids to linux netdevs that are LAG masters. */
3156 if (netdev->is_lag_master) {
3157 block_id = netdev->ifindex;
3158 }
3159 ovs_mutex_unlock(&netdev->mutex);
3160
3161 return block_id;
3162}
3163
8b61709d
BP
3164/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3165 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3166 * returns 0. Otherwise, it returns a positive errno value; in particular,
3167 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3168static int
3169netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 3170 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
3171{
3172 struct arpreq r;
c100e025 3173 struct sockaddr_in sin;
8b61709d
BP
3174 int retval;
3175
3176 memset(&r, 0, sizeof r);
f2cc621b 3177 memset(&sin, 0, sizeof sin);
c100e025
BP
3178 sin.sin_family = AF_INET;
3179 sin.sin_addr.s_addr = ip;
3180 sin.sin_port = 0;
3181 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
3182 r.arp_ha.sa_family = ARPHRD_ETHER;
3183 r.arp_flags = 0;
71d7c22f 3184 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 3185 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 3186 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
3187 if (!retval) {
3188 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3189 } else if (retval != ENXIO) {
3190 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
3191 netdev_get_name(netdev), IP_ARGS(ip),
3192 ovs_strerror(retval));
8b61709d
BP
3193 }
3194 return retval;
3195}
3196
b24751ff 3197static unsigned int
8b61709d
BP
3198nd_to_iff_flags(enum netdev_flags nd)
3199{
b24751ff 3200 unsigned int iff = 0;
8b61709d
BP
3201 if (nd & NETDEV_UP) {
3202 iff |= IFF_UP;
3203 }
3204 if (nd & NETDEV_PROMISC) {
3205 iff |= IFF_PROMISC;
3206 }
7ba19d41
AC
3207 if (nd & NETDEV_LOOPBACK) {
3208 iff |= IFF_LOOPBACK;
3209 }
8b61709d
BP
3210 return iff;
3211}
3212
3213static int
b24751ff 3214iff_to_nd_flags(unsigned int iff)
8b61709d
BP
3215{
3216 enum netdev_flags nd = 0;
3217 if (iff & IFF_UP) {
3218 nd |= NETDEV_UP;
3219 }
3220 if (iff & IFF_PROMISC) {
3221 nd |= NETDEV_PROMISC;
3222 }
7ba19d41
AC
3223 if (iff & IFF_LOOPBACK) {
3224 nd |= NETDEV_LOOPBACK;
3225 }
8b61709d
BP
3226 return nd;
3227}
3228
3229static int
4f9f3f21
BP
3230update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3231 enum netdev_flags on, enum netdev_flags *old_flagsp)
3232 OVS_REQUIRES(netdev->mutex)
8b61709d 3233{
b24751ff 3234 unsigned int old_flags, new_flags;
c37d4da4
EJ
3235 int error = 0;
3236
b5d57fc8 3237 old_flags = netdev->ifi_flags;
c37d4da4
EJ
3238 *old_flagsp = iff_to_nd_flags(old_flags);
3239 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3240 if (new_flags != old_flags) {
4f9f3f21
BP
3241 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3242 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 3243 }
4f9f3f21
BP
3244
3245 return error;
3246}
3247
3248static int
3249netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3250 enum netdev_flags on, enum netdev_flags *old_flagsp)
3251{
3252 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756819dd 3253 int error = 0;
4f9f3f21
BP
3254
3255 ovs_mutex_lock(&netdev->mutex);
756819dd
FL
3256 if (on || off) {
3257 /* Changing flags over netlink isn't support yet. */
e0e2410d
FL
3258 if (netdev_linux_netnsid_is_remote(netdev)) {
3259 error = EOPNOTSUPP;
3260 goto exit;
3261 }
756819dd
FL
3262 error = update_flags(netdev, off, on, old_flagsp);
3263 } else {
3264 /* Try reading flags over netlink, or fall back to ioctl. */
3265 if (!netdev_linux_update_via_netlink(netdev)) {
3266 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3267 } else {
3268 error = update_flags(netdev, off, on, old_flagsp);
3269 }
3270 }
e0e2410d
FL
3271
3272exit:
86383816 3273 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3274 return error;
3275}
3276
89c09c1c
BP
3277#define NETDEV_LINUX_CLASS_COMMON \
3278 .run = netdev_linux_run, \
3279 .wait = netdev_linux_wait, \
3280 .alloc = netdev_linux_alloc, \
3281 .destruct = netdev_linux_destruct, \
3282 .dealloc = netdev_linux_dealloc, \
3283 .send = netdev_linux_send, \
3284 .send_wait = netdev_linux_send_wait, \
3285 .set_etheraddr = netdev_linux_set_etheraddr, \
3286 .get_etheraddr = netdev_linux_get_etheraddr, \
3287 .get_mtu = netdev_linux_get_mtu, \
3288 .set_mtu = netdev_linux_set_mtu, \
3289 .get_ifindex = netdev_linux_get_ifindex, \
3290 .get_carrier = netdev_linux_get_carrier, \
3291 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3292 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3293 .set_advertisements = netdev_linux_set_advertisements, \
3294 .set_policing = netdev_linux_set_policing, \
3295 .get_qos_types = netdev_linux_get_qos_types, \
3296 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3297 .get_qos = netdev_linux_get_qos, \
3298 .set_qos = netdev_linux_set_qos, \
3299 .get_queue = netdev_linux_get_queue, \
3300 .set_queue = netdev_linux_set_queue, \
3301 .delete_queue = netdev_linux_delete_queue, \
3302 .get_queue_stats = netdev_linux_get_queue_stats, \
3303 .queue_dump_start = netdev_linux_queue_dump_start, \
3304 .queue_dump_next = netdev_linux_queue_dump_next, \
3305 .queue_dump_done = netdev_linux_queue_dump_done, \
3306 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3307 .set_in4 = netdev_linux_set_in4, \
3308 .get_addr_list = netdev_linux_get_addr_list, \
3309 .add_router = netdev_linux_add_router, \
3310 .get_next_hop = netdev_linux_get_next_hop, \
3311 .arp_lookup = netdev_linux_arp_lookup, \
3312 .update_flags = netdev_linux_update_flags, \
3313 .rxq_alloc = netdev_linux_rxq_alloc, \
3314 .rxq_construct = netdev_linux_rxq_construct, \
3315 .rxq_destruct = netdev_linux_rxq_destruct, \
3316 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3317 .rxq_recv = netdev_linux_rxq_recv, \
3318 .rxq_wait = netdev_linux_rxq_wait, \
3319 .rxq_drain = netdev_linux_rxq_drain
3320
3321const struct netdev_class netdev_linux_class = {
3322 NETDEV_LINUX_CLASS_COMMON,
89c09c1c
BP
3323 .type = "system",
3324 .construct = netdev_linux_construct,
3325 .get_stats = netdev_linux_get_stats,
3326 .get_features = netdev_linux_get_features,
3327 .get_status = netdev_linux_get_status,
3328 .get_block_id = netdev_linux_get_block_id
3329};
3330
3331const struct netdev_class netdev_tap_class = {
3332 NETDEV_LINUX_CLASS_COMMON,
3333 .type = "tap",
3334 .construct = netdev_linux_construct_tap,
3335 .get_stats = netdev_tap_get_stats,
3336 .get_features = netdev_linux_get_features,
3337 .get_status = netdev_linux_get_status,
3338};
3339
3340const struct netdev_class netdev_internal_class = {
3341 NETDEV_LINUX_CLASS_COMMON,
3342 .type = "internal",
3343 .construct = netdev_linux_construct,
3344 .get_stats = netdev_internal_get_stats,
3345 .get_status = netdev_internal_get_status,
3346};
8b61709d 3347\f
677d9158
JV
3348
3349#define CODEL_N_QUEUES 0x0000
3350
2f4298ce
BP
3351/* In sufficiently new kernel headers these are defined as enums in
3352 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3353 * kernels. (This overrides any enum definition in the header file but that's
3354 * harmless.) */
3355#define TCA_CODEL_TARGET 1
3356#define TCA_CODEL_LIMIT 2
3357#define TCA_CODEL_INTERVAL 3
3358
677d9158
JV
3359struct codel {
3360 struct tc tc;
3361 uint32_t target;
3362 uint32_t limit;
3363 uint32_t interval;
3364};
3365
3366static struct codel *
3367codel_get__(const struct netdev *netdev_)
3368{
3369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3370 return CONTAINER_OF(netdev->tc, struct codel, tc);
3371}
3372
3373static void
3374codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3375 uint32_t interval)
3376{
3377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3378 struct codel *codel;
3379
3380 codel = xmalloc(sizeof *codel);
3381 tc_init(&codel->tc, &tc_ops_codel);
3382 codel->target = target;
3383 codel->limit = limit;
3384 codel->interval = interval;
3385
3386 netdev->tc = &codel->tc;
3387}
3388
3389static int
3390codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3391 uint32_t interval)
3392{
3393 size_t opt_offset;
3394 struct ofpbuf request;
3395 struct tcmsg *tcmsg;
3396 uint32_t otarget, olimit, ointerval;
3397 int error;
3398
3399 tc_del_qdisc(netdev);
3400
7874bdff
RD
3401 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3402 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3403 if (!tcmsg) {
3404 return ENODEV;
3405 }
3406 tcmsg->tcm_handle = tc_make_handle(1, 0);
3407 tcmsg->tcm_parent = TC_H_ROOT;
3408
3409 otarget = target ? target : 5000;
3410 olimit = limit ? limit : 10240;
3411 ointerval = interval ? interval : 100000;
3412
3413 nl_msg_put_string(&request, TCA_KIND, "codel");
3414 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3415 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3416 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3417 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3418 nl_msg_end_nested(&request, opt_offset);
3419
3420 error = tc_transact(&request, NULL);
3421 if (error) {
3422 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3423 "target %u, limit %u, interval %u error %d(%s)",
3424 netdev_get_name(netdev),
3425 otarget, olimit, ointerval,
3426 error, ovs_strerror(error));
3427 }
3428 return error;
3429}
3430
3431static void
3432codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3433 const struct smap *details, struct codel *codel)
3434{
13c1637f
BP
3435 codel->target = smap_get_ullong(details, "target", 0);
3436 codel->limit = smap_get_ullong(details, "limit", 0);
3437 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3438
3439 if (!codel->target) {
3440 codel->target = 5000;
3441 }
3442 if (!codel->limit) {
3443 codel->limit = 10240;
3444 }
3445 if (!codel->interval) {
3446 codel->interval = 100000;
3447 }
3448}
3449
3450static int
3451codel_tc_install(struct netdev *netdev, const struct smap *details)
3452{
3453 int error;
3454 struct codel codel;
3455
3456 codel_parse_qdisc_details__(netdev, details, &codel);
3457 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3458 codel.interval);
3459 if (!error) {
3460 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3461 }
3462 return error;
3463}
3464
3465static int
3466codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3467{
3468 static const struct nl_policy tca_codel_policy[] = {
3469 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3470 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3471 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3472 };
3473
3474 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3475
3476 if (!nl_parse_nested(nl_options, tca_codel_policy,
3477 attrs, ARRAY_SIZE(tca_codel_policy))) {
3478 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3479 return EPROTO;
3480 }
3481
3482 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3483 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3484 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3485 return 0;
3486}
3487
3488static int
3489codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3490{
3491 struct nlattr *nlattr;
3492 const char * kind;
3493 int error;
3494 struct codel codel;
3495
3496 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3497 if (error != 0) {
3498 return error;
3499 }
3500
3501 error = codel_parse_tca_options__(nlattr, &codel);
3502 if (error != 0) {
3503 return error;
3504 }
3505
3506 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3507 return 0;
3508}
3509
3510
3511static void
3512codel_tc_destroy(struct tc *tc)
3513{
3514 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3515 tc_destroy(tc);
3516 free(codel);
3517}
3518
3519static int
3520codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3521{
3522 const struct codel *codel = codel_get__(netdev);
3523 smap_add_format(details, "target", "%u", codel->target);
3524 smap_add_format(details, "limit", "%u", codel->limit);
3525 smap_add_format(details, "interval", "%u", codel->interval);
3526 return 0;
3527}
3528
3529static int
3530codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3531{
3532 struct codel codel;
3533
3534 codel_parse_qdisc_details__(netdev, details, &codel);
3535 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3536 codel_get__(netdev)->target = codel.target;
3537 codel_get__(netdev)->limit = codel.limit;
3538 codel_get__(netdev)->interval = codel.interval;
3539 return 0;
3540}
3541
3542static const struct tc_ops tc_ops_codel = {
89c09c1c
BP
3543 .linux_name = "codel",
3544 .ovs_name = "linux-codel",
3545 .n_queues = CODEL_N_QUEUES,
3546 .tc_install = codel_tc_install,
3547 .tc_load = codel_tc_load,
3548 .tc_destroy = codel_tc_destroy,
3549 .qdisc_get = codel_qdisc_get,
3550 .qdisc_set = codel_qdisc_set,
677d9158
JV
3551};
3552\f
3553/* FQ-CoDel traffic control class. */
3554
3555#define FQCODEL_N_QUEUES 0x0000
3556
2f4298ce
BP
3557/* In sufficiently new kernel headers these are defined as enums in
3558 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3559 * kernels. (This overrides any enum definition in the header file but that's
3560 * harmless.) */
3561#define TCA_FQ_CODEL_TARGET 1
3562#define TCA_FQ_CODEL_LIMIT 2
3563#define TCA_FQ_CODEL_INTERVAL 3
3564#define TCA_FQ_CODEL_ECN 4
3565#define TCA_FQ_CODEL_FLOWS 5
3566#define TCA_FQ_CODEL_QUANTUM 6
3567
677d9158
JV
3568struct fqcodel {
3569 struct tc tc;
3570 uint32_t target;
3571 uint32_t limit;
3572 uint32_t interval;
3573 uint32_t flows;
3574 uint32_t quantum;
3575};
3576
3577static struct fqcodel *
3578fqcodel_get__(const struct netdev *netdev_)
3579{
3580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3581 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3582}
3583
3584static void
3585fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3586 uint32_t interval, uint32_t flows, uint32_t quantum)
3587{
3588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3589 struct fqcodel *fqcodel;
3590
3591 fqcodel = xmalloc(sizeof *fqcodel);
3592 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3593 fqcodel->target = target;
3594 fqcodel->limit = limit;
3595 fqcodel->interval = interval;
3596 fqcodel->flows = flows;
3597 fqcodel->quantum = quantum;
3598
3599 netdev->tc = &fqcodel->tc;
3600}
3601
3602static int
3603fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3604 uint32_t interval, uint32_t flows, uint32_t quantum)
3605{
3606 size_t opt_offset;
3607 struct ofpbuf request;
3608 struct tcmsg *tcmsg;
3609 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3610 int error;
3611
3612 tc_del_qdisc(netdev);
3613
7874bdff
RD
3614 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3615 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3616 if (!tcmsg) {
3617 return ENODEV;
3618 }
3619 tcmsg->tcm_handle = tc_make_handle(1, 0);
3620 tcmsg->tcm_parent = TC_H_ROOT;
3621
3622 otarget = target ? target : 5000;
3623 olimit = limit ? limit : 10240;
3624 ointerval = interval ? interval : 100000;
3625 oflows = flows ? flows : 1024;
3626 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3627 not mtu */
3628
3629 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3630 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3631 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3632 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3633 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3634 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3635 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3636 nl_msg_end_nested(&request, opt_offset);
3637
3638 error = tc_transact(&request, NULL);
3639 if (error) {
3640 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3641 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3642 netdev_get_name(netdev),
3643 otarget, olimit, ointerval, oflows, oquantum,
3644 error, ovs_strerror(error));
3645 }
3646 return error;
3647}
3648
3649static void
3650fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3651 const struct smap *details, struct fqcodel *fqcodel)
3652{
13c1637f
BP
3653 fqcodel->target = smap_get_ullong(details, "target", 0);
3654 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3655 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3656 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3657 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3658
677d9158
JV
3659 if (!fqcodel->target) {
3660 fqcodel->target = 5000;
3661 }
3662 if (!fqcodel->limit) {
3663 fqcodel->limit = 10240;
3664 }
3665 if (!fqcodel->interval) {
3666 fqcodel->interval = 1000000;
3667 }
3668 if (!fqcodel->flows) {
3669 fqcodel->flows = 1024;
3670 }
3671 if (!fqcodel->quantum) {
3672 fqcodel->quantum = 1514;
3673 }
3674}
3675
3676static int
3677fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3678{
3679 int error;
3680 struct fqcodel fqcodel;
3681
3682 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3683 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3684 fqcodel.interval, fqcodel.flows,
3685 fqcodel.quantum);
3686 if (!error) {
3687 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3688 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3689 }
3690 return error;
3691}
3692
3693static int
3694fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3695{
3696 static const struct nl_policy tca_fqcodel_policy[] = {
3697 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3698 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3699 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3700 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3701 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3702 };
3703
3704 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3705
3706 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3707 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3708 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3709 return EPROTO;
3710 }
3711
3712 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3713 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3714 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3715 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3716 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3717 return 0;
3718}
3719
3720static int
3721fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3722{
3723 struct nlattr *nlattr;
3724 const char * kind;
3725 int error;
3726 struct fqcodel fqcodel;
3727
3728 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3729 if (error != 0) {
3730 return error;
3731 }
3732
3733 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3734 if (error != 0) {
3735 return error;
3736 }
3737
3738 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3739 fqcodel.flows, fqcodel.quantum);
3740 return 0;
3741}
3742
3743static void
3744fqcodel_tc_destroy(struct tc *tc)
3745{
3746 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3747 tc_destroy(tc);
3748 free(fqcodel);
3749}
3750
3751static int
3752fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3753{
3754 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3755 smap_add_format(details, "target", "%u", fqcodel->target);
3756 smap_add_format(details, "limit", "%u", fqcodel->limit);
3757 smap_add_format(details, "interval", "%u", fqcodel->interval);
3758 smap_add_format(details, "flows", "%u", fqcodel->flows);
3759 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3760 return 0;
3761}
3762
3763static int
3764fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3765{
3766 struct fqcodel fqcodel;
3767
3768 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3769 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3770 fqcodel.flows, fqcodel.quantum);
3771 fqcodel_get__(netdev)->target = fqcodel.target;
3772 fqcodel_get__(netdev)->limit = fqcodel.limit;
3773 fqcodel_get__(netdev)->interval = fqcodel.interval;
3774 fqcodel_get__(netdev)->flows = fqcodel.flows;
3775 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3776 return 0;
3777}
3778
3779static const struct tc_ops tc_ops_fqcodel = {
89c09c1c
BP
3780 .linux_name = "fq_codel",
3781 .ovs_name = "linux-fq_codel",
3782 .n_queues = FQCODEL_N_QUEUES,
3783 .tc_install = fqcodel_tc_install,
3784 .tc_load = fqcodel_tc_load,
3785 .tc_destroy = fqcodel_tc_destroy,
3786 .qdisc_get = fqcodel_qdisc_get,
3787 .qdisc_set = fqcodel_qdisc_set,
677d9158
JV
3788};
3789\f
3790/* SFQ traffic control class. */
3791
3792#define SFQ_N_QUEUES 0x0000
3793
3794struct sfq {
3795 struct tc tc;
3796 uint32_t quantum;
3797 uint32_t perturb;
3798};
3799
3800static struct sfq *
3801sfq_get__(const struct netdev *netdev_)
3802{
3803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3804 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3805}
3806
3807static void
3808sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3809{
3810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3811 struct sfq *sfq;
3812
3813 sfq = xmalloc(sizeof *sfq);
3814 tc_init(&sfq->tc, &tc_ops_sfq);
3815 sfq->perturb = perturb;
3816 sfq->quantum = quantum;
3817
3818 netdev->tc = &sfq->tc;
3819}
3820
3821static int
3822sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3823{
3824 struct tc_sfq_qopt opt;
3825 struct ofpbuf request;
3826 struct tcmsg *tcmsg;
3827 int mtu;
3828 int mtu_error, error;
3829 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3830
3831 tc_del_qdisc(netdev);
3832
7874bdff
RD
3833 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3834 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3835 if (!tcmsg) {
3836 return ENODEV;
3837 }
3838 tcmsg->tcm_handle = tc_make_handle(1, 0);
3839 tcmsg->tcm_parent = TC_H_ROOT;
3840
3841 memset(&opt, 0, sizeof opt);
3842 if (!quantum) {
3843 if (!mtu_error) {
3844 opt.quantum = mtu; /* if we cannot find mtu, use default */
3845 }
3846 } else {
3847 opt.quantum = quantum;
3848 }
3849
3850 if (!perturb) {
3851 opt.perturb_period = 10;
3852 } else {
3853 opt.perturb_period = perturb;
3854 }
3855
3856 nl_msg_put_string(&request, TCA_KIND, "sfq");
3857 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3858
3859 error = tc_transact(&request, NULL);
3860 if (error) {
3861 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3862 "quantum %u, perturb %u error %d(%s)",
3863 netdev_get_name(netdev),
3864 opt.quantum, opt.perturb_period,
3865 error, ovs_strerror(error));
3866 }
3867 return error;
3868}
3869
3870static void
3871sfq_parse_qdisc_details__(struct netdev *netdev,
3872 const struct smap *details, struct sfq *sfq)
3873{
13c1637f
BP
3874 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3875 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3876
677d9158
JV
3877 if (!sfq->perturb) {
3878 sfq->perturb = 10;
3879 }
3880
3881 if (!sfq->quantum) {
13c1637f
BP
3882 int mtu;
3883 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3884 sfq->quantum = mtu;
3885 } else {
3886 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3887 "device without mtu");
677d9158
JV
3888 }
3889 }
3890}
3891
3892static int
3893sfq_tc_install(struct netdev *netdev, const struct smap *details)
3894{
3895 int error;
3896 struct sfq sfq;
3897
3898 sfq_parse_qdisc_details__(netdev, details, &sfq);
3899 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3900 if (!error) {
3901 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3902 }
3903 return error;
3904}
3905
3906static int
3907sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3908{
3909 const struct tc_sfq_qopt *sfq;
3910 struct nlattr *nlattr;
3911 const char * kind;
3912 int error;
3913
3914 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3915 if (error == 0) {
3916 sfq = nl_attr_get(nlattr);
61265c03 3917 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
677d9158
JV
3918 return 0;
3919 }
3920
3921 return error;
3922}
3923
3924static void
3925sfq_tc_destroy(struct tc *tc)
3926{
3927 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3928 tc_destroy(tc);
3929 free(sfq);
3930}
3931
3932static int
3933sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3934{
3935 const struct sfq *sfq = sfq_get__(netdev);
3936 smap_add_format(details, "quantum", "%u", sfq->quantum);
3937 smap_add_format(details, "perturb", "%u", sfq->perturb);
3938 return 0;
3939}
3940
3941static int
3942sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3943{
3944 struct sfq sfq;
3945
3946 sfq_parse_qdisc_details__(netdev, details, &sfq);
3947 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3948 sfq_get__(netdev)->quantum = sfq.quantum;
3949 sfq_get__(netdev)->perturb = sfq.perturb;
3950 return 0;
3951}
3952
3953static const struct tc_ops tc_ops_sfq = {
89c09c1c
BP
3954 .linux_name = "sfq",
3955 .ovs_name = "linux-sfq",
3956 .n_queues = SFQ_N_QUEUES,
3957 .tc_install = sfq_tc_install,
3958 .tc_load = sfq_tc_load,
3959 .tc_destroy = sfq_tc_destroy,
3960 .qdisc_get = sfq_qdisc_get,
3961 .qdisc_set = sfq_qdisc_set,
677d9158
JV
3962};
3963\f
2f564bb1
S
3964/* netem traffic control class. */
3965
3966struct netem {
3967 struct tc tc;
3968 uint32_t latency;
3969 uint32_t limit;
3970 uint32_t loss;
3971};
3972
3973static struct netem *
3974netem_get__(const struct netdev *netdev_)
3975{
3976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3977 return CONTAINER_OF(netdev->tc, struct netem, tc);
3978}
3979
3980static void
3981netem_install__(struct netdev *netdev_, uint32_t latency,
3982 uint32_t limit, uint32_t loss)
3983{
3984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3985 struct netem *netem;
3986
3987 netem = xmalloc(sizeof *netem);
3988 tc_init(&netem->tc, &tc_ops_netem);
3989 netem->latency = latency;
3990 netem->limit = limit;
3991 netem->loss = loss;
3992
3993 netdev->tc = &netem->tc;
3994}
3995
3996static int
3997netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
3998 uint32_t limit, uint32_t loss)
3999{
4000 struct tc_netem_qopt opt;
4001 struct ofpbuf request;
4002 struct tcmsg *tcmsg;
4003 int error;
4004
4005 tc_del_qdisc(netdev);
4006
4007 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4008 NLM_F_EXCL | NLM_F_CREATE, &request);
4009 if (!tcmsg) {
4010 return ENODEV;
4011 }
4012 tcmsg->tcm_handle = tc_make_handle(1, 0);
4013 tcmsg->tcm_parent = TC_H_ROOT;
4014
4015 memset(&opt, 0, sizeof opt);
4016
4017 if (!limit) {
4018 opt.limit = 1000;
4019 } else {
4020 opt.limit = limit;
4021 }
4022
4023 if (loss) {
4024 if (loss > 100) {
4025 VLOG_WARN_RL(&rl,
4026 "loss should be a percentage value between 0 to 100, "
4027 "loss was %u", loss);
4028 return EINVAL;
4029 }
4030 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4031 }
4032
4033 opt.latency = tc_time_to_ticks(latency);
4034
4035 nl_msg_put_string(&request, TCA_KIND, "netem");
4036 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4037
4038 error = tc_transact(&request, NULL);
4039 if (error) {
4040 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4041 "latency %u, limit %u, loss %u error %d(%s)",
4042 netdev_get_name(netdev),
4043 opt.latency, opt.limit, opt.loss,
4044 error, ovs_strerror(error));
4045 }
4046 return error;
4047}
4048
4049static void
4050netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4051 const struct smap *details, struct netem *netem)
4052{
4053 netem->latency = smap_get_ullong(details, "latency", 0);
4054 netem->limit = smap_get_ullong(details, "limit", 0);
4055 netem->loss = smap_get_ullong(details, "loss", 0);
4056
4057 if (!netem->limit) {
4058 netem->limit = 1000;
4059 }
4060}
4061
4062static int
4063netem_tc_install(struct netdev *netdev, const struct smap *details)
4064{
4065 int error;
4066 struct netem netem;
4067
4068 netem_parse_qdisc_details__(netdev, details, &netem);
4069 error = netem_setup_qdisc__(netdev, netem.latency,
4070 netem.limit, netem.loss);
4071 if (!error) {
4072 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4073 }
4074 return error;
4075}
4076
4077static int
4078netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4079{
4080 const struct tc_netem_qopt *netem;
4081 struct nlattr *nlattr;
4082 const char *kind;
4083 int error;
4084
4085 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4086 if (error == 0) {
4087 netem = nl_attr_get(nlattr);
4088 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4089 return 0;
4090 }
4091
4092 return error;
4093}
4094
4095static void
4096netem_tc_destroy(struct tc *tc)
4097{
4098 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4099 tc_destroy(tc);
4100 free(netem);
4101}
4102
4103static int
4104netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4105{
4106 const struct netem *netem = netem_get__(netdev);
4107 smap_add_format(details, "latency", "%u", netem->latency);
4108 smap_add_format(details, "limit", "%u", netem->limit);
4109 smap_add_format(details, "loss", "%u", netem->loss);
4110 return 0;
4111}
4112
4113static int
4114netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4115{
4116 struct netem netem;
4117
4118 netem_parse_qdisc_details__(netdev, details, &netem);
4119 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4120 netem_get__(netdev)->latency = netem.latency;
4121 netem_get__(netdev)->limit = netem.limit;
4122 netem_get__(netdev)->loss = netem.loss;
4123 return 0;
4124}
4125
4126static const struct tc_ops tc_ops_netem = {
4127 .linux_name = "netem",
4128 .ovs_name = "linux-netem",
4129 .n_queues = 0,
4130 .tc_install = netem_tc_install,
4131 .tc_load = netem_tc_load,
4132 .tc_destroy = netem_tc_destroy,
4133 .qdisc_get = netem_qdisc_get,
4134 .qdisc_set = netem_qdisc_set,
4135};
4136\f
c1c9c9c4 4137/* HTB traffic control class. */
559843ed 4138
c1c9c9c4 4139#define HTB_N_QUEUES 0xf000
4f631ccd 4140#define HTB_RATE2QUANTUM 10
8b61709d 4141
c1c9c9c4
BP
4142struct htb {
4143 struct tc tc;
4144 unsigned int max_rate; /* In bytes/s. */
4145};
8b61709d 4146
c1c9c9c4 4147struct htb_class {
93b13be8 4148 struct tc_queue tc_queue;
c1c9c9c4
BP
4149 unsigned int min_rate; /* In bytes/s. */
4150 unsigned int max_rate; /* In bytes/s. */
4151 unsigned int burst; /* In bytes. */
4152 unsigned int priority; /* Lower values are higher priorities. */
4153};
8b61709d 4154
c1c9c9c4 4155static struct htb *
b5d57fc8 4156htb_get__(const struct netdev *netdev_)
c1c9c9c4 4157{
b5d57fc8
BP
4158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4159 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
4160}
4161
24045e35 4162static void
b5d57fc8 4163htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 4164{
b5d57fc8 4165 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4166 struct htb *htb;
4167
4168 htb = xmalloc(sizeof *htb);
4169 tc_init(&htb->tc, &tc_ops_htb);
4170 htb->max_rate = max_rate;
4171
b5d57fc8 4172 netdev->tc = &htb->tc;
c1c9c9c4
BP
4173}
4174
4175/* Create an HTB qdisc.
4176 *
a339aa81 4177 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
4178static int
4179htb_setup_qdisc__(struct netdev *netdev)
4180{
4181 size_t opt_offset;
4182 struct tc_htb_glob opt;
4183 struct ofpbuf request;
4184 struct tcmsg *tcmsg;
4185
4186 tc_del_qdisc(netdev);
4187
7874bdff
RD
4188 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4189 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
4190 if (!tcmsg) {
4191 return ENODEV;
4192 }
c1c9c9c4
BP
4193 tcmsg->tcm_handle = tc_make_handle(1, 0);
4194 tcmsg->tcm_parent = TC_H_ROOT;
4195
4196 nl_msg_put_string(&request, TCA_KIND, "htb");
4197
4198 memset(&opt, 0, sizeof opt);
4f631ccd 4199 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 4200 opt.version = 3;
4ecf12d5 4201 opt.defcls = 1;
c1c9c9c4
BP
4202
4203 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4204 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4205 nl_msg_end_nested(&request, opt_offset);
4206
4207 return tc_transact(&request, NULL);
4208}
4209
4210/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4211 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4212static int
4213htb_setup_class__(struct netdev *netdev, unsigned int handle,
4214 unsigned int parent, struct htb_class *class)
4215{
4216 size_t opt_offset;
4217 struct tc_htb_opt opt;
4218 struct ofpbuf request;
4219 struct tcmsg *tcmsg;
4220 int error;
4221 int mtu;
4222
73371c09 4223 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4224 if (error) {
f915f1a8
BP
4225 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4226 netdev_get_name(netdev));
9b020780 4227 return error;
f915f1a8 4228 }
c1c9c9c4
BP
4229
4230 memset(&opt, 0, sizeof opt);
4231 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4232 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
4233 /* Makes sure the quantum is at least MTU. Setting quantum will
4234 * make htb ignore the r2q for this class. */
4235 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4236 opt.quantum = mtu;
4237 }
c1c9c9c4
BP
4238 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4239 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4240 opt.prio = class->priority;
4241
7874bdff
RD
4242 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4243 &request);
23a98ffe
BP
4244 if (!tcmsg) {
4245 return ENODEV;
4246 }
c1c9c9c4
BP
4247 tcmsg->tcm_handle = handle;
4248 tcmsg->tcm_parent = parent;
4249
4250 nl_msg_put_string(&request, TCA_KIND, "htb");
4251 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4252 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4253 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4254 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4255 nl_msg_end_nested(&request, opt_offset);
4256
4257 error = tc_transact(&request, NULL);
4258 if (error) {
4259 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4260 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4261 netdev_get_name(netdev),
4262 tc_get_major(handle), tc_get_minor(handle),
4263 tc_get_major(parent), tc_get_minor(parent),
4264 class->min_rate, class->max_rate,
10a89ef0 4265 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
4266 }
4267 return error;
4268}
4269
4270/* Parses Netlink attributes in 'options' for HTB parameters and stores a
4271 * description of them into 'details'. The description complies with the
4272 * specification given in the vswitch database documentation for linux-htb
4273 * queue details. */
4274static int
4275htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4276{
4277 static const struct nl_policy tca_htb_policy[] = {
4278 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4279 .min_len = sizeof(struct tc_htb_opt) },
4280 };
4281
4282 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4283 const struct tc_htb_opt *htb;
4284
4285 if (!nl_parse_nested(nl_options, tca_htb_policy,
4286 attrs, ARRAY_SIZE(tca_htb_policy))) {
4287 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4288 return EPROTO;
4289 }
4290
4291 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4292 class->min_rate = htb->rate.rate;
4293 class->max_rate = htb->ceil.rate;
4294 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4295 class->priority = htb->prio;
4296 return 0;
4297}
4298
4299static int
4300htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4301 struct htb_class *options,
4302 struct netdev_queue_stats *stats)
4303{
4304 struct nlattr *nl_options;
4305 unsigned int handle;
4306 int error;
4307
4308 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4309 if (!error && queue_id) {
17ee3c1f
BP
4310 unsigned int major = tc_get_major(handle);
4311 unsigned int minor = tc_get_minor(handle);
4312 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4313 *queue_id = minor - 1;
c1c9c9c4
BP
4314 } else {
4315 error = EPROTO;
4316 }
4317 }
4318 if (!error && options) {
4319 error = htb_parse_tca_options__(nl_options, options);
4320 }
4321 return error;
4322}
4323
4324static void
73371c09 4325htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 4326 const struct smap *details, struct htb_class *hc)
c1c9c9c4 4327{
73371c09 4328 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 4329
13c1637f 4330 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 4331 if (!hc->max_rate) {
a00ca915 4332 enum netdev_features current;
c1c9c9c4 4333
73371c09
BP
4334 netdev_linux_read_features(netdev);
4335 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4336 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
4337 }
4338 hc->min_rate = hc->max_rate;
4339 hc->burst = 0;
4340 hc->priority = 0;
4341}
4342
4343static int
4344htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 4345 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
4346{
4347 const struct htb *htb = htb_get__(netdev);
9b020780 4348 int mtu, error;
214117fd 4349 unsigned long long int max_rate_bit;
c1c9c9c4 4350
73371c09 4351 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4352 if (error) {
f915f1a8
BP
4353 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4354 netdev_get_name(netdev));
9b020780 4355 return error;
f915f1a8
BP
4356 }
4357
4f104611
EJ
4358 /* HTB requires at least an mtu sized min-rate to send any traffic even
4359 * on uncongested links. */
13c1637f 4360 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 4361 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
4362 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4363
4364 /* max-rate */
214117fd
KF
4365 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4366 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
4367 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4368 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4369
4370 /* burst
4371 *
4372 * According to hints in the documentation that I've read, it is important
4373 * that 'burst' be at least as big as the largest frame that might be
4374 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4375 * but having it a bit too small is a problem. Since netdev_get_mtu()
4376 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4377 * the MTU. We actually add 64, instead of 14, as a guard against
4378 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 4379 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
4380 hc->burst = MAX(hc->burst, mtu + 64);
4381
4382 /* priority */
13c1637f 4383 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
4384
4385 return 0;
4386}
4387
4388static int
4389htb_query_class__(const struct netdev *netdev, unsigned int handle,
4390 unsigned int parent, struct htb_class *options,
4391 struct netdev_queue_stats *stats)
4392{
4393 struct ofpbuf *reply;
4394 int error;
4395
4396 error = tc_query_class(netdev, handle, parent, &reply);
4397 if (!error) {
4398 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4399 ofpbuf_delete(reply);
4400 }
4401 return error;
4402}
4403
4404static int
79f1cbe9 4405htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4406{
4407 int error;
4408
4409 error = htb_setup_qdisc__(netdev);
4410 if (!error) {
4411 struct htb_class hc;
4412
4413 htb_parse_qdisc_details__(netdev, details, &hc);
4414 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4415 tc_make_handle(1, 0), &hc);
4416 if (!error) {
4417 htb_install__(netdev, hc.max_rate);
4418 }
4419 }
4420 return error;
4421}
4422
93b13be8
BP
4423static struct htb_class *
4424htb_class_cast__(const struct tc_queue *queue)
4425{
4426 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4427}
4428
c1c9c9c4
BP
4429static void
4430htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4431 const struct htb_class *hc)
4432{
4433 struct htb *htb = htb_get__(netdev);
93b13be8
BP
4434 size_t hash = hash_int(queue_id, 0);
4435 struct tc_queue *queue;
c1c9c9c4
BP
4436 struct htb_class *hcp;
4437
93b13be8
BP
4438 queue = tc_find_queue__(netdev, queue_id, hash);
4439 if (queue) {
4440 hcp = htb_class_cast__(queue);
4441 } else {
c1c9c9c4 4442 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
4443 queue = &hcp->tc_queue;
4444 queue->queue_id = queue_id;
6dc34a0d 4445 queue->created = time_msec();
93b13be8 4446 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 4447 }
93b13be8
BP
4448
4449 hcp->min_rate = hc->min_rate;
4450 hcp->max_rate = hc->max_rate;
4451 hcp->burst = hc->burst;
4452 hcp->priority = hc->priority;
c1c9c9c4
BP
4453}
4454
4455static int
4456htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4457{
c1c9c9c4 4458 struct ofpbuf msg;
d57695d7 4459 struct queue_dump_state state;
c1c9c9c4 4460 struct htb_class hc;
c1c9c9c4
BP
4461
4462 /* Get qdisc options. */
4463 hc.max_rate = 0;
4464 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4465 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
4466
4467 /* Get queues. */
d57695d7 4468 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
4469 return ENODEV;
4470 }
d57695d7 4471 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
4472 unsigned int queue_id;
4473
4474 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4475 htb_update_queue__(netdev, queue_id, &hc);
4476 }
4477 }
d57695d7 4478 finish_queue_dump(&state);
c1c9c9c4
BP
4479
4480 return 0;
4481}
4482
4483static void
4484htb_tc_destroy(struct tc *tc)
4485{
4486 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 4487 struct htb_class *hc;
c1c9c9c4 4488
4ec3d7c7 4489 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
4490 free(hc);
4491 }
4492 tc_destroy(tc);
4493 free(htb);
4494}
4495
4496static int
79f1cbe9 4497htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
4498{
4499 const struct htb *htb = htb_get__(netdev);
79f1cbe9 4500 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
4501 return 0;
4502}
4503
4504static int
79f1cbe9 4505htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4506{
4507 struct htb_class hc;
4508 int error;
4509
4510 htb_parse_qdisc_details__(netdev, details, &hc);
4511 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4512 tc_make_handle(1, 0), &hc);
4513 if (!error) {
4514 htb_get__(netdev)->max_rate = hc.max_rate;
4515 }
4516 return error;
4517}
4518
4519static int
93b13be8 4520htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4521 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 4522{
93b13be8 4523 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4524
79f1cbe9 4525 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 4526 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4527 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 4528 }
79f1cbe9 4529 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 4530 if (hc->priority) {
79f1cbe9 4531 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
4532 }
4533 return 0;
4534}
4535
4536static int
4537htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4538 const struct smap *details)
c1c9c9c4
BP
4539{
4540 struct htb_class hc;
4541 int error;
4542
4543 error = htb_parse_class_details__(netdev, details, &hc);
4544 if (error) {
4545 return error;
4546 }
4547
17ee3c1f 4548 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
4549 tc_make_handle(1, 0xfffe), &hc);
4550 if (error) {
4551 return error;
4552 }
4553
4554 htb_update_queue__(netdev, queue_id, &hc);
4555 return 0;
4556}
4557
4558static int
93b13be8 4559htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 4560{
93b13be8 4561 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4562 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4563 int error;
4564
93b13be8 4565 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4566 if (!error) {
93b13be8 4567 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4568 free(hc);
c1c9c9c4
BP
4569 }
4570 return error;
4571}
4572
4573static int
93b13be8 4574htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4575 struct netdev_queue_stats *stats)
4576{
93b13be8 4577 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4578 tc_make_handle(1, 0xfffe), NULL, stats);
4579}
4580
4581static int
4582htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4583 const struct ofpbuf *nlmsg,
4584 netdev_dump_queue_stats_cb *cb, void *aux)
4585{
4586 struct netdev_queue_stats stats;
17ee3c1f 4587 unsigned int handle, major, minor;
c1c9c9c4
BP
4588 int error;
4589
4590 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4591 if (error) {
4592 return error;
4593 }
4594
17ee3c1f
BP
4595 major = tc_get_major(handle);
4596 minor = tc_get_minor(handle);
4597 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4598 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4599 }
4600 return 0;
4601}
4602
4603static const struct tc_ops tc_ops_htb = {
89c09c1c
BP
4604 .linux_name = "htb",
4605 .ovs_name = "linux-htb",
4606 .n_queues = HTB_N_QUEUES,
4607 .tc_install = htb_tc_install,
4608 .tc_load = htb_tc_load,
4609 .tc_destroy = htb_tc_destroy,
4610 .qdisc_get = htb_qdisc_get,
4611 .qdisc_set = htb_qdisc_set,
4612 .class_get = htb_class_get,
4613 .class_set = htb_class_set,
4614 .class_delete = htb_class_delete,
4615 .class_get_stats = htb_class_get_stats,
4616 .class_dump_stats = htb_class_dump_stats
c1c9c9c4
BP
4617};
4618\f
a339aa81
EJ
4619/* "linux-hfsc" traffic control class. */
4620
4621#define HFSC_N_QUEUES 0xf000
4622
4623struct hfsc {
4624 struct tc tc;
4625 uint32_t max_rate;
4626};
4627
4628struct hfsc_class {
4629 struct tc_queue tc_queue;
4630 uint32_t min_rate;
4631 uint32_t max_rate;
4632};
4633
4634static struct hfsc *
b5d57fc8 4635hfsc_get__(const struct netdev *netdev_)
a339aa81 4636{
b5d57fc8
BP
4637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4638 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4639}
4640
4641static struct hfsc_class *
4642hfsc_class_cast__(const struct tc_queue *queue)
4643{
4644 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4645}
4646
24045e35 4647static void
b5d57fc8 4648hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4649{
b5d57fc8 4650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4651 struct hfsc *hfsc;
4652
a339aa81
EJ
4653 hfsc = xmalloc(sizeof *hfsc);
4654 tc_init(&hfsc->tc, &tc_ops_hfsc);
4655 hfsc->max_rate = max_rate;
b5d57fc8 4656 netdev->tc = &hfsc->tc;
a339aa81
EJ
4657}
4658
4659static void
4660hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4661 const struct hfsc_class *hc)
4662{
4663 size_t hash;
4664 struct hfsc *hfsc;
4665 struct hfsc_class *hcp;
4666 struct tc_queue *queue;
4667
4668 hfsc = hfsc_get__(netdev);
4669 hash = hash_int(queue_id, 0);
4670
4671 queue = tc_find_queue__(netdev, queue_id, hash);
4672 if (queue) {
4673 hcp = hfsc_class_cast__(queue);
4674 } else {
4675 hcp = xmalloc(sizeof *hcp);
4676 queue = &hcp->tc_queue;
4677 queue->queue_id = queue_id;
6dc34a0d 4678 queue->created = time_msec();
a339aa81
EJ
4679 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4680 }
4681
4682 hcp->min_rate = hc->min_rate;
4683 hcp->max_rate = hc->max_rate;
4684}
4685
4686static int
4687hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4688{
4689 const struct tc_service_curve *rsc, *fsc, *usc;
4690 static const struct nl_policy tca_hfsc_policy[] = {
4691 [TCA_HFSC_RSC] = {
4692 .type = NL_A_UNSPEC,
4693 .optional = false,
4694 .min_len = sizeof(struct tc_service_curve),
4695 },
4696 [TCA_HFSC_FSC] = {
4697 .type = NL_A_UNSPEC,
4698 .optional = false,
4699 .min_len = sizeof(struct tc_service_curve),
4700 },
4701 [TCA_HFSC_USC] = {
4702 .type = NL_A_UNSPEC,
4703 .optional = false,
4704 .min_len = sizeof(struct tc_service_curve),
4705 },
4706 };
4707 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4708
4709 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4710 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4711 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4712 return EPROTO;
4713 }
4714
4715 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4716 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4717 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4718
4719 if (rsc->m1 != 0 || rsc->d != 0 ||
4720 fsc->m1 != 0 || fsc->d != 0 ||
4721 usc->m1 != 0 || usc->d != 0) {
4722 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4723 "Non-linear service curves are not supported.");
4724 return EPROTO;
4725 }
4726
4727 if (rsc->m2 != fsc->m2) {
4728 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4729 "Real-time service curves are not supported ");
4730 return EPROTO;
4731 }
4732
4733 if (rsc->m2 > usc->m2) {
4734 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4735 "Min-rate service curve is greater than "
4736 "the max-rate service curve.");
4737 return EPROTO;
4738 }
4739
4740 class->min_rate = fsc->m2;
4741 class->max_rate = usc->m2;
4742 return 0;
4743}
4744
4745static int
4746hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4747 struct hfsc_class *options,
4748 struct netdev_queue_stats *stats)
4749{
4750 int error;
4751 unsigned int handle;
4752 struct nlattr *nl_options;
4753
4754 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4755 if (error) {
4756 return error;
4757 }
4758
4759 if (queue_id) {
4760 unsigned int major, minor;
4761
4762 major = tc_get_major(handle);
4763 minor = tc_get_minor(handle);
4764 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4765 *queue_id = minor - 1;
4766 } else {
4767 return EPROTO;
4768 }
4769 }
4770
4771 if (options) {
4772 error = hfsc_parse_tca_options__(nl_options, options);
4773 }
4774
4775 return error;
4776}
4777
4778static int
4779hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4780 unsigned int parent, struct hfsc_class *options,
4781 struct netdev_queue_stats *stats)
4782{
4783 int error;
4784 struct ofpbuf *reply;
4785
4786 error = tc_query_class(netdev, handle, parent, &reply);
4787 if (error) {
4788 return error;
4789 }
4790
4791 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4792 ofpbuf_delete(reply);
4793 return error;
4794}
4795
4796static void
73371c09 4797hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4798 struct hfsc_class *class)
4799{
73371c09 4800 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4801
13c1637f 4802 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4803 if (!max_rate) {
a00ca915 4804 enum netdev_features current;
a339aa81 4805
73371c09
BP
4806 netdev_linux_read_features(netdev);
4807 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4808 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4809 }
4810
4811 class->min_rate = max_rate;
4812 class->max_rate = max_rate;
4813}
4814
4815static int
4816hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4817 const struct smap *details,
a339aa81
EJ
4818 struct hfsc_class * class)
4819{
4820 const struct hfsc *hfsc;
4821 uint32_t min_rate, max_rate;
a339aa81
EJ
4822
4823 hfsc = hfsc_get__(netdev);
a339aa81 4824
13c1637f 4825 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4826 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4827 min_rate = MIN(min_rate, hfsc->max_rate);
4828
13c1637f 4829 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4830 max_rate = MAX(max_rate, min_rate);
4831 max_rate = MIN(max_rate, hfsc->max_rate);
4832
4833 class->min_rate = min_rate;
4834 class->max_rate = max_rate;
4835
4836 return 0;
4837}
4838
4839/* Create an HFSC qdisc.
4840 *
4841 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4842static int
4843hfsc_setup_qdisc__(struct netdev * netdev)
4844{
4845 struct tcmsg *tcmsg;
4846 struct ofpbuf request;
4847 struct tc_hfsc_qopt opt;
4848
4849 tc_del_qdisc(netdev);
4850
7874bdff
RD
4851 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4852 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
4853
4854 if (!tcmsg) {
4855 return ENODEV;
4856 }
4857
4858 tcmsg->tcm_handle = tc_make_handle(1, 0);
4859 tcmsg->tcm_parent = TC_H_ROOT;
4860
4861 memset(&opt, 0, sizeof opt);
4862 opt.defcls = 1;
4863
4864 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4865 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4866
4867 return tc_transact(&request, NULL);
4868}
4869
4870/* Create an HFSC class.
4871 *
4872 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4873 * sc rate <min_rate> ul rate <max_rate>" */
4874static int
4875hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4876 unsigned int parent, struct hfsc_class *class)
4877{
4878 int error;
4879 size_t opt_offset;
4880 struct tcmsg *tcmsg;
4881 struct ofpbuf request;
4882 struct tc_service_curve min, max;
4883
7874bdff
RD
4884 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4885 &request);
a339aa81
EJ
4886
4887 if (!tcmsg) {
4888 return ENODEV;
4889 }
4890
4891 tcmsg->tcm_handle = handle;
4892 tcmsg->tcm_parent = parent;
4893
4894 min.m1 = 0;
4895 min.d = 0;
4896 min.m2 = class->min_rate;
4897
4898 max.m1 = 0;
4899 max.d = 0;
4900 max.m2 = class->max_rate;
4901
4902 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4903 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4904 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4905 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4906 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4907 nl_msg_end_nested(&request, opt_offset);
4908
4909 error = tc_transact(&request, NULL);
4910 if (error) {
4911 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4912 "min-rate %ubps, max-rate %ubps (%s)",
4913 netdev_get_name(netdev),
4914 tc_get_major(handle), tc_get_minor(handle),
4915 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4916 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4917 }
4918
4919 return error;
4920}
4921
4922static int
79f1cbe9 4923hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4924{
4925 int error;
4926 struct hfsc_class class;
4927
4928 error = hfsc_setup_qdisc__(netdev);
4929
4930 if (error) {
4931 return error;
4932 }
4933
4934 hfsc_parse_qdisc_details__(netdev, details, &class);
4935 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4936 tc_make_handle(1, 0), &class);
4937
4938 if (error) {
4939 return error;
4940 }
4941
4942 hfsc_install__(netdev, class.max_rate);
4943 return 0;
4944}
4945
4946static int
4947hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4948{
4949 struct ofpbuf msg;
d57695d7 4950 struct queue_dump_state state;
a339aa81
EJ
4951 struct hfsc_class hc;
4952
4953 hc.max_rate = 0;
4954 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4955 hfsc_install__(netdev, hc.max_rate);
a339aa81 4956
d57695d7 4957 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4958 return ENODEV;
4959 }
4960
d57695d7 4961 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4962 unsigned int queue_id;
4963
4964 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4965 hfsc_update_queue__(netdev, queue_id, &hc);
4966 }
4967 }
4968
d57695d7 4969 finish_queue_dump(&state);
a339aa81
EJ
4970 return 0;
4971}
4972
4973static void
4974hfsc_tc_destroy(struct tc *tc)
4975{
4976 struct hfsc *hfsc;
4977 struct hfsc_class *hc, *next;
4978
4979 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4980
4981 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4982 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4983 free(hc);
4984 }
4985
4986 tc_destroy(tc);
4987 free(hfsc);
4988}
4989
4990static int
79f1cbe9 4991hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4992{
4993 const struct hfsc *hfsc;
4994 hfsc = hfsc_get__(netdev);
79f1cbe9 4995 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4996 return 0;
4997}
4998
4999static int
79f1cbe9 5000hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
5001{
5002 int error;
5003 struct hfsc_class class;
5004
5005 hfsc_parse_qdisc_details__(netdev, details, &class);
5006 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5007 tc_make_handle(1, 0), &class);
5008
5009 if (!error) {
5010 hfsc_get__(netdev)->max_rate = class.max_rate;
5011 }
5012
5013 return error;
5014}
5015
5016static int
5017hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 5018 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
5019{
5020 const struct hfsc_class *hc;
5021
5022 hc = hfsc_class_cast__(queue);
79f1cbe9 5023 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 5024 if (hc->min_rate != hc->max_rate) {
79f1cbe9 5025 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
5026 }
5027 return 0;
5028}
5029
5030static int
5031hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 5032 const struct smap *details)
a339aa81
EJ
5033{
5034 int error;
5035 struct hfsc_class class;
5036
5037 error = hfsc_parse_class_details__(netdev, details, &class);
5038 if (error) {
5039 return error;
5040 }
5041
5042 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5043 tc_make_handle(1, 0xfffe), &class);
5044 if (error) {
5045 return error;
5046 }
5047
5048 hfsc_update_queue__(netdev, queue_id, &class);
5049 return 0;
5050}
5051
5052static int
5053hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5054{
5055 int error;
5056 struct hfsc *hfsc;
5057 struct hfsc_class *hc;
5058
5059 hc = hfsc_class_cast__(queue);
5060 hfsc = hfsc_get__(netdev);
5061
5062 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5063 if (!error) {
5064 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5065 free(hc);
5066 }
5067 return error;
5068}
5069
5070static int
5071hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5072 struct netdev_queue_stats *stats)
5073{
5074 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5075 tc_make_handle(1, 0xfffe), NULL, stats);
5076}
5077
5078static int
5079hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5080 const struct ofpbuf *nlmsg,
5081 netdev_dump_queue_stats_cb *cb, void *aux)
5082{
5083 struct netdev_queue_stats stats;
5084 unsigned int handle, major, minor;
5085 int error;
5086
5087 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5088 if (error) {
5089 return error;
5090 }
5091
5092 major = tc_get_major(handle);
5093 minor = tc_get_minor(handle);
5094 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5095 (*cb)(minor - 1, &stats, aux);
5096 }
5097 return 0;
5098}
5099
5100static const struct tc_ops tc_ops_hfsc = {
89c09c1c
BP
5101 .linux_name = "hfsc",
5102 .ovs_name = "linux-hfsc",
5103 .n_queues = HFSC_N_QUEUES, /* n_queues */
5104 .tc_install = hfsc_tc_install,
5105 .tc_load = hfsc_tc_load,
5106 .tc_destroy = hfsc_tc_destroy,
5107 .qdisc_get = hfsc_qdisc_get,
5108 .qdisc_set = hfsc_qdisc_set,
5109 .class_get = hfsc_class_get,
5110 .class_set = hfsc_class_set,
5111 .class_delete = hfsc_class_delete,
5112 .class_get_stats = hfsc_class_get_stats,
5113 .class_dump_stats = hfsc_class_dump_stats,
a339aa81
EJ
5114};
5115\f
6cf888b8
BS
5116/* "linux-noop" traffic control class. */
5117
5118static void
5119noop_install__(struct netdev *netdev_)
5120{
5121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5122 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5123
5124 netdev->tc = CONST_CAST(struct tc *, &tc);
5125}
5126
5127static int
5128noop_tc_install(struct netdev *netdev,
5129 const struct smap *details OVS_UNUSED)
5130{
5131 noop_install__(netdev);
5132 return 0;
5133}
5134
5135static int
5136noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5137{
5138 noop_install__(netdev);
5139 return 0;
5140}
5141
5142static const struct tc_ops tc_ops_noop = {
89c09c1c
BP
5143 .ovs_name = "linux-noop", /* ovs_name */
5144 .tc_install = noop_tc_install,
5145 .tc_load = noop_tc_load,
6cf888b8
BS
5146};
5147\f
c1c9c9c4
BP
5148/* "linux-default" traffic control class.
5149 *
5150 * This class represents the default, unnamed Linux qdisc. It corresponds to
5151 * the "" (empty string) QoS type in the OVS database. */
5152
5153static void
b5d57fc8 5154default_install__(struct netdev *netdev_)
c1c9c9c4 5155{
b5d57fc8 5156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5157 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 5158
559eb230
BP
5159 /* Nothing but a tc class implementation is allowed to write to a tc. This
5160 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5161 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5162}
5163
5164static int
5165default_tc_install(struct netdev *netdev,
79f1cbe9 5166 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
5167{
5168 default_install__(netdev);
5169 return 0;
5170}
5171
5172static int
5173default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5174{
5175 default_install__(netdev);
5176 return 0;
5177}
5178
5179static const struct tc_ops tc_ops_default = {
89c09c1c
BP
5180 .ovs_name = "", /* ovs_name */
5181 .tc_install = default_tc_install,
5182 .tc_load = default_tc_load,
c1c9c9c4
BP
5183};
5184\f
5185/* "linux-other" traffic control class.
5186 *
5187 * */
5188
5189static int
b5d57fc8 5190other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 5191{
b5d57fc8 5192 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5193 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 5194
559eb230
BP
5195 /* Nothing but a tc class implementation is allowed to write to a tc. This
5196 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5197 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5198 return 0;
5199}
5200
5201static const struct tc_ops tc_ops_other = {
89c09c1c
BP
5202 .ovs_name = "linux-other",
5203 .tc_load = other_tc_load,
c1c9c9c4
BP
5204};
5205\f
5206/* Traffic control. */
5207
5208/* Number of kernel "tc" ticks per second. */
5209static double ticks_per_s;
5210
5211/* Number of kernel "jiffies" per second. This is used for the purpose of
5212 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5213 * one jiffy's worth of data.
5214 *
5215 * There are two possibilities here:
5216 *
5217 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5218 * approximate range of 100 to 1024. That means that we really need to
5219 * make sure that the qdisc can buffer that much data.
5220 *
5221 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5222 * has finely granular timers and there's no need to fudge additional room
5223 * for buffers. (There's no extra effort needed to implement that: the
5224 * large 'buffer_hz' is used as a divisor, so practically any number will
5225 * come out as 0 in the division. Small integer results in the case of
5226 * really high dividends won't have any real effect anyhow.)
5227 */
5228static unsigned int buffer_hz;
5229
7874bdff
RD
5230static struct tcmsg *
5231netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5232 unsigned int flags, struct ofpbuf *request)
5233{
5234 int ifindex;
5235 int error;
5236
5237 error = get_ifindex(netdev, &ifindex);
5238 if (error) {
5239 return NULL;
5240 }
5241
5242 return tc_make_request(ifindex, type, flags, request);
5243}
5244
f8500004
JP
5245/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5246 * of 'kbits_burst'.
5247 *
5248 * This function is equivalent to running:
5249 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5250 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5251 * mtu 65535 drop
5252 *
5253 * The configuration and stats may be seen with the following command:
c7952afb 5254 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
5255 *
5256 * Returns 0 if successful, otherwise a positive errno value.
5257 */
5258static int
c7952afb
BP
5259tc_add_policer(struct netdev *netdev,
5260 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
5261{
5262 struct tc_police tc_police;
5263 struct ofpbuf request;
5264 struct tcmsg *tcmsg;
5265 size_t basic_offset;
5266 size_t police_offset;
5267 int error;
5268 int mtu = 65535;
5269
5270 memset(&tc_police, 0, sizeof tc_police);
5271 tc_police.action = TC_POLICE_SHOT;
5272 tc_police.mtu = mtu;
1aca400c 5273 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 5274
79abacc8
MAA
5275 /* The following appears wrong in one way: In networking a kilobit is
5276 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
5277 *
5278 * However if you "fix" those problems then "tc filter show ..." shows
5279 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5280 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5281 * tc's point of view. Whatever. */
5282 tc_police.burst = tc_bytes_to_ticks(
79abacc8 5283 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 5284
7874bdff
RD
5285 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5286 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
5287 if (!tcmsg) {
5288 return ENODEV;
5289 }
5290 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5291 tcmsg->tcm_info = tc_make_handle(49,
5292 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5293
5294 nl_msg_put_string(&request, TCA_KIND, "basic");
5295 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5296 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5297 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5298 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5299 nl_msg_end_nested(&request, police_offset);
5300 nl_msg_end_nested(&request, basic_offset);
5301
5302 error = tc_transact(&request, NULL);
5303 if (error) {
5304 return error;
5305 }
5306
5307 return 0;
5308}
5309
c1c9c9c4
BP
5310static void
5311read_psched(void)
5312{
5313 /* The values in psched are not individually very meaningful, but they are
5314 * important. The tables below show some values seen in the wild.
5315 *
5316 * Some notes:
5317 *
5318 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5319 * (Before that, there are hints that it was 1000000000.)
5320 *
5321 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5322 * above.
5323 *
5324 * /proc/net/psched
5325 * -----------------------------------
5326 * [1] 000c8000 000f4240 000f4240 00000064
5327 * [2] 000003e8 00000400 000f4240 3b9aca00
5328 * [3] 000003e8 00000400 000f4240 3b9aca00
5329 * [4] 000003e8 00000400 000f4240 00000064
5330 * [5] 000003e8 00000040 000f4240 3b9aca00
5331 * [6] 000003e8 00000040 000f4240 000000f9
5332 *
5333 * a b c d ticks_per_s buffer_hz
5334 * ------- --------- ---------- ------------- ----------- -------------
5335 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5336 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5337 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5338 * [4] 1,000 1,024 1,000,000 100 976,562 100
5339 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5340 * [6] 1,000 64 1,000,000 249 15,625,000 249
5341 *
5342 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5343 * [2] 2.6.26-1-686-bigmem from Debian lenny
5344 * [3] 2.6.26-2-sparc64 from Debian lenny
5345 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5346 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5347 * [6] 2.6.34 from kernel.org on KVM
5348 */
23882115 5349 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
5350 static const char fn[] = "/proc/net/psched";
5351 unsigned int a, b, c, d;
5352 FILE *stream;
5353
23882115
BP
5354 if (!ovsthread_once_start(&once)) {
5355 return;
5356 }
5357
c1c9c9c4
BP
5358 ticks_per_s = 1.0;
5359 buffer_hz = 100;
5360
5361 stream = fopen(fn, "r");
5362 if (!stream) {
10a89ef0 5363 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 5364 goto exit;
c1c9c9c4
BP
5365 }
5366
5367 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5368 VLOG_WARN("%s: read failed", fn);
5369 fclose(stream);
23882115 5370 goto exit;
c1c9c9c4
BP
5371 }
5372 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5373 fclose(stream);
5374
1bab4901 5375 if (!a || !b || !c) {
c1c9c9c4 5376 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 5377 goto exit;
c1c9c9c4
BP
5378 }
5379
5380 ticks_per_s = (double) a * c / b;
5381 if (c == 1000000) {
5382 buffer_hz = d;
5383 } else {
5384 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5385 fn, a, b, c, d);
5386 }
5387 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
5388
5389exit:
5390 ovsthread_once_done(&once);
c1c9c9c4
BP
5391}
5392
5393/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5394 * rate of 'rate' bytes per second. */
5395static unsigned int
5396tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5397{
23882115 5398 read_psched();
c1c9c9c4
BP
5399 return (rate * ticks) / ticks_per_s;
5400}
5401
5402/* Returns the number of ticks that it would take to transmit 'size' bytes at a
5403 * rate of 'rate' bytes per second. */
5404static unsigned int
5405tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5406{
23882115 5407 read_psched();
015c93a4 5408 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
5409}
5410
5411/* Returns the number of bytes that need to be reserved for qdisc buffering at
5412 * a transmission rate of 'rate' bytes per second. */
5413static unsigned int
5414tc_buffer_per_jiffy(unsigned int rate)
5415{
23882115 5416 read_psched();
c1c9c9c4
BP
5417 return rate / buffer_hz;
5418}
5419
2f564bb1
S
5420static uint32_t
5421tc_time_to_ticks(uint32_t time) {
5422 read_psched();
5423 return time * (ticks_per_s / 1000000);
5424}
5425
c1c9c9c4
BP
5426/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5427 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5428 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5429 * stores NULL into it if it is absent.
5430 *
5431 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5432 * 'msg'.
5433 *
5434 * Returns 0 if successful, otherwise a positive errno value. */
5435static int
5436tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5437 struct nlattr **options)
5438{
5439 static const struct nl_policy tca_policy[] = {
5440 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5441 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5442 };
5443 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5444
5445 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5446 tca_policy, ta, ARRAY_SIZE(ta))) {
5447 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5448 goto error;
5449 }
5450
5451 if (kind) {
5452 *kind = nl_attr_get_string(ta[TCA_KIND]);
5453 }
5454
5455 if (options) {
5456 *options = ta[TCA_OPTIONS];
5457 }
5458
5459 return 0;
5460
5461error:
5462 if (kind) {
5463 *kind = NULL;
5464 }
5465 if (options) {
5466 *options = NULL;
5467 }
5468 return EPROTO;
5469}
5470
5471/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5472 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5473 * into '*options', and its queue statistics into '*stats'. Any of the output
5474 * arguments may be null.
5475 *
5476 * Returns 0 if successful, otherwise a positive errno value. */
5477static int
5478tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5479 struct nlattr **options, struct netdev_queue_stats *stats)
5480{
5481 static const struct nl_policy tca_policy[] = {
5482 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5483 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5484 };
5485 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5486
5487 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5488 tca_policy, ta, ARRAY_SIZE(ta))) {
5489 VLOG_WARN_RL(&rl, "failed to parse class message");
5490 goto error;
5491 }
5492
5493 if (handlep) {
5494 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5495 *handlep = tc->tcm_handle;
5496 }
5497
5498 if (options) {
5499 *options = ta[TCA_OPTIONS];
5500 }
5501
5502 if (stats) {
5503 const struct gnet_stats_queue *gsq;
5504 struct gnet_stats_basic gsb;
5505
5506 static const struct nl_policy stats_policy[] = {
5507 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5508 .min_len = sizeof gsb },
5509 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5510 .min_len = sizeof *gsq },
5511 };
5512 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5513
5514 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5515 sa, ARRAY_SIZE(sa))) {
5516 VLOG_WARN_RL(&rl, "failed to parse class stats");
5517 goto error;
5518 }
5519
5520 /* Alignment issues screw up the length of struct gnet_stats_basic on
5521 * some arch/bitsize combinations. Newer versions of Linux have a
5522 * struct gnet_stats_basic_packed, but we can't depend on that. The
5523 * easiest thing to do is just to make a copy. */
5524 memset(&gsb, 0, sizeof gsb);
5525 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5526 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5527 stats->tx_bytes = gsb.bytes;
5528 stats->tx_packets = gsb.packets;
5529
5530 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5531 stats->tx_errors = gsq->drops;
5532 }
5533
5534 return 0;
5535
5536error:
5537 if (options) {
5538 *options = NULL;
5539 }
5540 if (stats) {
5541 memset(stats, 0, sizeof *stats);
5542 }
5543 return EPROTO;
5544}
5545
5546/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5547 * on 'netdev'. */
5548static int
5549tc_query_class(const struct netdev *netdev,
5550 unsigned int handle, unsigned int parent,
5551 struct ofpbuf **replyp)
5552{
5553 struct ofpbuf request;
5554 struct tcmsg *tcmsg;
5555 int error;
5556
7874bdff
RD
5557 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5558 &request);
23a98ffe
BP
5559 if (!tcmsg) {
5560 return ENODEV;
5561 }
c1c9c9c4
BP
5562 tcmsg->tcm_handle = handle;
5563 tcmsg->tcm_parent = parent;
5564
5565 error = tc_transact(&request, replyp);
5566 if (error) {
5567 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5568 netdev_get_name(netdev),
5569 tc_get_major(handle), tc_get_minor(handle),
5570 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5571 ovs_strerror(error));
c1c9c9c4
BP
5572 }
5573 return error;
5574}
5575
5576/* Equivalent to "tc class del dev <name> handle <handle>". */
5577static int
5578tc_delete_class(const struct netdev *netdev, unsigned int handle)
5579{
5580 struct ofpbuf request;
5581 struct tcmsg *tcmsg;
5582 int error;
5583
7874bdff 5584 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5585 if (!tcmsg) {
5586 return ENODEV;
5587 }
c1c9c9c4
BP
5588 tcmsg->tcm_handle = handle;
5589 tcmsg->tcm_parent = 0;
5590
5591 error = tc_transact(&request, NULL);
5592 if (error) {
5593 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5594 netdev_get_name(netdev),
5595 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5596 ovs_strerror(error));
c1c9c9c4
BP
5597 }
5598 return error;
5599}
5600
5601/* Equivalent to "tc qdisc del dev <name> root". */
5602static int
b5d57fc8 5603tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5604{
b5d57fc8 5605 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5606 struct ofpbuf request;
5607 struct tcmsg *tcmsg;
5608 int error;
5609
7874bdff 5610 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5611 if (!tcmsg) {
5612 return ENODEV;
5613 }
c1c9c9c4
BP
5614 tcmsg->tcm_handle = tc_make_handle(1, 0);
5615 tcmsg->tcm_parent = TC_H_ROOT;
5616
5617 error = tc_transact(&request, NULL);
5618 if (error == EINVAL) {
5619 /* EINVAL probably means that the default qdisc was in use, in which
5620 * case we've accomplished our purpose. */
5621 error = 0;
5622 }
b5d57fc8
BP
5623 if (!error && netdev->tc) {
5624 if (netdev->tc->ops->tc_destroy) {
5625 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5626 }
b5d57fc8 5627 netdev->tc = NULL;
c1c9c9c4
BP
5628 }
5629 return error;
5630}
5631
ac3e3aaa
BP
5632static bool
5633getqdisc_is_safe(void)
5634{
5635 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5636 static bool safe = false;
5637
5638 if (ovsthread_once_start(&once)) {
5639 struct utsname utsname;
5640 int major, minor;
5641
5642 if (uname(&utsname) == -1) {
5643 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5644 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5645 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5646 } else if (major < 2 || (major == 2 && minor < 35)) {
5647 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5648 utsname.release);
5649 } else {
5650 safe = true;
5651 }
5652 ovsthread_once_done(&once);
5653 }
5654 return safe;
5655}
5656
c1c9c9c4
BP
5657/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5658 * kernel to determine what they are. Returns 0 if successful, otherwise a
5659 * positive errno value. */
5660static int
b5d57fc8 5661tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5662{
b5d57fc8 5663 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5664 struct ofpbuf request, *qdisc;
5665 const struct tc_ops *ops;
5666 struct tcmsg *tcmsg;
5667 int load_error;
5668 int error;
5669
b5d57fc8 5670 if (netdev->tc) {
c1c9c9c4
BP
5671 return 0;
5672 }
5673
5674 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5675 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5676 * 2.6.35 without that fix backported to it.
5677 *
5678 * To avoid the OOPS, we must not make a request that would attempt to dump
5679 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5680 * few others. There are a few ways that I can see to do this, but most of
5681 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5682 * technique chosen here is to assume that any non-default qdisc that we
5683 * create will have a class with handle 1:0. The built-in qdiscs only have
5684 * a class with handle 0:0.
5685 *
ac3e3aaa
BP
5686 * On Linux 2.6.35+ we use the straightforward method because it allows us
5687 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5688 * in such a case we get no response at all from the kernel (!) if a
5689 * builtin qdisc is in use (which is later caught by "!error &&
5690 * !qdisc->size"). */
7874bdff
RD
5691 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5692 &request);
23a98ffe
BP
5693 if (!tcmsg) {
5694 return ENODEV;
5695 }
ac3e3aaa
BP
5696 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5697 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5698
5699 /* Figure out what tc class to instantiate. */
5700 error = tc_transact(&request, &qdisc);
ac3e3aaa 5701 if (!error && qdisc->size) {
c1c9c9c4
BP
5702 const char *kind;
5703
5704 error = tc_parse_qdisc(qdisc, &kind, NULL);
5705 if (error) {
5706 ops = &tc_ops_other;
5707 } else {
5708 ops = tc_lookup_linux_name(kind);
5709 if (!ops) {
5710 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5711 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5712
5713 ops = &tc_ops_other;
5714 }
5715 }
ac3e3aaa
BP
5716 } else if ((!error && !qdisc->size) || error == ENOENT) {
5717 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5718 * set up by some other entity that doesn't have a handle 1:0. We will
5719 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5720 ops = &tc_ops_default;
5721 error = 0;
5722 } else {
5723 /* Who knows? Maybe the device got deleted. */
5724 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5725 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5726 ops = &tc_ops_other;
5727 }
5728
5729 /* Instantiate it. */
b5d57fc8
BP
5730 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5731 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5732 ofpbuf_delete(qdisc);
5733
5734 return error ? error : load_error;
5735}
5736
5737/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5738 approximate the time to transmit packets of various lengths. For an MTU of
5739 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5740 represents two possible packet lengths; for a MTU of 513 through 1024, four
5741 possible lengths; and so on.
5742
5743 Returns, for the specified 'mtu', the number of bits that packet lengths
5744 need to be shifted right to fit within such a 256-entry table. */
5745static int
5746tc_calc_cell_log(unsigned int mtu)
5747{
5748 int cell_log;
5749
5750 if (!mtu) {
5751 mtu = ETH_PAYLOAD_MAX;
5752 }
5753 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5754
5755 for (cell_log = 0; mtu >= 256; cell_log++) {
5756 mtu >>= 1;
5757 }
5758
5759 return cell_log;
5760}
5761
5762/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5763 * of 'mtu'. */
5764static void
5765tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5766{
5767 memset(rate, 0, sizeof *rate);
5768 rate->cell_log = tc_calc_cell_log(mtu);
5769 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5770 /* rate->cell_align = 0; */ /* distro headers. */
5771 rate->mpu = ETH_TOTAL_MIN;
5772 rate->rate = Bps;
5773}
5774
5775/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5776 * attribute of the specified "type".
5777 *
5778 * See tc_calc_cell_log() above for a description of "rtab"s. */
e7f6ba22 5779void
c1c9c9c4
BP
5780tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5781{
5782 uint32_t *rtab;
5783 unsigned int i;
5784
5785 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5786 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5787 unsigned packet_size = (i + 1) << rate->cell_log;
5788 if (packet_size < rate->mpu) {
5789 packet_size = rate->mpu;
5790 }
5791 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5792 }
5793}
5794
5795/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5796 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5797 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5798 * 0 is fine.) */
c1c9c9c4
BP
5799static int
5800tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5801{
5802 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5803 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5804}
d3980822 5805\f
aaf2fb1a
BP
5806/* Linux-only functions declared in netdev-linux.h */
5807
5808/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5809 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5810int
5811netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5812 const char *flag_name, bool enable)
5813{
5814 const char *netdev_name = netdev_get_name(netdev);
5815 struct ethtool_value evalue;
5816 uint32_t new_flags;
5817 int error;
5818
ab985a77 5819 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5820 memset(&evalue, 0, sizeof evalue);
5821 error = netdev_linux_do_ethtool(netdev_name,
5822 (struct ethtool_cmd *)&evalue,
5823 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5824 if (error) {
5825 return error;
5826 }
5827
ab985a77 5828 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5829 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5830 if (new_flags == evalue.data) {
5831 return 0;
5832 }
5833 evalue.data = new_flags;
aaf2fb1a
BP
5834 error = netdev_linux_do_ethtool(netdev_name,
5835 (struct ethtool_cmd *)&evalue,
5836 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5837 if (error) {
5838 return error;
5839 }
5840
ab985a77 5841 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5842 memset(&evalue, 0, sizeof evalue);
5843 error = netdev_linux_do_ethtool(netdev_name,
5844 (struct ethtool_cmd *)&evalue,
5845 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5846 if (error) {
5847 return error;
5848 }
5849
5850 if (new_flags != evalue.data) {
5851 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5852 "device %s failed", enable ? "enable" : "disable",
5853 flag_name, netdev_name);
5854 return EOPNOTSUPP;
5855 }
5856
5857 return 0;
5858}
5859\f
5860/* Utility functions. */
5861
d3980822 5862/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5863static void
d3980822
BP
5864netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5865 const struct rtnl_link_stats *src)
5866{
f613a0d7
PS
5867 dst->rx_packets = src->rx_packets;
5868 dst->tx_packets = src->tx_packets;
5869 dst->rx_bytes = src->rx_bytes;
5870 dst->tx_bytes = src->tx_bytes;
5871 dst->rx_errors = src->rx_errors;
5872 dst->tx_errors = src->tx_errors;
5873 dst->rx_dropped = src->rx_dropped;
5874 dst->tx_dropped = src->tx_dropped;
5875 dst->multicast = src->multicast;
5876 dst->collisions = src->collisions;
5877 dst->rx_length_errors = src->rx_length_errors;
5878 dst->rx_over_errors = src->rx_over_errors;
5879 dst->rx_crc_errors = src->rx_crc_errors;
5880 dst->rx_frame_errors = src->rx_frame_errors;
5881 dst->rx_fifo_errors = src->rx_fifo_errors;
5882 dst->rx_missed_errors = src->rx_missed_errors;
5883 dst->tx_aborted_errors = src->tx_aborted_errors;
5884 dst->tx_carrier_errors = src->tx_carrier_errors;
5885 dst->tx_fifo_errors = src->tx_fifo_errors;
5886 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5887 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5888}
5889
337c9b99
BP
5890/* Copies 'src' into 'dst', performing format conversion in the process. */
5891static void
5892netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5893 const struct rtnl_link_stats64 *src)
5894{
5895 dst->rx_packets = src->rx_packets;
5896 dst->tx_packets = src->tx_packets;
5897 dst->rx_bytes = src->rx_bytes;
5898 dst->tx_bytes = src->tx_bytes;
5899 dst->rx_errors = src->rx_errors;
5900 dst->tx_errors = src->tx_errors;
5901 dst->rx_dropped = src->rx_dropped;
5902 dst->tx_dropped = src->tx_dropped;
5903 dst->multicast = src->multicast;
5904 dst->collisions = src->collisions;
5905 dst->rx_length_errors = src->rx_length_errors;
5906 dst->rx_over_errors = src->rx_over_errors;
5907 dst->rx_crc_errors = src->rx_crc_errors;
5908 dst->rx_frame_errors = src->rx_frame_errors;
5909 dst->rx_fifo_errors = src->rx_fifo_errors;
5910 dst->rx_missed_errors = src->rx_missed_errors;
5911 dst->tx_aborted_errors = src->tx_aborted_errors;
5912 dst->tx_carrier_errors = src->tx_carrier_errors;
5913 dst->tx_fifo_errors = src->tx_fifo_errors;
5914 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5915 dst->tx_window_errors = src->tx_window_errors;
5916}
5917
c1c9c9c4 5918static int
35eef899 5919get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5920{
c1c9c9c4
BP
5921 struct ofpbuf request;
5922 struct ofpbuf *reply;
c1c9c9c4
BP
5923 int error;
5924
d6e3feb5 5925 /* Filtering all counters by default */
5926 memset(stats, 0xFF, sizeof(struct netdev_stats));
5927
c1c9c9c4 5928 ofpbuf_init(&request, 0);
13a24df8
BP
5929 nl_msg_put_nlmsghdr(&request,
5930 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5931 RTM_GETLINK, NLM_F_REQUEST);
5932 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5933 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5934 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5935 ofpbuf_uninit(&request);
5936 if (error) {
5937 return error;
5938 }
5939
13a24df8 5940 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5941 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5942 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5943 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5944 error = 0;
5945 } else {
71f21279 5946 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
5947 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5948 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5949 error = 0;
5950 } else {
5951 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5952 error = EPROTO;
5953 }
13a24df8
BP
5954 }
5955 } else {
5956 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5957 error = EPROTO;
c1c9c9c4 5958 }
8b61709d 5959
8b61709d 5960
576e26d7 5961 ofpbuf_delete(reply);
35eef899 5962 return error;
8b61709d 5963}
c1c9c9c4 5964
3a183124 5965static int
b5d57fc8 5966get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5967{
5968 struct ifreq ifr;
5969 int error;
5970
755be9ea 5971 *flags = 0;
259e0b1a 5972 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5973 if (!error) {
5974 *flags = ifr.ifr_flags;
5975 }
8b61709d
BP
5976 return error;
5977}
5978
5979static int
4b609110 5980set_flags(const char *name, unsigned int flags)
8b61709d
BP
5981{
5982 struct ifreq ifr;
5983
5984 ifr.ifr_flags = flags;
259e0b1a 5985 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5986}
5987
01b25786
PB
5988int
5989linux_get_ifindex(const char *netdev_name)
8b61709d
BP
5990{
5991 struct ifreq ifr;
259e0b1a 5992 int error;
8b61709d 5993
71d7c22f 5994 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5995 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5996
5997 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5998 if (error) {
580e1152
RD
5999 /* ENODEV probably means that a vif disappeared asynchronously and
6000 * hasn't been removed from the database yet, so reduce the log level
6001 * to INFO for that case. */
6002 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6003 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6004 netdev_name, ovs_strerror(error));
259e0b1a 6005 return -error;
8b61709d
BP
6006 }
6007 return ifr.ifr_ifindex;
6008}
6009
6010static int
6011get_ifindex(const struct netdev *netdev_, int *ifindexp)
6012{
b5d57fc8 6013 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 6014
b5d57fc8 6015 if (!(netdev->cache_valid & VALID_IFINDEX)) {
756819dd
FL
6016 netdev_linux_update_via_netlink(netdev);
6017 }
6018
6019 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6020 /* Fall back to ioctl if netlink fails */
01b25786 6021 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 6022
8b61709d 6023 if (ifindex < 0) {
b5d57fc8
BP
6024 netdev->get_ifindex_error = -ifindex;
6025 netdev->ifindex = 0;
c7b1b0a5 6026 } else {
b5d57fc8
BP
6027 netdev->get_ifindex_error = 0;
6028 netdev->ifindex = ifindex;
8b61709d 6029 }
b5d57fc8 6030 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 6031 }
c7b1b0a5 6032
b5d57fc8
BP
6033 *ifindexp = netdev->ifindex;
6034 return netdev->get_ifindex_error;
8b61709d
BP
6035}
6036
6037static int
756819dd
FL
6038netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6039{
6040 struct ofpbuf request;
6041 struct ofpbuf *reply;
6042 struct rtnetlink_change chg;
6043 struct rtnetlink_change *change = &chg;
6044 int error;
6045
6046 ofpbuf_init(&request, 0);
6047 nl_msg_put_nlmsghdr(&request,
b43762a5
FL
6048 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6049 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
756819dd
FL
6050 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6051
6052 /* The correct identifiers for a Linux device are netnsid and ifindex,
6053 * but ifindex changes as the port is moved to another network namespace
6054 * and the interface name statically stored in ovsdb. */
6055 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6056 if (netdev_linux_netnsid_is_remote(netdev)) {
23fa50f6 6057 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
756819dd
FL
6058 }
6059 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6060 ofpbuf_uninit(&request);
6061 if (error) {
6062 ofpbuf_delete(reply);
6063 return error;
6064 }
6065
6066 if (rtnetlink_parse(reply, change)
6067 && change->nlmsg_type == RTM_NEWLINK) {
6068 bool changed = false;
6069 error = 0;
6070
6071 /* Update netdev from rtnl msg and increment its seq if needed. */
6072 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6073 netdev->carrier_resets++;
6074 changed = true;
6075 }
6076 if (change->ifi_flags != netdev->ifi_flags) {
6077 netdev->ifi_flags = change->ifi_flags;
6078 changed = true;
6079 }
6080 if (change->mtu && change->mtu != netdev->mtu) {
6081 netdev->mtu = change->mtu;
6082 netdev->cache_valid |= VALID_MTU;
6083 netdev->netdev_mtu_error = 0;
6084 changed = true;
6085 }
6086 if (!eth_addr_is_zero(change->mac)
6087 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6088 netdev->etheraddr = change->mac;
6089 netdev->cache_valid |= VALID_ETHERADDR;
6090 netdev->ether_addr_error = 0;
6091 changed = true;
6092 }
6093 if (change->if_index != netdev->ifindex) {
6094 netdev->ifindex = change->if_index;
6095 netdev->cache_valid |= VALID_IFINDEX;
6096 netdev->get_ifindex_error = 0;
6097 changed = true;
6098 }
3d9c99ab
JH
6099 if (change->master && netdev_linux_kind_is_lag(change->master)) {
6100 netdev->is_lag_master = true;
6101 }
756819dd
FL
6102 if (changed) {
6103 netdev_change_seq_changed(&netdev->up);
6104 }
6105 } else {
6106 error = EINVAL;
6107 }
6108
6109 ofpbuf_delete(reply);
6110 return error;
6111}
6112
6113static int
74ff3298 6114get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
6115{
6116 struct ifreq ifr;
6117 int hwaddr_family;
259e0b1a 6118 int error;
8b61709d
BP
6119
6120 memset(&ifr, 0, sizeof ifr);
71d7c22f 6121 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 6122 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
6123 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6124 if (error) {
78857dfb
BP
6125 /* ENODEV probably means that a vif disappeared asynchronously and
6126 * hasn't been removed from the database yet, so reduce the log level
6127 * to INFO for that case. */
259e0b1a 6128 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 6129 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
6130 netdev_name, ovs_strerror(error));
6131 return error;
8b61709d
BP
6132 }
6133 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
6134 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6135 hwaddr_family != ARPHRD_NONE) {
c9697f35 6136 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 6137 netdev_name, hwaddr_family);
c9697f35 6138 return EINVAL;
8b61709d
BP
6139 }
6140 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6141 return 0;
6142}
6143
6144static int
74ff3298 6145set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
6146{
6147 struct ifreq ifr;
259e0b1a 6148 int error;
8b61709d
BP
6149
6150 memset(&ifr, 0, sizeof ifr);
71d7c22f 6151 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 6152 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 6153 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 6154 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
6155 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6156 if (error) {
8b61709d 6157 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 6158 netdev_name, ovs_strerror(error));
8b61709d 6159 }
259e0b1a 6160 return error;
8b61709d
BP
6161}
6162
6163static int
0b0544d7 6164netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
6165 int cmd, const char *cmd_name)
6166{
6167 struct ifreq ifr;
259e0b1a 6168 int error;
8b61709d
BP
6169
6170 memset(&ifr, 0, sizeof ifr);
71d7c22f 6171 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
6172 ifr.ifr_data = (caddr_t) ecmd;
6173
6174 ecmd->cmd = cmd;
259e0b1a
BP
6175 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6176 if (error) {
6177 if (error != EOPNOTSUPP) {
8b61709d 6178 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 6179 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
6180 } else {
6181 /* The device doesn't support this operation. That's pretty
6182 * common, so there's no point in logging anything. */
6183 }
8b61709d 6184 }
259e0b1a 6185 return error;
8b61709d 6186}
f1acd62b 6187
488d734d
BP
6188/* Returns an AF_PACKET raw socket or a negative errno value. */
6189static int
6190af_packet_sock(void)
6191{
23882115
BP
6192 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6193 static int sock;
488d734d 6194
23882115 6195 if (ovsthread_once_start(&once)) {
488d734d
BP
6196 sock = socket(AF_PACKET, SOCK_RAW, 0);
6197 if (sock >= 0) {
8450059e
BP
6198 int error = set_nonblocking(sock);
6199 if (error) {
6200 close(sock);
6201 sock = -error;
6202 }
488d734d
BP
6203 } else {
6204 sock = -errno;
10a89ef0
BP
6205 VLOG_ERR("failed to create packet socket: %s",
6206 ovs_strerror(errno));
488d734d 6207 }
23882115 6208 ovsthread_once_done(&once);
488d734d
BP
6209 }
6210
6211 return sock;
6212}