]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
netlink: provide network namespace id from a msg.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
b2befd5b
BP
23#include <sys/types.h>
24#include <netinet/in.h>
55bc98d6 25#include <arpa/inet.h>
8b61709d 26#include <inttypes.h>
32383c3b 27#include <linux/filter.h>
c1c9c9c4 28#include <linux/gen_stats.h>
bb7d0e22 29#include <linux/if_ether.h>
8b61709d
BP
30#include <linux/if_tun.h>
31#include <linux/types.h>
32#include <linux/ethtool.h>
63331829 33#include <linux/mii.h>
ef3767f5 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/ioctl.h>
37#include <sys/socket.h>
ac3e3aaa 38#include <sys/utsname.h>
55bc98d6 39#include <netpacket/packet.h>
8b61709d
BP
40#include <net/if.h>
41#include <net/if_arp.h>
55bc98d6 42#include <net/if_packet.h>
8b61709d 43#include <net/route.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
e14deea0 50#include "dp-packet.h"
93451a0a 51#include "dpif-netlink.h"
df1e5a3b 52#include "dpif-netdev.h"
3e8a2ad1 53#include "openvswitch/dynamic-string.h"
8b61709d 54#include "fatal-signal.h"
93b13be8 55#include "hash.h"
ee89ea7b 56#include "openvswitch/hmap.h"
8b61709d 57#include "netdev-provider.h"
18ebd48c 58#include "netdev-tc-offloads.h"
7fbef77a 59#include "netdev-vport.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
c060c4cf 62#include "netlink.h"
64c96779 63#include "openvswitch/ofpbuf.h"
8b61709d 64#include "openflow/openflow.h"
19c8e9c1 65#include "ovs-atomic.h"
8b61709d 66#include "packets.h"
fd016ae3 67#include "openvswitch/poll-loop.h"
7e9dcc0f 68#include "rtnetlink.h"
ee89ea7b 69#include "openvswitch/shash.h"
c060c4cf 70#include "socket-util.h"
19993ef3 71#include "sset.h"
c1c5c723 72#include "tc.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
ee89ea7b 76#include "util.h"
5136ce49 77
d98e6007 78VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 79
d76f09ea
BP
80COVERAGE_DEFINE(netdev_set_policing);
81COVERAGE_DEFINE(netdev_arp_lookup);
82COVERAGE_DEFINE(netdev_get_ifindex);
83COVERAGE_DEFINE(netdev_get_hwaddr);
84COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
85COVERAGE_DEFINE(netdev_get_ethtool);
86COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 87
8b61709d
BP
88\f
89/* These were introduced in Linux 2.6.14, so they might be missing if we have
90 * old headers. */
91#ifndef ADVERTISED_Pause
92#define ADVERTISED_Pause (1 << 13)
93#endif
94#ifndef ADVERTISED_Asym_Pause
95#define ADVERTISED_Asym_Pause (1 << 14)
96#endif
97
e47bd51a
JP
98/* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100#ifndef ETHTOOL_GFLAGS
101#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102#endif
103#ifndef ETHTOOL_SFLAGS
104#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105#endif
106
c1c9c9c4
BP
107/* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 * headers. */
109#ifndef TC_RTAB_SIZE
110#define TC_RTAB_SIZE 1024
111#endif
112
b73c8518
SH
113/* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
118 *
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
121 */
55bc98d6
BP
122#ifndef PACKET_AUXDATA
123#define PACKET_AUXDATA 8
124#endif
b73c8518
SH
125#ifndef TP_STATUS_VLAN_VALID
126#define TP_STATUS_VLAN_VALID (1 << 4)
127#endif
128#ifndef TP_STATUS_VLAN_TPID_VALID
129#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130#endif
131#undef tpacket_auxdata
132#define tpacket_auxdata rpl_tpacket_auxdata
133struct tpacket_auxdata {
134 uint32_t tp_status;
135 uint32_t tp_len;
136 uint32_t tp_snaplen;
137 uint16_t tp_mac;
138 uint16_t tp_net;
139 uint16_t tp_vlan_tci;
140 uint16_t tp_vlan_tpid;
141};
142
0c615356
SH
143/* Linux 2.6.27 introduced ethtool_cmd_speed
144 *
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
8a7903c6 147 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
148 * unconditionally replace ethtool_cmd_speed. */
149#define ethtool_cmd_speed rpl_ethtool_cmd_speed
150static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151{
152 return ep->speed | (ep->speed_hi << 16);
153}
154
67bed84c
SH
155/* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157#ifndef SUPPORTED_1000baseKX_Full
158#define SUPPORTED_1000baseKX_Full (1 << 17)
159#define SUPPORTED_10000baseKX4_Full (1 << 18)
160#define SUPPORTED_10000baseKR_Full (1 << 19)
161#define SUPPORTED_10000baseR_FEC (1 << 20)
162#define ADVERTISED_1000baseKX_Full (1 << 17)
163#define ADVERTISED_10000baseKX4_Full (1 << 18)
164#define ADVERTISED_10000baseKR_Full (1 << 19)
165#define ADVERTISED_10000baseR_FEC (1 << 20)
166#endif
167
168/* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170#ifndef SUPPORTED_40000baseKR4_Full
171#define SUPPORTED_40000baseKR4_Full (1 << 23)
172#define SUPPORTED_40000baseCR4_Full (1 << 24)
173#define SUPPORTED_40000baseSR4_Full (1 << 25)
174#define SUPPORTED_40000baseLR4_Full (1 << 26)
175#define ADVERTISED_40000baseKR4_Full (1 << 23)
176#define ADVERTISED_40000baseCR4_Full (1 << 24)
177#define ADVERTISED_40000baseSR4_Full (1 << 25)
178#define ADVERTISED_40000baseLR4_Full (1 << 26)
179#endif
180
fa373af4
BP
181/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 *
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
185 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
186 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
187 * unconditionally define a replacement. */
188#ifndef IFLA_STATS64
337c9b99 189#define IFLA_STATS64 23
fa373af4
BP
190#endif
191#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
192struct rtnl_link_stats64 {
193 uint64_t rx_packets;
194 uint64_t tx_packets;
195 uint64_t rx_bytes;
196 uint64_t tx_bytes;
197 uint64_t rx_errors;
198 uint64_t tx_errors;
199 uint64_t rx_dropped;
200 uint64_t tx_dropped;
201 uint64_t multicast;
202 uint64_t collisions;
203
204 uint64_t rx_length_errors;
205 uint64_t rx_over_errors;
206 uint64_t rx_crc_errors;
207 uint64_t rx_frame_errors;
208 uint64_t rx_fifo_errors;
209 uint64_t rx_missed_errors;
210
211 uint64_t tx_aborted_errors;
212 uint64_t tx_carrier_errors;
213 uint64_t tx_fifo_errors;
214 uint64_t tx_heartbeat_errors;
215 uint64_t tx_window_errors;
216
217 uint64_t rx_compressed;
218 uint64_t tx_compressed;
219};
337c9b99 220
8b61709d 221enum {
7fbef77a
JG
222 VALID_IFINDEX = 1 << 0,
223 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
224 VALID_IN = 1 << 2,
225 VALID_MTU = 1 << 3,
226 VALID_POLICING = 1 << 4,
227 VALID_VPORT_STAT_ERROR = 1 << 5,
228 VALID_DRVINFO = 1 << 6,
229 VALID_FEATURES = 1 << 7,
8b61709d 230};
c1c9c9c4
BP
231\f
232/* Traffic control. */
233
234/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
c1c9c9c4
BP
239struct tc {
240 const struct tc_ops *ops;
93b13be8
BP
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244};
c1c9c9c4 245
559eb230
BP
246#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
93b13be8
BP
248/* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 255 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
256};
257
258/* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
79f1cbe9 292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
79f1cbe9 332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
79f1cbe9 343 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 344
93b13be8
BP
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
93b13be8 358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 359 struct smap *details);
c1c9c9c4
BP
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 373 const struct smap *details);
c1c9c9c4 374
93b13be8
BP
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
93b13be8 380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 381
93b13be8
BP
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
93b13be8
BP
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
c1c9c9c4
BP
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401};
402
403static void
404tc_init(struct tc *tc, const struct tc_ops *ops)
405{
406 tc->ops = ops;
93b13be8 407 hmap_init(&tc->queues);
c1c9c9c4
BP
408}
409
410static void
411tc_destroy(struct tc *tc)
412{
93b13be8 413 hmap_destroy(&tc->queues);
c1c9c9c4
BP
414}
415
416static const struct tc_ops tc_ops_htb;
a339aa81 417static const struct tc_ops tc_ops_hfsc;
677d9158
JV
418static const struct tc_ops tc_ops_codel;
419static const struct tc_ops tc_ops_fqcodel;
420static const struct tc_ops tc_ops_sfq;
c1c9c9c4 421static const struct tc_ops tc_ops_default;
6cf888b8 422static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
423static const struct tc_ops tc_ops_other;
424
559eb230 425static const struct tc_ops *const tcs[] = {
c1c9c9c4 426 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 427 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
428 &tc_ops_codel, /* Controlled delay */
429 &tc_ops_fqcodel, /* Fair queue controlled delay */
430 &tc_ops_sfq, /* Stochastic fair queueing */
6cf888b8 431 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
432 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other, /* Some other qdisc. */
434 NULL
435};
149f577a 436
c1c9c9c4
BP
437static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
438static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
439static unsigned int tc_buffer_per_jiffy(unsigned int rate);
440
7874bdff
RD
441static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
442 int type,
443 unsigned int flags,
444 struct ofpbuf *);
c7952afb
BP
445static int tc_add_policer(struct netdev *,
446 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
447
448static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
449 struct nlattr **options);
450static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
451 struct nlattr **options,
452 struct netdev_queue_stats *);
453static int tc_query_class(const struct netdev *,
454 unsigned int handle, unsigned int parent,
455 struct ofpbuf **replyp);
456static int tc_delete_class(const struct netdev *, unsigned int handle);
457
458static int tc_del_qdisc(struct netdev *netdev);
459static int tc_query_qdisc(const struct netdev *netdev);
460
461static int tc_calc_cell_log(unsigned int mtu);
462static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
463static void tc_put_rtab(struct ofpbuf *, uint16_t type,
464 const struct tc_ratespec *rate);
465static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
466\f
b5d57fc8
BP
467struct netdev_linux {
468 struct netdev up;
149f577a 469
86383816
BP
470 /* Protects all members below. */
471 struct ovs_mutex mutex;
472
149f577a 473 unsigned int cache_valid;
8b61709d 474
1670c579
EJ
475 bool miimon; /* Link status of last poll. */
476 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
477 struct timer miimon_timer;
478
8722022c
BP
479 /* The following are figured out "on demand" only. They are only valid
480 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 481 int ifindex;
74ff3298 482 struct eth_addr etheraddr;
8b61709d 483 int mtu;
059e5f4f 484 unsigned int ifi_flags;
65c3058c 485 long long int carrier_resets;
80a86fbe
BP
486 uint32_t kbits_rate; /* Policing data. */
487 uint32_t kbits_burst;
bba1e6f3
PS
488 int vport_stats_error; /* Cached error code from vport_get_stats().
489 0 or an errno value. */
90a6637d 490 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 491 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 492 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 493 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 494 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 495
a00ca915
EJ
496 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
497 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
498 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 499
4f925bd3 500 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 501 struct tc *tc;
149f577a 502
d0d08f8a
BP
503 /* For devices of class netdev_tap_class only. */
504 int tap_fd;
22dcb534
FL
505 bool present; /* If the device is present in the namespace */
506 uint64_t tx_dropped; /* tap device can drop if the iface is down */
8b61709d
BP
507};
508
f7791740
PS
509struct netdev_rxq_linux {
510 struct netdev_rxq up;
796223f5 511 bool is_tap;
5b7448ed 512 int fd;
149f577a 513};
8b61709d 514
8b61709d
BP
515/* This is set pretty low because we probably won't learn anything from the
516 * additional log messages. */
517static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
518
19c8e9c1
JS
519/* Polling miimon status for all ports causes performance degradation when
520 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
521 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
522 *
523 * Readers do not depend on this variable synchronizing with the related
524 * changes in the device miimon status, so we can use atomic_count. */
525static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 526
1c33f0c3 527static void netdev_linux_run(const struct netdev_class *);
6f643e49 528
0b0544d7 529static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 530 int cmd, const char *cmd_name);
b5d57fc8 531static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 532static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
533static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
534 enum netdev_flags on, enum netdev_flags *old_flagsp)
535 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
536static int get_ifindex(const struct netdev *, int *ifindexp);
537static int do_set_addr(struct netdev *netdev,
538 int ioctl_nr, const char *ioctl_name,
539 struct in_addr addr);
74ff3298
JR
540static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
541static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 542static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 543static int af_packet_sock(void);
19c8e9c1 544static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
545static void netdev_linux_miimon_run(void);
546static void netdev_linux_miimon_wait(void);
df1e5a3b 547static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 548
15b3596a
JG
549static bool
550is_netdev_linux_class(const struct netdev_class *netdev_class)
551{
259e0b1a 552 return netdev_class->run == netdev_linux_run;
15b3596a
JG
553}
554
796223f5
BP
555static bool
556is_tap_netdev(const struct netdev *netdev)
557{
b5d57fc8 558 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
559}
560
8b61709d
BP
561static struct netdev_linux *
562netdev_linux_cast(const struct netdev *netdev)
563{
b5d57fc8 564 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 565
180c6d0b 566 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 567}
796223f5 568
f7791740
PS
569static struct netdev_rxq_linux *
570netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 571{
9dc63482 572 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 573 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 574}
ff4ed3c9 575\f
cee87338 576static void netdev_linux_update(struct netdev_linux *netdev,
7e9dcc0f 577 const struct rtnetlink_change *)
86383816 578 OVS_REQUIRES(netdev->mutex);
cee87338 579static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
580 unsigned int ifi_flags, unsigned int mask)
581 OVS_REQUIRES(netdev->mutex);
cee87338 582
d6384a3a
AW
583/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
584 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
585 * if no such socket could be created. */
586static struct nl_sock *
587netdev_linux_notify_sock(void)
588{
589 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
590 static struct nl_sock *sock;
989d7135
PS
591 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
592 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
593
594 if (ovsthread_once_start(&once)) {
595 int error;
596
597 error = nl_sock_create(NETLINK_ROUTE, &sock);
598 if (!error) {
d6384a3a
AW
599 size_t i;
600
601 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
602 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
603 if (error) {
604 nl_sock_destroy(sock);
605 sock = NULL;
606 break;
607 }
cee87338
BP
608 }
609 }
610 ovsthread_once_done(&once);
611 }
612
613 return sock;
614}
615
19c8e9c1
JS
616static bool
617netdev_linux_miimon_enabled(void)
618{
812c272c 619 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
620}
621
8b61709d 622static void
1c33f0c3 623netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 624{
cee87338
BP
625 struct nl_sock *sock;
626 int error;
627
19c8e9c1
JS
628 if (netdev_linux_miimon_enabled()) {
629 netdev_linux_miimon_run();
630 }
cee87338
BP
631
632 sock = netdev_linux_notify_sock();
633 if (!sock) {
634 return;
635 }
636
637 do {
cee87338
BP
638 uint64_t buf_stub[4096 / 8];
639 struct ofpbuf buf;
640
641 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
a86bd14e 642 error = nl_sock_recv(sock, &buf, NULL, false);
cee87338 643 if (!error) {
7e9dcc0f 644 struct rtnetlink_change change;
cee87338 645
7e9dcc0f 646 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
647 struct netdev *netdev_ = NULL;
648 char dev_name[IFNAMSIZ];
649
650 if (!change.ifname) {
651 change.ifname = if_indextoname(change.if_index, dev_name);
652 }
653
654 if (change.ifname) {
655 netdev_ = netdev_from_name(change.ifname);
656 }
cee87338
BP
657 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
658 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
659
660 ovs_mutex_lock(&netdev->mutex);
cee87338 661 netdev_linux_update(netdev, &change);
86383816 662 ovs_mutex_unlock(&netdev->mutex);
cee87338 663 }
38e0065b 664 netdev_close(netdev_);
cee87338
BP
665 }
666 } else if (error == ENOBUFS) {
667 struct shash device_shash;
668 struct shash_node *node;
669
670 nl_sock_drain(sock);
671
672 shash_init(&device_shash);
673 netdev_get_devices(&netdev_linux_class, &device_shash);
674 SHASH_FOR_EACH (node, &device_shash) {
675 struct netdev *netdev_ = node->data;
676 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
677 unsigned int flags;
678
86383816 679 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
680 get_flags(netdev_, &flags);
681 netdev_linux_changed(netdev, flags, 0);
86383816
BP
682 ovs_mutex_unlock(&netdev->mutex);
683
cee87338
BP
684 netdev_close(netdev_);
685 }
686 shash_destroy(&device_shash);
687 } else if (error != EAGAIN) {
7ed58d4a
JP
688 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
689 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
690 ovs_strerror(error));
691 }
692 ofpbuf_uninit(&buf);
693 } while (!error);
8b61709d
BP
694}
695
696static void
1c33f0c3 697netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 698{
cee87338
BP
699 struct nl_sock *sock;
700
19c8e9c1
JS
701 if (netdev_linux_miimon_enabled()) {
702 netdev_linux_miimon_wait();
703 }
cee87338
BP
704 sock = netdev_linux_notify_sock();
705 if (sock) {
706 nl_sock_wait(sock, POLLIN);
707 }
8b61709d
BP
708}
709
ac4d3bcb 710static void
b5d57fc8
BP
711netdev_linux_changed(struct netdev_linux *dev,
712 unsigned int ifi_flags, unsigned int mask)
86383816 713 OVS_REQUIRES(dev->mutex)
ac4d3bcb 714{
3e912ffc 715 netdev_change_seq_changed(&dev->up);
8aa77183
BP
716
717 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
718 dev->carrier_resets++;
719 }
720 dev->ifi_flags = ifi_flags;
721
4f925bd3 722 dev->cache_valid &= mask;
6b6e1329 723 if (!(mask & VALID_IN)) {
a8704b50
PS
724 netdev_get_addrs_list_flush();
725 }
4f925bd3
PS
726}
727
728static void
b5d57fc8 729netdev_linux_update(struct netdev_linux *dev,
7e9dcc0f 730 const struct rtnetlink_change *change)
86383816 731 OVS_REQUIRES(dev->mutex)
4f925bd3 732{
d6384a3a
AW
733 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
734 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 735 /* Keep drv-info, and ip addresses. */
d6384a3a 736 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 737 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
738
739 /* Update netdev from rtnl-change msg. */
740 if (change->mtu) {
741 dev->mtu = change->mtu;
742 dev->cache_valid |= VALID_MTU;
743 dev->netdev_mtu_error = 0;
744 }
90a6637d 745
74ff3298
JR
746 if (!eth_addr_is_zero(change->mac)) {
747 dev->etheraddr = change->mac;
d6384a3a
AW
748 dev->cache_valid |= VALID_ETHERADDR;
749 dev->ether_addr_error = 0;
e8e1a409
TZ
750
751 /* The mac addr has been changed, report it now. */
752 rtnetlink_report_link();
d6384a3a 753 }
44445cac 754
d6384a3a
AW
755 dev->ifindex = change->if_index;
756 dev->cache_valid |= VALID_IFINDEX;
757 dev->get_ifindex_error = 0;
22dcb534 758 dev->present = true;
d6384a3a
AW
759 } else {
760 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 761 dev->present = false;
d6384a3a
AW
762 }
763 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
764 /* Invalidates in4, in6. */
6b6e1329 765 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 766 } else {
d6384a3a 767 OVS_NOT_REACHED();
4f925bd3 768 }
ac4d3bcb
EJ
769}
770
9dc63482
BP
771static struct netdev *
772netdev_linux_alloc(void)
773{
774 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
775 return &netdev->up;
776}
777
48c6733c
WT
778static int
779netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 780{
48c6733c
WT
781 /* Prevent any attempt to create (or open) a network device named "default"
782 * or "all". These device names are effectively reserved on Linux because
783 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
784 * itself this wouldn't call for any special treatment, but in practice if
785 * a program tries to create devices with these names, it causes the kernel
786 * to fire a "new device" notification event even though creation failed,
787 * and in turn that causes OVS to wake up and try to create them again,
788 * which ends up as a 100% CPU loop. */
789 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
790 const char *name = netdev_->name;
791 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
792 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
793 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
794 name);
795 return EINVAL;
796 }
797
834d6caf 798 ovs_mutex_init(&netdev->mutex);
48c6733c 799 return 0;
9dc63482
BP
800}
801
1f6e0fbd
BP
802/* Creates system and internal devices. */
803static int
9dc63482 804netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 805{
9dc63482 806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
807 int error = netdev_linux_common_construct(netdev_);
808 if (error) {
809 return error;
810 }
1f6e0fbd 811
b5d57fc8
BP
812 error = get_flags(&netdev->up, &netdev->ifi_flags);
813 if (error == ENODEV) {
9dc63482 814 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 815 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
816 return ENODEV;
817 } else {
818 /* "Internal" netdevs have to be created as netdev objects before
819 * they exist in the kernel, because creating them in the kernel
820 * happens by passing a netdev object to dpif_port_add().
821 * Therefore, ignore the error. */
822 }
823 }
46415c90 824
a740f0de
JG
825 return 0;
826}
827
5b7448ed
JG
828/* For most types of netdevs we open the device for each call of
829 * netdev_open(). However, this is not the case with tap devices,
830 * since it is only possible to open the device once. In this
831 * situation we share a single file descriptor, and consequently
832 * buffers, across all readers. Therefore once data is read it will
833 * be unavailable to other reads for tap devices. */
a740f0de 834static int
9dc63482 835netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 836{
9dc63482 837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 838 static const char tap_dev[] = "/dev/net/tun";
9dc63482 839 const char *name = netdev_->name;
a740f0de 840 struct ifreq ifr;
a740f0de 841
48c6733c
WT
842 int error = netdev_linux_common_construct(netdev_);
843 if (error) {
844 return error;
845 }
1f6e0fbd 846
6c88d577 847 /* Open tap device. */
d0d08f8a
BP
848 netdev->tap_fd = open(tap_dev, O_RDWR);
849 if (netdev->tap_fd < 0) {
6c88d577 850 error = errno;
10a89ef0 851 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 852 return error;
6c88d577
JP
853 }
854
855 /* Create tap device. */
61b9d078 856 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 857 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 858 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 859 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 860 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 861 ovs_strerror(errno));
6c88d577 862 error = errno;
f61d8d29 863 goto error_close;
6c88d577
JP
864 }
865
866 /* Make non-blocking. */
d0d08f8a 867 error = set_nonblocking(netdev->tap_fd);
a740f0de 868 if (error) {
f61d8d29 869 goto error_close;
a740f0de
JG
870 }
871
0f28164b
FL
872 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
873 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
874 ovs_strerror(errno));
875 error = errno;
876 goto error_close;
877 }
878
a740f0de
JG
879 return 0;
880
f61d8d29 881error_close:
d0d08f8a 882 close(netdev->tap_fd);
a740f0de
JG
883 return error;
884}
885
6c88d577 886static void
9dc63482 887netdev_linux_destruct(struct netdev *netdev_)
6c88d577 888{
b5d57fc8 889 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 890
b5d57fc8
BP
891 if (netdev->tc && netdev->tc->ops->tc_destroy) {
892 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
893 }
894
d0d08f8a
BP
895 if (netdev_get_class(netdev_) == &netdev_tap_class
896 && netdev->tap_fd >= 0)
897 {
0f28164b 898 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 899 close(netdev->tap_fd);
6c88d577 900 }
86383816 901
19c8e9c1 902 if (netdev->miimon_interval > 0) {
812c272c 903 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
904 }
905
86383816 906 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
907}
908
9dc63482
BP
909static void
910netdev_linux_dealloc(struct netdev *netdev_)
911{
912 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
913 free(netdev);
914}
915
f7791740
PS
916static struct netdev_rxq *
917netdev_linux_rxq_alloc(void)
9dc63482 918{
f7791740 919 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
920 return &rx->up;
921}
922
7b6b0ef4 923static int
f7791740 924netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 925{
f7791740 926 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 927 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 929 int error;
7b6b0ef4 930
86383816 931 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
932 rx->is_tap = is_tap_netdev(netdev_);
933 if (rx->is_tap) {
934 rx->fd = netdev->tap_fd;
796223f5
BP
935 } else {
936 struct sockaddr_ll sll;
b73c8518 937 int ifindex, val;
32383c3b 938 /* Result of tcpdump -dd inbound */
259e0b1a 939 static const struct sock_filter filt[] = {
32383c3b
MM
940 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
941 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
942 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
943 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
944 };
259e0b1a
BP
945 static const struct sock_fprog fprog = {
946 ARRAY_SIZE(filt), (struct sock_filter *) filt
947 };
7b6b0ef4 948
796223f5 949 /* Create file descriptor. */
9dc63482
BP
950 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
951 if (rx->fd < 0) {
796223f5 952 error = errno;
10a89ef0 953 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
954 goto error;
955 }
33d82a56 956
b73c8518
SH
957 val = 1;
958 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
959 error = errno;
960 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
961 netdev_get_name(netdev_), ovs_strerror(error));
962 goto error;
963 }
964
796223f5 965 /* Set non-blocking mode. */
9dc63482 966 error = set_nonblocking(rx->fd);
796223f5
BP
967 if (error) {
968 goto error;
969 }
7b6b0ef4 970
796223f5 971 /* Get ethernet device index. */
180c6d0b 972 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
973 if (error) {
974 goto error;
975 }
7b6b0ef4 976
796223f5
BP
977 /* Bind to specific ethernet device. */
978 memset(&sll, 0, sizeof sll);
979 sll.sll_family = AF_PACKET;
980 sll.sll_ifindex = ifindex;
b73c8518 981 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 982 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
983 error = errno;
984 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 985 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
986 goto error;
987 }
32383c3b
MM
988
989 /* Filter for only inbound packets. */
9dc63482 990 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
991 sizeof fprog);
992 if (error) {
993 error = errno;
259e0b1a 994 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 995 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
996 goto error;
997 }
7b6b0ef4 998 }
86383816 999 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1000
7b6b0ef4
BP
1001 return 0;
1002
1003error:
9dc63482
BP
1004 if (rx->fd >= 0) {
1005 close(rx->fd);
7b6b0ef4 1006 }
86383816 1007 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1008 return error;
1009}
1010
796223f5 1011static void
f7791740 1012netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1013{
f7791740 1014 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1015
796223f5
BP
1016 if (!rx->is_tap) {
1017 close(rx->fd);
8b61709d 1018 }
9dc63482
BP
1019}
1020
1021static void
f7791740 1022netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1023{
f7791740 1024 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1025
796223f5
BP
1026 free(rx);
1027}
8b61709d 1028
b73c8518 1029static ovs_be16
1ebdc7eb 1030auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1031{
1032 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1033 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1034 } else if (double_tagged) {
1035 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1036 } else {
1ebdc7eb 1037 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1038 }
1039}
1040
1041static bool
1042auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1043{
1044 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1045}
1046
796223f5 1047static int
cf62fa4c 1048netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1049{
b73c8518 1050 size_t size;
796223f5 1051 ssize_t retval;
b73c8518
SH
1052 struct iovec iov;
1053 struct cmsghdr *cmsg;
1054 union {
1055 struct cmsghdr cmsg;
1056 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1057 } cmsg_buffer;
1058 struct msghdr msgh;
1059
1060 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1061 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1062 size = dp_packet_tailroom(buffer);
b73c8518 1063
cf62fa4c 1064 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1065 iov.iov_len = size;
1066 msgh.msg_name = NULL;
1067 msgh.msg_namelen = 0;
1068 msgh.msg_iov = &iov;
1069 msgh.msg_iovlen = 1;
1070 msgh.msg_control = &cmsg_buffer;
1071 msgh.msg_controllen = sizeof cmsg_buffer;
1072 msgh.msg_flags = 0;
8e8cddf7 1073
796223f5 1074 do {
b73c8518 1075 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1076 } while (retval < 0 && errno == EINTR);
1077
bfd3367b 1078 if (retval < 0) {
b73c8518
SH
1079 return errno;
1080 } else if (retval > size) {
1081 return EMSGSIZE;
1082 }
1083
cf62fa4c 1084 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1085
1086 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1087 const struct tpacket_auxdata *aux;
1088
1089 if (cmsg->cmsg_level != SOL_PACKET
1090 || cmsg->cmsg_type != PACKET_AUXDATA
1091 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1092 continue;
8b61709d 1093 }
b73c8518
SH
1094
1095 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1096 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1097 struct eth_header *eth;
1098 bool double_tagged;
1099
b73c8518
SH
1100 if (retval < ETH_HEADER_LEN) {
1101 return EINVAL;
1102 }
1103
1ebdc7eb
EG
1104 eth = dp_packet_data(buffer);
1105 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1106
1107 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1108 htons(aux->tp_vlan_tci));
1109 break;
1110 }
1111 }
1112
1113 return 0;
1114}
1115
1116static int
cf62fa4c 1117netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1118{
1119 ssize_t retval;
cf62fa4c 1120 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1121
1122 do {
cf62fa4c 1123 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1124 } while (retval < 0 && errno == EINTR);
1125
1126 if (retval < 0) {
bfd3367b 1127 return errno;
8b61709d 1128 }
b73c8518 1129
cf62fa4c 1130 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1131 return 0;
1132}
1133
1134static int
64839cf4 1135netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
b73c8518 1136{
f7791740 1137 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1138 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1139 struct dp_packet *buffer;
df1e5a3b
PS
1140 ssize_t retval;
1141 int mtu;
1142
1143 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1144 mtu = ETH_PAYLOAD_MAX;
1145 }
1146
2482b0b0 1147 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1148 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1149 DP_NETDEV_HEADROOM);
b73c8518 1150 retval = (rx->is_tap
f7791740
PS
1151 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1152 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1153
1154 if (retval) {
1155 if (retval != EAGAIN && retval != EMSGSIZE) {
1156 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1157 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1158 }
cf62fa4c 1159 dp_packet_delete(buffer);
df1e5a3b 1160 } else {
72c84bc2 1161 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1162 }
1163
1164 return retval;
8b61709d
BP
1165}
1166
8b61709d 1167static void
f7791740 1168netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1169{
f7791740 1170 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1171 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1172}
1173
8b61709d 1174static int
f7791740 1175netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1176{
f7791740 1177 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1178 if (rx->is_tap) {
8b61709d 1179 struct ifreq ifr;
f7791740 1180 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1181 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1182 if (error) {
1183 return error;
1184 }
796223f5 1185 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1186 return 0;
1187 } else {
796223f5 1188 return drain_rcvbuf(rx->fd);
8b61709d
BP
1189 }
1190}
1191
d19cf8bb
ZG
1192static int
1193netdev_linux_sock_batch_send(int sock, int ifindex,
1194 struct dp_packet_batch *batch)
1195{
e0a00cee 1196 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1197 /* We don't bother setting most fields in sockaddr_ll because the
1198 * kernel ignores them for SOCK_RAW. */
1199 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1200 .sll_ifindex = ifindex };
1201
e0a00cee
BB
1202 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1203 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1204
e0a00cee 1205 struct dp_packet *packet;
e883448e 1206 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
d19cf8bb 1207 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1208 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1209 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1210 .msg_namelen = sizeof sll,
1211 .msg_iov = &iov[i],
1212 .msg_iovlen = 1 };
1213 }
1214
1215 int error = 0;
e0a00cee 1216 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1217 ssize_t retval;
1218 do {
e0a00cee 1219 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1220 error = retval < 0 ? errno : 0;
1221 } while (error == EINTR);
1222 if (error) {
1223 break;
1224 }
1225 ofs += retval;
1226 }
1227
1228 free(mmsg);
1229 free(iov);
1230 return error;
1231}
1232
1233/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1234 * essential, because packets sent to a tap device with an AF_PACKET socket
1235 * will loop back to be *received* again on the tap device. This doesn't occur
1236 * on other interface types because we attach a socket filter to the rx
1237 * socket. */
1238static int
1239netdev_linux_tap_batch_send(struct netdev *netdev_,
1240 struct dp_packet_batch *batch)
1241{
1242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1243 struct dp_packet *packet;
22dcb534
FL
1244
1245 /* The Linux tap driver returns EIO if the device is not up,
1246 * so if the device is not up, don't waste time sending it.
1247 * However, if the device is in another network namespace
1248 * then OVS can't retrieve the state. In that case, send the
1249 * packets anyway. */
1250 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1251 netdev->tx_dropped += dp_packet_batch_size(batch);
1252 return 0;
1253 }
1254
e883448e 1255 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
ad8b0b4f 1256 size_t size = dp_packet_size(packet);
d19cf8bb
ZG
1257 ssize_t retval;
1258 int error;
1259
1260 do {
1261 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1262 error = retval < 0 ? errno : 0;
1263 } while (error == EINTR);
1264
1265 if (error) {
1266 /* The Linux tap driver returns EIO if the device is not up. From
1267 * the OVS side this is not an error, so we ignore it; otherwise,
1268 * return the erro. */
1269 if (error != EIO) {
1270 return error;
1271 }
1272 } else if (retval != size) {
1273 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1274 "bytes of %"PRIuSIZE") on %s",
1275 retval, size, netdev_get_name(netdev_));
1276 return EMSGSIZE;
1277 }
1278 }
1279 return 0;
1280}
1281
1282/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1283 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1284 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1285 * the packet is too big or too small to transmit on the device.
1286 *
8b61709d
BP
1287 * The kernel maintains a packet transmission queue, so the caller is not
1288 * expected to do additional queuing of packets. */
1289static int
f00fa8cb 1290netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1291 struct dp_packet_batch *batch,
324c8374 1292 bool concurrent_txq OVS_UNUSED)
8b61709d 1293{
f4fd623c 1294 int error = 0;
0a62ae2c
ZG
1295 int sock = 0;
1296
0a62ae2c
ZG
1297 if (!is_tap_netdev(netdev_)) {
1298 sock = af_packet_sock();
1299 if (sock < 0) {
1300 error = -sock;
1301 goto free_batch;
1302 }
1303
1304 int ifindex = netdev_get_ifindex(netdev_);
1305 if (ifindex < 0) {
1306 error = -ifindex;
1307 goto free_batch;
1308 }
1309
d19cf8bb
ZG
1310 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1311 } else {
1312 error = netdev_linux_tap_batch_send(netdev_, batch);
0a62ae2c 1313 }
d19cf8bb
ZG
1314 if (error) {
1315 if (error == ENOBUFS) {
1316 /* The Linux AF_PACKET implementation never blocks waiting
1317 * for room for packets, instead returning ENOBUFS.
1318 * Translate this into EAGAIN for the caller. */
1319 error = EAGAIN;
f23347ea 1320 } else {
f4fd623c
DDP
1321 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1322 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1323 }
f4fd623c
DDP
1324 }
1325
0a62ae2c 1326free_batch:
b30896c9 1327 dp_packet_delete_batch(batch, true);
f4fd623c 1328 return error;
8b61709d
BP
1329}
1330
1331/* Registers with the poll loop to wake up from the next call to poll_block()
1332 * when the packet transmission queue has sufficient room to transmit a packet
1333 * with netdev_send().
1334 *
1335 * The kernel maintains a packet transmission queue, so the client is not
1336 * expected to do additional queuing of packets. Thus, this function is
1337 * unlikely to ever be used. It is included for completeness. */
1338static void
f00fa8cb 1339netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1340{
796223f5 1341 if (is_tap_netdev(netdev)) {
8b61709d
BP
1342 /* TAP device always accepts packets.*/
1343 poll_immediate_wake();
1344 }
1345}
1346
1347/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1348 * otherwise a positive errno value. */
1349static int
74ff3298 1350netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1351{
b5d57fc8 1352 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1353 enum netdev_flags old_flags = 0;
eb395f2e
BP
1354 int error;
1355
86383816
BP
1356 ovs_mutex_lock(&netdev->mutex);
1357
b5d57fc8 1358 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1359 error = netdev->ether_addr_error;
1360 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1361 goto exit;
44445cac 1362 }
b5d57fc8 1363 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1364 }
1365
7eb1bd81 1366 /* Tap devices must be brought down before setting the address. */
796223f5 1367 if (is_tap_netdev(netdev_)) {
4f9f3f21 1368 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1369 }
44445cac
PS
1370 error = set_etheraddr(netdev_get_name(netdev_), mac);
1371 if (!error || error == ENODEV) {
b5d57fc8
BP
1372 netdev->ether_addr_error = error;
1373 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1374 if (!error) {
74ff3298 1375 netdev->etheraddr = mac;
eb395f2e 1376 }
8b61709d 1377 }
44445cac 1378
4f9f3f21
BP
1379 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1380 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1381 }
7eb1bd81 1382
86383816
BP
1383exit:
1384 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1385 return error;
1386}
1387
44445cac 1388/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1389static int
74ff3298 1390netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1391{
b5d57fc8 1392 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1393 int error;
44445cac 1394
86383816 1395 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1396 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816 1397 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1398 &netdev->etheraddr);
b5d57fc8 1399 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1400 }
44445cac 1401
86383816
BP
1402 error = netdev->ether_addr_error;
1403 if (!error) {
74ff3298 1404 *mac = netdev->etheraddr;
44445cac 1405 }
86383816 1406 ovs_mutex_unlock(&netdev->mutex);
44445cac 1407
86383816 1408 return error;
8b61709d
BP
1409}
1410
8b61709d 1411static int
73371c09 1412netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1413{
86383816
BP
1414 int error;
1415
b5d57fc8 1416 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1417 struct ifreq ifr;
90a6637d 1418
86383816 1419 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1420 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1421 netdev->mtu = ifr.ifr_mtu;
1422 netdev->cache_valid |= VALID_MTU;
8b61709d 1423 }
90a6637d 1424
86383816
BP
1425 error = netdev->netdev_mtu_error;
1426 if (!error) {
b5d57fc8 1427 *mtup = netdev->mtu;
90a6637d 1428 }
73371c09
BP
1429
1430 return error;
1431}
1432
1433/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1434 * in bytes, not including the hardware header; thus, this is typically 1500
1435 * bytes for Ethernet devices. */
1436static int
1437netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1438{
1439 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1440 int error;
1441
1442 ovs_mutex_lock(&netdev->mutex);
1443 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1444 ovs_mutex_unlock(&netdev->mutex);
1445
1446 return error;
8b61709d
BP
1447}
1448
9b020780
PS
1449/* Sets the maximum size of transmitted (MTU) for given device using linux
1450 * networking ioctl interface.
1451 */
1452static int
4124cb12 1453netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1454{
b5d57fc8 1455 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1456 struct ifreq ifr;
1457 int error;
1458
86383816 1459 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1460 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1461 error = netdev->netdev_mtu_error;
1462 if (error || netdev->mtu == mtu) {
1463 goto exit;
90a6637d 1464 }
b5d57fc8 1465 netdev->cache_valid &= ~VALID_MTU;
153e5481 1466 }
9b020780 1467 ifr.ifr_mtu = mtu;
259e0b1a
BP
1468 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1469 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1470 if (!error || error == ENODEV) {
b5d57fc8
BP
1471 netdev->netdev_mtu_error = error;
1472 netdev->mtu = ifr.ifr_mtu;
1473 netdev->cache_valid |= VALID_MTU;
9b020780 1474 }
86383816
BP
1475exit:
1476 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1477 return error;
9b020780
PS
1478}
1479
9ab3d9a3
BP
1480/* Returns the ifindex of 'netdev', if successful, as a positive number.
1481 * On failure, returns a negative errno value. */
1482static int
86383816 1483netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1484{
86383816 1485 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1486 int ifindex, error;
1487
86383816
BP
1488 ovs_mutex_lock(&netdev->mutex);
1489 error = get_ifindex(netdev_, &ifindex);
1490 ovs_mutex_unlock(&netdev->mutex);
1491
9ab3d9a3
BP
1492 return error ? -error : ifindex;
1493}
1494
8b61709d
BP
1495static int
1496netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1497{
b5d57fc8 1498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1499
86383816 1500 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1501 if (netdev->miimon_interval > 0) {
1502 *carrier = netdev->miimon;
3a183124 1503 } else {
b5d57fc8 1504 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1505 }
86383816 1506 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1507
3a183124 1508 return 0;
8b61709d
BP
1509}
1510
65c3058c 1511static long long int
86383816 1512netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1513{
86383816
BP
1514 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1515 long long int carrier_resets;
1516
1517 ovs_mutex_lock(&netdev->mutex);
1518 carrier_resets = netdev->carrier_resets;
1519 ovs_mutex_unlock(&netdev->mutex);
1520
1521 return carrier_resets;
65c3058c
EJ
1522}
1523
63331829 1524static int
1670c579
EJ
1525netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1526 struct mii_ioctl_data *data)
63331829 1527{
63331829 1528 struct ifreq ifr;
782e6111 1529 int error;
63331829 1530
63331829 1531 memset(&ifr, 0, sizeof ifr);
782e6111 1532 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1533 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1534 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1535
782e6111
EJ
1536 return error;
1537}
1538
1539static int
1670c579 1540netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1541{
782e6111
EJ
1542 struct mii_ioctl_data data;
1543 int error;
63331829 1544
782e6111
EJ
1545 *miimon = false;
1546
1547 memset(&data, 0, sizeof data);
1670c579 1548 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1549 if (!error) {
1550 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1551 data.reg_num = MII_BMSR;
1670c579 1552 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1553 &data);
63331829
EJ
1554
1555 if (!error) {
782e6111 1556 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1557 }
9120cfc0
DH
1558 }
1559 if (error) {
63331829 1560 struct ethtool_cmd ecmd;
63331829
EJ
1561
1562 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1563 name);
1564
ab985a77 1565 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1566 memset(&ecmd, 0, sizeof ecmd);
1567 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1568 "ETHTOOL_GLINK");
1569 if (!error) {
782e6111
EJ
1570 struct ethtool_value eval;
1571
1572 memcpy(&eval, &ecmd, sizeof eval);
1573 *miimon = !!eval.data;
63331829
EJ
1574 } else {
1575 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1576 }
1577 }
1578
1579 return error;
1580}
1581
1670c579
EJ
1582static int
1583netdev_linux_set_miimon_interval(struct netdev *netdev_,
1584 long long int interval)
1585{
b5d57fc8 1586 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1587
86383816 1588 ovs_mutex_lock(&netdev->mutex);
1670c579 1589 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1590 if (netdev->miimon_interval != interval) {
19c8e9c1 1591 if (interval && !netdev->miimon_interval) {
812c272c 1592 atomic_count_inc(&miimon_cnt);
19c8e9c1 1593 } else if (!interval && netdev->miimon_interval) {
812c272c 1594 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1595 }
1596
b5d57fc8
BP
1597 netdev->miimon_interval = interval;
1598 timer_set_expired(&netdev->miimon_timer);
1670c579 1599 }
86383816 1600 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1601
1602 return 0;
1603}
1604
1605static void
1606netdev_linux_miimon_run(void)
1607{
1608 struct shash device_shash;
1609 struct shash_node *node;
1610
1611 shash_init(&device_shash);
b5d57fc8 1612 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1613 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1614 struct netdev *netdev = node->data;
1615 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1616 bool miimon;
1617
86383816
BP
1618 ovs_mutex_lock(&dev->mutex);
1619 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1620 netdev_linux_get_miimon(dev->up.name, &miimon);
1621 if (miimon != dev->miimon) {
1622 dev->miimon = miimon;
1623 netdev_linux_changed(dev, dev->ifi_flags, 0);
1624 }
1670c579 1625
86383816 1626 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1627 }
86383816 1628 ovs_mutex_unlock(&dev->mutex);
2f980d74 1629 netdev_close(netdev);
1670c579
EJ
1630 }
1631
1632 shash_destroy(&device_shash);
1633}
1634
1635static void
1636netdev_linux_miimon_wait(void)
1637{
1638 struct shash device_shash;
1639 struct shash_node *node;
1640
1641 shash_init(&device_shash);
b5d57fc8 1642 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1643 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1644 struct netdev *netdev = node->data;
1645 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1646
86383816 1647 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1648 if (dev->miimon_interval > 0) {
1649 timer_wait(&dev->miimon_timer);
1650 }
86383816 1651 ovs_mutex_unlock(&dev->mutex);
2f980d74 1652 netdev_close(netdev);
1670c579
EJ
1653 }
1654 shash_destroy(&device_shash);
1655}
1656
92df599c
JG
1657static void
1658swap_uint64(uint64_t *a, uint64_t *b)
1659{
1de0e8ae
BP
1660 uint64_t tmp = *a;
1661 *a = *b;
1662 *b = tmp;
92df599c
JG
1663}
1664
c060c4cf
EJ
1665/* Copies 'src' into 'dst', performing format conversion in the process.
1666 *
1667 * 'src' is allowed to be misaligned. */
1668static void
1669netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1670 const struct ovs_vport_stats *src)
1671{
6a54dedc
BP
1672 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1673 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1674 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1675 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1676 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1677 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1678 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1679 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1680 dst->multicast = 0;
1681 dst->collisions = 0;
1682 dst->rx_length_errors = 0;
1683 dst->rx_over_errors = 0;
1684 dst->rx_crc_errors = 0;
1685 dst->rx_frame_errors = 0;
1686 dst->rx_fifo_errors = 0;
1687 dst->rx_missed_errors = 0;
1688 dst->tx_aborted_errors = 0;
1689 dst->tx_carrier_errors = 0;
1690 dst->tx_fifo_errors = 0;
1691 dst->tx_heartbeat_errors = 0;
1692 dst->tx_window_errors = 0;
1693}
1694
1695static int
1696get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1697{
93451a0a 1698 struct dpif_netlink_vport reply;
c060c4cf
EJ
1699 struct ofpbuf *buf;
1700 int error;
1701
93451a0a 1702 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1703 if (error) {
1704 return error;
1705 } else if (!reply.stats) {
1706 ofpbuf_delete(buf);
1707 return EOPNOTSUPP;
1708 }
1709
1710 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1711
1712 ofpbuf_delete(buf);
1713
1714 return 0;
1715}
1716
f613a0d7
PS
1717static void
1718get_stats_via_vport(const struct netdev *netdev_,
1719 struct netdev_stats *stats)
8b61709d 1720{
b5d57fc8 1721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1722
b5d57fc8
BP
1723 if (!netdev->vport_stats_error ||
1724 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1725 int error;
7fbef77a 1726
c060c4cf 1727 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1728 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1729 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1730 "(%s)",
1731 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1732 }
b5d57fc8
BP
1733 netdev->vport_stats_error = error;
1734 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1735 }
f613a0d7 1736}
8b61709d 1737
f613a0d7
PS
1738/* Retrieves current device stats for 'netdev-linux'. */
1739static int
1740netdev_linux_get_stats(const struct netdev *netdev_,
1741 struct netdev_stats *stats)
1742{
b5d57fc8 1743 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1744 struct netdev_stats dev_stats;
1745 int error;
1746
86383816 1747 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1748 get_stats_via_vport(netdev_, stats);
35eef899 1749 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1750 if (error) {
86383816
BP
1751 if (!netdev->vport_stats_error) {
1752 error = 0;
f613a0d7 1753 }
86383816 1754 } else if (netdev->vport_stats_error) {
04c881eb 1755 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1756 *stats = dev_stats;
1757 } else {
04c881eb
AZ
1758 /* Use kernel netdev's packet and byte counts since vport's counters
1759 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1760 * enabled. */
1761 stats->rx_packets = dev_stats.rx_packets;
1762 stats->rx_bytes = dev_stats.rx_bytes;
1763 stats->tx_packets = dev_stats.tx_packets;
1764 stats->tx_bytes = dev_stats.tx_bytes;
1765
f613a0d7
PS
1766 stats->rx_errors += dev_stats.rx_errors;
1767 stats->tx_errors += dev_stats.tx_errors;
1768 stats->rx_dropped += dev_stats.rx_dropped;
1769 stats->tx_dropped += dev_stats.tx_dropped;
1770 stats->multicast += dev_stats.multicast;
1771 stats->collisions += dev_stats.collisions;
1772 stats->rx_length_errors += dev_stats.rx_length_errors;
1773 stats->rx_over_errors += dev_stats.rx_over_errors;
1774 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1775 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1776 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1777 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1778 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1779 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1780 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1781 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1782 stats->tx_window_errors += dev_stats.tx_window_errors;
1783 }
86383816
BP
1784 ovs_mutex_unlock(&netdev->mutex);
1785
1786 return error;
f613a0d7
PS
1787}
1788
1789/* Retrieves current device stats for 'netdev-tap' netdev or
1790 * netdev-internal. */
1791static int
15aee116 1792netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1793{
b5d57fc8 1794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1795 struct netdev_stats dev_stats;
1796 int error;
1797
86383816 1798 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1799 get_stats_via_vport(netdev_, stats);
35eef899 1800 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1801 if (error) {
86383816
BP
1802 if (!netdev->vport_stats_error) {
1803 error = 0;
8b61709d 1804 }
86383816
BP
1805 } else if (netdev->vport_stats_error) {
1806 /* Transmit and receive stats will appear to be swapped relative to the
1807 * other ports since we are the one sending the data, not a remote
1808 * computer. For consistency, we swap them back here. This does not
1809 * apply if we are getting stats from the vport layer because it always
1810 * tracks stats from the perspective of the switch. */
fe6b0e03 1811
f613a0d7 1812 *stats = dev_stats;
92df599c
JG
1813 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1814 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1815 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1816 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1817 stats->rx_length_errors = 0;
1818 stats->rx_over_errors = 0;
1819 stats->rx_crc_errors = 0;
1820 stats->rx_frame_errors = 0;
1821 stats->rx_fifo_errors = 0;
1822 stats->rx_missed_errors = 0;
1823 stats->tx_aborted_errors = 0;
1824 stats->tx_carrier_errors = 0;
1825 stats->tx_fifo_errors = 0;
1826 stats->tx_heartbeat_errors = 0;
1827 stats->tx_window_errors = 0;
f613a0d7 1828 } else {
04c881eb
AZ
1829 /* Use kernel netdev's packet and byte counts since vport counters
1830 * do not reflect packet counts on the wire when GSO, TSO or GRO
1831 * are enabled. */
1832 stats->rx_packets = dev_stats.tx_packets;
1833 stats->rx_bytes = dev_stats.tx_bytes;
1834 stats->tx_packets = dev_stats.rx_packets;
1835 stats->tx_bytes = dev_stats.rx_bytes;
1836
f613a0d7
PS
1837 stats->rx_dropped += dev_stats.tx_dropped;
1838 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1839
f613a0d7
PS
1840 stats->rx_errors += dev_stats.tx_errors;
1841 stats->tx_errors += dev_stats.rx_errors;
1842
1843 stats->multicast += dev_stats.multicast;
1844 stats->collisions += dev_stats.collisions;
1845 }
22dcb534 1846 stats->tx_dropped += netdev->tx_dropped;
86383816
BP
1847 ovs_mutex_unlock(&netdev->mutex);
1848
1849 return error;
8b61709d
BP
1850}
1851
bba1e6f3
PS
1852static int
1853netdev_internal_get_stats(const struct netdev *netdev_,
1854 struct netdev_stats *stats)
1855{
b5d57fc8 1856 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1857 int error;
bba1e6f3 1858
86383816 1859 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1860 get_stats_via_vport(netdev_, stats);
86383816
BP
1861 error = netdev->vport_stats_error;
1862 ovs_mutex_unlock(&netdev->mutex);
1863
1864 return error;
bba1e6f3
PS
1865}
1866
51f87458 1867static void
b5d57fc8 1868netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1869{
1870 struct ethtool_cmd ecmd;
6c038611 1871 uint32_t speed;
8b61709d
BP
1872 int error;
1873
b5d57fc8 1874 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1875 return;
1876 }
1877
ab985a77 1878 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1879 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1880 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1881 ETHTOOL_GSET, "ETHTOOL_GSET");
1882 if (error) {
51f87458 1883 goto out;
8b61709d
BP
1884 }
1885
1886 /* Supported features. */
b5d57fc8 1887 netdev->supported = 0;
8b61709d 1888 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1889 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1890 }
1891 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1892 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1893 }
1894 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1895 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1896 }
1897 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1898 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1899 }
1900 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1901 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 1902 }
67bed84c
SH
1903 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1904 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 1905 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 1906 }
67bed84c
SH
1907 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1908 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1909 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1910 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 1911 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 1912 }
67bed84c
SH
1913 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1914 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1915 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1916 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1917 netdev->supported |= NETDEV_F_40GB_FD;
1918 }
8b61709d 1919 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1920 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1921 }
1922 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1923 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1924 }
1925 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1926 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1927 }
1928 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1929 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1930 }
1931 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1932 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1933 }
1934
1935 /* Advertised features. */
b5d57fc8 1936 netdev->advertised = 0;
8b61709d 1937 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1938 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1939 }
1940 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1941 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1942 }
1943 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1944 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1945 }
1946 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1947 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1948 }
1949 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1950 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 1951 }
67bed84c
SH
1952 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1953 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 1954 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 1955 }
67bed84c
SH
1956 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1957 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1958 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1959 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 1960 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 1961 }
67bed84c
SH
1962 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1963 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1964 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1965 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1966 netdev->advertised |= NETDEV_F_40GB_FD;
1967 }
8b61709d 1968 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1969 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1970 }
1971 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1972 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1973 }
1974 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1975 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1976 }
1977 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1978 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1979 }
1980 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1981 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1982 }
1983
1984 /* Current settings. */
0c615356 1985 speed = ethtool_cmd_speed(&ecmd);
6c038611 1986 if (speed == SPEED_10) {
b5d57fc8 1987 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1988 } else if (speed == SPEED_100) {
b5d57fc8 1989 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1990 } else if (speed == SPEED_1000) {
b5d57fc8 1991 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1992 } else if (speed == SPEED_10000) {
b5d57fc8 1993 netdev->current = NETDEV_F_10GB_FD;
6c038611 1994 } else if (speed == 40000) {
b5d57fc8 1995 netdev->current = NETDEV_F_40GB_FD;
6c038611 1996 } else if (speed == 100000) {
b5d57fc8 1997 netdev->current = NETDEV_F_100GB_FD;
6c038611 1998 } else if (speed == 1000000) {
b5d57fc8 1999 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2000 } else {
b5d57fc8 2001 netdev->current = 0;
8b61709d
BP
2002 }
2003
2004 if (ecmd.port == PORT_TP) {
b5d57fc8 2005 netdev->current |= NETDEV_F_COPPER;
8b61709d 2006 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2007 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2008 }
2009
2010 if (ecmd.autoneg) {
b5d57fc8 2011 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2012 }
2013
51f87458 2014out:
b5d57fc8
BP
2015 netdev->cache_valid |= VALID_FEATURES;
2016 netdev->get_features_error = error;
51f87458
PS
2017}
2018
887ed8b2
BP
2019/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2020 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2021 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2022static int
2023netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2024 enum netdev_features *current,
2025 enum netdev_features *advertised,
2026 enum netdev_features *supported,
2027 enum netdev_features *peer)
51f87458 2028{
b5d57fc8 2029 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2030 int error;
51f87458 2031
86383816 2032 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2033 netdev_linux_read_features(netdev);
b5d57fc8
BP
2034 if (!netdev->get_features_error) {
2035 *current = netdev->current;
2036 *advertised = netdev->advertised;
2037 *supported = netdev->supported;
887ed8b2 2038 *peer = 0; /* XXX */
51f87458 2039 }
86383816
BP
2040 error = netdev->get_features_error;
2041 ovs_mutex_unlock(&netdev->mutex);
2042
2043 return error;
8b61709d
BP
2044}
2045
2046/* Set the features advertised by 'netdev' to 'advertise'. */
2047static int
86383816 2048netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2049 enum netdev_features advertise)
8b61709d 2050{
86383816 2051 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2052 struct ethtool_cmd ecmd;
2053 int error;
2054
86383816
BP
2055 ovs_mutex_lock(&netdev->mutex);
2056
ab985a77 2057 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2058 memset(&ecmd, 0, sizeof ecmd);
86383816 2059 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2060 ETHTOOL_GSET, "ETHTOOL_GSET");
2061 if (error) {
86383816 2062 goto exit;
8b61709d
BP
2063 }
2064
2065 ecmd.advertising = 0;
6c038611 2066 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2067 ecmd.advertising |= ADVERTISED_10baseT_Half;
2068 }
6c038611 2069 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2070 ecmd.advertising |= ADVERTISED_10baseT_Full;
2071 }
6c038611 2072 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2073 ecmd.advertising |= ADVERTISED_100baseT_Half;
2074 }
6c038611 2075 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2076 ecmd.advertising |= ADVERTISED_100baseT_Full;
2077 }
6c038611 2078 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2079 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2080 }
6c038611 2081 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2082 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2083 }
6c038611 2084 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2085 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2086 }
6c038611 2087 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2088 ecmd.advertising |= ADVERTISED_TP;
2089 }
6c038611 2090 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2091 ecmd.advertising |= ADVERTISED_FIBRE;
2092 }
6c038611 2093 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2094 ecmd.advertising |= ADVERTISED_Autoneg;
2095 }
6c038611 2096 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2097 ecmd.advertising |= ADVERTISED_Pause;
2098 }
6c038611 2099 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2100 ecmd.advertising |= ADVERTISED_Asym_Pause;
2101 }
ab985a77 2102 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2103 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2104 ETHTOOL_SSET, "ETHTOOL_SSET");
2105
2106exit:
2107 ovs_mutex_unlock(&netdev->mutex);
2108 return error;
8b61709d
BP
2109}
2110
f8500004
JP
2111/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2112 * successful, otherwise a positive errno value. */
8b61709d 2113static int
b5d57fc8 2114netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2115 uint32_t kbits_rate, uint32_t kbits_burst)
2116{
b5d57fc8
BP
2117 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2118 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2119 int ifindex;
f8500004 2120 int error;
8b61709d 2121
d5ae4a60
PB
2122 if (netdev_is_flow_api_enabled()) {
2123 if (kbits_rate) {
2124 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2125 netdev_name);
2126 }
2127 return EOPNOTSUPP;
2128 }
2129
80a86fbe 2130 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2131 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2132 : kbits_burst); /* Stick with user-specified value. */
2133
86383816 2134 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2135 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2136 error = netdev->netdev_policing_error;
2137 if (error || (netdev->kbits_rate == kbits_rate &&
2138 netdev->kbits_burst == kbits_burst)) {
c9f71668 2139 /* Assume that settings haven't changed since we last set them. */
86383816 2140 goto out;
c9f71668 2141 }
b5d57fc8 2142 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2143 }
2144
7874bdff
RD
2145 error = get_ifindex(netdev_, &ifindex);
2146 if (error) {
2147 goto out;
2148 }
2149
ac8c3412 2150 COVERAGE_INC(netdev_set_policing);
f8500004 2151 /* Remove any existing ingress qdisc. */
7874bdff 2152 error = tc_add_del_ingress_qdisc(ifindex, false);
f8500004
JP
2153 if (error) {
2154 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2155 netdev_name, ovs_strerror(error));
c9f71668 2156 goto out;
f8500004
JP
2157 }
2158
8b61709d 2159 if (kbits_rate) {
7874bdff 2160 error = tc_add_del_ingress_qdisc(ifindex, true);
f8500004
JP
2161 if (error) {
2162 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2163 netdev_name, ovs_strerror(error));
c9f71668 2164 goto out;
8b61709d
BP
2165 }
2166
b5d57fc8 2167 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2168 if (error){
2169 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2170 netdev_name, ovs_strerror(error));
c9f71668 2171 goto out;
8b61709d 2172 }
8b61709d
BP
2173 }
2174
b5d57fc8
BP
2175 netdev->kbits_rate = kbits_rate;
2176 netdev->kbits_burst = kbits_burst;
f8500004 2177
c9f71668
PS
2178out:
2179 if (!error || error == ENODEV) {
b5d57fc8
BP
2180 netdev->netdev_policing_error = error;
2181 netdev->cache_valid |= VALID_POLICING;
c9f71668 2182 }
86383816 2183 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2184 return error;
8b61709d
BP
2185}
2186
c1c9c9c4
BP
2187static int
2188netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2189 struct sset *types)
c1c9c9c4 2190{
559eb230 2191 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2192 for (opsp = tcs; *opsp != NULL; opsp++) {
2193 const struct tc_ops *ops = *opsp;
2194 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2195 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2196 }
2197 }
2198 return 0;
2199}
2200
2201static const struct tc_ops *
2202tc_lookup_ovs_name(const char *name)
2203{
559eb230 2204 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2205
2206 for (opsp = tcs; *opsp != NULL; opsp++) {
2207 const struct tc_ops *ops = *opsp;
2208 if (!strcmp(name, ops->ovs_name)) {
2209 return ops;
2210 }
2211 }
2212 return NULL;
2213}
2214
2215static const struct tc_ops *
2216tc_lookup_linux_name(const char *name)
2217{
559eb230 2218 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2219
2220 for (opsp = tcs; *opsp != NULL; opsp++) {
2221 const struct tc_ops *ops = *opsp;
2222 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2223 return ops;
2224 }
2225 }
2226 return NULL;
2227}
2228
93b13be8 2229static struct tc_queue *
b5d57fc8 2230tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2231 size_t hash)
2232{
b5d57fc8 2233 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2234 struct tc_queue *queue;
2235
b5d57fc8 2236 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2237 if (queue->queue_id == queue_id) {
2238 return queue;
2239 }
2240 }
2241 return NULL;
2242}
2243
2244static struct tc_queue *
2245tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2246{
2247 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2248}
2249
c1c9c9c4
BP
2250static int
2251netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2252 const char *type,
2253 struct netdev_qos_capabilities *caps)
2254{
2255 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2256 if (!ops) {
2257 return EOPNOTSUPP;
2258 }
2259 caps->n_queues = ops->n_queues;
2260 return 0;
2261}
2262
2263static int
b5d57fc8 2264netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2265 const char **typep, struct smap *details)
c1c9c9c4 2266{
b5d57fc8 2267 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2268 int error;
2269
86383816 2270 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2271 error = tc_query_qdisc(netdev_);
86383816
BP
2272 if (!error) {
2273 *typep = netdev->tc->ops->ovs_name;
2274 error = (netdev->tc->ops->qdisc_get
2275 ? netdev->tc->ops->qdisc_get(netdev_, details)
2276 : 0);
c1c9c9c4 2277 }
86383816 2278 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2279
86383816 2280 return error;
c1c9c9c4
BP
2281}
2282
2283static int
b5d57fc8 2284netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2285 const char *type, const struct smap *details)
c1c9c9c4 2286{
b5d57fc8 2287 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2288 const struct tc_ops *new_ops;
2289 int error;
2290
2291 new_ops = tc_lookup_ovs_name(type);
2292 if (!new_ops || !new_ops->tc_install) {
2293 return EOPNOTSUPP;
2294 }
2295
6cf888b8
BS
2296 if (new_ops == &tc_ops_noop) {
2297 return new_ops->tc_install(netdev_, details);
2298 }
2299
86383816 2300 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2301 error = tc_query_qdisc(netdev_);
c1c9c9c4 2302 if (error) {
86383816 2303 goto exit;
c1c9c9c4
BP
2304 }
2305
b5d57fc8 2306 if (new_ops == netdev->tc->ops) {
86383816 2307 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2308 } else {
2309 /* Delete existing qdisc. */
b5d57fc8 2310 error = tc_del_qdisc(netdev_);
c1c9c9c4 2311 if (error) {
86383816 2312 goto exit;
c1c9c9c4 2313 }
b5d57fc8 2314 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2315
2316 /* Install new qdisc. */
b5d57fc8
BP
2317 error = new_ops->tc_install(netdev_, details);
2318 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2319 }
86383816
BP
2320
2321exit:
2322 ovs_mutex_unlock(&netdev->mutex);
2323 return error;
c1c9c9c4
BP
2324}
2325
2326static int
b5d57fc8 2327netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2328 unsigned int queue_id, struct smap *details)
c1c9c9c4 2329{
b5d57fc8 2330 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2331 int error;
2332
86383816 2333 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2334 error = tc_query_qdisc(netdev_);
86383816 2335 if (!error) {
b5d57fc8 2336 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2337 error = (queue
b5d57fc8 2338 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2339 : ENOENT);
c1c9c9c4 2340 }
86383816
BP
2341 ovs_mutex_unlock(&netdev->mutex);
2342
2343 return error;
c1c9c9c4
BP
2344}
2345
2346static int
b5d57fc8 2347netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2348 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2349{
b5d57fc8 2350 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2351 int error;
2352
86383816 2353 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2354 error = tc_query_qdisc(netdev_);
86383816
BP
2355 if (!error) {
2356 error = (queue_id < netdev->tc->ops->n_queues
2357 && netdev->tc->ops->class_set
2358 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2359 : EINVAL);
c1c9c9c4 2360 }
86383816 2361 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2362
86383816 2363 return error;
c1c9c9c4
BP
2364}
2365
2366static int
b5d57fc8 2367netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2368{
b5d57fc8 2369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2370 int error;
2371
86383816 2372 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2373 error = tc_query_qdisc(netdev_);
86383816
BP
2374 if (!error) {
2375 if (netdev->tc->ops->class_delete) {
2376 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2377 error = (queue
2378 ? netdev->tc->ops->class_delete(netdev_, queue)
2379 : ENOENT);
2380 } else {
2381 error = EINVAL;
2382 }
c1c9c9c4 2383 }
86383816
BP
2384 ovs_mutex_unlock(&netdev->mutex);
2385
2386 return error;
c1c9c9c4
BP
2387}
2388
2389static int
b5d57fc8 2390netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2391 unsigned int queue_id,
2392 struct netdev_queue_stats *stats)
2393{
b5d57fc8 2394 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2395 int error;
2396
86383816 2397 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2398 error = tc_query_qdisc(netdev_);
86383816
BP
2399 if (!error) {
2400 if (netdev->tc->ops->class_get_stats) {
2401 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2402 if (queue) {
2403 stats->created = queue->created;
2404 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2405 stats);
2406 } else {
2407 error = ENOENT;
2408 }
2409 } else {
2410 error = EOPNOTSUPP;
6dc34a0d 2411 }
c1c9c9c4 2412 }
86383816
BP
2413 ovs_mutex_unlock(&netdev->mutex);
2414
2415 return error;
c1c9c9c4
BP
2416}
2417
d57695d7
JS
2418struct queue_dump_state {
2419 struct nl_dump dump;
2420 struct ofpbuf buf;
2421};
2422
23a98ffe 2423static bool
d57695d7 2424start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2425{
2426 struct ofpbuf request;
2427 struct tcmsg *tcmsg;
2428
7874bdff 2429 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2430 if (!tcmsg) {
2431 return false;
2432 }
3c4de644 2433 tcmsg->tcm_parent = 0;
d57695d7 2434 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2435 ofpbuf_uninit(&request);
d57695d7
JS
2436
2437 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2438 return true;
c1c9c9c4
BP
2439}
2440
d57695d7
JS
2441static int
2442finish_queue_dump(struct queue_dump_state *state)
2443{
2444 ofpbuf_uninit(&state->buf);
2445 return nl_dump_done(&state->dump);
2446}
2447
89454bf4
BP
2448struct netdev_linux_queue_state {
2449 unsigned int *queues;
2450 size_t cur_queue;
2451 size_t n_queues;
2452};
2453
c1c9c9c4 2454static int
89454bf4 2455netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2456{
89454bf4 2457 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2458 int error;
2459
86383816 2460 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2461 error = tc_query_qdisc(netdev_);
86383816
BP
2462 if (!error) {
2463 if (netdev->tc->ops->class_get) {
89454bf4
BP
2464 struct netdev_linux_queue_state *state;
2465 struct tc_queue *queue;
2466 size_t i;
2467
2468 *statep = state = xmalloc(sizeof *state);
2469 state->n_queues = hmap_count(&netdev->tc->queues);
2470 state->cur_queue = 0;
2471 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2472
2473 i = 0;
2474 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2475 state->queues[i++] = queue->queue_id;
86383816 2476 }
c1c9c9c4 2477 } else {
86383816 2478 error = EOPNOTSUPP;
c1c9c9c4
BP
2479 }
2480 }
86383816 2481 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2482
86383816 2483 return error;
c1c9c9c4
BP
2484}
2485
89454bf4
BP
2486static int
2487netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2488 unsigned int *queue_idp, struct smap *details)
2489{
2490 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2491 struct netdev_linux_queue_state *state = state_;
2492 int error = EOF;
2493
2494 ovs_mutex_lock(&netdev->mutex);
2495 while (state->cur_queue < state->n_queues) {
2496 unsigned int queue_id = state->queues[state->cur_queue++];
2497 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2498
2499 if (queue) {
2500 *queue_idp = queue_id;
2501 error = netdev->tc->ops->class_get(netdev_, queue, details);
2502 break;
2503 }
2504 }
2505 ovs_mutex_unlock(&netdev->mutex);
2506
2507 return error;
2508}
2509
2510static int
2511netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2512 void *state_)
2513{
2514 struct netdev_linux_queue_state *state = state_;
2515
2516 free(state->queues);
2517 free(state);
2518 return 0;
2519}
2520
c1c9c9c4 2521static int
b5d57fc8 2522netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2523 netdev_dump_queue_stats_cb *cb, void *aux)
2524{
b5d57fc8 2525 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2526 int error;
2527
86383816 2528 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2529 error = tc_query_qdisc(netdev_);
86383816 2530 if (!error) {
d57695d7 2531 struct queue_dump_state state;
c1c9c9c4 2532
86383816
BP
2533 if (!netdev->tc->ops->class_dump_stats) {
2534 error = EOPNOTSUPP;
d57695d7 2535 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2536 error = ENODEV;
2537 } else {
2538 struct ofpbuf msg;
2539 int retval;
2540
d57695d7 2541 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2542 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2543 cb, aux);
2544 if (retval) {
2545 error = retval;
2546 }
2547 }
2548
d57695d7 2549 retval = finish_queue_dump(&state);
86383816
BP
2550 if (retval) {
2551 error = retval;
2552 }
c1c9c9c4
BP
2553 }
2554 }
86383816 2555 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2556
86383816 2557 return error;
c1c9c9c4
BP
2558}
2559
8b61709d 2560static int
f1acd62b
BP
2561netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2562 struct in_addr netmask)
8b61709d 2563{
b5d57fc8 2564 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2565 int error;
2566
86383816 2567 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2568 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2569 if (!error) {
f1acd62b 2570 if (address.s_addr != INADDR_ANY) {
8b61709d 2571 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2572 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2573 }
2574 }
49af9a3d 2575
86383816
BP
2576 ovs_mutex_unlock(&netdev->mutex);
2577
8b61709d
BP
2578 return error;
2579}
2580
7df6932e
AW
2581/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2582 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2583 * error. */
8b61709d 2584static int
a8704b50
PS
2585netdev_linux_get_addr_list(const struct netdev *netdev_,
2586 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2587{
b5d57fc8 2588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2589 int error;
86383816
BP
2590
2591 ovs_mutex_lock(&netdev->mutex);
a8704b50 2592 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816
BP
2593 ovs_mutex_unlock(&netdev->mutex);
2594
7df6932e 2595 return error;
8b61709d
BP
2596}
2597
2598static void
2599make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2600{
2601 struct sockaddr_in sin;
2602 memset(&sin, 0, sizeof sin);
2603 sin.sin_family = AF_INET;
2604 sin.sin_addr = addr;
2605 sin.sin_port = 0;
2606
2607 memset(sa, 0, sizeof *sa);
2608 memcpy(sa, &sin, sizeof sin);
2609}
2610
2611static int
2612do_set_addr(struct netdev *netdev,
2613 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2614{
2615 struct ifreq ifr;
149f577a 2616
259e0b1a
BP
2617 make_in4_sockaddr(&ifr.ifr_addr, addr);
2618 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2619 ioctl_name);
8b61709d
BP
2620}
2621
2622/* Adds 'router' as a default IP gateway. */
2623static int
67a4917b 2624netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2625{
2626 struct in_addr any = { INADDR_ANY };
2627 struct rtentry rt;
2628 int error;
2629
2630 memset(&rt, 0, sizeof rt);
2631 make_in4_sockaddr(&rt.rt_dst, any);
2632 make_in4_sockaddr(&rt.rt_gateway, router);
2633 make_in4_sockaddr(&rt.rt_genmask, any);
2634 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2635 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2636 if (error) {
10a89ef0 2637 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2638 }
2639 return error;
2640}
2641
f1acd62b
BP
2642static int
2643netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2644 char **netdev_name)
2645{
2646 static const char fn[] = "/proc/net/route";
2647 FILE *stream;
2648 char line[256];
2649 int ln;
2650
2651 *netdev_name = NULL;
2652 stream = fopen(fn, "r");
2653 if (stream == NULL) {
10a89ef0 2654 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2655 return errno;
2656 }
2657
2658 ln = 0;
2659 while (fgets(line, sizeof line, stream)) {
2660 if (++ln >= 2) {
2661 char iface[17];
dbba996b 2662 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2663 int refcnt, metric, mtu;
2664 unsigned int flags, use, window, irtt;
2665
c2c28dfd
BP
2666 if (!ovs_scan(line,
2667 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2668 " %d %u %u\n",
2669 iface, &dest, &gateway, &flags, &refcnt,
2670 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2671 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2672 fn, ln, line);
2673 continue;
2674 }
2675 if (!(flags & RTF_UP)) {
2676 /* Skip routes that aren't up. */
2677 continue;
2678 }
2679
2680 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2681 * network byte order, so we don't need need any endian
f1acd62b
BP
2682 * conversions here. */
2683 if ((dest & mask) == (host->s_addr & mask)) {
2684 if (!gateway) {
2685 /* The host is directly reachable. */
2686 next_hop->s_addr = 0;
2687 } else {
2688 /* To reach the host, we must go through a gateway. */
2689 next_hop->s_addr = gateway;
2690 }
2691 *netdev_name = xstrdup(iface);
2692 fclose(stream);
2693 return 0;
2694 }
2695 }
2696 }
2697
2698 fclose(stream);
2699 return ENXIO;
2700}
2701
e210037e 2702static int
b5d57fc8 2703netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2704{
b5d57fc8 2705 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2706 int error = 0;
2707
86383816 2708 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2709 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2710 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2711
2712 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2713 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2714 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2715 cmd,
2716 ETHTOOL_GDRVINFO,
2717 "ETHTOOL_GDRVINFO");
2718 if (!error) {
b5d57fc8 2719 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2720 }
2721 }
e210037e 2722
e210037e 2723 if (!error) {
b5d57fc8
BP
2724 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2725 smap_add(smap, "driver_version", netdev->drvinfo.version);
2726 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2727 }
86383816
BP
2728 ovs_mutex_unlock(&netdev->mutex);
2729
e210037e
AE
2730 return error;
2731}
2732
4f925bd3 2733static int
275707c3
EJ
2734netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2735 struct smap *smap)
4f925bd3 2736{
79f1cbe9 2737 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2738 return 0;
2739}
2740
8b61709d
BP
2741/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2742 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2743 * returns 0. Otherwise, it returns a positive errno value; in particular,
2744 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2745static int
2746netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2747 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2748{
2749 struct arpreq r;
c100e025 2750 struct sockaddr_in sin;
8b61709d
BP
2751 int retval;
2752
2753 memset(&r, 0, sizeof r);
f2cc621b 2754 memset(&sin, 0, sizeof sin);
c100e025
BP
2755 sin.sin_family = AF_INET;
2756 sin.sin_addr.s_addr = ip;
2757 sin.sin_port = 0;
2758 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2759 r.arp_ha.sa_family = ARPHRD_ETHER;
2760 r.arp_flags = 0;
71d7c22f 2761 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2762 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2763 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2764 if (!retval) {
2765 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2766 } else if (retval != ENXIO) {
2767 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2768 netdev_get_name(netdev), IP_ARGS(ip),
2769 ovs_strerror(retval));
8b61709d
BP
2770 }
2771 return retval;
2772}
2773
2774static int
2775nd_to_iff_flags(enum netdev_flags nd)
2776{
2777 int iff = 0;
2778 if (nd & NETDEV_UP) {
2779 iff |= IFF_UP;
2780 }
2781 if (nd & NETDEV_PROMISC) {
2782 iff |= IFF_PROMISC;
2783 }
7ba19d41
AC
2784 if (nd & NETDEV_LOOPBACK) {
2785 iff |= IFF_LOOPBACK;
2786 }
8b61709d
BP
2787 return iff;
2788}
2789
2790static int
2791iff_to_nd_flags(int iff)
2792{
2793 enum netdev_flags nd = 0;
2794 if (iff & IFF_UP) {
2795 nd |= NETDEV_UP;
2796 }
2797 if (iff & IFF_PROMISC) {
2798 nd |= NETDEV_PROMISC;
2799 }
7ba19d41
AC
2800 if (iff & IFF_LOOPBACK) {
2801 nd |= NETDEV_LOOPBACK;
2802 }
8b61709d
BP
2803 return nd;
2804}
2805
2806static int
4f9f3f21
BP
2807update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2808 enum netdev_flags on, enum netdev_flags *old_flagsp)
2809 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2810{
2811 int old_flags, new_flags;
c37d4da4
EJ
2812 int error = 0;
2813
b5d57fc8 2814 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2815 *old_flagsp = iff_to_nd_flags(old_flags);
2816 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2817 if (new_flags != old_flags) {
4f9f3f21
BP
2818 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2819 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2820 }
4f9f3f21
BP
2821
2822 return error;
2823}
2824
2825static int
2826netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2827 enum netdev_flags on, enum netdev_flags *old_flagsp)
2828{
2829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2830 int error;
2831
2832 ovs_mutex_lock(&netdev->mutex);
2833 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2834 ovs_mutex_unlock(&netdev->mutex);
2835
8b61709d
BP
2836 return error;
2837}
2838
2f9dd77f 2839#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
18ebd48c
PB
2840 GET_FEATURES, GET_STATUS, \
2841 FLOW_OFFLOAD_API) \
c3827f61
BP
2842{ \
2843 NAME, \
118c77b1 2844 false, /* is_pmd */ \
c3827f61 2845 \
259e0b1a 2846 NULL, \
c3827f61
BP
2847 netdev_linux_run, \
2848 netdev_linux_wait, \
2849 \
9dc63482
BP
2850 netdev_linux_alloc, \
2851 CONSTRUCT, \
2852 netdev_linux_destruct, \
2853 netdev_linux_dealloc, \
de5cdb90 2854 NULL, /* get_config */ \
6d9e6eb4 2855 NULL, /* set_config */ \
f431bf7d 2856 NULL, /* get_tunnel_config */ \
a36de779
PS
2857 NULL, /* build header */ \
2858 NULL, /* push header */ \
2859 NULL, /* pop header */ \
7dec44fe 2860 NULL, /* get_numa_id */ \
050c60bf 2861 NULL, /* set_tx_multiq */ \
c3827f61 2862 \
c3827f61
BP
2863 netdev_linux_send, \
2864 netdev_linux_send_wait, \
2865 \
2866 netdev_linux_set_etheraddr, \
2867 netdev_linux_get_etheraddr, \
2868 netdev_linux_get_mtu, \
9b020780 2869 netdev_linux_set_mtu, \
c3827f61
BP
2870 netdev_linux_get_ifindex, \
2871 netdev_linux_get_carrier, \
65c3058c 2872 netdev_linux_get_carrier_resets, \
1670c579 2873 netdev_linux_set_miimon_interval, \
f613a0d7 2874 GET_STATS, \
971f4b39 2875 NULL, \
c3827f61 2876 \
51f87458 2877 GET_FEATURES, \
c3827f61 2878 netdev_linux_set_advertisements, \
875ab130 2879 NULL, /* get_pt_mode */ \
c3827f61
BP
2880 \
2881 netdev_linux_set_policing, \
2882 netdev_linux_get_qos_types, \
2883 netdev_linux_get_qos_capabilities, \
2884 netdev_linux_get_qos, \
2885 netdev_linux_set_qos, \
2886 netdev_linux_get_queue, \
2887 netdev_linux_set_queue, \
2888 netdev_linux_delete_queue, \
2889 netdev_linux_get_queue_stats, \
89454bf4
BP
2890 netdev_linux_queue_dump_start, \
2891 netdev_linux_queue_dump_next, \
2892 netdev_linux_queue_dump_done, \
c3827f61
BP
2893 netdev_linux_dump_queue_stats, \
2894 \
c3827f61 2895 netdev_linux_set_in4, \
a8704b50 2896 netdev_linux_get_addr_list, \
c3827f61
BP
2897 netdev_linux_add_router, \
2898 netdev_linux_get_next_hop, \
4f925bd3 2899 GET_STATUS, \
c3827f61
BP
2900 netdev_linux_arp_lookup, \
2901 \
2902 netdev_linux_update_flags, \
790fb3b7 2903 NULL, /* reconfigure */ \
c3827f61 2904 \
f7791740
PS
2905 netdev_linux_rxq_alloc, \
2906 netdev_linux_rxq_construct, \
2907 netdev_linux_rxq_destruct, \
2908 netdev_linux_rxq_dealloc, \
2909 netdev_linux_rxq_recv, \
2910 netdev_linux_rxq_wait, \
2911 netdev_linux_rxq_drain, \
18ebd48c
PB
2912 \
2913 FLOW_OFFLOAD_API \
c3827f61
BP
2914}
2915
2916const struct netdev_class netdev_linux_class =
2917 NETDEV_LINUX_CLASS(
2918 "system",
9dc63482 2919 netdev_linux_construct,
f613a0d7 2920 netdev_linux_get_stats,
51f87458 2921 netdev_linux_get_features,
18ebd48c
PB
2922 netdev_linux_get_status,
2923 LINUX_FLOW_OFFLOAD_API);
c3827f61
BP
2924
2925const struct netdev_class netdev_tap_class =
2926 NETDEV_LINUX_CLASS(
2927 "tap",
9dc63482 2928 netdev_linux_construct_tap,
bba1e6f3 2929 netdev_tap_get_stats,
51f87458 2930 netdev_linux_get_features,
18ebd48c
PB
2931 netdev_linux_get_status,
2932 NO_OFFLOAD_API);
c3827f61
BP
2933
2934const struct netdev_class netdev_internal_class =
2935 NETDEV_LINUX_CLASS(
2936 "internal",
9dc63482 2937 netdev_linux_construct,
bba1e6f3 2938 netdev_internal_get_stats,
51f87458 2939 NULL, /* get_features */
18ebd48c
PB
2940 netdev_internal_get_status,
2941 NO_OFFLOAD_API);
8b61709d 2942\f
677d9158
JV
2943
2944#define CODEL_N_QUEUES 0x0000
2945
2f4298ce
BP
2946/* In sufficiently new kernel headers these are defined as enums in
2947 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2948 * kernels. (This overrides any enum definition in the header file but that's
2949 * harmless.) */
2950#define TCA_CODEL_TARGET 1
2951#define TCA_CODEL_LIMIT 2
2952#define TCA_CODEL_INTERVAL 3
2953
677d9158
JV
2954struct codel {
2955 struct tc tc;
2956 uint32_t target;
2957 uint32_t limit;
2958 uint32_t interval;
2959};
2960
2961static struct codel *
2962codel_get__(const struct netdev *netdev_)
2963{
2964 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2965 return CONTAINER_OF(netdev->tc, struct codel, tc);
2966}
2967
2968static void
2969codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2970 uint32_t interval)
2971{
2972 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2973 struct codel *codel;
2974
2975 codel = xmalloc(sizeof *codel);
2976 tc_init(&codel->tc, &tc_ops_codel);
2977 codel->target = target;
2978 codel->limit = limit;
2979 codel->interval = interval;
2980
2981 netdev->tc = &codel->tc;
2982}
2983
2984static int
2985codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2986 uint32_t interval)
2987{
2988 size_t opt_offset;
2989 struct ofpbuf request;
2990 struct tcmsg *tcmsg;
2991 uint32_t otarget, olimit, ointerval;
2992 int error;
2993
2994 tc_del_qdisc(netdev);
2995
7874bdff
RD
2996 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
2997 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
2998 if (!tcmsg) {
2999 return ENODEV;
3000 }
3001 tcmsg->tcm_handle = tc_make_handle(1, 0);
3002 tcmsg->tcm_parent = TC_H_ROOT;
3003
3004 otarget = target ? target : 5000;
3005 olimit = limit ? limit : 10240;
3006 ointerval = interval ? interval : 100000;
3007
3008 nl_msg_put_string(&request, TCA_KIND, "codel");
3009 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3010 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3011 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3012 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3013 nl_msg_end_nested(&request, opt_offset);
3014
3015 error = tc_transact(&request, NULL);
3016 if (error) {
3017 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3018 "target %u, limit %u, interval %u error %d(%s)",
3019 netdev_get_name(netdev),
3020 otarget, olimit, ointerval,
3021 error, ovs_strerror(error));
3022 }
3023 return error;
3024}
3025
3026static void
3027codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3028 const struct smap *details, struct codel *codel)
3029{
13c1637f
BP
3030 codel->target = smap_get_ullong(details, "target", 0);
3031 codel->limit = smap_get_ullong(details, "limit", 0);
3032 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3033
3034 if (!codel->target) {
3035 codel->target = 5000;
3036 }
3037 if (!codel->limit) {
3038 codel->limit = 10240;
3039 }
3040 if (!codel->interval) {
3041 codel->interval = 100000;
3042 }
3043}
3044
3045static int
3046codel_tc_install(struct netdev *netdev, const struct smap *details)
3047{
3048 int error;
3049 struct codel codel;
3050
3051 codel_parse_qdisc_details__(netdev, details, &codel);
3052 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3053 codel.interval);
3054 if (!error) {
3055 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3056 }
3057 return error;
3058}
3059
3060static int
3061codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3062{
3063 static const struct nl_policy tca_codel_policy[] = {
3064 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3065 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3066 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3067 };
3068
3069 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3070
3071 if (!nl_parse_nested(nl_options, tca_codel_policy,
3072 attrs, ARRAY_SIZE(tca_codel_policy))) {
3073 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3074 return EPROTO;
3075 }
3076
3077 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3078 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3079 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3080 return 0;
3081}
3082
3083static int
3084codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3085{
3086 struct nlattr *nlattr;
3087 const char * kind;
3088 int error;
3089 struct codel codel;
3090
3091 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3092 if (error != 0) {
3093 return error;
3094 }
3095
3096 error = codel_parse_tca_options__(nlattr, &codel);
3097 if (error != 0) {
3098 return error;
3099 }
3100
3101 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3102 return 0;
3103}
3104
3105
3106static void
3107codel_tc_destroy(struct tc *tc)
3108{
3109 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3110 tc_destroy(tc);
3111 free(codel);
3112}
3113
3114static int
3115codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3116{
3117 const struct codel *codel = codel_get__(netdev);
3118 smap_add_format(details, "target", "%u", codel->target);
3119 smap_add_format(details, "limit", "%u", codel->limit);
3120 smap_add_format(details, "interval", "%u", codel->interval);
3121 return 0;
3122}
3123
3124static int
3125codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3126{
3127 struct codel codel;
3128
3129 codel_parse_qdisc_details__(netdev, details, &codel);
3130 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3131 codel_get__(netdev)->target = codel.target;
3132 codel_get__(netdev)->limit = codel.limit;
3133 codel_get__(netdev)->interval = codel.interval;
3134 return 0;
3135}
3136
3137static const struct tc_ops tc_ops_codel = {
3138 "codel", /* linux_name */
3139 "linux-codel", /* ovs_name */
3140 CODEL_N_QUEUES, /* n_queues */
3141 codel_tc_install,
3142 codel_tc_load,
3143 codel_tc_destroy,
3144 codel_qdisc_get,
3145 codel_qdisc_set,
3146 NULL,
3147 NULL,
3148 NULL,
3149 NULL,
3150 NULL
3151};
3152\f
3153/* FQ-CoDel traffic control class. */
3154
3155#define FQCODEL_N_QUEUES 0x0000
3156
2f4298ce
BP
3157/* In sufficiently new kernel headers these are defined as enums in
3158 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3159 * kernels. (This overrides any enum definition in the header file but that's
3160 * harmless.) */
3161#define TCA_FQ_CODEL_TARGET 1
3162#define TCA_FQ_CODEL_LIMIT 2
3163#define TCA_FQ_CODEL_INTERVAL 3
3164#define TCA_FQ_CODEL_ECN 4
3165#define TCA_FQ_CODEL_FLOWS 5
3166#define TCA_FQ_CODEL_QUANTUM 6
3167
677d9158
JV
3168struct fqcodel {
3169 struct tc tc;
3170 uint32_t target;
3171 uint32_t limit;
3172 uint32_t interval;
3173 uint32_t flows;
3174 uint32_t quantum;
3175};
3176
3177static struct fqcodel *
3178fqcodel_get__(const struct netdev *netdev_)
3179{
3180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3181 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3182}
3183
3184static void
3185fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3186 uint32_t interval, uint32_t flows, uint32_t quantum)
3187{
3188 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3189 struct fqcodel *fqcodel;
3190
3191 fqcodel = xmalloc(sizeof *fqcodel);
3192 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3193 fqcodel->target = target;
3194 fqcodel->limit = limit;
3195 fqcodel->interval = interval;
3196 fqcodel->flows = flows;
3197 fqcodel->quantum = quantum;
3198
3199 netdev->tc = &fqcodel->tc;
3200}
3201
3202static int
3203fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3204 uint32_t interval, uint32_t flows, uint32_t quantum)
3205{
3206 size_t opt_offset;
3207 struct ofpbuf request;
3208 struct tcmsg *tcmsg;
3209 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3210 int error;
3211
3212 tc_del_qdisc(netdev);
3213
7874bdff
RD
3214 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3215 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3216 if (!tcmsg) {
3217 return ENODEV;
3218 }
3219 tcmsg->tcm_handle = tc_make_handle(1, 0);
3220 tcmsg->tcm_parent = TC_H_ROOT;
3221
3222 otarget = target ? target : 5000;
3223 olimit = limit ? limit : 10240;
3224 ointerval = interval ? interval : 100000;
3225 oflows = flows ? flows : 1024;
3226 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3227 not mtu */
3228
3229 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3230 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3231 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3232 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3233 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3234 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3235 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3236 nl_msg_end_nested(&request, opt_offset);
3237
3238 error = tc_transact(&request, NULL);
3239 if (error) {
3240 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3241 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3242 netdev_get_name(netdev),
3243 otarget, olimit, ointerval, oflows, oquantum,
3244 error, ovs_strerror(error));
3245 }
3246 return error;
3247}
3248
3249static void
3250fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3251 const struct smap *details, struct fqcodel *fqcodel)
3252{
13c1637f
BP
3253 fqcodel->target = smap_get_ullong(details, "target", 0);
3254 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3255 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3256 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3257 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3258
677d9158
JV
3259 if (!fqcodel->target) {
3260 fqcodel->target = 5000;
3261 }
3262 if (!fqcodel->limit) {
3263 fqcodel->limit = 10240;
3264 }
3265 if (!fqcodel->interval) {
3266 fqcodel->interval = 1000000;
3267 }
3268 if (!fqcodel->flows) {
3269 fqcodel->flows = 1024;
3270 }
3271 if (!fqcodel->quantum) {
3272 fqcodel->quantum = 1514;
3273 }
3274}
3275
3276static int
3277fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3278{
3279 int error;
3280 struct fqcodel fqcodel;
3281
3282 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3283 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3284 fqcodel.interval, fqcodel.flows,
3285 fqcodel.quantum);
3286 if (!error) {
3287 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3288 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3289 }
3290 return error;
3291}
3292
3293static int
3294fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3295{
3296 static const struct nl_policy tca_fqcodel_policy[] = {
3297 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3298 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3299 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3300 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3301 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3302 };
3303
3304 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3305
3306 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3307 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3308 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3309 return EPROTO;
3310 }
3311
3312 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3313 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3314 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3315 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3316 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3317 return 0;
3318}
3319
3320static int
3321fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3322{
3323 struct nlattr *nlattr;
3324 const char * kind;
3325 int error;
3326 struct fqcodel fqcodel;
3327
3328 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3329 if (error != 0) {
3330 return error;
3331 }
3332
3333 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3334 if (error != 0) {
3335 return error;
3336 }
3337
3338 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3339 fqcodel.flows, fqcodel.quantum);
3340 return 0;
3341}
3342
3343static void
3344fqcodel_tc_destroy(struct tc *tc)
3345{
3346 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3347 tc_destroy(tc);
3348 free(fqcodel);
3349}
3350
3351static int
3352fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3353{
3354 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3355 smap_add_format(details, "target", "%u", fqcodel->target);
3356 smap_add_format(details, "limit", "%u", fqcodel->limit);
3357 smap_add_format(details, "interval", "%u", fqcodel->interval);
3358 smap_add_format(details, "flows", "%u", fqcodel->flows);
3359 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3360 return 0;
3361}
3362
3363static int
3364fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3365{
3366 struct fqcodel fqcodel;
3367
3368 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3369 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3370 fqcodel.flows, fqcodel.quantum);
3371 fqcodel_get__(netdev)->target = fqcodel.target;
3372 fqcodel_get__(netdev)->limit = fqcodel.limit;
3373 fqcodel_get__(netdev)->interval = fqcodel.interval;
3374 fqcodel_get__(netdev)->flows = fqcodel.flows;
3375 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3376 return 0;
3377}
3378
3379static const struct tc_ops tc_ops_fqcodel = {
3380 "fq_codel", /* linux_name */
3381 "linux-fq_codel", /* ovs_name */
3382 FQCODEL_N_QUEUES, /* n_queues */
3383 fqcodel_tc_install,
3384 fqcodel_tc_load,
3385 fqcodel_tc_destroy,
3386 fqcodel_qdisc_get,
3387 fqcodel_qdisc_set,
3388 NULL,
3389 NULL,
3390 NULL,
3391 NULL,
3392 NULL
3393};
3394\f
3395/* SFQ traffic control class. */
3396
3397#define SFQ_N_QUEUES 0x0000
3398
3399struct sfq {
3400 struct tc tc;
3401 uint32_t quantum;
3402 uint32_t perturb;
3403};
3404
3405static struct sfq *
3406sfq_get__(const struct netdev *netdev_)
3407{
3408 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3409 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3410}
3411
3412static void
3413sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3414{
3415 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3416 struct sfq *sfq;
3417
3418 sfq = xmalloc(sizeof *sfq);
3419 tc_init(&sfq->tc, &tc_ops_sfq);
3420 sfq->perturb = perturb;
3421 sfq->quantum = quantum;
3422
3423 netdev->tc = &sfq->tc;
3424}
3425
3426static int
3427sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3428{
3429 struct tc_sfq_qopt opt;
3430 struct ofpbuf request;
3431 struct tcmsg *tcmsg;
3432 int mtu;
3433 int mtu_error, error;
3434 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3435
3436 tc_del_qdisc(netdev);
3437
7874bdff
RD
3438 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3439 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3440 if (!tcmsg) {
3441 return ENODEV;
3442 }
3443 tcmsg->tcm_handle = tc_make_handle(1, 0);
3444 tcmsg->tcm_parent = TC_H_ROOT;
3445
3446 memset(&opt, 0, sizeof opt);
3447 if (!quantum) {
3448 if (!mtu_error) {
3449 opt.quantum = mtu; /* if we cannot find mtu, use default */
3450 }
3451 } else {
3452 opt.quantum = quantum;
3453 }
3454
3455 if (!perturb) {
3456 opt.perturb_period = 10;
3457 } else {
3458 opt.perturb_period = perturb;
3459 }
3460
3461 nl_msg_put_string(&request, TCA_KIND, "sfq");
3462 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3463
3464 error = tc_transact(&request, NULL);
3465 if (error) {
3466 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3467 "quantum %u, perturb %u error %d(%s)",
3468 netdev_get_name(netdev),
3469 opt.quantum, opt.perturb_period,
3470 error, ovs_strerror(error));
3471 }
3472 return error;
3473}
3474
3475static void
3476sfq_parse_qdisc_details__(struct netdev *netdev,
3477 const struct smap *details, struct sfq *sfq)
3478{
13c1637f
BP
3479 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3480 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3481
677d9158
JV
3482 if (!sfq->perturb) {
3483 sfq->perturb = 10;
3484 }
3485
3486 if (!sfq->quantum) {
13c1637f
BP
3487 int mtu;
3488 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3489 sfq->quantum = mtu;
3490 } else {
3491 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3492 "device without mtu");
677d9158
JV
3493 }
3494 }
3495}
3496
3497static int
3498sfq_tc_install(struct netdev *netdev, const struct smap *details)
3499{
3500 int error;
3501 struct sfq sfq;
3502
3503 sfq_parse_qdisc_details__(netdev, details, &sfq);
3504 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3505 if (!error) {
3506 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3507 }
3508 return error;
3509}
3510
3511static int
3512sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3513{
3514 const struct tc_sfq_qopt *sfq;
3515 struct nlattr *nlattr;
3516 const char * kind;
3517 int error;
3518
3519 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3520 if (error == 0) {
3521 sfq = nl_attr_get(nlattr);
3522 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3523 return 0;
3524 }
3525
3526 return error;
3527}
3528
3529static void
3530sfq_tc_destroy(struct tc *tc)
3531{
3532 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3533 tc_destroy(tc);
3534 free(sfq);
3535}
3536
3537static int
3538sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3539{
3540 const struct sfq *sfq = sfq_get__(netdev);
3541 smap_add_format(details, "quantum", "%u", sfq->quantum);
3542 smap_add_format(details, "perturb", "%u", sfq->perturb);
3543 return 0;
3544}
3545
3546static int
3547sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3548{
3549 struct sfq sfq;
3550
3551 sfq_parse_qdisc_details__(netdev, details, &sfq);
3552 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3553 sfq_get__(netdev)->quantum = sfq.quantum;
3554 sfq_get__(netdev)->perturb = sfq.perturb;
3555 return 0;
3556}
3557
3558static const struct tc_ops tc_ops_sfq = {
3559 "sfq", /* linux_name */
3560 "linux-sfq", /* ovs_name */
3561 SFQ_N_QUEUES, /* n_queues */
3562 sfq_tc_install,
3563 sfq_tc_load,
3564 sfq_tc_destroy,
3565 sfq_qdisc_get,
3566 sfq_qdisc_set,
3567 NULL,
3568 NULL,
3569 NULL,
3570 NULL,
3571 NULL
3572};
3573\f
c1c9c9c4 3574/* HTB traffic control class. */
559843ed 3575
c1c9c9c4 3576#define HTB_N_QUEUES 0xf000
4f631ccd 3577#define HTB_RATE2QUANTUM 10
8b61709d 3578
c1c9c9c4
BP
3579struct htb {
3580 struct tc tc;
3581 unsigned int max_rate; /* In bytes/s. */
3582};
8b61709d 3583
c1c9c9c4 3584struct htb_class {
93b13be8 3585 struct tc_queue tc_queue;
c1c9c9c4
BP
3586 unsigned int min_rate; /* In bytes/s. */
3587 unsigned int max_rate; /* In bytes/s. */
3588 unsigned int burst; /* In bytes. */
3589 unsigned int priority; /* Lower values are higher priorities. */
3590};
8b61709d 3591
c1c9c9c4 3592static struct htb *
b5d57fc8 3593htb_get__(const struct netdev *netdev_)
c1c9c9c4 3594{
b5d57fc8
BP
3595 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3596 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3597}
3598
24045e35 3599static void
b5d57fc8 3600htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3601{
b5d57fc8 3602 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3603 struct htb *htb;
3604
3605 htb = xmalloc(sizeof *htb);
3606 tc_init(&htb->tc, &tc_ops_htb);
3607 htb->max_rate = max_rate;
3608
b5d57fc8 3609 netdev->tc = &htb->tc;
c1c9c9c4
BP
3610}
3611
3612/* Create an HTB qdisc.
3613 *
a339aa81 3614 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3615static int
3616htb_setup_qdisc__(struct netdev *netdev)
3617{
3618 size_t opt_offset;
3619 struct tc_htb_glob opt;
3620 struct ofpbuf request;
3621 struct tcmsg *tcmsg;
3622
3623 tc_del_qdisc(netdev);
3624
7874bdff
RD
3625 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3626 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3627 if (!tcmsg) {
3628 return ENODEV;
3629 }
c1c9c9c4
BP
3630 tcmsg->tcm_handle = tc_make_handle(1, 0);
3631 tcmsg->tcm_parent = TC_H_ROOT;
3632
3633 nl_msg_put_string(&request, TCA_KIND, "htb");
3634
3635 memset(&opt, 0, sizeof opt);
4f631ccd 3636 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3637 opt.version = 3;
4ecf12d5 3638 opt.defcls = 1;
c1c9c9c4
BP
3639
3640 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3641 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3642 nl_msg_end_nested(&request, opt_offset);
3643
3644 return tc_transact(&request, NULL);
3645}
3646
3647/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3648 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3649static int
3650htb_setup_class__(struct netdev *netdev, unsigned int handle,
3651 unsigned int parent, struct htb_class *class)
3652{
3653 size_t opt_offset;
3654 struct tc_htb_opt opt;
3655 struct ofpbuf request;
3656 struct tcmsg *tcmsg;
3657 int error;
3658 int mtu;
3659
73371c09 3660 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3661 if (error) {
f915f1a8
BP
3662 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3663 netdev_get_name(netdev));
9b020780 3664 return error;
f915f1a8 3665 }
c1c9c9c4
BP
3666
3667 memset(&opt, 0, sizeof opt);
3668 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3669 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3670 /* Makes sure the quantum is at least MTU. Setting quantum will
3671 * make htb ignore the r2q for this class. */
3672 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3673 opt.quantum = mtu;
3674 }
c1c9c9c4
BP
3675 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3676 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3677 opt.prio = class->priority;
3678
7874bdff
RD
3679 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3680 &request);
23a98ffe
BP
3681 if (!tcmsg) {
3682 return ENODEV;
3683 }
c1c9c9c4
BP
3684 tcmsg->tcm_handle = handle;
3685 tcmsg->tcm_parent = parent;
3686
3687 nl_msg_put_string(&request, TCA_KIND, "htb");
3688 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3689 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3690 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3691 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3692 nl_msg_end_nested(&request, opt_offset);
3693
3694 error = tc_transact(&request, NULL);
3695 if (error) {
3696 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3697 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3698 netdev_get_name(netdev),
3699 tc_get_major(handle), tc_get_minor(handle),
3700 tc_get_major(parent), tc_get_minor(parent),
3701 class->min_rate, class->max_rate,
10a89ef0 3702 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3703 }
3704 return error;
3705}
3706
3707/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3708 * description of them into 'details'. The description complies with the
3709 * specification given in the vswitch database documentation for linux-htb
3710 * queue details. */
3711static int
3712htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3713{
3714 static const struct nl_policy tca_htb_policy[] = {
3715 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3716 .min_len = sizeof(struct tc_htb_opt) },
3717 };
3718
3719 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3720 const struct tc_htb_opt *htb;
3721
3722 if (!nl_parse_nested(nl_options, tca_htb_policy,
3723 attrs, ARRAY_SIZE(tca_htb_policy))) {
3724 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3725 return EPROTO;
3726 }
3727
3728 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3729 class->min_rate = htb->rate.rate;
3730 class->max_rate = htb->ceil.rate;
3731 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3732 class->priority = htb->prio;
3733 return 0;
3734}
3735
3736static int
3737htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3738 struct htb_class *options,
3739 struct netdev_queue_stats *stats)
3740{
3741 struct nlattr *nl_options;
3742 unsigned int handle;
3743 int error;
3744
3745 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3746 if (!error && queue_id) {
17ee3c1f
BP
3747 unsigned int major = tc_get_major(handle);
3748 unsigned int minor = tc_get_minor(handle);
3749 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3750 *queue_id = minor - 1;
c1c9c9c4
BP
3751 } else {
3752 error = EPROTO;
3753 }
3754 }
3755 if (!error && options) {
3756 error = htb_parse_tca_options__(nl_options, options);
3757 }
3758 return error;
3759}
3760
3761static void
73371c09 3762htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3763 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3764{
73371c09 3765 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 3766
13c1637f 3767 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 3768 if (!hc->max_rate) {
a00ca915 3769 enum netdev_features current;
c1c9c9c4 3770
73371c09
BP
3771 netdev_linux_read_features(netdev);
3772 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3773 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3774 }
3775 hc->min_rate = hc->max_rate;
3776 hc->burst = 0;
3777 hc->priority = 0;
3778}
3779
3780static int
3781htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3782 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3783{
3784 const struct htb *htb = htb_get__(netdev);
9b020780 3785 int mtu, error;
214117fd 3786 unsigned long long int max_rate_bit;
c1c9c9c4 3787
73371c09 3788 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3789 if (error) {
f915f1a8
BP
3790 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3791 netdev_get_name(netdev));
9b020780 3792 return error;
f915f1a8
BP
3793 }
3794
4f104611
EJ
3795 /* HTB requires at least an mtu sized min-rate to send any traffic even
3796 * on uncongested links. */
13c1637f 3797 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 3798 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3799 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3800
3801 /* max-rate */
214117fd
KF
3802 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
3803 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
3804 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3805 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3806
3807 /* burst
3808 *
3809 * According to hints in the documentation that I've read, it is important
3810 * that 'burst' be at least as big as the largest frame that might be
3811 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3812 * but having it a bit too small is a problem. Since netdev_get_mtu()
3813 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3814 * the MTU. We actually add 64, instead of 14, as a guard against
3815 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 3816 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
3817 hc->burst = MAX(hc->burst, mtu + 64);
3818
3819 /* priority */
13c1637f 3820 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
3821
3822 return 0;
3823}
3824
3825static int
3826htb_query_class__(const struct netdev *netdev, unsigned int handle,
3827 unsigned int parent, struct htb_class *options,
3828 struct netdev_queue_stats *stats)
3829{
3830 struct ofpbuf *reply;
3831 int error;
3832
3833 error = tc_query_class(netdev, handle, parent, &reply);
3834 if (!error) {
3835 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3836 ofpbuf_delete(reply);
3837 }
3838 return error;
3839}
3840
3841static int
79f1cbe9 3842htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3843{
3844 int error;
3845
3846 error = htb_setup_qdisc__(netdev);
3847 if (!error) {
3848 struct htb_class hc;
3849
3850 htb_parse_qdisc_details__(netdev, details, &hc);
3851 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3852 tc_make_handle(1, 0), &hc);
3853 if (!error) {
3854 htb_install__(netdev, hc.max_rate);
3855 }
3856 }
3857 return error;
3858}
3859
93b13be8
BP
3860static struct htb_class *
3861htb_class_cast__(const struct tc_queue *queue)
3862{
3863 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3864}
3865
c1c9c9c4
BP
3866static void
3867htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3868 const struct htb_class *hc)
3869{
3870 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3871 size_t hash = hash_int(queue_id, 0);
3872 struct tc_queue *queue;
c1c9c9c4
BP
3873 struct htb_class *hcp;
3874
93b13be8
BP
3875 queue = tc_find_queue__(netdev, queue_id, hash);
3876 if (queue) {
3877 hcp = htb_class_cast__(queue);
3878 } else {
c1c9c9c4 3879 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3880 queue = &hcp->tc_queue;
3881 queue->queue_id = queue_id;
6dc34a0d 3882 queue->created = time_msec();
93b13be8 3883 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3884 }
93b13be8
BP
3885
3886 hcp->min_rate = hc->min_rate;
3887 hcp->max_rate = hc->max_rate;
3888 hcp->burst = hc->burst;
3889 hcp->priority = hc->priority;
c1c9c9c4
BP
3890}
3891
3892static int
3893htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3894{
c1c9c9c4 3895 struct ofpbuf msg;
d57695d7 3896 struct queue_dump_state state;
c1c9c9c4 3897 struct htb_class hc;
c1c9c9c4
BP
3898
3899 /* Get qdisc options. */
3900 hc.max_rate = 0;
3901 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3902 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3903
3904 /* Get queues. */
d57695d7 3905 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3906 return ENODEV;
3907 }
d57695d7 3908 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3909 unsigned int queue_id;
3910
3911 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3912 htb_update_queue__(netdev, queue_id, &hc);
3913 }
3914 }
d57695d7 3915 finish_queue_dump(&state);
c1c9c9c4
BP
3916
3917 return 0;
3918}
3919
3920static void
3921htb_tc_destroy(struct tc *tc)
3922{
3923 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 3924 struct htb_class *hc;
c1c9c9c4 3925
4ec3d7c7 3926 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
3927 free(hc);
3928 }
3929 tc_destroy(tc);
3930 free(htb);
3931}
3932
3933static int
79f1cbe9 3934htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3935{
3936 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3937 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3938 return 0;
3939}
3940
3941static int
79f1cbe9 3942htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3943{
3944 struct htb_class hc;
3945 int error;
3946
3947 htb_parse_qdisc_details__(netdev, details, &hc);
3948 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3949 tc_make_handle(1, 0), &hc);
3950 if (!error) {
3951 htb_get__(netdev)->max_rate = hc.max_rate;
3952 }
3953 return error;
3954}
3955
3956static int
93b13be8 3957htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3958 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3959{
93b13be8 3960 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3961
79f1cbe9 3962 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3963 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3964 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3965 }
79f1cbe9 3966 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3967 if (hc->priority) {
79f1cbe9 3968 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3969 }
3970 return 0;
3971}
3972
3973static int
3974htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3975 const struct smap *details)
c1c9c9c4
BP
3976{
3977 struct htb_class hc;
3978 int error;
3979
3980 error = htb_parse_class_details__(netdev, details, &hc);
3981 if (error) {
3982 return error;
3983 }
3984
17ee3c1f 3985 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3986 tc_make_handle(1, 0xfffe), &hc);
3987 if (error) {
3988 return error;
3989 }
3990
3991 htb_update_queue__(netdev, queue_id, &hc);
3992 return 0;
3993}
3994
3995static int
93b13be8 3996htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3997{
93b13be8 3998 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3999 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4000 int error;
4001
93b13be8 4002 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4003 if (!error) {
93b13be8 4004 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4005 free(hc);
c1c9c9c4
BP
4006 }
4007 return error;
4008}
4009
4010static int
93b13be8 4011htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4012 struct netdev_queue_stats *stats)
4013{
93b13be8 4014 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4015 tc_make_handle(1, 0xfffe), NULL, stats);
4016}
4017
4018static int
4019htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4020 const struct ofpbuf *nlmsg,
4021 netdev_dump_queue_stats_cb *cb, void *aux)
4022{
4023 struct netdev_queue_stats stats;
17ee3c1f 4024 unsigned int handle, major, minor;
c1c9c9c4
BP
4025 int error;
4026
4027 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4028 if (error) {
4029 return error;
4030 }
4031
17ee3c1f
BP
4032 major = tc_get_major(handle);
4033 minor = tc_get_minor(handle);
4034 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4035 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4036 }
4037 return 0;
4038}
4039
4040static const struct tc_ops tc_ops_htb = {
4041 "htb", /* linux_name */
4042 "linux-htb", /* ovs_name */
4043 HTB_N_QUEUES, /* n_queues */
4044 htb_tc_install,
4045 htb_tc_load,
4046 htb_tc_destroy,
4047 htb_qdisc_get,
4048 htb_qdisc_set,
4049 htb_class_get,
4050 htb_class_set,
4051 htb_class_delete,
4052 htb_class_get_stats,
4053 htb_class_dump_stats
4054};
4055\f
a339aa81
EJ
4056/* "linux-hfsc" traffic control class. */
4057
4058#define HFSC_N_QUEUES 0xf000
4059
4060struct hfsc {
4061 struct tc tc;
4062 uint32_t max_rate;
4063};
4064
4065struct hfsc_class {
4066 struct tc_queue tc_queue;
4067 uint32_t min_rate;
4068 uint32_t max_rate;
4069};
4070
4071static struct hfsc *
b5d57fc8 4072hfsc_get__(const struct netdev *netdev_)
a339aa81 4073{
b5d57fc8
BP
4074 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4075 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4076}
4077
4078static struct hfsc_class *
4079hfsc_class_cast__(const struct tc_queue *queue)
4080{
4081 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4082}
4083
24045e35 4084static void
b5d57fc8 4085hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4086{
b5d57fc8 4087 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4088 struct hfsc *hfsc;
4089
a339aa81
EJ
4090 hfsc = xmalloc(sizeof *hfsc);
4091 tc_init(&hfsc->tc, &tc_ops_hfsc);
4092 hfsc->max_rate = max_rate;
b5d57fc8 4093 netdev->tc = &hfsc->tc;
a339aa81
EJ
4094}
4095
4096static void
4097hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4098 const struct hfsc_class *hc)
4099{
4100 size_t hash;
4101 struct hfsc *hfsc;
4102 struct hfsc_class *hcp;
4103 struct tc_queue *queue;
4104
4105 hfsc = hfsc_get__(netdev);
4106 hash = hash_int(queue_id, 0);
4107
4108 queue = tc_find_queue__(netdev, queue_id, hash);
4109 if (queue) {
4110 hcp = hfsc_class_cast__(queue);
4111 } else {
4112 hcp = xmalloc(sizeof *hcp);
4113 queue = &hcp->tc_queue;
4114 queue->queue_id = queue_id;
6dc34a0d 4115 queue->created = time_msec();
a339aa81
EJ
4116 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4117 }
4118
4119 hcp->min_rate = hc->min_rate;
4120 hcp->max_rate = hc->max_rate;
4121}
4122
4123static int
4124hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4125{
4126 const struct tc_service_curve *rsc, *fsc, *usc;
4127 static const struct nl_policy tca_hfsc_policy[] = {
4128 [TCA_HFSC_RSC] = {
4129 .type = NL_A_UNSPEC,
4130 .optional = false,
4131 .min_len = sizeof(struct tc_service_curve),
4132 },
4133 [TCA_HFSC_FSC] = {
4134 .type = NL_A_UNSPEC,
4135 .optional = false,
4136 .min_len = sizeof(struct tc_service_curve),
4137 },
4138 [TCA_HFSC_USC] = {
4139 .type = NL_A_UNSPEC,
4140 .optional = false,
4141 .min_len = sizeof(struct tc_service_curve),
4142 },
4143 };
4144 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4145
4146 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4147 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4148 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4149 return EPROTO;
4150 }
4151
4152 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4153 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4154 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4155
4156 if (rsc->m1 != 0 || rsc->d != 0 ||
4157 fsc->m1 != 0 || fsc->d != 0 ||
4158 usc->m1 != 0 || usc->d != 0) {
4159 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4160 "Non-linear service curves are not supported.");
4161 return EPROTO;
4162 }
4163
4164 if (rsc->m2 != fsc->m2) {
4165 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4166 "Real-time service curves are not supported ");
4167 return EPROTO;
4168 }
4169
4170 if (rsc->m2 > usc->m2) {
4171 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4172 "Min-rate service curve is greater than "
4173 "the max-rate service curve.");
4174 return EPROTO;
4175 }
4176
4177 class->min_rate = fsc->m2;
4178 class->max_rate = usc->m2;
4179 return 0;
4180}
4181
4182static int
4183hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4184 struct hfsc_class *options,
4185 struct netdev_queue_stats *stats)
4186{
4187 int error;
4188 unsigned int handle;
4189 struct nlattr *nl_options;
4190
4191 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4192 if (error) {
4193 return error;
4194 }
4195
4196 if (queue_id) {
4197 unsigned int major, minor;
4198
4199 major = tc_get_major(handle);
4200 minor = tc_get_minor(handle);
4201 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4202 *queue_id = minor - 1;
4203 } else {
4204 return EPROTO;
4205 }
4206 }
4207
4208 if (options) {
4209 error = hfsc_parse_tca_options__(nl_options, options);
4210 }
4211
4212 return error;
4213}
4214
4215static int
4216hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4217 unsigned int parent, struct hfsc_class *options,
4218 struct netdev_queue_stats *stats)
4219{
4220 int error;
4221 struct ofpbuf *reply;
4222
4223 error = tc_query_class(netdev, handle, parent, &reply);
4224 if (error) {
4225 return error;
4226 }
4227
4228 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4229 ofpbuf_delete(reply);
4230 return error;
4231}
4232
4233static void
73371c09 4234hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4235 struct hfsc_class *class)
4236{
73371c09 4237 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4238
13c1637f 4239 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4240 if (!max_rate) {
a00ca915 4241 enum netdev_features current;
a339aa81 4242
73371c09
BP
4243 netdev_linux_read_features(netdev);
4244 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4245 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4246 }
4247
4248 class->min_rate = max_rate;
4249 class->max_rate = max_rate;
4250}
4251
4252static int
4253hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4254 const struct smap *details,
a339aa81
EJ
4255 struct hfsc_class * class)
4256{
4257 const struct hfsc *hfsc;
4258 uint32_t min_rate, max_rate;
a339aa81
EJ
4259
4260 hfsc = hfsc_get__(netdev);
a339aa81 4261
13c1637f 4262 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4263 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4264 min_rate = MIN(min_rate, hfsc->max_rate);
4265
13c1637f 4266 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4267 max_rate = MAX(max_rate, min_rate);
4268 max_rate = MIN(max_rate, hfsc->max_rate);
4269
4270 class->min_rate = min_rate;
4271 class->max_rate = max_rate;
4272
4273 return 0;
4274}
4275
4276/* Create an HFSC qdisc.
4277 *
4278 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4279static int
4280hfsc_setup_qdisc__(struct netdev * netdev)
4281{
4282 struct tcmsg *tcmsg;
4283 struct ofpbuf request;
4284 struct tc_hfsc_qopt opt;
4285
4286 tc_del_qdisc(netdev);
4287
7874bdff
RD
4288 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4289 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
4290
4291 if (!tcmsg) {
4292 return ENODEV;
4293 }
4294
4295 tcmsg->tcm_handle = tc_make_handle(1, 0);
4296 tcmsg->tcm_parent = TC_H_ROOT;
4297
4298 memset(&opt, 0, sizeof opt);
4299 opt.defcls = 1;
4300
4301 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4302 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4303
4304 return tc_transact(&request, NULL);
4305}
4306
4307/* Create an HFSC class.
4308 *
4309 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4310 * sc rate <min_rate> ul rate <max_rate>" */
4311static int
4312hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4313 unsigned int parent, struct hfsc_class *class)
4314{
4315 int error;
4316 size_t opt_offset;
4317 struct tcmsg *tcmsg;
4318 struct ofpbuf request;
4319 struct tc_service_curve min, max;
4320
7874bdff
RD
4321 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4322 &request);
a339aa81
EJ
4323
4324 if (!tcmsg) {
4325 return ENODEV;
4326 }
4327
4328 tcmsg->tcm_handle = handle;
4329 tcmsg->tcm_parent = parent;
4330
4331 min.m1 = 0;
4332 min.d = 0;
4333 min.m2 = class->min_rate;
4334
4335 max.m1 = 0;
4336 max.d = 0;
4337 max.m2 = class->max_rate;
4338
4339 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4340 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4341 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4342 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4343 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4344 nl_msg_end_nested(&request, opt_offset);
4345
4346 error = tc_transact(&request, NULL);
4347 if (error) {
4348 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4349 "min-rate %ubps, max-rate %ubps (%s)",
4350 netdev_get_name(netdev),
4351 tc_get_major(handle), tc_get_minor(handle),
4352 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4353 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4354 }
4355
4356 return error;
4357}
4358
4359static int
79f1cbe9 4360hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4361{
4362 int error;
4363 struct hfsc_class class;
4364
4365 error = hfsc_setup_qdisc__(netdev);
4366
4367 if (error) {
4368 return error;
4369 }
4370
4371 hfsc_parse_qdisc_details__(netdev, details, &class);
4372 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4373 tc_make_handle(1, 0), &class);
4374
4375 if (error) {
4376 return error;
4377 }
4378
4379 hfsc_install__(netdev, class.max_rate);
4380 return 0;
4381}
4382
4383static int
4384hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4385{
4386 struct ofpbuf msg;
d57695d7 4387 struct queue_dump_state state;
a339aa81
EJ
4388 struct hfsc_class hc;
4389
4390 hc.max_rate = 0;
4391 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4392 hfsc_install__(netdev, hc.max_rate);
a339aa81 4393
d57695d7 4394 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4395 return ENODEV;
4396 }
4397
d57695d7 4398 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4399 unsigned int queue_id;
4400
4401 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4402 hfsc_update_queue__(netdev, queue_id, &hc);
4403 }
4404 }
4405
d57695d7 4406 finish_queue_dump(&state);
a339aa81
EJ
4407 return 0;
4408}
4409
4410static void
4411hfsc_tc_destroy(struct tc *tc)
4412{
4413 struct hfsc *hfsc;
4414 struct hfsc_class *hc, *next;
4415
4416 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4417
4418 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4419 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4420 free(hc);
4421 }
4422
4423 tc_destroy(tc);
4424 free(hfsc);
4425}
4426
4427static int
79f1cbe9 4428hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4429{
4430 const struct hfsc *hfsc;
4431 hfsc = hfsc_get__(netdev);
79f1cbe9 4432 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4433 return 0;
4434}
4435
4436static int
79f1cbe9 4437hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4438{
4439 int error;
4440 struct hfsc_class class;
4441
4442 hfsc_parse_qdisc_details__(netdev, details, &class);
4443 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4444 tc_make_handle(1, 0), &class);
4445
4446 if (!error) {
4447 hfsc_get__(netdev)->max_rate = class.max_rate;
4448 }
4449
4450 return error;
4451}
4452
4453static int
4454hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4455 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4456{
4457 const struct hfsc_class *hc;
4458
4459 hc = hfsc_class_cast__(queue);
79f1cbe9 4460 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4461 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4462 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4463 }
4464 return 0;
4465}
4466
4467static int
4468hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4469 const struct smap *details)
a339aa81
EJ
4470{
4471 int error;
4472 struct hfsc_class class;
4473
4474 error = hfsc_parse_class_details__(netdev, details, &class);
4475 if (error) {
4476 return error;
4477 }
4478
4479 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4480 tc_make_handle(1, 0xfffe), &class);
4481 if (error) {
4482 return error;
4483 }
4484
4485 hfsc_update_queue__(netdev, queue_id, &class);
4486 return 0;
4487}
4488
4489static int
4490hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4491{
4492 int error;
4493 struct hfsc *hfsc;
4494 struct hfsc_class *hc;
4495
4496 hc = hfsc_class_cast__(queue);
4497 hfsc = hfsc_get__(netdev);
4498
4499 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4500 if (!error) {
4501 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4502 free(hc);
4503 }
4504 return error;
4505}
4506
4507static int
4508hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4509 struct netdev_queue_stats *stats)
4510{
4511 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4512 tc_make_handle(1, 0xfffe), NULL, stats);
4513}
4514
4515static int
4516hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4517 const struct ofpbuf *nlmsg,
4518 netdev_dump_queue_stats_cb *cb, void *aux)
4519{
4520 struct netdev_queue_stats stats;
4521 unsigned int handle, major, minor;
4522 int error;
4523
4524 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4525 if (error) {
4526 return error;
4527 }
4528
4529 major = tc_get_major(handle);
4530 minor = tc_get_minor(handle);
4531 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4532 (*cb)(minor - 1, &stats, aux);
4533 }
4534 return 0;
4535}
4536
4537static const struct tc_ops tc_ops_hfsc = {
4538 "hfsc", /* linux_name */
4539 "linux-hfsc", /* ovs_name */
4540 HFSC_N_QUEUES, /* n_queues */
4541 hfsc_tc_install, /* tc_install */
4542 hfsc_tc_load, /* tc_load */
4543 hfsc_tc_destroy, /* tc_destroy */
4544 hfsc_qdisc_get, /* qdisc_get */
4545 hfsc_qdisc_set, /* qdisc_set */
4546 hfsc_class_get, /* class_get */
4547 hfsc_class_set, /* class_set */
4548 hfsc_class_delete, /* class_delete */
4549 hfsc_class_get_stats, /* class_get_stats */
4550 hfsc_class_dump_stats /* class_dump_stats */
4551};
4552\f
6cf888b8
BS
4553/* "linux-noop" traffic control class. */
4554
4555static void
4556noop_install__(struct netdev *netdev_)
4557{
4558 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4559 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4560
4561 netdev->tc = CONST_CAST(struct tc *, &tc);
4562}
4563
4564static int
4565noop_tc_install(struct netdev *netdev,
4566 const struct smap *details OVS_UNUSED)
4567{
4568 noop_install__(netdev);
4569 return 0;
4570}
4571
4572static int
4573noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4574{
4575 noop_install__(netdev);
4576 return 0;
4577}
4578
4579static const struct tc_ops tc_ops_noop = {
4580 NULL, /* linux_name */
4581 "linux-noop", /* ovs_name */
4582 0, /* n_queues */
4583 noop_tc_install,
4584 noop_tc_load,
4585 NULL, /* tc_destroy */
4586 NULL, /* qdisc_get */
4587 NULL, /* qdisc_set */
4588 NULL, /* class_get */
4589 NULL, /* class_set */
4590 NULL, /* class_delete */
4591 NULL, /* class_get_stats */
4592 NULL /* class_dump_stats */
4593};
4594\f
c1c9c9c4
BP
4595/* "linux-default" traffic control class.
4596 *
4597 * This class represents the default, unnamed Linux qdisc. It corresponds to
4598 * the "" (empty string) QoS type in the OVS database. */
4599
4600static void
b5d57fc8 4601default_install__(struct netdev *netdev_)
c1c9c9c4 4602{
b5d57fc8 4603 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4604 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4605
559eb230
BP
4606 /* Nothing but a tc class implementation is allowed to write to a tc. This
4607 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4608 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4609}
4610
4611static int
4612default_tc_install(struct netdev *netdev,
79f1cbe9 4613 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4614{
4615 default_install__(netdev);
4616 return 0;
4617}
4618
4619static int
4620default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4621{
4622 default_install__(netdev);
4623 return 0;
4624}
4625
4626static const struct tc_ops tc_ops_default = {
4627 NULL, /* linux_name */
4628 "", /* ovs_name */
4629 0, /* n_queues */
4630 default_tc_install,
4631 default_tc_load,
4632 NULL, /* tc_destroy */
4633 NULL, /* qdisc_get */
4634 NULL, /* qdisc_set */
4635 NULL, /* class_get */
4636 NULL, /* class_set */
4637 NULL, /* class_delete */
4638 NULL, /* class_get_stats */
4639 NULL /* class_dump_stats */
4640};
4641\f
4642/* "linux-other" traffic control class.
4643 *
4644 * */
4645
4646static int
b5d57fc8 4647other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4648{
b5d57fc8 4649 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4650 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4651
559eb230
BP
4652 /* Nothing but a tc class implementation is allowed to write to a tc. This
4653 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4654 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4655 return 0;
4656}
4657
4658static const struct tc_ops tc_ops_other = {
4659 NULL, /* linux_name */
4660 "linux-other", /* ovs_name */
4661 0, /* n_queues */
4662 NULL, /* tc_install */
4663 other_tc_load,
4664 NULL, /* tc_destroy */
4665 NULL, /* qdisc_get */
4666 NULL, /* qdisc_set */
4667 NULL, /* class_get */
4668 NULL, /* class_set */
4669 NULL, /* class_delete */
4670 NULL, /* class_get_stats */
4671 NULL /* class_dump_stats */
4672};
4673\f
4674/* Traffic control. */
4675
4676/* Number of kernel "tc" ticks per second. */
4677static double ticks_per_s;
4678
4679/* Number of kernel "jiffies" per second. This is used for the purpose of
4680 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4681 * one jiffy's worth of data.
4682 *
4683 * There are two possibilities here:
4684 *
4685 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4686 * approximate range of 100 to 1024. That means that we really need to
4687 * make sure that the qdisc can buffer that much data.
4688 *
4689 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4690 * has finely granular timers and there's no need to fudge additional room
4691 * for buffers. (There's no extra effort needed to implement that: the
4692 * large 'buffer_hz' is used as a divisor, so practically any number will
4693 * come out as 0 in the division. Small integer results in the case of
4694 * really high dividends won't have any real effect anyhow.)
4695 */
4696static unsigned int buffer_hz;
4697
7874bdff
RD
4698static struct tcmsg *
4699netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4700 unsigned int flags, struct ofpbuf *request)
4701{
4702 int ifindex;
4703 int error;
4704
4705 error = get_ifindex(netdev, &ifindex);
4706 if (error) {
4707 return NULL;
4708 }
4709
4710 return tc_make_request(ifindex, type, flags, request);
4711}
4712
f8500004
JP
4713/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4714 * of 'kbits_burst'.
4715 *
4716 * This function is equivalent to running:
4717 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4718 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4719 * mtu 65535 drop
4720 *
4721 * The configuration and stats may be seen with the following command:
c7952afb 4722 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4723 *
4724 * Returns 0 if successful, otherwise a positive errno value.
4725 */
4726static int
c7952afb
BP
4727tc_add_policer(struct netdev *netdev,
4728 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4729{
4730 struct tc_police tc_police;
4731 struct ofpbuf request;
4732 struct tcmsg *tcmsg;
4733 size_t basic_offset;
4734 size_t police_offset;
4735 int error;
4736 int mtu = 65535;
4737
4738 memset(&tc_police, 0, sizeof tc_police);
4739 tc_police.action = TC_POLICE_SHOT;
4740 tc_police.mtu = mtu;
1aca400c 4741 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4742
79abacc8
MAA
4743 /* The following appears wrong in one way: In networking a kilobit is
4744 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4745 *
4746 * However if you "fix" those problems then "tc filter show ..." shows
4747 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4748 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4749 * tc's point of view. Whatever. */
4750 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4751 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 4752
7874bdff
RD
4753 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4754 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
4755 if (!tcmsg) {
4756 return ENODEV;
4757 }
4758 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4759 tcmsg->tcm_info = tc_make_handle(49,
4760 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4761
4762 nl_msg_put_string(&request, TCA_KIND, "basic");
4763 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4764 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4765 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4766 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4767 nl_msg_end_nested(&request, police_offset);
4768 nl_msg_end_nested(&request, basic_offset);
4769
4770 error = tc_transact(&request, NULL);
4771 if (error) {
4772 return error;
4773 }
4774
4775 return 0;
4776}
4777
c1c9c9c4
BP
4778static void
4779read_psched(void)
4780{
4781 /* The values in psched are not individually very meaningful, but they are
4782 * important. The tables below show some values seen in the wild.
4783 *
4784 * Some notes:
4785 *
4786 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4787 * (Before that, there are hints that it was 1000000000.)
4788 *
4789 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4790 * above.
4791 *
4792 * /proc/net/psched
4793 * -----------------------------------
4794 * [1] 000c8000 000f4240 000f4240 00000064
4795 * [2] 000003e8 00000400 000f4240 3b9aca00
4796 * [3] 000003e8 00000400 000f4240 3b9aca00
4797 * [4] 000003e8 00000400 000f4240 00000064
4798 * [5] 000003e8 00000040 000f4240 3b9aca00
4799 * [6] 000003e8 00000040 000f4240 000000f9
4800 *
4801 * a b c d ticks_per_s buffer_hz
4802 * ------- --------- ---------- ------------- ----------- -------------
4803 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4804 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4805 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4806 * [4] 1,000 1,024 1,000,000 100 976,562 100
4807 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4808 * [6] 1,000 64 1,000,000 249 15,625,000 249
4809 *
4810 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4811 * [2] 2.6.26-1-686-bigmem from Debian lenny
4812 * [3] 2.6.26-2-sparc64 from Debian lenny
4813 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4814 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4815 * [6] 2.6.34 from kernel.org on KVM
4816 */
23882115 4817 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4818 static const char fn[] = "/proc/net/psched";
4819 unsigned int a, b, c, d;
4820 FILE *stream;
4821
23882115
BP
4822 if (!ovsthread_once_start(&once)) {
4823 return;
4824 }
4825
c1c9c9c4
BP
4826 ticks_per_s = 1.0;
4827 buffer_hz = 100;
4828
4829 stream = fopen(fn, "r");
4830 if (!stream) {
10a89ef0 4831 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4832 goto exit;
c1c9c9c4
BP
4833 }
4834
4835 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4836 VLOG_WARN("%s: read failed", fn);
4837 fclose(stream);
23882115 4838 goto exit;
c1c9c9c4
BP
4839 }
4840 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4841 fclose(stream);
4842
4843 if (!a || !c) {
4844 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4845 goto exit;
c1c9c9c4
BP
4846 }
4847
4848 ticks_per_s = (double) a * c / b;
4849 if (c == 1000000) {
4850 buffer_hz = d;
4851 } else {
4852 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4853 fn, a, b, c, d);
4854 }
4855 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4856
4857exit:
4858 ovsthread_once_done(&once);
c1c9c9c4
BP
4859}
4860
4861/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4862 * rate of 'rate' bytes per second. */
4863static unsigned int
4864tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4865{
23882115 4866 read_psched();
c1c9c9c4
BP
4867 return (rate * ticks) / ticks_per_s;
4868}
4869
4870/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4871 * rate of 'rate' bytes per second. */
4872static unsigned int
4873tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4874{
23882115 4875 read_psched();
015c93a4 4876 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4877}
4878
4879/* Returns the number of bytes that need to be reserved for qdisc buffering at
4880 * a transmission rate of 'rate' bytes per second. */
4881static unsigned int
4882tc_buffer_per_jiffy(unsigned int rate)
4883{
23882115 4884 read_psched();
c1c9c9c4
BP
4885 return rate / buffer_hz;
4886}
4887
4888/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4889 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4890 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4891 * stores NULL into it if it is absent.
4892 *
4893 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4894 * 'msg'.
4895 *
4896 * Returns 0 if successful, otherwise a positive errno value. */
4897static int
4898tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4899 struct nlattr **options)
4900{
4901 static const struct nl_policy tca_policy[] = {
4902 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4903 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4904 };
4905 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4906
4907 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4908 tca_policy, ta, ARRAY_SIZE(ta))) {
4909 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4910 goto error;
4911 }
4912
4913 if (kind) {
4914 *kind = nl_attr_get_string(ta[TCA_KIND]);
4915 }
4916
4917 if (options) {
4918 *options = ta[TCA_OPTIONS];
4919 }
4920
4921 return 0;
4922
4923error:
4924 if (kind) {
4925 *kind = NULL;
4926 }
4927 if (options) {
4928 *options = NULL;
4929 }
4930 return EPROTO;
4931}
4932
4933/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4934 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4935 * into '*options', and its queue statistics into '*stats'. Any of the output
4936 * arguments may be null.
4937 *
4938 * Returns 0 if successful, otherwise a positive errno value. */
4939static int
4940tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4941 struct nlattr **options, struct netdev_queue_stats *stats)
4942{
4943 static const struct nl_policy tca_policy[] = {
4944 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4945 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4946 };
4947 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4948
4949 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4950 tca_policy, ta, ARRAY_SIZE(ta))) {
4951 VLOG_WARN_RL(&rl, "failed to parse class message");
4952 goto error;
4953 }
4954
4955 if (handlep) {
4956 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4957 *handlep = tc->tcm_handle;
4958 }
4959
4960 if (options) {
4961 *options = ta[TCA_OPTIONS];
4962 }
4963
4964 if (stats) {
4965 const struct gnet_stats_queue *gsq;
4966 struct gnet_stats_basic gsb;
4967
4968 static const struct nl_policy stats_policy[] = {
4969 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4970 .min_len = sizeof gsb },
4971 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4972 .min_len = sizeof *gsq },
4973 };
4974 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4975
4976 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4977 sa, ARRAY_SIZE(sa))) {
4978 VLOG_WARN_RL(&rl, "failed to parse class stats");
4979 goto error;
4980 }
4981
4982 /* Alignment issues screw up the length of struct gnet_stats_basic on
4983 * some arch/bitsize combinations. Newer versions of Linux have a
4984 * struct gnet_stats_basic_packed, but we can't depend on that. The
4985 * easiest thing to do is just to make a copy. */
4986 memset(&gsb, 0, sizeof gsb);
4987 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4988 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4989 stats->tx_bytes = gsb.bytes;
4990 stats->tx_packets = gsb.packets;
4991
4992 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4993 stats->tx_errors = gsq->drops;
4994 }
4995
4996 return 0;
4997
4998error:
4999 if (options) {
5000 *options = NULL;
5001 }
5002 if (stats) {
5003 memset(stats, 0, sizeof *stats);
5004 }
5005 return EPROTO;
5006}
5007
5008/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5009 * on 'netdev'. */
5010static int
5011tc_query_class(const struct netdev *netdev,
5012 unsigned int handle, unsigned int parent,
5013 struct ofpbuf **replyp)
5014{
5015 struct ofpbuf request;
5016 struct tcmsg *tcmsg;
5017 int error;
5018
7874bdff
RD
5019 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5020 &request);
23a98ffe
BP
5021 if (!tcmsg) {
5022 return ENODEV;
5023 }
c1c9c9c4
BP
5024 tcmsg->tcm_handle = handle;
5025 tcmsg->tcm_parent = parent;
5026
5027 error = tc_transact(&request, replyp);
5028 if (error) {
5029 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5030 netdev_get_name(netdev),
5031 tc_get_major(handle), tc_get_minor(handle),
5032 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5033 ovs_strerror(error));
c1c9c9c4
BP
5034 }
5035 return error;
5036}
5037
5038/* Equivalent to "tc class del dev <name> handle <handle>". */
5039static int
5040tc_delete_class(const struct netdev *netdev, unsigned int handle)
5041{
5042 struct ofpbuf request;
5043 struct tcmsg *tcmsg;
5044 int error;
5045
7874bdff 5046 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5047 if (!tcmsg) {
5048 return ENODEV;
5049 }
c1c9c9c4
BP
5050 tcmsg->tcm_handle = handle;
5051 tcmsg->tcm_parent = 0;
5052
5053 error = tc_transact(&request, NULL);
5054 if (error) {
5055 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5056 netdev_get_name(netdev),
5057 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5058 ovs_strerror(error));
c1c9c9c4
BP
5059 }
5060 return error;
5061}
5062
5063/* Equivalent to "tc qdisc del dev <name> root". */
5064static int
b5d57fc8 5065tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5066{
b5d57fc8 5067 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5068 struct ofpbuf request;
5069 struct tcmsg *tcmsg;
5070 int error;
5071
7874bdff 5072 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5073 if (!tcmsg) {
5074 return ENODEV;
5075 }
c1c9c9c4
BP
5076 tcmsg->tcm_handle = tc_make_handle(1, 0);
5077 tcmsg->tcm_parent = TC_H_ROOT;
5078
5079 error = tc_transact(&request, NULL);
5080 if (error == EINVAL) {
5081 /* EINVAL probably means that the default qdisc was in use, in which
5082 * case we've accomplished our purpose. */
5083 error = 0;
5084 }
b5d57fc8
BP
5085 if (!error && netdev->tc) {
5086 if (netdev->tc->ops->tc_destroy) {
5087 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5088 }
b5d57fc8 5089 netdev->tc = NULL;
c1c9c9c4
BP
5090 }
5091 return error;
5092}
5093
ac3e3aaa
BP
5094static bool
5095getqdisc_is_safe(void)
5096{
5097 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5098 static bool safe = false;
5099
5100 if (ovsthread_once_start(&once)) {
5101 struct utsname utsname;
5102 int major, minor;
5103
5104 if (uname(&utsname) == -1) {
5105 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5106 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5107 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5108 } else if (major < 2 || (major == 2 && minor < 35)) {
5109 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5110 utsname.release);
5111 } else {
5112 safe = true;
5113 }
5114 ovsthread_once_done(&once);
5115 }
5116 return safe;
5117}
5118
c1c9c9c4
BP
5119/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5120 * kernel to determine what they are. Returns 0 if successful, otherwise a
5121 * positive errno value. */
5122static int
b5d57fc8 5123tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5124{
b5d57fc8 5125 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5126 struct ofpbuf request, *qdisc;
5127 const struct tc_ops *ops;
5128 struct tcmsg *tcmsg;
5129 int load_error;
5130 int error;
5131
b5d57fc8 5132 if (netdev->tc) {
c1c9c9c4
BP
5133 return 0;
5134 }
5135
5136 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5137 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5138 * 2.6.35 without that fix backported to it.
5139 *
5140 * To avoid the OOPS, we must not make a request that would attempt to dump
5141 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5142 * few others. There are a few ways that I can see to do this, but most of
5143 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5144 * technique chosen here is to assume that any non-default qdisc that we
5145 * create will have a class with handle 1:0. The built-in qdiscs only have
5146 * a class with handle 0:0.
5147 *
ac3e3aaa
BP
5148 * On Linux 2.6.35+ we use the straightforward method because it allows us
5149 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5150 * in such a case we get no response at all from the kernel (!) if a
5151 * builtin qdisc is in use (which is later caught by "!error &&
5152 * !qdisc->size"). */
7874bdff
RD
5153 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5154 &request);
23a98ffe
BP
5155 if (!tcmsg) {
5156 return ENODEV;
5157 }
ac3e3aaa
BP
5158 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5159 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5160
5161 /* Figure out what tc class to instantiate. */
5162 error = tc_transact(&request, &qdisc);
ac3e3aaa 5163 if (!error && qdisc->size) {
c1c9c9c4
BP
5164 const char *kind;
5165
5166 error = tc_parse_qdisc(qdisc, &kind, NULL);
5167 if (error) {
5168 ops = &tc_ops_other;
5169 } else {
5170 ops = tc_lookup_linux_name(kind);
5171 if (!ops) {
5172 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5173 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5174
5175 ops = &tc_ops_other;
5176 }
5177 }
ac3e3aaa
BP
5178 } else if ((!error && !qdisc->size) || error == ENOENT) {
5179 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5180 * set up by some other entity that doesn't have a handle 1:0. We will
5181 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5182 ops = &tc_ops_default;
5183 error = 0;
5184 } else {
5185 /* Who knows? Maybe the device got deleted. */
5186 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5187 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5188 ops = &tc_ops_other;
5189 }
5190
5191 /* Instantiate it. */
b5d57fc8
BP
5192 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5193 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5194 ofpbuf_delete(qdisc);
5195
5196 return error ? error : load_error;
5197}
5198
5199/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5200 approximate the time to transmit packets of various lengths. For an MTU of
5201 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5202 represents two possible packet lengths; for a MTU of 513 through 1024, four
5203 possible lengths; and so on.
5204
5205 Returns, for the specified 'mtu', the number of bits that packet lengths
5206 need to be shifted right to fit within such a 256-entry table. */
5207static int
5208tc_calc_cell_log(unsigned int mtu)
5209{
5210 int cell_log;
5211
5212 if (!mtu) {
5213 mtu = ETH_PAYLOAD_MAX;
5214 }
5215 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5216
5217 for (cell_log = 0; mtu >= 256; cell_log++) {
5218 mtu >>= 1;
5219 }
5220
5221 return cell_log;
5222}
5223
5224/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5225 * of 'mtu'. */
5226static void
5227tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5228{
5229 memset(rate, 0, sizeof *rate);
5230 rate->cell_log = tc_calc_cell_log(mtu);
5231 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5232 /* rate->cell_align = 0; */ /* distro headers. */
5233 rate->mpu = ETH_TOTAL_MIN;
5234 rate->rate = Bps;
5235}
5236
5237/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5238 * attribute of the specified "type".
5239 *
5240 * See tc_calc_cell_log() above for a description of "rtab"s. */
5241static void
5242tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5243{
5244 uint32_t *rtab;
5245 unsigned int i;
5246
5247 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5248 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5249 unsigned packet_size = (i + 1) << rate->cell_log;
5250 if (packet_size < rate->mpu) {
5251 packet_size = rate->mpu;
5252 }
5253 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5254 }
5255}
5256
5257/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5258 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5259 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5260 * 0 is fine.) */
c1c9c9c4
BP
5261static int
5262tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5263{
5264 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5265 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5266}
d3980822 5267\f
aaf2fb1a
BP
5268/* Linux-only functions declared in netdev-linux.h */
5269
5270/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5271 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5272int
5273netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5274 const char *flag_name, bool enable)
5275{
5276 const char *netdev_name = netdev_get_name(netdev);
5277 struct ethtool_value evalue;
5278 uint32_t new_flags;
5279 int error;
5280
ab985a77 5281 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5282 memset(&evalue, 0, sizeof evalue);
5283 error = netdev_linux_do_ethtool(netdev_name,
5284 (struct ethtool_cmd *)&evalue,
5285 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5286 if (error) {
5287 return error;
5288 }
5289
ab985a77 5290 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5291 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5292 if (new_flags == evalue.data) {
5293 return 0;
5294 }
5295 evalue.data = new_flags;
aaf2fb1a
BP
5296 error = netdev_linux_do_ethtool(netdev_name,
5297 (struct ethtool_cmd *)&evalue,
5298 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5299 if (error) {
5300 return error;
5301 }
5302
ab985a77 5303 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5304 memset(&evalue, 0, sizeof evalue);
5305 error = netdev_linux_do_ethtool(netdev_name,
5306 (struct ethtool_cmd *)&evalue,
5307 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5308 if (error) {
5309 return error;
5310 }
5311
5312 if (new_flags != evalue.data) {
5313 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5314 "device %s failed", enable ? "enable" : "disable",
5315 flag_name, netdev_name);
5316 return EOPNOTSUPP;
5317 }
5318
5319 return 0;
5320}
5321\f
5322/* Utility functions. */
5323
d3980822 5324/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5325static void
d3980822
BP
5326netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5327 const struct rtnl_link_stats *src)
5328{
f613a0d7
PS
5329 dst->rx_packets = src->rx_packets;
5330 dst->tx_packets = src->tx_packets;
5331 dst->rx_bytes = src->rx_bytes;
5332 dst->tx_bytes = src->tx_bytes;
5333 dst->rx_errors = src->rx_errors;
5334 dst->tx_errors = src->tx_errors;
5335 dst->rx_dropped = src->rx_dropped;
5336 dst->tx_dropped = src->tx_dropped;
5337 dst->multicast = src->multicast;
5338 dst->collisions = src->collisions;
5339 dst->rx_length_errors = src->rx_length_errors;
5340 dst->rx_over_errors = src->rx_over_errors;
5341 dst->rx_crc_errors = src->rx_crc_errors;
5342 dst->rx_frame_errors = src->rx_frame_errors;
5343 dst->rx_fifo_errors = src->rx_fifo_errors;
5344 dst->rx_missed_errors = src->rx_missed_errors;
5345 dst->tx_aborted_errors = src->tx_aborted_errors;
5346 dst->tx_carrier_errors = src->tx_carrier_errors;
5347 dst->tx_fifo_errors = src->tx_fifo_errors;
5348 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5349 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5350}
5351
337c9b99
BP
5352/* Copies 'src' into 'dst', performing format conversion in the process. */
5353static void
5354netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5355 const struct rtnl_link_stats64 *src)
5356{
5357 dst->rx_packets = src->rx_packets;
5358 dst->tx_packets = src->tx_packets;
5359 dst->rx_bytes = src->rx_bytes;
5360 dst->tx_bytes = src->tx_bytes;
5361 dst->rx_errors = src->rx_errors;
5362 dst->tx_errors = src->tx_errors;
5363 dst->rx_dropped = src->rx_dropped;
5364 dst->tx_dropped = src->tx_dropped;
5365 dst->multicast = src->multicast;
5366 dst->collisions = src->collisions;
5367 dst->rx_length_errors = src->rx_length_errors;
5368 dst->rx_over_errors = src->rx_over_errors;
5369 dst->rx_crc_errors = src->rx_crc_errors;
5370 dst->rx_frame_errors = src->rx_frame_errors;
5371 dst->rx_fifo_errors = src->rx_fifo_errors;
5372 dst->rx_missed_errors = src->rx_missed_errors;
5373 dst->tx_aborted_errors = src->tx_aborted_errors;
5374 dst->tx_carrier_errors = src->tx_carrier_errors;
5375 dst->tx_fifo_errors = src->tx_fifo_errors;
5376 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5377 dst->tx_window_errors = src->tx_window_errors;
5378}
5379
c1c9c9c4 5380static int
35eef899 5381get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5382{
c1c9c9c4
BP
5383 struct ofpbuf request;
5384 struct ofpbuf *reply;
c1c9c9c4
BP
5385 int error;
5386
d6e3feb5 5387 /* Filtering all counters by default */
5388 memset(stats, 0xFF, sizeof(struct netdev_stats));
5389
c1c9c9c4 5390 ofpbuf_init(&request, 0);
13a24df8
BP
5391 nl_msg_put_nlmsghdr(&request,
5392 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5393 RTM_GETLINK, NLM_F_REQUEST);
5394 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5395 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5396 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5397 ofpbuf_uninit(&request);
5398 if (error) {
5399 return error;
5400 }
5401
13a24df8 5402 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5403 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5404 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5405 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5406 error = 0;
5407 } else {
71f21279 5408 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
5409 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5410 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5411 error = 0;
5412 } else {
5413 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5414 error = EPROTO;
5415 }
13a24df8
BP
5416 }
5417 } else {
5418 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5419 error = EPROTO;
c1c9c9c4 5420 }
8b61709d 5421
8b61709d 5422
576e26d7 5423 ofpbuf_delete(reply);
35eef899 5424 return error;
8b61709d 5425}
c1c9c9c4 5426
3a183124 5427static int
b5d57fc8 5428get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5429{
5430 struct ifreq ifr;
5431 int error;
5432
755be9ea 5433 *flags = 0;
259e0b1a 5434 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5435 if (!error) {
5436 *flags = ifr.ifr_flags;
5437 }
8b61709d
BP
5438 return error;
5439}
5440
5441static int
4b609110 5442set_flags(const char *name, unsigned int flags)
8b61709d
BP
5443{
5444 struct ifreq ifr;
5445
5446 ifr.ifr_flags = flags;
259e0b1a 5447 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5448}
5449
01b25786
PB
5450int
5451linux_get_ifindex(const char *netdev_name)
8b61709d
BP
5452{
5453 struct ifreq ifr;
259e0b1a 5454 int error;
8b61709d 5455
71d7c22f 5456 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5457 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5458
5459 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5460 if (error) {
580e1152
RD
5461 /* ENODEV probably means that a vif disappeared asynchronously and
5462 * hasn't been removed from the database yet, so reduce the log level
5463 * to INFO for that case. */
5464 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5465 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5466 netdev_name, ovs_strerror(error));
259e0b1a 5467 return -error;
8b61709d
BP
5468 }
5469 return ifr.ifr_ifindex;
5470}
5471
5472static int
5473get_ifindex(const struct netdev *netdev_, int *ifindexp)
5474{
b5d57fc8 5475 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5476
b5d57fc8 5477 if (!(netdev->cache_valid & VALID_IFINDEX)) {
01b25786 5478 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5479
8b61709d 5480 if (ifindex < 0) {
b5d57fc8
BP
5481 netdev->get_ifindex_error = -ifindex;
5482 netdev->ifindex = 0;
c7b1b0a5 5483 } else {
b5d57fc8
BP
5484 netdev->get_ifindex_error = 0;
5485 netdev->ifindex = ifindex;
8b61709d 5486 }
b5d57fc8 5487 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5488 }
c7b1b0a5 5489
b5d57fc8
BP
5490 *ifindexp = netdev->ifindex;
5491 return netdev->get_ifindex_error;
8b61709d
BP
5492}
5493
5494static int
74ff3298 5495get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5496{
5497 struct ifreq ifr;
5498 int hwaddr_family;
259e0b1a 5499 int error;
8b61709d
BP
5500
5501 memset(&ifr, 0, sizeof ifr);
71d7c22f 5502 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5503 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5504 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5505 if (error) {
78857dfb
BP
5506 /* ENODEV probably means that a vif disappeared asynchronously and
5507 * hasn't been removed from the database yet, so reduce the log level
5508 * to INFO for that case. */
259e0b1a 5509 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5510 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5511 netdev_name, ovs_strerror(error));
5512 return error;
8b61709d
BP
5513 }
5514 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
5515 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5516 hwaddr_family != ARPHRD_NONE) {
c9697f35 5517 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5518 netdev_name, hwaddr_family);
c9697f35 5519 return EINVAL;
8b61709d
BP
5520 }
5521 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5522 return 0;
5523}
5524
5525static int
74ff3298 5526set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5527{
5528 struct ifreq ifr;
259e0b1a 5529 int error;
8b61709d
BP
5530
5531 memset(&ifr, 0, sizeof ifr);
71d7c22f 5532 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5533 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5534 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5535 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5536 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5537 if (error) {
8b61709d 5538 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5539 netdev_name, ovs_strerror(error));
8b61709d 5540 }
259e0b1a 5541 return error;
8b61709d
BP
5542}
5543
5544static int
0b0544d7 5545netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5546 int cmd, const char *cmd_name)
5547{
5548 struct ifreq ifr;
259e0b1a 5549 int error;
8b61709d
BP
5550
5551 memset(&ifr, 0, sizeof ifr);
71d7c22f 5552 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5553 ifr.ifr_data = (caddr_t) ecmd;
5554
5555 ecmd->cmd = cmd;
259e0b1a
BP
5556 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5557 if (error) {
5558 if (error != EOPNOTSUPP) {
8b61709d 5559 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5560 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5561 } else {
5562 /* The device doesn't support this operation. That's pretty
5563 * common, so there's no point in logging anything. */
5564 }
8b61709d 5565 }
259e0b1a 5566 return error;
8b61709d 5567}
f1acd62b 5568
488d734d
BP
5569/* Returns an AF_PACKET raw socket or a negative errno value. */
5570static int
5571af_packet_sock(void)
5572{
23882115
BP
5573 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5574 static int sock;
488d734d 5575
23882115 5576 if (ovsthread_once_start(&once)) {
488d734d
BP
5577 sock = socket(AF_PACKET, SOCK_RAW, 0);
5578 if (sock >= 0) {
8450059e
BP
5579 int error = set_nonblocking(sock);
5580 if (error) {
5581 close(sock);
5582 sock = -error;
5583 }
488d734d
BP
5584 } else {
5585 sock = -errno;
10a89ef0
BP
5586 VLOG_ERR("failed to create packet socket: %s",
5587 ovs_strerror(errno));
488d734d 5588 }
23882115 5589 ovsthread_once_done(&once);
488d734d
BP
5590 }
5591
5592 return sock;
5593}