]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
sandbox: disable ssl for backup ovn southbound db
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
55bc98d6 43#include <net/if_packet.h>
8b61709d
BP
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
e14deea0 52#include "dp-packet.h"
93451a0a 53#include "dpif-netlink.h"
df1e5a3b 54#include "dpif-netdev.h"
3e8a2ad1 55#include "openvswitch/dynamic-string.h"
8b61709d 56#include "fatal-signal.h"
93b13be8 57#include "hash.h"
ee89ea7b 58#include "openvswitch/hmap.h"
8b61709d 59#include "netdev-provider.h"
7fbef77a 60#include "netdev-vport.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
c060c4cf 63#include "netlink.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d
BP
67#include "packets.h"
68#include "poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
ee89ea7b 70#include "openvswitch/shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
ee89ea7b 76#include "util.h"
5136ce49 77
d98e6007 78VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 79
d76f09ea
BP
80COVERAGE_DEFINE(netdev_set_policing);
81COVERAGE_DEFINE(netdev_arp_lookup);
82COVERAGE_DEFINE(netdev_get_ifindex);
83COVERAGE_DEFINE(netdev_get_hwaddr);
84COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
85COVERAGE_DEFINE(netdev_get_ethtool);
86COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 87
8b61709d
BP
88\f
89/* These were introduced in Linux 2.6.14, so they might be missing if we have
90 * old headers. */
91#ifndef ADVERTISED_Pause
92#define ADVERTISED_Pause (1 << 13)
93#endif
94#ifndef ADVERTISED_Asym_Pause
95#define ADVERTISED_Asym_Pause (1 << 14)
96#endif
97
e47bd51a
JP
98/* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100#ifndef ETHTOOL_GFLAGS
101#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102#endif
103#ifndef ETHTOOL_SFLAGS
104#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105#endif
106
c1c9c9c4
BP
107/* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 * headers. */
109#ifndef TC_RTAB_SIZE
110#define TC_RTAB_SIZE 1024
111#endif
112
b73c8518
SH
113/* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
118 *
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
121 */
55bc98d6
BP
122#ifndef PACKET_AUXDATA
123#define PACKET_AUXDATA 8
124#endif
b73c8518
SH
125#ifndef TP_STATUS_VLAN_VALID
126#define TP_STATUS_VLAN_VALID (1 << 4)
127#endif
128#ifndef TP_STATUS_VLAN_TPID_VALID
129#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130#endif
131#undef tpacket_auxdata
132#define tpacket_auxdata rpl_tpacket_auxdata
133struct tpacket_auxdata {
134 uint32_t tp_status;
135 uint32_t tp_len;
136 uint32_t tp_snaplen;
137 uint16_t tp_mac;
138 uint16_t tp_net;
139 uint16_t tp_vlan_tci;
140 uint16_t tp_vlan_tpid;
141};
142
0c615356
SH
143/* Linux 2.6.27 introduced ethtool_cmd_speed
144 *
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
147 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
148 * unconditionally replace ethtool_cmd_speed. */
149#define ethtool_cmd_speed rpl_ethtool_cmd_speed
150static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151{
152 return ep->speed | (ep->speed_hi << 16);
153}
154
67bed84c
SH
155/* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157#ifndef SUPPORTED_1000baseKX_Full
158#define SUPPORTED_1000baseKX_Full (1 << 17)
159#define SUPPORTED_10000baseKX4_Full (1 << 18)
160#define SUPPORTED_10000baseKR_Full (1 << 19)
161#define SUPPORTED_10000baseR_FEC (1 << 20)
162#define ADVERTISED_1000baseKX_Full (1 << 17)
163#define ADVERTISED_10000baseKX4_Full (1 << 18)
164#define ADVERTISED_10000baseKR_Full (1 << 19)
165#define ADVERTISED_10000baseR_FEC (1 << 20)
166#endif
167
168/* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170#ifndef SUPPORTED_40000baseKR4_Full
171#define SUPPORTED_40000baseKR4_Full (1 << 23)
172#define SUPPORTED_40000baseCR4_Full (1 << 24)
173#define SUPPORTED_40000baseSR4_Full (1 << 25)
174#define SUPPORTED_40000baseLR4_Full (1 << 26)
175#define ADVERTISED_40000baseKR4_Full (1 << 23)
176#define ADVERTISED_40000baseCR4_Full (1 << 24)
177#define ADVERTISED_40000baseSR4_Full (1 << 25)
178#define ADVERTISED_40000baseLR4_Full (1 << 26)
179#endif
180
fa373af4
BP
181/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 *
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
185 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
186 * if_link.h is not self-contained on those kernels. It is easiest to
187 * unconditionally define a replacement. */
188#ifndef IFLA_STATS64
337c9b99 189#define IFLA_STATS64 23
fa373af4
BP
190#endif
191#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
192struct rtnl_link_stats64 {
193 uint64_t rx_packets;
194 uint64_t tx_packets;
195 uint64_t rx_bytes;
196 uint64_t tx_bytes;
197 uint64_t rx_errors;
198 uint64_t tx_errors;
199 uint64_t rx_dropped;
200 uint64_t tx_dropped;
201 uint64_t multicast;
202 uint64_t collisions;
203
204 uint64_t rx_length_errors;
205 uint64_t rx_over_errors;
206 uint64_t rx_crc_errors;
207 uint64_t rx_frame_errors;
208 uint64_t rx_fifo_errors;
209 uint64_t rx_missed_errors;
210
211 uint64_t tx_aborted_errors;
212 uint64_t tx_carrier_errors;
213 uint64_t tx_fifo_errors;
214 uint64_t tx_heartbeat_errors;
215 uint64_t tx_window_errors;
216
217 uint64_t rx_compressed;
218 uint64_t tx_compressed;
219};
337c9b99 220
8b61709d 221enum {
7fbef77a
JG
222 VALID_IFINDEX = 1 << 0,
223 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
224 VALID_IN = 1 << 2,
225 VALID_MTU = 1 << 3,
226 VALID_POLICING = 1 << 4,
227 VALID_VPORT_STAT_ERROR = 1 << 5,
228 VALID_DRVINFO = 1 << 6,
229 VALID_FEATURES = 1 << 7,
8b61709d 230};
c1c9c9c4
BP
231\f
232/* Traffic control. */
233
234/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
c1c9c9c4
BP
239struct tc {
240 const struct tc_ops *ops;
93b13be8
BP
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244};
c1c9c9c4 245
559eb230
BP
246#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
93b13be8
BP
248/* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 255 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
256};
257
258/* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
79f1cbe9 292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
79f1cbe9 332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
79f1cbe9 343 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 344
93b13be8
BP
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
93b13be8 358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 359 struct smap *details);
c1c9c9c4
BP
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 373 const struct smap *details);
c1c9c9c4 374
93b13be8
BP
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
93b13be8 380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 381
93b13be8
BP
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
93b13be8
BP
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
c1c9c9c4
BP
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401};
402
403static void
404tc_init(struct tc *tc, const struct tc_ops *ops)
405{
406 tc->ops = ops;
93b13be8 407 hmap_init(&tc->queues);
c1c9c9c4
BP
408}
409
410static void
411tc_destroy(struct tc *tc)
412{
93b13be8 413 hmap_destroy(&tc->queues);
c1c9c9c4
BP
414}
415
416static const struct tc_ops tc_ops_htb;
a339aa81 417static const struct tc_ops tc_ops_hfsc;
677d9158
JV
418static const struct tc_ops tc_ops_codel;
419static const struct tc_ops tc_ops_fqcodel;
420static const struct tc_ops tc_ops_sfq;
c1c9c9c4 421static const struct tc_ops tc_ops_default;
6cf888b8 422static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
423static const struct tc_ops tc_ops_other;
424
559eb230 425static const struct tc_ops *const tcs[] = {
c1c9c9c4 426 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 427 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
428 &tc_ops_codel, /* Controlled delay */
429 &tc_ops_fqcodel, /* Fair queue controlled delay */
430 &tc_ops_sfq, /* Stochastic fair queueing */
6cf888b8 431 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
432 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other, /* Some other qdisc. */
434 NULL
435};
149f577a 436
c1c9c9c4
BP
437static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
438static unsigned int tc_get_major(unsigned int handle);
439static unsigned int tc_get_minor(unsigned int handle);
440
441static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
445static struct tcmsg *tc_make_request(const struct netdev *, int type,
446 unsigned int flags, struct ofpbuf *);
447static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004 448static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
c7952afb
BP
449static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
451
452static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462static int tc_del_qdisc(struct netdev *netdev);
463static int tc_query_qdisc(const struct netdev *netdev);
464
465static int tc_calc_cell_log(unsigned int mtu);
466static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470\f
b5d57fc8
BP
471struct netdev_linux {
472 struct netdev up;
149f577a 473
86383816
BP
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
149f577a 477 unsigned int cache_valid;
8b61709d 478
1670c579
EJ
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
8722022c
BP
483 /* The following are figured out "on demand" only. They are only valid
484 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 485 int ifindex;
74ff3298 486 struct eth_addr etheraddr;
8b61709d 487 int mtu;
059e5f4f 488 unsigned int ifi_flags;
65c3058c 489 long long int carrier_resets;
80a86fbe
BP
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
bba1e6f3
PS
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
90a6637d 494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 496 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 499
a00ca915
EJ
500 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 503
4f925bd3 504 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 505 struct tc *tc;
149f577a 506
d0d08f8a
BP
507 /* For devices of class netdev_tap_class only. */
508 int tap_fd;
8b61709d
BP
509};
510
f7791740
PS
511struct netdev_rxq_linux {
512 struct netdev_rxq up;
796223f5 513 bool is_tap;
5b7448ed 514 int fd;
149f577a 515};
8b61709d 516
8b61709d
BP
517/* This is set pretty low because we probably won't learn anything from the
518 * additional log messages. */
519static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
520
19c8e9c1
JS
521/* Polling miimon status for all ports causes performance degradation when
522 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
523 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 *
525 * Readers do not depend on this variable synchronizing with the related
526 * changes in the device miimon status, so we can use atomic_count. */
527static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 528
1c33f0c3 529static void netdev_linux_run(const struct netdev_class *);
6f643e49 530
0b0544d7 531static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 532 int cmd, const char *cmd_name);
b5d57fc8 533static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 534static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
535static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
536 enum netdev_flags on, enum netdev_flags *old_flagsp)
537 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
538static int do_get_ifindex(const char *netdev_name);
539static int get_ifindex(const struct netdev *, int *ifindexp);
540static int do_set_addr(struct netdev *netdev,
541 int ioctl_nr, const char *ioctl_name,
542 struct in_addr addr);
74ff3298
JR
543static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
544static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 545static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 546static int af_packet_sock(void);
19c8e9c1 547static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
548static void netdev_linux_miimon_run(void);
549static void netdev_linux_miimon_wait(void);
df1e5a3b 550static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 551
15b3596a
JG
552static bool
553is_netdev_linux_class(const struct netdev_class *netdev_class)
554{
259e0b1a 555 return netdev_class->run == netdev_linux_run;
15b3596a
JG
556}
557
796223f5
BP
558static bool
559is_tap_netdev(const struct netdev *netdev)
560{
b5d57fc8 561 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
562}
563
8b61709d
BP
564static struct netdev_linux *
565netdev_linux_cast(const struct netdev *netdev)
566{
b5d57fc8 567 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 568
180c6d0b 569 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 570}
796223f5 571
f7791740
PS
572static struct netdev_rxq_linux *
573netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 574{
9dc63482 575 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 576 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 577}
ff4ed3c9 578\f
cee87338 579static void netdev_linux_update(struct netdev_linux *netdev,
7e9dcc0f 580 const struct rtnetlink_change *)
86383816 581 OVS_REQUIRES(netdev->mutex);
cee87338 582static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
583 unsigned int ifi_flags, unsigned int mask)
584 OVS_REQUIRES(netdev->mutex);
cee87338 585
d6384a3a
AW
586/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
587 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
588 * if no such socket could be created. */
589static struct nl_sock *
590netdev_linux_notify_sock(void)
591{
592 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
593 static struct nl_sock *sock;
989d7135
PS
594 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
595 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
596
597 if (ovsthread_once_start(&once)) {
598 int error;
599
600 error = nl_sock_create(NETLINK_ROUTE, &sock);
601 if (!error) {
d6384a3a
AW
602 size_t i;
603
604 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
605 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
606 if (error) {
607 nl_sock_destroy(sock);
608 sock = NULL;
609 break;
610 }
cee87338
BP
611 }
612 }
613 ovsthread_once_done(&once);
614 }
615
616 return sock;
617}
618
19c8e9c1
JS
619static bool
620netdev_linux_miimon_enabled(void)
621{
812c272c 622 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
623}
624
8b61709d 625static void
1c33f0c3 626netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 627{
cee87338
BP
628 struct nl_sock *sock;
629 int error;
630
19c8e9c1
JS
631 if (netdev_linux_miimon_enabled()) {
632 netdev_linux_miimon_run();
633 }
cee87338
BP
634
635 sock = netdev_linux_notify_sock();
636 if (!sock) {
637 return;
638 }
639
640 do {
641 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
642 uint64_t buf_stub[4096 / 8];
643 struct ofpbuf buf;
644
645 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
646 error = nl_sock_recv(sock, &buf, false);
647 if (!error) {
7e9dcc0f 648 struct rtnetlink_change change;
cee87338 649
7e9dcc0f 650 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
651 struct netdev *netdev_ = NULL;
652 char dev_name[IFNAMSIZ];
653
654 if (!change.ifname) {
655 change.ifname = if_indextoname(change.if_index, dev_name);
656 }
657
658 if (change.ifname) {
659 netdev_ = netdev_from_name(change.ifname);
660 }
cee87338
BP
661 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
663
664 ovs_mutex_lock(&netdev->mutex);
cee87338 665 netdev_linux_update(netdev, &change);
86383816 666 ovs_mutex_unlock(&netdev->mutex);
cee87338 667 }
38e0065b 668 netdev_close(netdev_);
cee87338
BP
669 }
670 } else if (error == ENOBUFS) {
671 struct shash device_shash;
672 struct shash_node *node;
673
674 nl_sock_drain(sock);
675
676 shash_init(&device_shash);
677 netdev_get_devices(&netdev_linux_class, &device_shash);
678 SHASH_FOR_EACH (node, &device_shash) {
679 struct netdev *netdev_ = node->data;
680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
681 unsigned int flags;
682
86383816 683 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
684 get_flags(netdev_, &flags);
685 netdev_linux_changed(netdev, flags, 0);
86383816
BP
686 ovs_mutex_unlock(&netdev->mutex);
687
cee87338
BP
688 netdev_close(netdev_);
689 }
690 shash_destroy(&device_shash);
691 } else if (error != EAGAIN) {
692 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
693 ovs_strerror(error));
694 }
695 ofpbuf_uninit(&buf);
696 } while (!error);
8b61709d
BP
697}
698
699static void
1c33f0c3 700netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 701{
cee87338
BP
702 struct nl_sock *sock;
703
19c8e9c1
JS
704 if (netdev_linux_miimon_enabled()) {
705 netdev_linux_miimon_wait();
706 }
cee87338
BP
707 sock = netdev_linux_notify_sock();
708 if (sock) {
709 nl_sock_wait(sock, POLLIN);
710 }
8b61709d
BP
711}
712
ac4d3bcb 713static void
b5d57fc8
BP
714netdev_linux_changed(struct netdev_linux *dev,
715 unsigned int ifi_flags, unsigned int mask)
86383816 716 OVS_REQUIRES(dev->mutex)
ac4d3bcb 717{
3e912ffc 718 netdev_change_seq_changed(&dev->up);
8aa77183
BP
719
720 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
721 dev->carrier_resets++;
722 }
723 dev->ifi_flags = ifi_flags;
724
4f925bd3 725 dev->cache_valid &= mask;
6b6e1329 726 if (!(mask & VALID_IN)) {
a8704b50
PS
727 netdev_get_addrs_list_flush();
728 }
4f925bd3
PS
729}
730
731static void
b5d57fc8 732netdev_linux_update(struct netdev_linux *dev,
7e9dcc0f 733 const struct rtnetlink_change *change)
86383816 734 OVS_REQUIRES(dev->mutex)
4f925bd3 735{
d6384a3a
AW
736 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
737 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 738 /* Keep drv-info, and ip addresses. */
d6384a3a 739 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 740 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
741
742 /* Update netdev from rtnl-change msg. */
743 if (change->mtu) {
744 dev->mtu = change->mtu;
745 dev->cache_valid |= VALID_MTU;
746 dev->netdev_mtu_error = 0;
747 }
90a6637d 748
74ff3298
JR
749 if (!eth_addr_is_zero(change->mac)) {
750 dev->etheraddr = change->mac;
d6384a3a
AW
751 dev->cache_valid |= VALID_ETHERADDR;
752 dev->ether_addr_error = 0;
753 }
44445cac 754
d6384a3a
AW
755 dev->ifindex = change->if_index;
756 dev->cache_valid |= VALID_IFINDEX;
757 dev->get_ifindex_error = 0;
758 } else {
759 netdev_linux_changed(dev, change->ifi_flags, 0);
760 }
761 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
762 /* Invalidates in4, in6. */
6b6e1329 763 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 764 } else {
d6384a3a 765 OVS_NOT_REACHED();
4f925bd3 766 }
ac4d3bcb
EJ
767}
768
9dc63482
BP
769static struct netdev *
770netdev_linux_alloc(void)
771{
772 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
773 return &netdev->up;
774}
775
48c6733c
WT
776static int
777netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 778{
48c6733c
WT
779 /* Prevent any attempt to create (or open) a network device named "default"
780 * or "all". These device names are effectively reserved on Linux because
781 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
782 * itself this wouldn't call for any special treatment, but in practice if
783 * a program tries to create devices with these names, it causes the kernel
784 * to fire a "new device" notification event even though creation failed,
785 * and in turn that causes OVS to wake up and try to create them again,
786 * which ends up as a 100% CPU loop. */
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 const char *name = netdev_->name;
789 if (!strcmp(name, "default") || !strcmp(name, "all")) {
790 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
791 VLOG_WARN_RL(&rl, "%s: Linux forbids network device with this name",
792 name);
793 return EINVAL;
794 }
795
834d6caf 796 ovs_mutex_init(&netdev->mutex);
48c6733c 797 return 0;
9dc63482
BP
798}
799
1f6e0fbd
BP
800/* Creates system and internal devices. */
801static int
9dc63482 802netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 803{
9dc63482 804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
805 int error = netdev_linux_common_construct(netdev_);
806 if (error) {
807 return error;
808 }
1f6e0fbd 809
b5d57fc8
BP
810 error = get_flags(&netdev->up, &netdev->ifi_flags);
811 if (error == ENODEV) {
9dc63482 812 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 813 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
814 return ENODEV;
815 } else {
816 /* "Internal" netdevs have to be created as netdev objects before
817 * they exist in the kernel, because creating them in the kernel
818 * happens by passing a netdev object to dpif_port_add().
819 * Therefore, ignore the error. */
820 }
821 }
46415c90 822
a740f0de
JG
823 return 0;
824}
825
5b7448ed
JG
826/* For most types of netdevs we open the device for each call of
827 * netdev_open(). However, this is not the case with tap devices,
828 * since it is only possible to open the device once. In this
829 * situation we share a single file descriptor, and consequently
830 * buffers, across all readers. Therefore once data is read it will
831 * be unavailable to other reads for tap devices. */
a740f0de 832static int
9dc63482 833netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 834{
9dc63482 835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 836 static const char tap_dev[] = "/dev/net/tun";
9dc63482 837 const char *name = netdev_->name;
a740f0de 838 struct ifreq ifr;
a740f0de 839
48c6733c
WT
840 int error = netdev_linux_common_construct(netdev_);
841 if (error) {
842 return error;
843 }
1f6e0fbd 844
6c88d577 845 /* Open tap device. */
d0d08f8a
BP
846 netdev->tap_fd = open(tap_dev, O_RDWR);
847 if (netdev->tap_fd < 0) {
6c88d577 848 error = errno;
10a89ef0 849 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 850 return error;
6c88d577
JP
851 }
852
853 /* Create tap device. */
61b9d078 854 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 855 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 856 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 857 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 858 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 859 ovs_strerror(errno));
6c88d577 860 error = errno;
f61d8d29 861 goto error_close;
6c88d577
JP
862 }
863
864 /* Make non-blocking. */
d0d08f8a 865 error = set_nonblocking(netdev->tap_fd);
a740f0de 866 if (error) {
f61d8d29 867 goto error_close;
a740f0de
JG
868 }
869
0f28164b
FL
870 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
871 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
872 ovs_strerror(errno));
873 error = errno;
874 goto error_close;
875 }
876
a740f0de
JG
877 return 0;
878
f61d8d29 879error_close:
d0d08f8a 880 close(netdev->tap_fd);
a740f0de
JG
881 return error;
882}
883
6c88d577 884static void
9dc63482 885netdev_linux_destruct(struct netdev *netdev_)
6c88d577 886{
b5d57fc8 887 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 888
b5d57fc8
BP
889 if (netdev->tc && netdev->tc->ops->tc_destroy) {
890 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
891 }
892
d0d08f8a
BP
893 if (netdev_get_class(netdev_) == &netdev_tap_class
894 && netdev->tap_fd >= 0)
895 {
0f28164b 896 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 897 close(netdev->tap_fd);
6c88d577 898 }
86383816 899
19c8e9c1 900 if (netdev->miimon_interval > 0) {
812c272c 901 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
902 }
903
86383816 904 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
905}
906
9dc63482
BP
907static void
908netdev_linux_dealloc(struct netdev *netdev_)
909{
910 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
911 free(netdev);
912}
913
f7791740
PS
914static struct netdev_rxq *
915netdev_linux_rxq_alloc(void)
9dc63482 916{
f7791740 917 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
918 return &rx->up;
919}
920
7b6b0ef4 921static int
f7791740 922netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 923{
f7791740 924 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 925 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 926 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 927 int error;
7b6b0ef4 928
86383816 929 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
930 rx->is_tap = is_tap_netdev(netdev_);
931 if (rx->is_tap) {
932 rx->fd = netdev->tap_fd;
796223f5
BP
933 } else {
934 struct sockaddr_ll sll;
b73c8518 935 int ifindex, val;
32383c3b 936 /* Result of tcpdump -dd inbound */
259e0b1a 937 static const struct sock_filter filt[] = {
32383c3b
MM
938 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
939 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
940 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
941 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
942 };
259e0b1a
BP
943 static const struct sock_fprog fprog = {
944 ARRAY_SIZE(filt), (struct sock_filter *) filt
945 };
7b6b0ef4 946
796223f5 947 /* Create file descriptor. */
9dc63482
BP
948 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
949 if (rx->fd < 0) {
796223f5 950 error = errno;
10a89ef0 951 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
952 goto error;
953 }
33d82a56 954
b73c8518
SH
955 val = 1;
956 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
957 error = errno;
958 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
959 netdev_get_name(netdev_), ovs_strerror(error));
960 goto error;
961 }
962
796223f5 963 /* Set non-blocking mode. */
9dc63482 964 error = set_nonblocking(rx->fd);
796223f5
BP
965 if (error) {
966 goto error;
967 }
7b6b0ef4 968
796223f5 969 /* Get ethernet device index. */
180c6d0b 970 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
971 if (error) {
972 goto error;
973 }
7b6b0ef4 974
796223f5
BP
975 /* Bind to specific ethernet device. */
976 memset(&sll, 0, sizeof sll);
977 sll.sll_family = AF_PACKET;
978 sll.sll_ifindex = ifindex;
b73c8518 979 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 980 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
981 error = errno;
982 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 983 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
984 goto error;
985 }
32383c3b
MM
986
987 /* Filter for only inbound packets. */
9dc63482 988 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
989 sizeof fprog);
990 if (error) {
991 error = errno;
259e0b1a 992 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 993 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
994 goto error;
995 }
7b6b0ef4 996 }
86383816 997 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 998
7b6b0ef4
BP
999 return 0;
1000
1001error:
9dc63482
BP
1002 if (rx->fd >= 0) {
1003 close(rx->fd);
7b6b0ef4 1004 }
86383816 1005 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1006 return error;
1007}
1008
796223f5 1009static void
f7791740 1010netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1011{
f7791740 1012 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1013
796223f5
BP
1014 if (!rx->is_tap) {
1015 close(rx->fd);
8b61709d 1016 }
9dc63482
BP
1017}
1018
1019static void
f7791740 1020netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1021{
f7791740 1022 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1023
796223f5
BP
1024 free(rx);
1025}
8b61709d 1026
b73c8518 1027static ovs_be16
1ebdc7eb 1028auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1029{
1030 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1031 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1032 } else if (double_tagged) {
1033 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1034 } else {
1ebdc7eb 1035 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1036 }
1037}
1038
1039static bool
1040auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1041{
1042 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1043}
1044
796223f5 1045static int
cf62fa4c 1046netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1047{
b73c8518 1048 size_t size;
796223f5 1049 ssize_t retval;
b73c8518
SH
1050 struct iovec iov;
1051 struct cmsghdr *cmsg;
1052 union {
1053 struct cmsghdr cmsg;
1054 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1055 } cmsg_buffer;
1056 struct msghdr msgh;
1057
1058 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1059 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1060 size = dp_packet_tailroom(buffer);
b73c8518 1061
cf62fa4c 1062 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1063 iov.iov_len = size;
1064 msgh.msg_name = NULL;
1065 msgh.msg_namelen = 0;
1066 msgh.msg_iov = &iov;
1067 msgh.msg_iovlen = 1;
1068 msgh.msg_control = &cmsg_buffer;
1069 msgh.msg_controllen = sizeof cmsg_buffer;
1070 msgh.msg_flags = 0;
8e8cddf7 1071
796223f5 1072 do {
b73c8518 1073 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1074 } while (retval < 0 && errno == EINTR);
1075
bfd3367b 1076 if (retval < 0) {
b73c8518
SH
1077 return errno;
1078 } else if (retval > size) {
1079 return EMSGSIZE;
1080 }
1081
cf62fa4c 1082 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1083
1084 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1085 const struct tpacket_auxdata *aux;
1086
1087 if (cmsg->cmsg_level != SOL_PACKET
1088 || cmsg->cmsg_type != PACKET_AUXDATA
1089 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1090 continue;
8b61709d 1091 }
b73c8518
SH
1092
1093 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1094 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1095 struct eth_header *eth;
1096 bool double_tagged;
1097
b73c8518
SH
1098 if (retval < ETH_HEADER_LEN) {
1099 return EINVAL;
1100 }
1101
1ebdc7eb
EG
1102 eth = dp_packet_data(buffer);
1103 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1104
1105 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1106 htons(aux->tp_vlan_tci));
1107 break;
1108 }
1109 }
1110
1111 return 0;
1112}
1113
1114static int
cf62fa4c 1115netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1116{
1117 ssize_t retval;
cf62fa4c 1118 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1119
1120 do {
cf62fa4c 1121 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1122 } while (retval < 0 && errno == EINTR);
1123
1124 if (retval < 0) {
bfd3367b 1125 return errno;
8b61709d 1126 }
b73c8518 1127
cf62fa4c 1128 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1129 return 0;
1130}
1131
1132static int
64839cf4 1133netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
b73c8518 1134{
f7791740 1135 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1136 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1137 struct dp_packet *buffer;
df1e5a3b
PS
1138 ssize_t retval;
1139 int mtu;
1140
1141 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1142 mtu = ETH_PAYLOAD_MAX;
1143 }
1144
2482b0b0 1145 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1146 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1147 DP_NETDEV_HEADROOM);
b73c8518 1148 retval = (rx->is_tap
f7791740
PS
1149 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1150 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1151
1152 if (retval) {
1153 if (retval != EAGAIN && retval != EMSGSIZE) {
1154 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1155 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1156 }
cf62fa4c 1157 dp_packet_delete(buffer);
df1e5a3b 1158 } else {
72c84bc2 1159 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1160 }
1161
1162 return retval;
8b61709d
BP
1163}
1164
8b61709d 1165static void
f7791740 1166netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1167{
f7791740 1168 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1169 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1170}
1171
8b61709d 1172static int
f7791740 1173netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1174{
f7791740 1175 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1176 if (rx->is_tap) {
8b61709d 1177 struct ifreq ifr;
f7791740 1178 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1179 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1180 if (error) {
1181 return error;
1182 }
796223f5 1183 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1184 return 0;
1185 } else {
796223f5 1186 return drain_rcvbuf(rx->fd);
8b61709d
BP
1187 }
1188}
1189
1190/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1191 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1192 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1193 * the packet is too big or too small to transmit on the device.
1194 *
1195 * The caller retains ownership of 'buffer' in all cases.
1196 *
1197 * The kernel maintains a packet transmission queue, so the caller is not
1198 * expected to do additional queuing of packets. */
1199static int
f00fa8cb 1200netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
324c8374
IM
1201 struct dp_packet_batch *batch, bool may_steal,
1202 bool concurrent_txq OVS_UNUSED)
8b61709d 1203{
f4fd623c 1204 int error = 0;
0a62ae2c
ZG
1205 int sock = 0;
1206
1207 struct sockaddr_ll sll;
1208 struct msghdr msg;
1209 if (!is_tap_netdev(netdev_)) {
1210 sock = af_packet_sock();
1211 if (sock < 0) {
1212 error = -sock;
1213 goto free_batch;
1214 }
1215
1216 int ifindex = netdev_get_ifindex(netdev_);
1217 if (ifindex < 0) {
1218 error = -ifindex;
1219 goto free_batch;
1220 }
1221
1222 /* We don't bother setting most fields in sockaddr_ll because the
1223 * kernel ignores them for SOCK_RAW. */
1224 memset(&sll, 0, sizeof sll);
1225 sll.sll_family = AF_PACKET;
1226 sll.sll_ifindex = ifindex;
1227
1228 msg.msg_name = &sll;
1229 msg.msg_namelen = sizeof sll;
1230 msg.msg_iovlen = 1;
1231 msg.msg_control = NULL;
1232 msg.msg_controllen = 0;
1233 msg.msg_flags = 0;
1234 }
40d26f04 1235
f4fd623c 1236 /* 'i' is incremented only if there's no error */
0a62ae2c 1237 for (int i = 0; i < batch->count; ) {
64839cf4
WT
1238 const void *data = dp_packet_data(batch->packets[i]);
1239 size_t size = dp_packet_size(batch->packets[i]);
f23347ea 1240 ssize_t retval;
8b61709d 1241
aaca4fe0 1242 /* Truncate the packet if it is configured. */
64839cf4 1243 size -= dp_packet_get_cutlen(batch->packets[i]);
aaca4fe0 1244
796223f5 1245 if (!is_tap_netdev(netdev_)) {
f23347ea 1246 /* Use our AF_PACKET socket to send to this device. */
f23347ea 1247 struct iovec iov;
76c308b5 1248
ebc56baa 1249 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1250 iov.iov_len = size;
76c308b5 1251
f23347ea 1252 msg.msg_iov = &iov;
f23347ea 1253
488d734d 1254 retval = sendmsg(sock, &msg, 0);
f23347ea 1255 } else {
796223f5
BP
1256 /* Use the tap fd to send to this device. This is essential for
1257 * tap devices, because packets sent to a tap device with an
1258 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1259 * tap device. This doesn't occur on other interface types
1260 * because we attach a socket filter to the rx socket. */
b5d57fc8 1261 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1262
d0d08f8a 1263 retval = write(netdev->tap_fd, data, size);
f23347ea 1264 }
76c308b5 1265
8b61709d 1266 if (retval < 0) {
29736cc0
DDP
1267 if (errno == EINTR) {
1268 /* The send was interrupted by a signal. Retry the packet by
1269 * continuing without incrementing 'i'.*/
8b61709d 1270 continue;
29736cc0
DDP
1271 } else if (errno == EIO && is_tap_netdev(netdev_)) {
1272 /* The Linux tap driver returns EIO if the device is not up.
1273 * From the OVS side this is not an error, so ignore it. */
1274 } else {
1275 /* The Linux AF_PACKET implementation never blocks waiting for
1276 * room for packets, instead returning ENOBUFS. Translate this
1277 * into EAGAIN for the caller. */
1278 error = errno == ENOBUFS ? EAGAIN : errno;
1279 break;
8b61709d 1280 }
8b61709d 1281 } else if (retval != size) {
f4fd623c
DDP
1282 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1283 " of %"PRIuSIZE") on %s", retval, size,
1284 netdev_get_name(netdev_));
1285 error = EMSGSIZE;
1286 break;
1287 }
1288
1289 /* Process the next packet in the batch */
1290 i++;
1291 }
1292
f4fd623c
DDP
1293 if (error && error != EAGAIN) {
1294 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1295 netdev_get_name(netdev_), ovs_strerror(error));
1296 }
1297
0a62ae2c
ZG
1298free_batch:
1299 dp_packet_delete_batch(batch, may_steal);
1300
f4fd623c
DDP
1301 return error;
1302
8b61709d
BP
1303}
1304
1305/* Registers with the poll loop to wake up from the next call to poll_block()
1306 * when the packet transmission queue has sufficient room to transmit a packet
1307 * with netdev_send().
1308 *
1309 * The kernel maintains a packet transmission queue, so the client is not
1310 * expected to do additional queuing of packets. Thus, this function is
1311 * unlikely to ever be used. It is included for completeness. */
1312static void
f00fa8cb 1313netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1314{
796223f5 1315 if (is_tap_netdev(netdev)) {
8b61709d
BP
1316 /* TAP device always accepts packets.*/
1317 poll_immediate_wake();
1318 }
1319}
1320
1321/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1322 * otherwise a positive errno value. */
1323static int
74ff3298 1324netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1325{
b5d57fc8 1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1327 enum netdev_flags old_flags = 0;
eb395f2e
BP
1328 int error;
1329
86383816
BP
1330 ovs_mutex_lock(&netdev->mutex);
1331
b5d57fc8 1332 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1333 error = netdev->ether_addr_error;
1334 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1335 goto exit;
44445cac 1336 }
b5d57fc8 1337 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1338 }
1339
7eb1bd81 1340 /* Tap devices must be brought down before setting the address. */
796223f5 1341 if (is_tap_netdev(netdev_)) {
4f9f3f21 1342 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1343 }
44445cac
PS
1344 error = set_etheraddr(netdev_get_name(netdev_), mac);
1345 if (!error || error == ENODEV) {
b5d57fc8
BP
1346 netdev->ether_addr_error = error;
1347 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1348 if (!error) {
74ff3298 1349 netdev->etheraddr = mac;
eb395f2e 1350 }
8b61709d 1351 }
44445cac 1352
4f9f3f21
BP
1353 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1354 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1355 }
7eb1bd81 1356
86383816
BP
1357exit:
1358 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1359 return error;
1360}
1361
44445cac 1362/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1363static int
74ff3298 1364netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1365{
b5d57fc8 1366 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1367 int error;
44445cac 1368
86383816 1369 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1370 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816 1371 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1372 &netdev->etheraddr);
b5d57fc8 1373 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1374 }
44445cac 1375
86383816
BP
1376 error = netdev->ether_addr_error;
1377 if (!error) {
74ff3298 1378 *mac = netdev->etheraddr;
44445cac 1379 }
86383816 1380 ovs_mutex_unlock(&netdev->mutex);
44445cac 1381
86383816 1382 return error;
8b61709d
BP
1383}
1384
8b61709d 1385static int
73371c09 1386netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1387{
86383816
BP
1388 int error;
1389
b5d57fc8 1390 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1391 struct ifreq ifr;
90a6637d 1392
86383816 1393 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1394 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1395 netdev->mtu = ifr.ifr_mtu;
1396 netdev->cache_valid |= VALID_MTU;
8b61709d 1397 }
90a6637d 1398
86383816
BP
1399 error = netdev->netdev_mtu_error;
1400 if (!error) {
b5d57fc8 1401 *mtup = netdev->mtu;
90a6637d 1402 }
73371c09
BP
1403
1404 return error;
1405}
1406
1407/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1408 * in bytes, not including the hardware header; thus, this is typically 1500
1409 * bytes for Ethernet devices. */
1410static int
1411netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1412{
1413 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1414 int error;
1415
1416 ovs_mutex_lock(&netdev->mutex);
1417 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1418 ovs_mutex_unlock(&netdev->mutex);
1419
1420 return error;
8b61709d
BP
1421}
1422
9b020780
PS
1423/* Sets the maximum size of transmitted (MTU) for given device using linux
1424 * networking ioctl interface.
1425 */
1426static int
4124cb12 1427netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1428{
b5d57fc8 1429 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1430 struct ifreq ifr;
1431 int error;
1432
86383816 1433 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1434 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1435 error = netdev->netdev_mtu_error;
1436 if (error || netdev->mtu == mtu) {
1437 goto exit;
90a6637d 1438 }
b5d57fc8 1439 netdev->cache_valid &= ~VALID_MTU;
153e5481 1440 }
9b020780 1441 ifr.ifr_mtu = mtu;
259e0b1a
BP
1442 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1443 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1444 if (!error || error == ENODEV) {
b5d57fc8
BP
1445 netdev->netdev_mtu_error = error;
1446 netdev->mtu = ifr.ifr_mtu;
1447 netdev->cache_valid |= VALID_MTU;
9b020780 1448 }
86383816
BP
1449exit:
1450 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1451 return error;
9b020780
PS
1452}
1453
9ab3d9a3
BP
1454/* Returns the ifindex of 'netdev', if successful, as a positive number.
1455 * On failure, returns a negative errno value. */
1456static int
86383816 1457netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1458{
86383816 1459 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1460 int ifindex, error;
1461
86383816
BP
1462 ovs_mutex_lock(&netdev->mutex);
1463 error = get_ifindex(netdev_, &ifindex);
1464 ovs_mutex_unlock(&netdev->mutex);
1465
9ab3d9a3
BP
1466 return error ? -error : ifindex;
1467}
1468
8b61709d
BP
1469static int
1470netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1471{
b5d57fc8 1472 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1473
86383816 1474 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1475 if (netdev->miimon_interval > 0) {
1476 *carrier = netdev->miimon;
3a183124 1477 } else {
b5d57fc8 1478 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1479 }
86383816 1480 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1481
3a183124 1482 return 0;
8b61709d
BP
1483}
1484
65c3058c 1485static long long int
86383816 1486netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1487{
86383816
BP
1488 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1489 long long int carrier_resets;
1490
1491 ovs_mutex_lock(&netdev->mutex);
1492 carrier_resets = netdev->carrier_resets;
1493 ovs_mutex_unlock(&netdev->mutex);
1494
1495 return carrier_resets;
65c3058c
EJ
1496}
1497
63331829 1498static int
1670c579
EJ
1499netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1500 struct mii_ioctl_data *data)
63331829 1501{
63331829 1502 struct ifreq ifr;
782e6111 1503 int error;
63331829 1504
63331829 1505 memset(&ifr, 0, sizeof ifr);
782e6111 1506 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1507 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1508 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1509
782e6111
EJ
1510 return error;
1511}
1512
1513static int
1670c579 1514netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1515{
782e6111
EJ
1516 struct mii_ioctl_data data;
1517 int error;
63331829 1518
782e6111
EJ
1519 *miimon = false;
1520
1521 memset(&data, 0, sizeof data);
1670c579 1522 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1523 if (!error) {
1524 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1525 data.reg_num = MII_BMSR;
1670c579 1526 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1527 &data);
63331829
EJ
1528
1529 if (!error) {
782e6111 1530 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1531 }
9120cfc0
DH
1532 }
1533 if (error) {
63331829 1534 struct ethtool_cmd ecmd;
63331829
EJ
1535
1536 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1537 name);
1538
ab985a77 1539 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1540 memset(&ecmd, 0, sizeof ecmd);
1541 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1542 "ETHTOOL_GLINK");
1543 if (!error) {
782e6111
EJ
1544 struct ethtool_value eval;
1545
1546 memcpy(&eval, &ecmd, sizeof eval);
1547 *miimon = !!eval.data;
63331829
EJ
1548 } else {
1549 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1550 }
1551 }
1552
1553 return error;
1554}
1555
1670c579
EJ
1556static int
1557netdev_linux_set_miimon_interval(struct netdev *netdev_,
1558 long long int interval)
1559{
b5d57fc8 1560 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1561
86383816 1562 ovs_mutex_lock(&netdev->mutex);
1670c579 1563 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1564 if (netdev->miimon_interval != interval) {
19c8e9c1 1565 if (interval && !netdev->miimon_interval) {
812c272c 1566 atomic_count_inc(&miimon_cnt);
19c8e9c1 1567 } else if (!interval && netdev->miimon_interval) {
812c272c 1568 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1569 }
1570
b5d57fc8
BP
1571 netdev->miimon_interval = interval;
1572 timer_set_expired(&netdev->miimon_timer);
1670c579 1573 }
86383816 1574 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1575
1576 return 0;
1577}
1578
1579static void
1580netdev_linux_miimon_run(void)
1581{
1582 struct shash device_shash;
1583 struct shash_node *node;
1584
1585 shash_init(&device_shash);
b5d57fc8 1586 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1587 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1588 struct netdev *netdev = node->data;
1589 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1590 bool miimon;
1591
86383816
BP
1592 ovs_mutex_lock(&dev->mutex);
1593 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1594 netdev_linux_get_miimon(dev->up.name, &miimon);
1595 if (miimon != dev->miimon) {
1596 dev->miimon = miimon;
1597 netdev_linux_changed(dev, dev->ifi_flags, 0);
1598 }
1670c579 1599
86383816 1600 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1601 }
86383816 1602 ovs_mutex_unlock(&dev->mutex);
2f980d74 1603 netdev_close(netdev);
1670c579
EJ
1604 }
1605
1606 shash_destroy(&device_shash);
1607}
1608
1609static void
1610netdev_linux_miimon_wait(void)
1611{
1612 struct shash device_shash;
1613 struct shash_node *node;
1614
1615 shash_init(&device_shash);
b5d57fc8 1616 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1617 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1618 struct netdev *netdev = node->data;
1619 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1620
86383816 1621 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1622 if (dev->miimon_interval > 0) {
1623 timer_wait(&dev->miimon_timer);
1624 }
86383816 1625 ovs_mutex_unlock(&dev->mutex);
2f980d74 1626 netdev_close(netdev);
1670c579
EJ
1627 }
1628 shash_destroy(&device_shash);
1629}
1630
92df599c
JG
1631static void
1632swap_uint64(uint64_t *a, uint64_t *b)
1633{
1de0e8ae
BP
1634 uint64_t tmp = *a;
1635 *a = *b;
1636 *b = tmp;
92df599c
JG
1637}
1638
c060c4cf
EJ
1639/* Copies 'src' into 'dst', performing format conversion in the process.
1640 *
1641 * 'src' is allowed to be misaligned. */
1642static void
1643netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1644 const struct ovs_vport_stats *src)
1645{
6a54dedc
BP
1646 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1647 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1648 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1649 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1650 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1651 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1652 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1653 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1654 dst->multicast = 0;
1655 dst->collisions = 0;
1656 dst->rx_length_errors = 0;
1657 dst->rx_over_errors = 0;
1658 dst->rx_crc_errors = 0;
1659 dst->rx_frame_errors = 0;
1660 dst->rx_fifo_errors = 0;
1661 dst->rx_missed_errors = 0;
1662 dst->tx_aborted_errors = 0;
1663 dst->tx_carrier_errors = 0;
1664 dst->tx_fifo_errors = 0;
1665 dst->tx_heartbeat_errors = 0;
1666 dst->tx_window_errors = 0;
1667}
1668
1669static int
1670get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1671{
93451a0a 1672 struct dpif_netlink_vport reply;
c060c4cf
EJ
1673 struct ofpbuf *buf;
1674 int error;
1675
93451a0a 1676 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1677 if (error) {
1678 return error;
1679 } else if (!reply.stats) {
1680 ofpbuf_delete(buf);
1681 return EOPNOTSUPP;
1682 }
1683
1684 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1685
1686 ofpbuf_delete(buf);
1687
1688 return 0;
1689}
1690
f613a0d7
PS
1691static void
1692get_stats_via_vport(const struct netdev *netdev_,
1693 struct netdev_stats *stats)
8b61709d 1694{
b5d57fc8 1695 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1696
b5d57fc8
BP
1697 if (!netdev->vport_stats_error ||
1698 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1699 int error;
7fbef77a 1700
c060c4cf 1701 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1702 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1703 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1704 "(%s)",
1705 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1706 }
b5d57fc8
BP
1707 netdev->vport_stats_error = error;
1708 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1709 }
f613a0d7 1710}
8b61709d 1711
f613a0d7
PS
1712/* Retrieves current device stats for 'netdev-linux'. */
1713static int
1714netdev_linux_get_stats(const struct netdev *netdev_,
1715 struct netdev_stats *stats)
1716{
b5d57fc8 1717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1718 struct netdev_stats dev_stats;
1719 int error;
1720
86383816 1721 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1722 get_stats_via_vport(netdev_, stats);
35eef899 1723 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1724 if (error) {
86383816
BP
1725 if (!netdev->vport_stats_error) {
1726 error = 0;
f613a0d7 1727 }
86383816 1728 } else if (netdev->vport_stats_error) {
04c881eb 1729 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1730 *stats = dev_stats;
1731 } else {
04c881eb
AZ
1732 /* Use kernel netdev's packet and byte counts since vport's counters
1733 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1734 * enabled. */
1735 stats->rx_packets = dev_stats.rx_packets;
1736 stats->rx_bytes = dev_stats.rx_bytes;
1737 stats->tx_packets = dev_stats.tx_packets;
1738 stats->tx_bytes = dev_stats.tx_bytes;
1739
f613a0d7
PS
1740 stats->rx_errors += dev_stats.rx_errors;
1741 stats->tx_errors += dev_stats.tx_errors;
1742 stats->rx_dropped += dev_stats.rx_dropped;
1743 stats->tx_dropped += dev_stats.tx_dropped;
1744 stats->multicast += dev_stats.multicast;
1745 stats->collisions += dev_stats.collisions;
1746 stats->rx_length_errors += dev_stats.rx_length_errors;
1747 stats->rx_over_errors += dev_stats.rx_over_errors;
1748 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1749 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1750 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1751 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1752 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1753 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1754 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1755 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1756 stats->tx_window_errors += dev_stats.tx_window_errors;
1757 }
86383816
BP
1758 ovs_mutex_unlock(&netdev->mutex);
1759
1760 return error;
f613a0d7
PS
1761}
1762
1763/* Retrieves current device stats for 'netdev-tap' netdev or
1764 * netdev-internal. */
1765static int
15aee116 1766netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1767{
b5d57fc8 1768 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1769 struct netdev_stats dev_stats;
1770 int error;
1771
86383816 1772 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1773 get_stats_via_vport(netdev_, stats);
35eef899 1774 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1775 if (error) {
86383816
BP
1776 if (!netdev->vport_stats_error) {
1777 error = 0;
8b61709d 1778 }
86383816
BP
1779 } else if (netdev->vport_stats_error) {
1780 /* Transmit and receive stats will appear to be swapped relative to the
1781 * other ports since we are the one sending the data, not a remote
1782 * computer. For consistency, we swap them back here. This does not
1783 * apply if we are getting stats from the vport layer because it always
1784 * tracks stats from the perspective of the switch. */
fe6b0e03 1785
f613a0d7 1786 *stats = dev_stats;
92df599c
JG
1787 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1788 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1789 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1790 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1791 stats->rx_length_errors = 0;
1792 stats->rx_over_errors = 0;
1793 stats->rx_crc_errors = 0;
1794 stats->rx_frame_errors = 0;
1795 stats->rx_fifo_errors = 0;
1796 stats->rx_missed_errors = 0;
1797 stats->tx_aborted_errors = 0;
1798 stats->tx_carrier_errors = 0;
1799 stats->tx_fifo_errors = 0;
1800 stats->tx_heartbeat_errors = 0;
1801 stats->tx_window_errors = 0;
f613a0d7 1802 } else {
04c881eb
AZ
1803 /* Use kernel netdev's packet and byte counts since vport counters
1804 * do not reflect packet counts on the wire when GSO, TSO or GRO
1805 * are enabled. */
1806 stats->rx_packets = dev_stats.tx_packets;
1807 stats->rx_bytes = dev_stats.tx_bytes;
1808 stats->tx_packets = dev_stats.rx_packets;
1809 stats->tx_bytes = dev_stats.rx_bytes;
1810
f613a0d7
PS
1811 stats->rx_dropped += dev_stats.tx_dropped;
1812 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1813
f613a0d7
PS
1814 stats->rx_errors += dev_stats.tx_errors;
1815 stats->tx_errors += dev_stats.rx_errors;
1816
1817 stats->multicast += dev_stats.multicast;
1818 stats->collisions += dev_stats.collisions;
1819 }
86383816
BP
1820 ovs_mutex_unlock(&netdev->mutex);
1821
1822 return error;
8b61709d
BP
1823}
1824
bba1e6f3
PS
1825static int
1826netdev_internal_get_stats(const struct netdev *netdev_,
1827 struct netdev_stats *stats)
1828{
b5d57fc8 1829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1830 int error;
bba1e6f3 1831
86383816 1832 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1833 get_stats_via_vport(netdev_, stats);
86383816
BP
1834 error = netdev->vport_stats_error;
1835 ovs_mutex_unlock(&netdev->mutex);
1836
1837 return error;
bba1e6f3
PS
1838}
1839
51f87458 1840static void
b5d57fc8 1841netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1842{
1843 struct ethtool_cmd ecmd;
6c038611 1844 uint32_t speed;
8b61709d
BP
1845 int error;
1846
b5d57fc8 1847 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1848 return;
1849 }
1850
ab985a77 1851 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1852 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1853 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1854 ETHTOOL_GSET, "ETHTOOL_GSET");
1855 if (error) {
51f87458 1856 goto out;
8b61709d
BP
1857 }
1858
1859 /* Supported features. */
b5d57fc8 1860 netdev->supported = 0;
8b61709d 1861 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1862 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1863 }
1864 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1865 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1866 }
1867 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1868 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1869 }
1870 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1871 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1872 }
1873 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1874 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 1875 }
67bed84c
SH
1876 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1877 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 1878 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 1879 }
67bed84c
SH
1880 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1881 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1882 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1883 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 1884 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 1885 }
67bed84c
SH
1886 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1887 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1888 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1889 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1890 netdev->supported |= NETDEV_F_40GB_FD;
1891 }
8b61709d 1892 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1893 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1894 }
1895 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1896 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1897 }
1898 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1899 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1900 }
1901 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1902 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1903 }
1904 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1905 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1906 }
1907
1908 /* Advertised features. */
b5d57fc8 1909 netdev->advertised = 0;
8b61709d 1910 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1911 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1912 }
1913 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1914 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1915 }
1916 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1917 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1918 }
1919 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1920 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1921 }
1922 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1923 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 1924 }
67bed84c
SH
1925 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1926 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 1927 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 1928 }
67bed84c
SH
1929 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1930 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1931 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1932 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 1933 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 1934 }
67bed84c
SH
1935 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1936 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1937 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1938 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1939 netdev->advertised |= NETDEV_F_40GB_FD;
1940 }
8b61709d 1941 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1942 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1943 }
1944 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1945 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1946 }
1947 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1948 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1949 }
1950 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1951 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1952 }
1953 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1954 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1955 }
1956
1957 /* Current settings. */
0c615356 1958 speed = ethtool_cmd_speed(&ecmd);
6c038611 1959 if (speed == SPEED_10) {
b5d57fc8 1960 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1961 } else if (speed == SPEED_100) {
b5d57fc8 1962 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1963 } else if (speed == SPEED_1000) {
b5d57fc8 1964 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1965 } else if (speed == SPEED_10000) {
b5d57fc8 1966 netdev->current = NETDEV_F_10GB_FD;
6c038611 1967 } else if (speed == 40000) {
b5d57fc8 1968 netdev->current = NETDEV_F_40GB_FD;
6c038611 1969 } else if (speed == 100000) {
b5d57fc8 1970 netdev->current = NETDEV_F_100GB_FD;
6c038611 1971 } else if (speed == 1000000) {
b5d57fc8 1972 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1973 } else {
b5d57fc8 1974 netdev->current = 0;
8b61709d
BP
1975 }
1976
1977 if (ecmd.port == PORT_TP) {
b5d57fc8 1978 netdev->current |= NETDEV_F_COPPER;
8b61709d 1979 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1980 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1981 }
1982
1983 if (ecmd.autoneg) {
b5d57fc8 1984 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1985 }
1986
51f87458 1987out:
b5d57fc8
BP
1988 netdev->cache_valid |= VALID_FEATURES;
1989 netdev->get_features_error = error;
51f87458
PS
1990}
1991
887ed8b2
BP
1992/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1993 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1994 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1995static int
1996netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1997 enum netdev_features *current,
1998 enum netdev_features *advertised,
1999 enum netdev_features *supported,
2000 enum netdev_features *peer)
51f87458 2001{
b5d57fc8 2002 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2003 int error;
51f87458 2004
86383816 2005 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2006 netdev_linux_read_features(netdev);
b5d57fc8
BP
2007 if (!netdev->get_features_error) {
2008 *current = netdev->current;
2009 *advertised = netdev->advertised;
2010 *supported = netdev->supported;
887ed8b2 2011 *peer = 0; /* XXX */
51f87458 2012 }
86383816
BP
2013 error = netdev->get_features_error;
2014 ovs_mutex_unlock(&netdev->mutex);
2015
2016 return error;
8b61709d
BP
2017}
2018
2019/* Set the features advertised by 'netdev' to 'advertise'. */
2020static int
86383816 2021netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2022 enum netdev_features advertise)
8b61709d 2023{
86383816 2024 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2025 struct ethtool_cmd ecmd;
2026 int error;
2027
86383816
BP
2028 ovs_mutex_lock(&netdev->mutex);
2029
ab985a77 2030 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2031 memset(&ecmd, 0, sizeof ecmd);
86383816 2032 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2033 ETHTOOL_GSET, "ETHTOOL_GSET");
2034 if (error) {
86383816 2035 goto exit;
8b61709d
BP
2036 }
2037
2038 ecmd.advertising = 0;
6c038611 2039 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2040 ecmd.advertising |= ADVERTISED_10baseT_Half;
2041 }
6c038611 2042 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2043 ecmd.advertising |= ADVERTISED_10baseT_Full;
2044 }
6c038611 2045 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2046 ecmd.advertising |= ADVERTISED_100baseT_Half;
2047 }
6c038611 2048 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2049 ecmd.advertising |= ADVERTISED_100baseT_Full;
2050 }
6c038611 2051 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2052 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2053 }
6c038611 2054 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2055 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2056 }
6c038611 2057 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2058 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2059 }
6c038611 2060 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2061 ecmd.advertising |= ADVERTISED_TP;
2062 }
6c038611 2063 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2064 ecmd.advertising |= ADVERTISED_FIBRE;
2065 }
6c038611 2066 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2067 ecmd.advertising |= ADVERTISED_Autoneg;
2068 }
6c038611 2069 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2070 ecmd.advertising |= ADVERTISED_Pause;
2071 }
6c038611 2072 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2073 ecmd.advertising |= ADVERTISED_Asym_Pause;
2074 }
ab985a77 2075 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2076 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2077 ETHTOOL_SSET, "ETHTOOL_SSET");
2078
2079exit:
2080 ovs_mutex_unlock(&netdev->mutex);
2081 return error;
8b61709d
BP
2082}
2083
f8500004
JP
2084/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2085 * successful, otherwise a positive errno value. */
8b61709d 2086static int
b5d57fc8 2087netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2088 uint32_t kbits_rate, uint32_t kbits_burst)
2089{
b5d57fc8
BP
2090 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2091 const char *netdev_name = netdev_get_name(netdev_);
f8500004 2092 int error;
8b61709d 2093
80a86fbe 2094 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2095 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2096 : kbits_burst); /* Stick with user-specified value. */
2097
86383816 2098 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2099 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2100 error = netdev->netdev_policing_error;
2101 if (error || (netdev->kbits_rate == kbits_rate &&
2102 netdev->kbits_burst == kbits_burst)) {
c9f71668 2103 /* Assume that settings haven't changed since we last set them. */
86383816 2104 goto out;
c9f71668 2105 }
b5d57fc8 2106 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2107 }
2108
ac8c3412 2109 COVERAGE_INC(netdev_set_policing);
f8500004 2110 /* Remove any existing ingress qdisc. */
b5d57fc8 2111 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
2112 if (error) {
2113 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2114 netdev_name, ovs_strerror(error));
c9f71668 2115 goto out;
f8500004
JP
2116 }
2117
8b61709d 2118 if (kbits_rate) {
b5d57fc8 2119 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
2120 if (error) {
2121 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2122 netdev_name, ovs_strerror(error));
c9f71668 2123 goto out;
8b61709d
BP
2124 }
2125
b5d57fc8 2126 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2127 if (error){
2128 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2129 netdev_name, ovs_strerror(error));
c9f71668 2130 goto out;
8b61709d 2131 }
8b61709d
BP
2132 }
2133
b5d57fc8
BP
2134 netdev->kbits_rate = kbits_rate;
2135 netdev->kbits_burst = kbits_burst;
f8500004 2136
c9f71668
PS
2137out:
2138 if (!error || error == ENODEV) {
b5d57fc8
BP
2139 netdev->netdev_policing_error = error;
2140 netdev->cache_valid |= VALID_POLICING;
c9f71668 2141 }
86383816 2142 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2143 return error;
8b61709d
BP
2144}
2145
c1c9c9c4
BP
2146static int
2147netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2148 struct sset *types)
c1c9c9c4 2149{
559eb230 2150 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2151 for (opsp = tcs; *opsp != NULL; opsp++) {
2152 const struct tc_ops *ops = *opsp;
2153 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2154 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2155 }
2156 }
2157 return 0;
2158}
2159
2160static const struct tc_ops *
2161tc_lookup_ovs_name(const char *name)
2162{
559eb230 2163 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2164
2165 for (opsp = tcs; *opsp != NULL; opsp++) {
2166 const struct tc_ops *ops = *opsp;
2167 if (!strcmp(name, ops->ovs_name)) {
2168 return ops;
2169 }
2170 }
2171 return NULL;
2172}
2173
2174static const struct tc_ops *
2175tc_lookup_linux_name(const char *name)
2176{
559eb230 2177 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2178
2179 for (opsp = tcs; *opsp != NULL; opsp++) {
2180 const struct tc_ops *ops = *opsp;
2181 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2182 return ops;
2183 }
2184 }
2185 return NULL;
2186}
2187
93b13be8 2188static struct tc_queue *
b5d57fc8 2189tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2190 size_t hash)
2191{
b5d57fc8 2192 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2193 struct tc_queue *queue;
2194
b5d57fc8 2195 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2196 if (queue->queue_id == queue_id) {
2197 return queue;
2198 }
2199 }
2200 return NULL;
2201}
2202
2203static struct tc_queue *
2204tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2205{
2206 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2207}
2208
c1c9c9c4
BP
2209static int
2210netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2211 const char *type,
2212 struct netdev_qos_capabilities *caps)
2213{
2214 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2215 if (!ops) {
2216 return EOPNOTSUPP;
2217 }
2218 caps->n_queues = ops->n_queues;
2219 return 0;
2220}
2221
2222static int
b5d57fc8 2223netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2224 const char **typep, struct smap *details)
c1c9c9c4 2225{
b5d57fc8 2226 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2227 int error;
2228
86383816 2229 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2230 error = tc_query_qdisc(netdev_);
86383816
BP
2231 if (!error) {
2232 *typep = netdev->tc->ops->ovs_name;
2233 error = (netdev->tc->ops->qdisc_get
2234 ? netdev->tc->ops->qdisc_get(netdev_, details)
2235 : 0);
c1c9c9c4 2236 }
86383816 2237 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2238
86383816 2239 return error;
c1c9c9c4
BP
2240}
2241
2242static int
b5d57fc8 2243netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2244 const char *type, const struct smap *details)
c1c9c9c4 2245{
b5d57fc8 2246 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2247 const struct tc_ops *new_ops;
2248 int error;
2249
2250 new_ops = tc_lookup_ovs_name(type);
2251 if (!new_ops || !new_ops->tc_install) {
2252 return EOPNOTSUPP;
2253 }
2254
6cf888b8
BS
2255 if (new_ops == &tc_ops_noop) {
2256 return new_ops->tc_install(netdev_, details);
2257 }
2258
86383816 2259 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2260 error = tc_query_qdisc(netdev_);
c1c9c9c4 2261 if (error) {
86383816 2262 goto exit;
c1c9c9c4
BP
2263 }
2264
b5d57fc8 2265 if (new_ops == netdev->tc->ops) {
86383816 2266 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2267 } else {
2268 /* Delete existing qdisc. */
b5d57fc8 2269 error = tc_del_qdisc(netdev_);
c1c9c9c4 2270 if (error) {
86383816 2271 goto exit;
c1c9c9c4 2272 }
b5d57fc8 2273 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2274
2275 /* Install new qdisc. */
b5d57fc8
BP
2276 error = new_ops->tc_install(netdev_, details);
2277 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2278 }
86383816
BP
2279
2280exit:
2281 ovs_mutex_unlock(&netdev->mutex);
2282 return error;
c1c9c9c4
BP
2283}
2284
2285static int
b5d57fc8 2286netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2287 unsigned int queue_id, struct smap *details)
c1c9c9c4 2288{
b5d57fc8 2289 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2290 int error;
2291
86383816 2292 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2293 error = tc_query_qdisc(netdev_);
86383816 2294 if (!error) {
b5d57fc8 2295 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2296 error = (queue
b5d57fc8 2297 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2298 : ENOENT);
c1c9c9c4 2299 }
86383816
BP
2300 ovs_mutex_unlock(&netdev->mutex);
2301
2302 return error;
c1c9c9c4
BP
2303}
2304
2305static int
b5d57fc8 2306netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2307 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2308{
b5d57fc8 2309 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2310 int error;
2311
86383816 2312 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2313 error = tc_query_qdisc(netdev_);
86383816
BP
2314 if (!error) {
2315 error = (queue_id < netdev->tc->ops->n_queues
2316 && netdev->tc->ops->class_set
2317 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2318 : EINVAL);
c1c9c9c4 2319 }
86383816 2320 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2321
86383816 2322 return error;
c1c9c9c4
BP
2323}
2324
2325static int
b5d57fc8 2326netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2327{
b5d57fc8 2328 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2329 int error;
2330
86383816 2331 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2332 error = tc_query_qdisc(netdev_);
86383816
BP
2333 if (!error) {
2334 if (netdev->tc->ops->class_delete) {
2335 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2336 error = (queue
2337 ? netdev->tc->ops->class_delete(netdev_, queue)
2338 : ENOENT);
2339 } else {
2340 error = EINVAL;
2341 }
c1c9c9c4 2342 }
86383816
BP
2343 ovs_mutex_unlock(&netdev->mutex);
2344
2345 return error;
c1c9c9c4
BP
2346}
2347
2348static int
b5d57fc8 2349netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2350 unsigned int queue_id,
2351 struct netdev_queue_stats *stats)
2352{
b5d57fc8 2353 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2354 int error;
2355
86383816 2356 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2357 error = tc_query_qdisc(netdev_);
86383816
BP
2358 if (!error) {
2359 if (netdev->tc->ops->class_get_stats) {
2360 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2361 if (queue) {
2362 stats->created = queue->created;
2363 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2364 stats);
2365 } else {
2366 error = ENOENT;
2367 }
2368 } else {
2369 error = EOPNOTSUPP;
6dc34a0d 2370 }
c1c9c9c4 2371 }
86383816
BP
2372 ovs_mutex_unlock(&netdev->mutex);
2373
2374 return error;
c1c9c9c4
BP
2375}
2376
d57695d7
JS
2377struct queue_dump_state {
2378 struct nl_dump dump;
2379 struct ofpbuf buf;
2380};
2381
23a98ffe 2382static bool
d57695d7 2383start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2384{
2385 struct ofpbuf request;
2386 struct tcmsg *tcmsg;
2387
2388 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2389 if (!tcmsg) {
2390 return false;
2391 }
3c4de644 2392 tcmsg->tcm_parent = 0;
d57695d7 2393 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2394 ofpbuf_uninit(&request);
d57695d7
JS
2395
2396 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2397 return true;
c1c9c9c4
BP
2398}
2399
d57695d7
JS
2400static int
2401finish_queue_dump(struct queue_dump_state *state)
2402{
2403 ofpbuf_uninit(&state->buf);
2404 return nl_dump_done(&state->dump);
2405}
2406
89454bf4
BP
2407struct netdev_linux_queue_state {
2408 unsigned int *queues;
2409 size_t cur_queue;
2410 size_t n_queues;
2411};
2412
c1c9c9c4 2413static int
89454bf4 2414netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2415{
89454bf4 2416 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2417 int error;
2418
86383816 2419 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2420 error = tc_query_qdisc(netdev_);
86383816
BP
2421 if (!error) {
2422 if (netdev->tc->ops->class_get) {
89454bf4
BP
2423 struct netdev_linux_queue_state *state;
2424 struct tc_queue *queue;
2425 size_t i;
2426
2427 *statep = state = xmalloc(sizeof *state);
2428 state->n_queues = hmap_count(&netdev->tc->queues);
2429 state->cur_queue = 0;
2430 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2431
2432 i = 0;
2433 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2434 state->queues[i++] = queue->queue_id;
86383816 2435 }
c1c9c9c4 2436 } else {
86383816 2437 error = EOPNOTSUPP;
c1c9c9c4
BP
2438 }
2439 }
86383816 2440 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2441
86383816 2442 return error;
c1c9c9c4
BP
2443}
2444
89454bf4
BP
2445static int
2446netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2447 unsigned int *queue_idp, struct smap *details)
2448{
2449 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2450 struct netdev_linux_queue_state *state = state_;
2451 int error = EOF;
2452
2453 ovs_mutex_lock(&netdev->mutex);
2454 while (state->cur_queue < state->n_queues) {
2455 unsigned int queue_id = state->queues[state->cur_queue++];
2456 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2457
2458 if (queue) {
2459 *queue_idp = queue_id;
2460 error = netdev->tc->ops->class_get(netdev_, queue, details);
2461 break;
2462 }
2463 }
2464 ovs_mutex_unlock(&netdev->mutex);
2465
2466 return error;
2467}
2468
2469static int
2470netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2471 void *state_)
2472{
2473 struct netdev_linux_queue_state *state = state_;
2474
2475 free(state->queues);
2476 free(state);
2477 return 0;
2478}
2479
c1c9c9c4 2480static int
b5d57fc8 2481netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2482 netdev_dump_queue_stats_cb *cb, void *aux)
2483{
b5d57fc8 2484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2485 int error;
2486
86383816 2487 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2488 error = tc_query_qdisc(netdev_);
86383816 2489 if (!error) {
d57695d7 2490 struct queue_dump_state state;
c1c9c9c4 2491
86383816
BP
2492 if (!netdev->tc->ops->class_dump_stats) {
2493 error = EOPNOTSUPP;
d57695d7 2494 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2495 error = ENODEV;
2496 } else {
2497 struct ofpbuf msg;
2498 int retval;
2499
d57695d7 2500 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2501 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2502 cb, aux);
2503 if (retval) {
2504 error = retval;
2505 }
2506 }
2507
d57695d7 2508 retval = finish_queue_dump(&state);
86383816
BP
2509 if (retval) {
2510 error = retval;
2511 }
c1c9c9c4
BP
2512 }
2513 }
86383816 2514 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2515
86383816 2516 return error;
c1c9c9c4
BP
2517}
2518
8b61709d 2519static int
f1acd62b
BP
2520netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2521 struct in_addr netmask)
8b61709d 2522{
b5d57fc8 2523 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2524 int error;
2525
86383816 2526 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2527 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2528 if (!error) {
f1acd62b 2529 if (address.s_addr != INADDR_ANY) {
8b61709d 2530 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2531 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2532 }
2533 }
49af9a3d 2534
86383816
BP
2535 ovs_mutex_unlock(&netdev->mutex);
2536
8b61709d
BP
2537 return error;
2538}
2539
7df6932e
AW
2540/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2541 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2542 * error. */
8b61709d 2543static int
a8704b50
PS
2544netdev_linux_get_addr_list(const struct netdev *netdev_,
2545 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2546{
b5d57fc8 2547 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2548 int error;
86383816
BP
2549
2550 ovs_mutex_lock(&netdev->mutex);
a8704b50 2551 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816
BP
2552 ovs_mutex_unlock(&netdev->mutex);
2553
7df6932e 2554 return error;
8b61709d
BP
2555}
2556
2557static void
2558make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2559{
2560 struct sockaddr_in sin;
2561 memset(&sin, 0, sizeof sin);
2562 sin.sin_family = AF_INET;
2563 sin.sin_addr = addr;
2564 sin.sin_port = 0;
2565
2566 memset(sa, 0, sizeof *sa);
2567 memcpy(sa, &sin, sizeof sin);
2568}
2569
2570static int
2571do_set_addr(struct netdev *netdev,
2572 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2573{
2574 struct ifreq ifr;
149f577a 2575
259e0b1a
BP
2576 make_in4_sockaddr(&ifr.ifr_addr, addr);
2577 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2578 ioctl_name);
8b61709d
BP
2579}
2580
2581/* Adds 'router' as a default IP gateway. */
2582static int
67a4917b 2583netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2584{
2585 struct in_addr any = { INADDR_ANY };
2586 struct rtentry rt;
2587 int error;
2588
2589 memset(&rt, 0, sizeof rt);
2590 make_in4_sockaddr(&rt.rt_dst, any);
2591 make_in4_sockaddr(&rt.rt_gateway, router);
2592 make_in4_sockaddr(&rt.rt_genmask, any);
2593 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2594 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2595 if (error) {
10a89ef0 2596 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2597 }
2598 return error;
2599}
2600
f1acd62b
BP
2601static int
2602netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2603 char **netdev_name)
2604{
2605 static const char fn[] = "/proc/net/route";
2606 FILE *stream;
2607 char line[256];
2608 int ln;
2609
2610 *netdev_name = NULL;
2611 stream = fopen(fn, "r");
2612 if (stream == NULL) {
10a89ef0 2613 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2614 return errno;
2615 }
2616
2617 ln = 0;
2618 while (fgets(line, sizeof line, stream)) {
2619 if (++ln >= 2) {
2620 char iface[17];
dbba996b 2621 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2622 int refcnt, metric, mtu;
2623 unsigned int flags, use, window, irtt;
2624
c2c28dfd
BP
2625 if (!ovs_scan(line,
2626 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2627 " %d %u %u\n",
2628 iface, &dest, &gateway, &flags, &refcnt,
2629 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2630 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2631 fn, ln, line);
2632 continue;
2633 }
2634 if (!(flags & RTF_UP)) {
2635 /* Skip routes that aren't up. */
2636 continue;
2637 }
2638
2639 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2640 * network byte order, so we don't need need any endian
f1acd62b
BP
2641 * conversions here. */
2642 if ((dest & mask) == (host->s_addr & mask)) {
2643 if (!gateway) {
2644 /* The host is directly reachable. */
2645 next_hop->s_addr = 0;
2646 } else {
2647 /* To reach the host, we must go through a gateway. */
2648 next_hop->s_addr = gateway;
2649 }
2650 *netdev_name = xstrdup(iface);
2651 fclose(stream);
2652 return 0;
2653 }
2654 }
2655 }
2656
2657 fclose(stream);
2658 return ENXIO;
2659}
2660
e210037e 2661static int
b5d57fc8 2662netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2663{
b5d57fc8 2664 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2665 int error = 0;
2666
86383816 2667 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2668 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2669 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2670
2671 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2672 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2673 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2674 cmd,
2675 ETHTOOL_GDRVINFO,
2676 "ETHTOOL_GDRVINFO");
2677 if (!error) {
b5d57fc8 2678 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2679 }
2680 }
e210037e 2681
e210037e 2682 if (!error) {
b5d57fc8
BP
2683 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2684 smap_add(smap, "driver_version", netdev->drvinfo.version);
2685 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2686 }
86383816
BP
2687 ovs_mutex_unlock(&netdev->mutex);
2688
e210037e
AE
2689 return error;
2690}
2691
4f925bd3 2692static int
275707c3
EJ
2693netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2694 struct smap *smap)
4f925bd3 2695{
79f1cbe9 2696 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2697 return 0;
2698}
2699
8b61709d
BP
2700/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2701 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2702 * returns 0. Otherwise, it returns a positive errno value; in particular,
2703 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2704static int
2705netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2706 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2707{
2708 struct arpreq r;
c100e025 2709 struct sockaddr_in sin;
8b61709d
BP
2710 int retval;
2711
2712 memset(&r, 0, sizeof r);
f2cc621b 2713 memset(&sin, 0, sizeof sin);
c100e025
BP
2714 sin.sin_family = AF_INET;
2715 sin.sin_addr.s_addr = ip;
2716 sin.sin_port = 0;
2717 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2718 r.arp_ha.sa_family = ARPHRD_ETHER;
2719 r.arp_flags = 0;
71d7c22f 2720 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2721 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2722 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2723 if (!retval) {
2724 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2725 } else if (retval != ENXIO) {
2726 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2727 netdev_get_name(netdev), IP_ARGS(ip),
2728 ovs_strerror(retval));
8b61709d
BP
2729 }
2730 return retval;
2731}
2732
2733static int
2734nd_to_iff_flags(enum netdev_flags nd)
2735{
2736 int iff = 0;
2737 if (nd & NETDEV_UP) {
2738 iff |= IFF_UP;
2739 }
2740 if (nd & NETDEV_PROMISC) {
2741 iff |= IFF_PROMISC;
2742 }
7ba19d41
AC
2743 if (nd & NETDEV_LOOPBACK) {
2744 iff |= IFF_LOOPBACK;
2745 }
8b61709d
BP
2746 return iff;
2747}
2748
2749static int
2750iff_to_nd_flags(int iff)
2751{
2752 enum netdev_flags nd = 0;
2753 if (iff & IFF_UP) {
2754 nd |= NETDEV_UP;
2755 }
2756 if (iff & IFF_PROMISC) {
2757 nd |= NETDEV_PROMISC;
2758 }
7ba19d41
AC
2759 if (iff & IFF_LOOPBACK) {
2760 nd |= NETDEV_LOOPBACK;
2761 }
8b61709d
BP
2762 return nd;
2763}
2764
2765static int
4f9f3f21
BP
2766update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2767 enum netdev_flags on, enum netdev_flags *old_flagsp)
2768 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2769{
2770 int old_flags, new_flags;
c37d4da4
EJ
2771 int error = 0;
2772
b5d57fc8 2773 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2774 *old_flagsp = iff_to_nd_flags(old_flags);
2775 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2776 if (new_flags != old_flags) {
4f9f3f21
BP
2777 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2778 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2779 }
4f9f3f21
BP
2780
2781 return error;
2782}
2783
2784static int
2785netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2786 enum netdev_flags on, enum netdev_flags *old_flagsp)
2787{
2788 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2789 int error;
2790
2791 ovs_mutex_lock(&netdev->mutex);
2792 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2793 ovs_mutex_unlock(&netdev->mutex);
2794
8b61709d
BP
2795 return error;
2796}
2797
2f9dd77f 2798#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2799 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2800{ \
2801 NAME, \
118c77b1 2802 false, /* is_pmd */ \
c3827f61 2803 \
259e0b1a 2804 NULL, \
c3827f61
BP
2805 netdev_linux_run, \
2806 netdev_linux_wait, \
2807 \
9dc63482
BP
2808 netdev_linux_alloc, \
2809 CONSTRUCT, \
2810 netdev_linux_destruct, \
2811 netdev_linux_dealloc, \
de5cdb90 2812 NULL, /* get_config */ \
6d9e6eb4 2813 NULL, /* set_config */ \
f431bf7d 2814 NULL, /* get_tunnel_config */ \
a36de779
PS
2815 NULL, /* build header */ \
2816 NULL, /* push header */ \
2817 NULL, /* pop header */ \
7dec44fe 2818 NULL, /* get_numa_id */ \
050c60bf 2819 NULL, /* set_tx_multiq */ \
c3827f61 2820 \
c3827f61
BP
2821 netdev_linux_send, \
2822 netdev_linux_send_wait, \
2823 \
2824 netdev_linux_set_etheraddr, \
2825 netdev_linux_get_etheraddr, \
2826 netdev_linux_get_mtu, \
9b020780 2827 netdev_linux_set_mtu, \
c3827f61
BP
2828 netdev_linux_get_ifindex, \
2829 netdev_linux_get_carrier, \
65c3058c 2830 netdev_linux_get_carrier_resets, \
1670c579 2831 netdev_linux_set_miimon_interval, \
f613a0d7 2832 GET_STATS, \
c3827f61 2833 \
51f87458 2834 GET_FEATURES, \
c3827f61 2835 netdev_linux_set_advertisements, \
c3827f61
BP
2836 \
2837 netdev_linux_set_policing, \
2838 netdev_linux_get_qos_types, \
2839 netdev_linux_get_qos_capabilities, \
2840 netdev_linux_get_qos, \
2841 netdev_linux_set_qos, \
2842 netdev_linux_get_queue, \
2843 netdev_linux_set_queue, \
2844 netdev_linux_delete_queue, \
2845 netdev_linux_get_queue_stats, \
89454bf4
BP
2846 netdev_linux_queue_dump_start, \
2847 netdev_linux_queue_dump_next, \
2848 netdev_linux_queue_dump_done, \
c3827f61
BP
2849 netdev_linux_dump_queue_stats, \
2850 \
c3827f61 2851 netdev_linux_set_in4, \
a8704b50 2852 netdev_linux_get_addr_list, \
c3827f61
BP
2853 netdev_linux_add_router, \
2854 netdev_linux_get_next_hop, \
4f925bd3 2855 GET_STATUS, \
c3827f61
BP
2856 netdev_linux_arp_lookup, \
2857 \
2858 netdev_linux_update_flags, \
790fb3b7 2859 NULL, /* reconfigure */ \
c3827f61 2860 \
f7791740
PS
2861 netdev_linux_rxq_alloc, \
2862 netdev_linux_rxq_construct, \
2863 netdev_linux_rxq_destruct, \
2864 netdev_linux_rxq_dealloc, \
2865 netdev_linux_rxq_recv, \
2866 netdev_linux_rxq_wait, \
2867 netdev_linux_rxq_drain, \
c3827f61
BP
2868}
2869
2870const struct netdev_class netdev_linux_class =
2871 NETDEV_LINUX_CLASS(
2872 "system",
9dc63482 2873 netdev_linux_construct,
f613a0d7 2874 netdev_linux_get_stats,
51f87458 2875 netdev_linux_get_features,
275707c3 2876 netdev_linux_get_status);
c3827f61
BP
2877
2878const struct netdev_class netdev_tap_class =
2879 NETDEV_LINUX_CLASS(
2880 "tap",
9dc63482 2881 netdev_linux_construct_tap,
bba1e6f3 2882 netdev_tap_get_stats,
51f87458 2883 netdev_linux_get_features,
275707c3 2884 netdev_linux_get_status);
c3827f61
BP
2885
2886const struct netdev_class netdev_internal_class =
2887 NETDEV_LINUX_CLASS(
2888 "internal",
9dc63482 2889 netdev_linux_construct,
bba1e6f3 2890 netdev_internal_get_stats,
51f87458 2891 NULL, /* get_features */
275707c3 2892 netdev_internal_get_status);
8b61709d 2893\f
677d9158
JV
2894
2895#define CODEL_N_QUEUES 0x0000
2896
2f4298ce
BP
2897/* In sufficiently new kernel headers these are defined as enums in
2898 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2899 * kernels. (This overrides any enum definition in the header file but that's
2900 * harmless.) */
2901#define TCA_CODEL_TARGET 1
2902#define TCA_CODEL_LIMIT 2
2903#define TCA_CODEL_INTERVAL 3
2904
677d9158
JV
2905struct codel {
2906 struct tc tc;
2907 uint32_t target;
2908 uint32_t limit;
2909 uint32_t interval;
2910};
2911
2912static struct codel *
2913codel_get__(const struct netdev *netdev_)
2914{
2915 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2916 return CONTAINER_OF(netdev->tc, struct codel, tc);
2917}
2918
2919static void
2920codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2921 uint32_t interval)
2922{
2923 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2924 struct codel *codel;
2925
2926 codel = xmalloc(sizeof *codel);
2927 tc_init(&codel->tc, &tc_ops_codel);
2928 codel->target = target;
2929 codel->limit = limit;
2930 codel->interval = interval;
2931
2932 netdev->tc = &codel->tc;
2933}
2934
2935static int
2936codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2937 uint32_t interval)
2938{
2939 size_t opt_offset;
2940 struct ofpbuf request;
2941 struct tcmsg *tcmsg;
2942 uint32_t otarget, olimit, ointerval;
2943 int error;
2944
2945 tc_del_qdisc(netdev);
2946
2947 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2948 NLM_F_EXCL | NLM_F_CREATE, &request);
2949 if (!tcmsg) {
2950 return ENODEV;
2951 }
2952 tcmsg->tcm_handle = tc_make_handle(1, 0);
2953 tcmsg->tcm_parent = TC_H_ROOT;
2954
2955 otarget = target ? target : 5000;
2956 olimit = limit ? limit : 10240;
2957 ointerval = interval ? interval : 100000;
2958
2959 nl_msg_put_string(&request, TCA_KIND, "codel");
2960 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2961 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2962 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2963 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2964 nl_msg_end_nested(&request, opt_offset);
2965
2966 error = tc_transact(&request, NULL);
2967 if (error) {
2968 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2969 "target %u, limit %u, interval %u error %d(%s)",
2970 netdev_get_name(netdev),
2971 otarget, olimit, ointerval,
2972 error, ovs_strerror(error));
2973 }
2974 return error;
2975}
2976
2977static void
2978codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2979 const struct smap *details, struct codel *codel)
2980{
13c1637f
BP
2981 codel->target = smap_get_ullong(details, "target", 0);
2982 codel->limit = smap_get_ullong(details, "limit", 0);
2983 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
2984
2985 if (!codel->target) {
2986 codel->target = 5000;
2987 }
2988 if (!codel->limit) {
2989 codel->limit = 10240;
2990 }
2991 if (!codel->interval) {
2992 codel->interval = 100000;
2993 }
2994}
2995
2996static int
2997codel_tc_install(struct netdev *netdev, const struct smap *details)
2998{
2999 int error;
3000 struct codel codel;
3001
3002 codel_parse_qdisc_details__(netdev, details, &codel);
3003 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3004 codel.interval);
3005 if (!error) {
3006 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3007 }
3008 return error;
3009}
3010
3011static int
3012codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3013{
3014 static const struct nl_policy tca_codel_policy[] = {
3015 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3016 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3017 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3018 };
3019
3020 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3021
3022 if (!nl_parse_nested(nl_options, tca_codel_policy,
3023 attrs, ARRAY_SIZE(tca_codel_policy))) {
3024 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3025 return EPROTO;
3026 }
3027
3028 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3029 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3030 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3031 return 0;
3032}
3033
3034static int
3035codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3036{
3037 struct nlattr *nlattr;
3038 const char * kind;
3039 int error;
3040 struct codel codel;
3041
3042 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3043 if (error != 0) {
3044 return error;
3045 }
3046
3047 error = codel_parse_tca_options__(nlattr, &codel);
3048 if (error != 0) {
3049 return error;
3050 }
3051
3052 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3053 return 0;
3054}
3055
3056
3057static void
3058codel_tc_destroy(struct tc *tc)
3059{
3060 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3061 tc_destroy(tc);
3062 free(codel);
3063}
3064
3065static int
3066codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3067{
3068 const struct codel *codel = codel_get__(netdev);
3069 smap_add_format(details, "target", "%u", codel->target);
3070 smap_add_format(details, "limit", "%u", codel->limit);
3071 smap_add_format(details, "interval", "%u", codel->interval);
3072 return 0;
3073}
3074
3075static int
3076codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3077{
3078 struct codel codel;
3079
3080 codel_parse_qdisc_details__(netdev, details, &codel);
3081 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3082 codel_get__(netdev)->target = codel.target;
3083 codel_get__(netdev)->limit = codel.limit;
3084 codel_get__(netdev)->interval = codel.interval;
3085 return 0;
3086}
3087
3088static const struct tc_ops tc_ops_codel = {
3089 "codel", /* linux_name */
3090 "linux-codel", /* ovs_name */
3091 CODEL_N_QUEUES, /* n_queues */
3092 codel_tc_install,
3093 codel_tc_load,
3094 codel_tc_destroy,
3095 codel_qdisc_get,
3096 codel_qdisc_set,
3097 NULL,
3098 NULL,
3099 NULL,
3100 NULL,
3101 NULL
3102};
3103\f
3104/* FQ-CoDel traffic control class. */
3105
3106#define FQCODEL_N_QUEUES 0x0000
3107
2f4298ce
BP
3108/* In sufficiently new kernel headers these are defined as enums in
3109 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3110 * kernels. (This overrides any enum definition in the header file but that's
3111 * harmless.) */
3112#define TCA_FQ_CODEL_TARGET 1
3113#define TCA_FQ_CODEL_LIMIT 2
3114#define TCA_FQ_CODEL_INTERVAL 3
3115#define TCA_FQ_CODEL_ECN 4
3116#define TCA_FQ_CODEL_FLOWS 5
3117#define TCA_FQ_CODEL_QUANTUM 6
3118
677d9158
JV
3119struct fqcodel {
3120 struct tc tc;
3121 uint32_t target;
3122 uint32_t limit;
3123 uint32_t interval;
3124 uint32_t flows;
3125 uint32_t quantum;
3126};
3127
3128static struct fqcodel *
3129fqcodel_get__(const struct netdev *netdev_)
3130{
3131 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3132 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3133}
3134
3135static void
3136fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3137 uint32_t interval, uint32_t flows, uint32_t quantum)
3138{
3139 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3140 struct fqcodel *fqcodel;
3141
3142 fqcodel = xmalloc(sizeof *fqcodel);
3143 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3144 fqcodel->target = target;
3145 fqcodel->limit = limit;
3146 fqcodel->interval = interval;
3147 fqcodel->flows = flows;
3148 fqcodel->quantum = quantum;
3149
3150 netdev->tc = &fqcodel->tc;
3151}
3152
3153static int
3154fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3155 uint32_t interval, uint32_t flows, uint32_t quantum)
3156{
3157 size_t opt_offset;
3158 struct ofpbuf request;
3159 struct tcmsg *tcmsg;
3160 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3161 int error;
3162
3163 tc_del_qdisc(netdev);
3164
3165 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3166 NLM_F_EXCL | NLM_F_CREATE, &request);
3167 if (!tcmsg) {
3168 return ENODEV;
3169 }
3170 tcmsg->tcm_handle = tc_make_handle(1, 0);
3171 tcmsg->tcm_parent = TC_H_ROOT;
3172
3173 otarget = target ? target : 5000;
3174 olimit = limit ? limit : 10240;
3175 ointerval = interval ? interval : 100000;
3176 oflows = flows ? flows : 1024;
3177 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3178 not mtu */
3179
3180 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3181 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3182 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3183 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3184 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3185 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3186 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3187 nl_msg_end_nested(&request, opt_offset);
3188
3189 error = tc_transact(&request, NULL);
3190 if (error) {
3191 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3192 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3193 netdev_get_name(netdev),
3194 otarget, olimit, ointerval, oflows, oquantum,
3195 error, ovs_strerror(error));
3196 }
3197 return error;
3198}
3199
3200static void
3201fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3202 const struct smap *details, struct fqcodel *fqcodel)
3203{
13c1637f
BP
3204 fqcodel->target = smap_get_ullong(details, "target", 0);
3205 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3206 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3207 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3208 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3209
677d9158
JV
3210 if (!fqcodel->target) {
3211 fqcodel->target = 5000;
3212 }
3213 if (!fqcodel->limit) {
3214 fqcodel->limit = 10240;
3215 }
3216 if (!fqcodel->interval) {
3217 fqcodel->interval = 1000000;
3218 }
3219 if (!fqcodel->flows) {
3220 fqcodel->flows = 1024;
3221 }
3222 if (!fqcodel->quantum) {
3223 fqcodel->quantum = 1514;
3224 }
3225}
3226
3227static int
3228fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3229{
3230 int error;
3231 struct fqcodel fqcodel;
3232
3233 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3234 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3235 fqcodel.interval, fqcodel.flows,
3236 fqcodel.quantum);
3237 if (!error) {
3238 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3239 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3240 }
3241 return error;
3242}
3243
3244static int
3245fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3246{
3247 static const struct nl_policy tca_fqcodel_policy[] = {
3248 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3249 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3250 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3251 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3252 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3253 };
3254
3255 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3256
3257 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3258 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3259 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3260 return EPROTO;
3261 }
3262
3263 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3264 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3265 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3266 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3267 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3268 return 0;
3269}
3270
3271static int
3272fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3273{
3274 struct nlattr *nlattr;
3275 const char * kind;
3276 int error;
3277 struct fqcodel fqcodel;
3278
3279 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3280 if (error != 0) {
3281 return error;
3282 }
3283
3284 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3285 if (error != 0) {
3286 return error;
3287 }
3288
3289 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3290 fqcodel.flows, fqcodel.quantum);
3291 return 0;
3292}
3293
3294static void
3295fqcodel_tc_destroy(struct tc *tc)
3296{
3297 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3298 tc_destroy(tc);
3299 free(fqcodel);
3300}
3301
3302static int
3303fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3304{
3305 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3306 smap_add_format(details, "target", "%u", fqcodel->target);
3307 smap_add_format(details, "limit", "%u", fqcodel->limit);
3308 smap_add_format(details, "interval", "%u", fqcodel->interval);
3309 smap_add_format(details, "flows", "%u", fqcodel->flows);
3310 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3311 return 0;
3312}
3313
3314static int
3315fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3316{
3317 struct fqcodel fqcodel;
3318
3319 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3320 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3321 fqcodel.flows, fqcodel.quantum);
3322 fqcodel_get__(netdev)->target = fqcodel.target;
3323 fqcodel_get__(netdev)->limit = fqcodel.limit;
3324 fqcodel_get__(netdev)->interval = fqcodel.interval;
3325 fqcodel_get__(netdev)->flows = fqcodel.flows;
3326 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3327 return 0;
3328}
3329
3330static const struct tc_ops tc_ops_fqcodel = {
3331 "fq_codel", /* linux_name */
3332 "linux-fq_codel", /* ovs_name */
3333 FQCODEL_N_QUEUES, /* n_queues */
3334 fqcodel_tc_install,
3335 fqcodel_tc_load,
3336 fqcodel_tc_destroy,
3337 fqcodel_qdisc_get,
3338 fqcodel_qdisc_set,
3339 NULL,
3340 NULL,
3341 NULL,
3342 NULL,
3343 NULL
3344};
3345\f
3346/* SFQ traffic control class. */
3347
3348#define SFQ_N_QUEUES 0x0000
3349
3350struct sfq {
3351 struct tc tc;
3352 uint32_t quantum;
3353 uint32_t perturb;
3354};
3355
3356static struct sfq *
3357sfq_get__(const struct netdev *netdev_)
3358{
3359 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3360 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3361}
3362
3363static void
3364sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3365{
3366 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3367 struct sfq *sfq;
3368
3369 sfq = xmalloc(sizeof *sfq);
3370 tc_init(&sfq->tc, &tc_ops_sfq);
3371 sfq->perturb = perturb;
3372 sfq->quantum = quantum;
3373
3374 netdev->tc = &sfq->tc;
3375}
3376
3377static int
3378sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3379{
3380 struct tc_sfq_qopt opt;
3381 struct ofpbuf request;
3382 struct tcmsg *tcmsg;
3383 int mtu;
3384 int mtu_error, error;
3385 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3386
3387 tc_del_qdisc(netdev);
3388
3389 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3390 NLM_F_EXCL | NLM_F_CREATE, &request);
3391 if (!tcmsg) {
3392 return ENODEV;
3393 }
3394 tcmsg->tcm_handle = tc_make_handle(1, 0);
3395 tcmsg->tcm_parent = TC_H_ROOT;
3396
3397 memset(&opt, 0, sizeof opt);
3398 if (!quantum) {
3399 if (!mtu_error) {
3400 opt.quantum = mtu; /* if we cannot find mtu, use default */
3401 }
3402 } else {
3403 opt.quantum = quantum;
3404 }
3405
3406 if (!perturb) {
3407 opt.perturb_period = 10;
3408 } else {
3409 opt.perturb_period = perturb;
3410 }
3411
3412 nl_msg_put_string(&request, TCA_KIND, "sfq");
3413 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3414
3415 error = tc_transact(&request, NULL);
3416 if (error) {
3417 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3418 "quantum %u, perturb %u error %d(%s)",
3419 netdev_get_name(netdev),
3420 opt.quantum, opt.perturb_period,
3421 error, ovs_strerror(error));
3422 }
3423 return error;
3424}
3425
3426static void
3427sfq_parse_qdisc_details__(struct netdev *netdev,
3428 const struct smap *details, struct sfq *sfq)
3429{
13c1637f
BP
3430 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3431 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3432
677d9158
JV
3433 if (!sfq->perturb) {
3434 sfq->perturb = 10;
3435 }
3436
3437 if (!sfq->quantum) {
13c1637f
BP
3438 int mtu;
3439 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3440 sfq->quantum = mtu;
3441 } else {
3442 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3443 "device without mtu");
677d9158
JV
3444 }
3445 }
3446}
3447
3448static int
3449sfq_tc_install(struct netdev *netdev, const struct smap *details)
3450{
3451 int error;
3452 struct sfq sfq;
3453
3454 sfq_parse_qdisc_details__(netdev, details, &sfq);
3455 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3456 if (!error) {
3457 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3458 }
3459 return error;
3460}
3461
3462static int
3463sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3464{
3465 const struct tc_sfq_qopt *sfq;
3466 struct nlattr *nlattr;
3467 const char * kind;
3468 int error;
3469
3470 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3471 if (error == 0) {
3472 sfq = nl_attr_get(nlattr);
3473 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3474 return 0;
3475 }
3476
3477 return error;
3478}
3479
3480static void
3481sfq_tc_destroy(struct tc *tc)
3482{
3483 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3484 tc_destroy(tc);
3485 free(sfq);
3486}
3487
3488static int
3489sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3490{
3491 const struct sfq *sfq = sfq_get__(netdev);
3492 smap_add_format(details, "quantum", "%u", sfq->quantum);
3493 smap_add_format(details, "perturb", "%u", sfq->perturb);
3494 return 0;
3495}
3496
3497static int
3498sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3499{
3500 struct sfq sfq;
3501
3502 sfq_parse_qdisc_details__(netdev, details, &sfq);
3503 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3504 sfq_get__(netdev)->quantum = sfq.quantum;
3505 sfq_get__(netdev)->perturb = sfq.perturb;
3506 return 0;
3507}
3508
3509static const struct tc_ops tc_ops_sfq = {
3510 "sfq", /* linux_name */
3511 "linux-sfq", /* ovs_name */
3512 SFQ_N_QUEUES, /* n_queues */
3513 sfq_tc_install,
3514 sfq_tc_load,
3515 sfq_tc_destroy,
3516 sfq_qdisc_get,
3517 sfq_qdisc_set,
3518 NULL,
3519 NULL,
3520 NULL,
3521 NULL,
3522 NULL
3523};
3524\f
c1c9c9c4 3525/* HTB traffic control class. */
559843ed 3526
c1c9c9c4 3527#define HTB_N_QUEUES 0xf000
4f631ccd 3528#define HTB_RATE2QUANTUM 10
8b61709d 3529
c1c9c9c4
BP
3530struct htb {
3531 struct tc tc;
3532 unsigned int max_rate; /* In bytes/s. */
3533};
8b61709d 3534
c1c9c9c4 3535struct htb_class {
93b13be8 3536 struct tc_queue tc_queue;
c1c9c9c4
BP
3537 unsigned int min_rate; /* In bytes/s. */
3538 unsigned int max_rate; /* In bytes/s. */
3539 unsigned int burst; /* In bytes. */
3540 unsigned int priority; /* Lower values are higher priorities. */
3541};
8b61709d 3542
c1c9c9c4 3543static struct htb *
b5d57fc8 3544htb_get__(const struct netdev *netdev_)
c1c9c9c4 3545{
b5d57fc8
BP
3546 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3547 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3548}
3549
24045e35 3550static void
b5d57fc8 3551htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3552{
b5d57fc8 3553 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3554 struct htb *htb;
3555
3556 htb = xmalloc(sizeof *htb);
3557 tc_init(&htb->tc, &tc_ops_htb);
3558 htb->max_rate = max_rate;
3559
b5d57fc8 3560 netdev->tc = &htb->tc;
c1c9c9c4
BP
3561}
3562
3563/* Create an HTB qdisc.
3564 *
a339aa81 3565 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3566static int
3567htb_setup_qdisc__(struct netdev *netdev)
3568{
3569 size_t opt_offset;
3570 struct tc_htb_glob opt;
3571 struct ofpbuf request;
3572 struct tcmsg *tcmsg;
3573
3574 tc_del_qdisc(netdev);
3575
3576 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3577 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3578 if (!tcmsg) {
3579 return ENODEV;
3580 }
c1c9c9c4
BP
3581 tcmsg->tcm_handle = tc_make_handle(1, 0);
3582 tcmsg->tcm_parent = TC_H_ROOT;
3583
3584 nl_msg_put_string(&request, TCA_KIND, "htb");
3585
3586 memset(&opt, 0, sizeof opt);
4f631ccd 3587 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3588 opt.version = 3;
4ecf12d5 3589 opt.defcls = 1;
c1c9c9c4
BP
3590
3591 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3592 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3593 nl_msg_end_nested(&request, opt_offset);
3594
3595 return tc_transact(&request, NULL);
3596}
3597
3598/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3599 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3600static int
3601htb_setup_class__(struct netdev *netdev, unsigned int handle,
3602 unsigned int parent, struct htb_class *class)
3603{
3604 size_t opt_offset;
3605 struct tc_htb_opt opt;
3606 struct ofpbuf request;
3607 struct tcmsg *tcmsg;
3608 int error;
3609 int mtu;
3610
73371c09 3611 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3612 if (error) {
f915f1a8
BP
3613 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3614 netdev_get_name(netdev));
9b020780 3615 return error;
f915f1a8 3616 }
c1c9c9c4
BP
3617
3618 memset(&opt, 0, sizeof opt);
3619 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3620 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3621 /* Makes sure the quantum is at least MTU. Setting quantum will
3622 * make htb ignore the r2q for this class. */
3623 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3624 opt.quantum = mtu;
3625 }
c1c9c9c4
BP
3626 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3627 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3628 opt.prio = class->priority;
3629
3630 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
3631 if (!tcmsg) {
3632 return ENODEV;
3633 }
c1c9c9c4
BP
3634 tcmsg->tcm_handle = handle;
3635 tcmsg->tcm_parent = parent;
3636
3637 nl_msg_put_string(&request, TCA_KIND, "htb");
3638 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3639 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3640 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3641 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3642 nl_msg_end_nested(&request, opt_offset);
3643
3644 error = tc_transact(&request, NULL);
3645 if (error) {
3646 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3647 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3648 netdev_get_name(netdev),
3649 tc_get_major(handle), tc_get_minor(handle),
3650 tc_get_major(parent), tc_get_minor(parent),
3651 class->min_rate, class->max_rate,
10a89ef0 3652 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3653 }
3654 return error;
3655}
3656
3657/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3658 * description of them into 'details'. The description complies with the
3659 * specification given in the vswitch database documentation for linux-htb
3660 * queue details. */
3661static int
3662htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3663{
3664 static const struct nl_policy tca_htb_policy[] = {
3665 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3666 .min_len = sizeof(struct tc_htb_opt) },
3667 };
3668
3669 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3670 const struct tc_htb_opt *htb;
3671
3672 if (!nl_parse_nested(nl_options, tca_htb_policy,
3673 attrs, ARRAY_SIZE(tca_htb_policy))) {
3674 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3675 return EPROTO;
3676 }
3677
3678 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3679 class->min_rate = htb->rate.rate;
3680 class->max_rate = htb->ceil.rate;
3681 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3682 class->priority = htb->prio;
3683 return 0;
3684}
3685
3686static int
3687htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3688 struct htb_class *options,
3689 struct netdev_queue_stats *stats)
3690{
3691 struct nlattr *nl_options;
3692 unsigned int handle;
3693 int error;
3694
3695 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3696 if (!error && queue_id) {
17ee3c1f
BP
3697 unsigned int major = tc_get_major(handle);
3698 unsigned int minor = tc_get_minor(handle);
3699 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3700 *queue_id = minor - 1;
c1c9c9c4
BP
3701 } else {
3702 error = EPROTO;
3703 }
3704 }
3705 if (!error && options) {
3706 error = htb_parse_tca_options__(nl_options, options);
3707 }
3708 return error;
3709}
3710
3711static void
73371c09 3712htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3713 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3714{
73371c09 3715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 3716
13c1637f 3717 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 3718 if (!hc->max_rate) {
a00ca915 3719 enum netdev_features current;
c1c9c9c4 3720
73371c09
BP
3721 netdev_linux_read_features(netdev);
3722 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3723 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3724 }
3725 hc->min_rate = hc->max_rate;
3726 hc->burst = 0;
3727 hc->priority = 0;
3728}
3729
3730static int
3731htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3732 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3733{
3734 const struct htb *htb = htb_get__(netdev);
9b020780 3735 int mtu, error;
c1c9c9c4 3736
73371c09 3737 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3738 if (error) {
f915f1a8
BP
3739 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3740 netdev_get_name(netdev));
9b020780 3741 return error;
f915f1a8
BP
3742 }
3743
4f104611
EJ
3744 /* HTB requires at least an mtu sized min-rate to send any traffic even
3745 * on uncongested links. */
13c1637f 3746 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 3747 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3748 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3749
3750 /* max-rate */
13c1637f
BP
3751 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
3752 if (!hc->max_rate) {
3753 hc->max_rate = htb->max_rate;
3754 }
c1c9c9c4
BP
3755 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3756 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3757
3758 /* burst
3759 *
3760 * According to hints in the documentation that I've read, it is important
3761 * that 'burst' be at least as big as the largest frame that might be
3762 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3763 * but having it a bit too small is a problem. Since netdev_get_mtu()
3764 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3765 * the MTU. We actually add 64, instead of 14, as a guard against
3766 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 3767 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
3768 hc->burst = MAX(hc->burst, mtu + 64);
3769
3770 /* priority */
13c1637f 3771 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
3772
3773 return 0;
3774}
3775
3776static int
3777htb_query_class__(const struct netdev *netdev, unsigned int handle,
3778 unsigned int parent, struct htb_class *options,
3779 struct netdev_queue_stats *stats)
3780{
3781 struct ofpbuf *reply;
3782 int error;
3783
3784 error = tc_query_class(netdev, handle, parent, &reply);
3785 if (!error) {
3786 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3787 ofpbuf_delete(reply);
3788 }
3789 return error;
3790}
3791
3792static int
79f1cbe9 3793htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3794{
3795 int error;
3796
3797 error = htb_setup_qdisc__(netdev);
3798 if (!error) {
3799 struct htb_class hc;
3800
3801 htb_parse_qdisc_details__(netdev, details, &hc);
3802 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3803 tc_make_handle(1, 0), &hc);
3804 if (!error) {
3805 htb_install__(netdev, hc.max_rate);
3806 }
3807 }
3808 return error;
3809}
3810
93b13be8
BP
3811static struct htb_class *
3812htb_class_cast__(const struct tc_queue *queue)
3813{
3814 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3815}
3816
c1c9c9c4
BP
3817static void
3818htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3819 const struct htb_class *hc)
3820{
3821 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3822 size_t hash = hash_int(queue_id, 0);
3823 struct tc_queue *queue;
c1c9c9c4
BP
3824 struct htb_class *hcp;
3825
93b13be8
BP
3826 queue = tc_find_queue__(netdev, queue_id, hash);
3827 if (queue) {
3828 hcp = htb_class_cast__(queue);
3829 } else {
c1c9c9c4 3830 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3831 queue = &hcp->tc_queue;
3832 queue->queue_id = queue_id;
6dc34a0d 3833 queue->created = time_msec();
93b13be8 3834 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3835 }
93b13be8
BP
3836
3837 hcp->min_rate = hc->min_rate;
3838 hcp->max_rate = hc->max_rate;
3839 hcp->burst = hc->burst;
3840 hcp->priority = hc->priority;
c1c9c9c4
BP
3841}
3842
3843static int
3844htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3845{
c1c9c9c4 3846 struct ofpbuf msg;
d57695d7 3847 struct queue_dump_state state;
c1c9c9c4 3848 struct htb_class hc;
c1c9c9c4
BP
3849
3850 /* Get qdisc options. */
3851 hc.max_rate = 0;
3852 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3853 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3854
3855 /* Get queues. */
d57695d7 3856 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3857 return ENODEV;
3858 }
d57695d7 3859 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3860 unsigned int queue_id;
3861
3862 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3863 htb_update_queue__(netdev, queue_id, &hc);
3864 }
3865 }
d57695d7 3866 finish_queue_dump(&state);
c1c9c9c4
BP
3867
3868 return 0;
3869}
3870
3871static void
3872htb_tc_destroy(struct tc *tc)
3873{
3874 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 3875 struct htb_class *hc;
c1c9c9c4 3876
4ec3d7c7 3877 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
3878 free(hc);
3879 }
3880 tc_destroy(tc);
3881 free(htb);
3882}
3883
3884static int
79f1cbe9 3885htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3886{
3887 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3888 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3889 return 0;
3890}
3891
3892static int
79f1cbe9 3893htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3894{
3895 struct htb_class hc;
3896 int error;
3897
3898 htb_parse_qdisc_details__(netdev, details, &hc);
3899 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3900 tc_make_handle(1, 0), &hc);
3901 if (!error) {
3902 htb_get__(netdev)->max_rate = hc.max_rate;
3903 }
3904 return error;
3905}
3906
3907static int
93b13be8 3908htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3909 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3910{
93b13be8 3911 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3912
79f1cbe9 3913 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3914 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3915 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3916 }
79f1cbe9 3917 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3918 if (hc->priority) {
79f1cbe9 3919 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3920 }
3921 return 0;
3922}
3923
3924static int
3925htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3926 const struct smap *details)
c1c9c9c4
BP
3927{
3928 struct htb_class hc;
3929 int error;
3930
3931 error = htb_parse_class_details__(netdev, details, &hc);
3932 if (error) {
3933 return error;
3934 }
3935
17ee3c1f 3936 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3937 tc_make_handle(1, 0xfffe), &hc);
3938 if (error) {
3939 return error;
3940 }
3941
3942 htb_update_queue__(netdev, queue_id, &hc);
3943 return 0;
3944}
3945
3946static int
93b13be8 3947htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3948{
93b13be8 3949 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3950 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3951 int error;
3952
93b13be8 3953 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3954 if (!error) {
93b13be8 3955 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3956 free(hc);
c1c9c9c4
BP
3957 }
3958 return error;
3959}
3960
3961static int
93b13be8 3962htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3963 struct netdev_queue_stats *stats)
3964{
93b13be8 3965 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3966 tc_make_handle(1, 0xfffe), NULL, stats);
3967}
3968
3969static int
3970htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3971 const struct ofpbuf *nlmsg,
3972 netdev_dump_queue_stats_cb *cb, void *aux)
3973{
3974 struct netdev_queue_stats stats;
17ee3c1f 3975 unsigned int handle, major, minor;
c1c9c9c4
BP
3976 int error;
3977
3978 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3979 if (error) {
3980 return error;
3981 }
3982
17ee3c1f
BP
3983 major = tc_get_major(handle);
3984 minor = tc_get_minor(handle);
3985 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3986 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3987 }
3988 return 0;
3989}
3990
3991static const struct tc_ops tc_ops_htb = {
3992 "htb", /* linux_name */
3993 "linux-htb", /* ovs_name */
3994 HTB_N_QUEUES, /* n_queues */
3995 htb_tc_install,
3996 htb_tc_load,
3997 htb_tc_destroy,
3998 htb_qdisc_get,
3999 htb_qdisc_set,
4000 htb_class_get,
4001 htb_class_set,
4002 htb_class_delete,
4003 htb_class_get_stats,
4004 htb_class_dump_stats
4005};
4006\f
a339aa81
EJ
4007/* "linux-hfsc" traffic control class. */
4008
4009#define HFSC_N_QUEUES 0xf000
4010
4011struct hfsc {
4012 struct tc tc;
4013 uint32_t max_rate;
4014};
4015
4016struct hfsc_class {
4017 struct tc_queue tc_queue;
4018 uint32_t min_rate;
4019 uint32_t max_rate;
4020};
4021
4022static struct hfsc *
b5d57fc8 4023hfsc_get__(const struct netdev *netdev_)
a339aa81 4024{
b5d57fc8
BP
4025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4026 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4027}
4028
4029static struct hfsc_class *
4030hfsc_class_cast__(const struct tc_queue *queue)
4031{
4032 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4033}
4034
24045e35 4035static void
b5d57fc8 4036hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4037{
b5d57fc8 4038 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4039 struct hfsc *hfsc;
4040
a339aa81
EJ
4041 hfsc = xmalloc(sizeof *hfsc);
4042 tc_init(&hfsc->tc, &tc_ops_hfsc);
4043 hfsc->max_rate = max_rate;
b5d57fc8 4044 netdev->tc = &hfsc->tc;
a339aa81
EJ
4045}
4046
4047static void
4048hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4049 const struct hfsc_class *hc)
4050{
4051 size_t hash;
4052 struct hfsc *hfsc;
4053 struct hfsc_class *hcp;
4054 struct tc_queue *queue;
4055
4056 hfsc = hfsc_get__(netdev);
4057 hash = hash_int(queue_id, 0);
4058
4059 queue = tc_find_queue__(netdev, queue_id, hash);
4060 if (queue) {
4061 hcp = hfsc_class_cast__(queue);
4062 } else {
4063 hcp = xmalloc(sizeof *hcp);
4064 queue = &hcp->tc_queue;
4065 queue->queue_id = queue_id;
6dc34a0d 4066 queue->created = time_msec();
a339aa81
EJ
4067 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4068 }
4069
4070 hcp->min_rate = hc->min_rate;
4071 hcp->max_rate = hc->max_rate;
4072}
4073
4074static int
4075hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4076{
4077 const struct tc_service_curve *rsc, *fsc, *usc;
4078 static const struct nl_policy tca_hfsc_policy[] = {
4079 [TCA_HFSC_RSC] = {
4080 .type = NL_A_UNSPEC,
4081 .optional = false,
4082 .min_len = sizeof(struct tc_service_curve),
4083 },
4084 [TCA_HFSC_FSC] = {
4085 .type = NL_A_UNSPEC,
4086 .optional = false,
4087 .min_len = sizeof(struct tc_service_curve),
4088 },
4089 [TCA_HFSC_USC] = {
4090 .type = NL_A_UNSPEC,
4091 .optional = false,
4092 .min_len = sizeof(struct tc_service_curve),
4093 },
4094 };
4095 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4096
4097 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4098 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4099 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4100 return EPROTO;
4101 }
4102
4103 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4104 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4105 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4106
4107 if (rsc->m1 != 0 || rsc->d != 0 ||
4108 fsc->m1 != 0 || fsc->d != 0 ||
4109 usc->m1 != 0 || usc->d != 0) {
4110 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4111 "Non-linear service curves are not supported.");
4112 return EPROTO;
4113 }
4114
4115 if (rsc->m2 != fsc->m2) {
4116 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4117 "Real-time service curves are not supported ");
4118 return EPROTO;
4119 }
4120
4121 if (rsc->m2 > usc->m2) {
4122 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4123 "Min-rate service curve is greater than "
4124 "the max-rate service curve.");
4125 return EPROTO;
4126 }
4127
4128 class->min_rate = fsc->m2;
4129 class->max_rate = usc->m2;
4130 return 0;
4131}
4132
4133static int
4134hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4135 struct hfsc_class *options,
4136 struct netdev_queue_stats *stats)
4137{
4138 int error;
4139 unsigned int handle;
4140 struct nlattr *nl_options;
4141
4142 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4143 if (error) {
4144 return error;
4145 }
4146
4147 if (queue_id) {
4148 unsigned int major, minor;
4149
4150 major = tc_get_major(handle);
4151 minor = tc_get_minor(handle);
4152 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4153 *queue_id = minor - 1;
4154 } else {
4155 return EPROTO;
4156 }
4157 }
4158
4159 if (options) {
4160 error = hfsc_parse_tca_options__(nl_options, options);
4161 }
4162
4163 return error;
4164}
4165
4166static int
4167hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4168 unsigned int parent, struct hfsc_class *options,
4169 struct netdev_queue_stats *stats)
4170{
4171 int error;
4172 struct ofpbuf *reply;
4173
4174 error = tc_query_class(netdev, handle, parent, &reply);
4175 if (error) {
4176 return error;
4177 }
4178
4179 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4180 ofpbuf_delete(reply);
4181 return error;
4182}
4183
4184static void
73371c09 4185hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4186 struct hfsc_class *class)
4187{
73371c09 4188 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4189
13c1637f 4190 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4191 if (!max_rate) {
a00ca915 4192 enum netdev_features current;
a339aa81 4193
73371c09
BP
4194 netdev_linux_read_features(netdev);
4195 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4196 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4197 }
4198
4199 class->min_rate = max_rate;
4200 class->max_rate = max_rate;
4201}
4202
4203static int
4204hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4205 const struct smap *details,
a339aa81
EJ
4206 struct hfsc_class * class)
4207{
4208 const struct hfsc *hfsc;
4209 uint32_t min_rate, max_rate;
a339aa81
EJ
4210
4211 hfsc = hfsc_get__(netdev);
a339aa81 4212
13c1637f 4213 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4214 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4215 min_rate = MIN(min_rate, hfsc->max_rate);
4216
13c1637f 4217 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4218 max_rate = MAX(max_rate, min_rate);
4219 max_rate = MIN(max_rate, hfsc->max_rate);
4220
4221 class->min_rate = min_rate;
4222 class->max_rate = max_rate;
4223
4224 return 0;
4225}
4226
4227/* Create an HFSC qdisc.
4228 *
4229 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4230static int
4231hfsc_setup_qdisc__(struct netdev * netdev)
4232{
4233 struct tcmsg *tcmsg;
4234 struct ofpbuf request;
4235 struct tc_hfsc_qopt opt;
4236
4237 tc_del_qdisc(netdev);
4238
4239 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4240 NLM_F_EXCL | NLM_F_CREATE, &request);
4241
4242 if (!tcmsg) {
4243 return ENODEV;
4244 }
4245
4246 tcmsg->tcm_handle = tc_make_handle(1, 0);
4247 tcmsg->tcm_parent = TC_H_ROOT;
4248
4249 memset(&opt, 0, sizeof opt);
4250 opt.defcls = 1;
4251
4252 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4253 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4254
4255 return tc_transact(&request, NULL);
4256}
4257
4258/* Create an HFSC class.
4259 *
4260 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4261 * sc rate <min_rate> ul rate <max_rate>" */
4262static int
4263hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4264 unsigned int parent, struct hfsc_class *class)
4265{
4266 int error;
4267 size_t opt_offset;
4268 struct tcmsg *tcmsg;
4269 struct ofpbuf request;
4270 struct tc_service_curve min, max;
4271
4272 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4273
4274 if (!tcmsg) {
4275 return ENODEV;
4276 }
4277
4278 tcmsg->tcm_handle = handle;
4279 tcmsg->tcm_parent = parent;
4280
4281 min.m1 = 0;
4282 min.d = 0;
4283 min.m2 = class->min_rate;
4284
4285 max.m1 = 0;
4286 max.d = 0;
4287 max.m2 = class->max_rate;
4288
4289 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4290 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4291 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4292 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4293 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4294 nl_msg_end_nested(&request, opt_offset);
4295
4296 error = tc_transact(&request, NULL);
4297 if (error) {
4298 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4299 "min-rate %ubps, max-rate %ubps (%s)",
4300 netdev_get_name(netdev),
4301 tc_get_major(handle), tc_get_minor(handle),
4302 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4303 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4304 }
4305
4306 return error;
4307}
4308
4309static int
79f1cbe9 4310hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4311{
4312 int error;
4313 struct hfsc_class class;
4314
4315 error = hfsc_setup_qdisc__(netdev);
4316
4317 if (error) {
4318 return error;
4319 }
4320
4321 hfsc_parse_qdisc_details__(netdev, details, &class);
4322 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4323 tc_make_handle(1, 0), &class);
4324
4325 if (error) {
4326 return error;
4327 }
4328
4329 hfsc_install__(netdev, class.max_rate);
4330 return 0;
4331}
4332
4333static int
4334hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4335{
4336 struct ofpbuf msg;
d57695d7 4337 struct queue_dump_state state;
a339aa81
EJ
4338 struct hfsc_class hc;
4339
4340 hc.max_rate = 0;
4341 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4342 hfsc_install__(netdev, hc.max_rate);
a339aa81 4343
d57695d7 4344 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4345 return ENODEV;
4346 }
4347
d57695d7 4348 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4349 unsigned int queue_id;
4350
4351 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4352 hfsc_update_queue__(netdev, queue_id, &hc);
4353 }
4354 }
4355
d57695d7 4356 finish_queue_dump(&state);
a339aa81
EJ
4357 return 0;
4358}
4359
4360static void
4361hfsc_tc_destroy(struct tc *tc)
4362{
4363 struct hfsc *hfsc;
4364 struct hfsc_class *hc, *next;
4365
4366 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4367
4368 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4369 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4370 free(hc);
4371 }
4372
4373 tc_destroy(tc);
4374 free(hfsc);
4375}
4376
4377static int
79f1cbe9 4378hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4379{
4380 const struct hfsc *hfsc;
4381 hfsc = hfsc_get__(netdev);
79f1cbe9 4382 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4383 return 0;
4384}
4385
4386static int
79f1cbe9 4387hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4388{
4389 int error;
4390 struct hfsc_class class;
4391
4392 hfsc_parse_qdisc_details__(netdev, details, &class);
4393 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4394 tc_make_handle(1, 0), &class);
4395
4396 if (!error) {
4397 hfsc_get__(netdev)->max_rate = class.max_rate;
4398 }
4399
4400 return error;
4401}
4402
4403static int
4404hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4405 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4406{
4407 const struct hfsc_class *hc;
4408
4409 hc = hfsc_class_cast__(queue);
79f1cbe9 4410 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4411 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4412 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4413 }
4414 return 0;
4415}
4416
4417static int
4418hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4419 const struct smap *details)
a339aa81
EJ
4420{
4421 int error;
4422 struct hfsc_class class;
4423
4424 error = hfsc_parse_class_details__(netdev, details, &class);
4425 if (error) {
4426 return error;
4427 }
4428
4429 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4430 tc_make_handle(1, 0xfffe), &class);
4431 if (error) {
4432 return error;
4433 }
4434
4435 hfsc_update_queue__(netdev, queue_id, &class);
4436 return 0;
4437}
4438
4439static int
4440hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4441{
4442 int error;
4443 struct hfsc *hfsc;
4444 struct hfsc_class *hc;
4445
4446 hc = hfsc_class_cast__(queue);
4447 hfsc = hfsc_get__(netdev);
4448
4449 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4450 if (!error) {
4451 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4452 free(hc);
4453 }
4454 return error;
4455}
4456
4457static int
4458hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4459 struct netdev_queue_stats *stats)
4460{
4461 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4462 tc_make_handle(1, 0xfffe), NULL, stats);
4463}
4464
4465static int
4466hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4467 const struct ofpbuf *nlmsg,
4468 netdev_dump_queue_stats_cb *cb, void *aux)
4469{
4470 struct netdev_queue_stats stats;
4471 unsigned int handle, major, minor;
4472 int error;
4473
4474 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4475 if (error) {
4476 return error;
4477 }
4478
4479 major = tc_get_major(handle);
4480 minor = tc_get_minor(handle);
4481 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4482 (*cb)(minor - 1, &stats, aux);
4483 }
4484 return 0;
4485}
4486
4487static const struct tc_ops tc_ops_hfsc = {
4488 "hfsc", /* linux_name */
4489 "linux-hfsc", /* ovs_name */
4490 HFSC_N_QUEUES, /* n_queues */
4491 hfsc_tc_install, /* tc_install */
4492 hfsc_tc_load, /* tc_load */
4493 hfsc_tc_destroy, /* tc_destroy */
4494 hfsc_qdisc_get, /* qdisc_get */
4495 hfsc_qdisc_set, /* qdisc_set */
4496 hfsc_class_get, /* class_get */
4497 hfsc_class_set, /* class_set */
4498 hfsc_class_delete, /* class_delete */
4499 hfsc_class_get_stats, /* class_get_stats */
4500 hfsc_class_dump_stats /* class_dump_stats */
4501};
4502\f
6cf888b8
BS
4503/* "linux-noop" traffic control class. */
4504
4505static void
4506noop_install__(struct netdev *netdev_)
4507{
4508 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4509 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4510
4511 netdev->tc = CONST_CAST(struct tc *, &tc);
4512}
4513
4514static int
4515noop_tc_install(struct netdev *netdev,
4516 const struct smap *details OVS_UNUSED)
4517{
4518 noop_install__(netdev);
4519 return 0;
4520}
4521
4522static int
4523noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4524{
4525 noop_install__(netdev);
4526 return 0;
4527}
4528
4529static const struct tc_ops tc_ops_noop = {
4530 NULL, /* linux_name */
4531 "linux-noop", /* ovs_name */
4532 0, /* n_queues */
4533 noop_tc_install,
4534 noop_tc_load,
4535 NULL, /* tc_destroy */
4536 NULL, /* qdisc_get */
4537 NULL, /* qdisc_set */
4538 NULL, /* class_get */
4539 NULL, /* class_set */
4540 NULL, /* class_delete */
4541 NULL, /* class_get_stats */
4542 NULL /* class_dump_stats */
4543};
4544\f
c1c9c9c4
BP
4545/* "linux-default" traffic control class.
4546 *
4547 * This class represents the default, unnamed Linux qdisc. It corresponds to
4548 * the "" (empty string) QoS type in the OVS database. */
4549
4550static void
b5d57fc8 4551default_install__(struct netdev *netdev_)
c1c9c9c4 4552{
b5d57fc8 4553 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4554 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4555
559eb230
BP
4556 /* Nothing but a tc class implementation is allowed to write to a tc. This
4557 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4558 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4559}
4560
4561static int
4562default_tc_install(struct netdev *netdev,
79f1cbe9 4563 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4564{
4565 default_install__(netdev);
4566 return 0;
4567}
4568
4569static int
4570default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4571{
4572 default_install__(netdev);
4573 return 0;
4574}
4575
4576static const struct tc_ops tc_ops_default = {
4577 NULL, /* linux_name */
4578 "", /* ovs_name */
4579 0, /* n_queues */
4580 default_tc_install,
4581 default_tc_load,
4582 NULL, /* tc_destroy */
4583 NULL, /* qdisc_get */
4584 NULL, /* qdisc_set */
4585 NULL, /* class_get */
4586 NULL, /* class_set */
4587 NULL, /* class_delete */
4588 NULL, /* class_get_stats */
4589 NULL /* class_dump_stats */
4590};
4591\f
4592/* "linux-other" traffic control class.
4593 *
4594 * */
4595
4596static int
b5d57fc8 4597other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4598{
b5d57fc8 4599 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4600 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4601
559eb230
BP
4602 /* Nothing but a tc class implementation is allowed to write to a tc. This
4603 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4604 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4605 return 0;
4606}
4607
4608static const struct tc_ops tc_ops_other = {
4609 NULL, /* linux_name */
4610 "linux-other", /* ovs_name */
4611 0, /* n_queues */
4612 NULL, /* tc_install */
4613 other_tc_load,
4614 NULL, /* tc_destroy */
4615 NULL, /* qdisc_get */
4616 NULL, /* qdisc_set */
4617 NULL, /* class_get */
4618 NULL, /* class_set */
4619 NULL, /* class_delete */
4620 NULL, /* class_get_stats */
4621 NULL /* class_dump_stats */
4622};
4623\f
4624/* Traffic control. */
4625
4626/* Number of kernel "tc" ticks per second. */
4627static double ticks_per_s;
4628
4629/* Number of kernel "jiffies" per second. This is used for the purpose of
4630 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4631 * one jiffy's worth of data.
4632 *
4633 * There are two possibilities here:
4634 *
4635 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4636 * approximate range of 100 to 1024. That means that we really need to
4637 * make sure that the qdisc can buffer that much data.
4638 *
4639 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4640 * has finely granular timers and there's no need to fudge additional room
4641 * for buffers. (There's no extra effort needed to implement that: the
4642 * large 'buffer_hz' is used as a divisor, so practically any number will
4643 * come out as 0 in the division. Small integer results in the case of
4644 * really high dividends won't have any real effect anyhow.)
4645 */
4646static unsigned int buffer_hz;
4647
4648/* Returns tc handle 'major':'minor'. */
4649static unsigned int
4650tc_make_handle(unsigned int major, unsigned int minor)
4651{
4652 return TC_H_MAKE(major << 16, minor);
4653}
4654
4655/* Returns the major number from 'handle'. */
4656static unsigned int
4657tc_get_major(unsigned int handle)
4658{
4659 return TC_H_MAJ(handle) >> 16;
4660}
4661
4662/* Returns the minor number from 'handle'. */
4663static unsigned int
4664tc_get_minor(unsigned int handle)
4665{
4666 return TC_H_MIN(handle);
4667}
4668
4669static struct tcmsg *
4670tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4671 struct ofpbuf *request)
4672{
4673 struct tcmsg *tcmsg;
4674 int ifindex;
4675 int error;
4676
4677 error = get_ifindex(netdev, &ifindex);
4678 if (error) {
4679 return NULL;
4680 }
4681
4682 ofpbuf_init(request, 512);
4683 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4684 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4685 tcmsg->tcm_family = AF_UNSPEC;
4686 tcmsg->tcm_ifindex = ifindex;
4687 /* Caller should fill in tcmsg->tcm_handle. */
4688 /* Caller should fill in tcmsg->tcm_parent. */
4689
4690 return tcmsg;
4691}
4692
4693static int
4694tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4695{
a88b4e04 4696 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
4697 ofpbuf_uninit(request);
4698 return error;
4699}
4700
f8500004
JP
4701/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4702 * policing configuration.
4703 *
4704 * This function is equivalent to running the following when 'add' is true:
4705 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4706 *
4707 * This function is equivalent to running the following when 'add' is false:
4708 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4709 *
4710 * The configuration and stats may be seen with the following command:
4711 * /sbin/tc -s qdisc show dev <devname>
4712 *
4713 * Returns 0 if successful, otherwise a positive errno value.
4714 */
4715static int
4716tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4717{
4718 struct ofpbuf request;
4719 struct tcmsg *tcmsg;
4720 int error;
4721 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4722 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4723
4724 tcmsg = tc_make_request(netdev, type, flags, &request);
4725 if (!tcmsg) {
4726 return ENODEV;
4727 }
4728 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4729 tcmsg->tcm_parent = TC_H_INGRESS;
4730 nl_msg_put_string(&request, TCA_KIND, "ingress");
4731 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4732
4733 error = tc_transact(&request, NULL);
4734 if (error) {
4735 /* If we're deleting the qdisc, don't worry about some of the
4736 * error conditions. */
4737 if (!add && (error == ENOENT || error == EINVAL)) {
4738 return 0;
4739 }
4740 return error;
4741 }
4742
4743 return 0;
4744}
4745
4746/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4747 * of 'kbits_burst'.
4748 *
4749 * This function is equivalent to running:
4750 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4751 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4752 * mtu 65535 drop
4753 *
4754 * The configuration and stats may be seen with the following command:
c7952afb 4755 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4756 *
4757 * Returns 0 if successful, otherwise a positive errno value.
4758 */
4759static int
c7952afb
BP
4760tc_add_policer(struct netdev *netdev,
4761 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4762{
4763 struct tc_police tc_police;
4764 struct ofpbuf request;
4765 struct tcmsg *tcmsg;
4766 size_t basic_offset;
4767 size_t police_offset;
4768 int error;
4769 int mtu = 65535;
4770
4771 memset(&tc_police, 0, sizeof tc_police);
4772 tc_police.action = TC_POLICE_SHOT;
4773 tc_police.mtu = mtu;
1aca400c 4774 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4775
79abacc8
MAA
4776 /* The following appears wrong in one way: In networking a kilobit is
4777 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4778 *
4779 * However if you "fix" those problems then "tc filter show ..." shows
4780 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4781 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4782 * tc's point of view. Whatever. */
4783 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4784 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004
JP
4785
4786 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4787 NLM_F_EXCL | NLM_F_CREATE, &request);
4788 if (!tcmsg) {
4789 return ENODEV;
4790 }
4791 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4792 tcmsg->tcm_info = tc_make_handle(49,
4793 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4794
4795 nl_msg_put_string(&request, TCA_KIND, "basic");
4796 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4797 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4798 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4799 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4800 nl_msg_end_nested(&request, police_offset);
4801 nl_msg_end_nested(&request, basic_offset);
4802
4803 error = tc_transact(&request, NULL);
4804 if (error) {
4805 return error;
4806 }
4807
4808 return 0;
4809}
4810
c1c9c9c4
BP
4811static void
4812read_psched(void)
4813{
4814 /* The values in psched are not individually very meaningful, but they are
4815 * important. The tables below show some values seen in the wild.
4816 *
4817 * Some notes:
4818 *
4819 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4820 * (Before that, there are hints that it was 1000000000.)
4821 *
4822 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4823 * above.
4824 *
4825 * /proc/net/psched
4826 * -----------------------------------
4827 * [1] 000c8000 000f4240 000f4240 00000064
4828 * [2] 000003e8 00000400 000f4240 3b9aca00
4829 * [3] 000003e8 00000400 000f4240 3b9aca00
4830 * [4] 000003e8 00000400 000f4240 00000064
4831 * [5] 000003e8 00000040 000f4240 3b9aca00
4832 * [6] 000003e8 00000040 000f4240 000000f9
4833 *
4834 * a b c d ticks_per_s buffer_hz
4835 * ------- --------- ---------- ------------- ----------- -------------
4836 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4837 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4838 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4839 * [4] 1,000 1,024 1,000,000 100 976,562 100
4840 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4841 * [6] 1,000 64 1,000,000 249 15,625,000 249
4842 *
4843 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4844 * [2] 2.6.26-1-686-bigmem from Debian lenny
4845 * [3] 2.6.26-2-sparc64 from Debian lenny
4846 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4847 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4848 * [6] 2.6.34 from kernel.org on KVM
4849 */
23882115 4850 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4851 static const char fn[] = "/proc/net/psched";
4852 unsigned int a, b, c, d;
4853 FILE *stream;
4854
23882115
BP
4855 if (!ovsthread_once_start(&once)) {
4856 return;
4857 }
4858
c1c9c9c4
BP
4859 ticks_per_s = 1.0;
4860 buffer_hz = 100;
4861
4862 stream = fopen(fn, "r");
4863 if (!stream) {
10a89ef0 4864 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4865 goto exit;
c1c9c9c4
BP
4866 }
4867
4868 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4869 VLOG_WARN("%s: read failed", fn);
4870 fclose(stream);
23882115 4871 goto exit;
c1c9c9c4
BP
4872 }
4873 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4874 fclose(stream);
4875
4876 if (!a || !c) {
4877 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4878 goto exit;
c1c9c9c4
BP
4879 }
4880
4881 ticks_per_s = (double) a * c / b;
4882 if (c == 1000000) {
4883 buffer_hz = d;
4884 } else {
4885 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4886 fn, a, b, c, d);
4887 }
4888 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4889
4890exit:
4891 ovsthread_once_done(&once);
c1c9c9c4
BP
4892}
4893
4894/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4895 * rate of 'rate' bytes per second. */
4896static unsigned int
4897tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4898{
23882115 4899 read_psched();
c1c9c9c4
BP
4900 return (rate * ticks) / ticks_per_s;
4901}
4902
4903/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4904 * rate of 'rate' bytes per second. */
4905static unsigned int
4906tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4907{
23882115 4908 read_psched();
015c93a4 4909 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4910}
4911
4912/* Returns the number of bytes that need to be reserved for qdisc buffering at
4913 * a transmission rate of 'rate' bytes per second. */
4914static unsigned int
4915tc_buffer_per_jiffy(unsigned int rate)
4916{
23882115 4917 read_psched();
c1c9c9c4
BP
4918 return rate / buffer_hz;
4919}
4920
4921/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4922 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4923 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4924 * stores NULL into it if it is absent.
4925 *
4926 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4927 * 'msg'.
4928 *
4929 * Returns 0 if successful, otherwise a positive errno value. */
4930static int
4931tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4932 struct nlattr **options)
4933{
4934 static const struct nl_policy tca_policy[] = {
4935 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4936 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4937 };
4938 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4939
4940 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4941 tca_policy, ta, ARRAY_SIZE(ta))) {
4942 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4943 goto error;
4944 }
4945
4946 if (kind) {
4947 *kind = nl_attr_get_string(ta[TCA_KIND]);
4948 }
4949
4950 if (options) {
4951 *options = ta[TCA_OPTIONS];
4952 }
4953
4954 return 0;
4955
4956error:
4957 if (kind) {
4958 *kind = NULL;
4959 }
4960 if (options) {
4961 *options = NULL;
4962 }
4963 return EPROTO;
4964}
4965
4966/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4967 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4968 * into '*options', and its queue statistics into '*stats'. Any of the output
4969 * arguments may be null.
4970 *
4971 * Returns 0 if successful, otherwise a positive errno value. */
4972static int
4973tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4974 struct nlattr **options, struct netdev_queue_stats *stats)
4975{
4976 static const struct nl_policy tca_policy[] = {
4977 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4978 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4979 };
4980 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4981
4982 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4983 tca_policy, ta, ARRAY_SIZE(ta))) {
4984 VLOG_WARN_RL(&rl, "failed to parse class message");
4985 goto error;
4986 }
4987
4988 if (handlep) {
4989 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4990 *handlep = tc->tcm_handle;
4991 }
4992
4993 if (options) {
4994 *options = ta[TCA_OPTIONS];
4995 }
4996
4997 if (stats) {
4998 const struct gnet_stats_queue *gsq;
4999 struct gnet_stats_basic gsb;
5000
5001 static const struct nl_policy stats_policy[] = {
5002 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5003 .min_len = sizeof gsb },
5004 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5005 .min_len = sizeof *gsq },
5006 };
5007 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5008
5009 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5010 sa, ARRAY_SIZE(sa))) {
5011 VLOG_WARN_RL(&rl, "failed to parse class stats");
5012 goto error;
5013 }
5014
5015 /* Alignment issues screw up the length of struct gnet_stats_basic on
5016 * some arch/bitsize combinations. Newer versions of Linux have a
5017 * struct gnet_stats_basic_packed, but we can't depend on that. The
5018 * easiest thing to do is just to make a copy. */
5019 memset(&gsb, 0, sizeof gsb);
5020 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5021 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5022 stats->tx_bytes = gsb.bytes;
5023 stats->tx_packets = gsb.packets;
5024
5025 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5026 stats->tx_errors = gsq->drops;
5027 }
5028
5029 return 0;
5030
5031error:
5032 if (options) {
5033 *options = NULL;
5034 }
5035 if (stats) {
5036 memset(stats, 0, sizeof *stats);
5037 }
5038 return EPROTO;
5039}
5040
5041/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5042 * on 'netdev'. */
5043static int
5044tc_query_class(const struct netdev *netdev,
5045 unsigned int handle, unsigned int parent,
5046 struct ofpbuf **replyp)
5047{
5048 struct ofpbuf request;
5049 struct tcmsg *tcmsg;
5050 int error;
5051
5052 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
5053 if (!tcmsg) {
5054 return ENODEV;
5055 }
c1c9c9c4
BP
5056 tcmsg->tcm_handle = handle;
5057 tcmsg->tcm_parent = parent;
5058
5059 error = tc_transact(&request, replyp);
5060 if (error) {
5061 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5062 netdev_get_name(netdev),
5063 tc_get_major(handle), tc_get_minor(handle),
5064 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5065 ovs_strerror(error));
c1c9c9c4
BP
5066 }
5067 return error;
5068}
5069
5070/* Equivalent to "tc class del dev <name> handle <handle>". */
5071static int
5072tc_delete_class(const struct netdev *netdev, unsigned int handle)
5073{
5074 struct ofpbuf request;
5075 struct tcmsg *tcmsg;
5076 int error;
5077
5078 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5079 if (!tcmsg) {
5080 return ENODEV;
5081 }
c1c9c9c4
BP
5082 tcmsg->tcm_handle = handle;
5083 tcmsg->tcm_parent = 0;
5084
5085 error = tc_transact(&request, NULL);
5086 if (error) {
5087 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5088 netdev_get_name(netdev),
5089 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5090 ovs_strerror(error));
c1c9c9c4
BP
5091 }
5092 return error;
5093}
5094
5095/* Equivalent to "tc qdisc del dev <name> root". */
5096static int
b5d57fc8 5097tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5098{
b5d57fc8 5099 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5100 struct ofpbuf request;
5101 struct tcmsg *tcmsg;
5102 int error;
5103
b5d57fc8 5104 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5105 if (!tcmsg) {
5106 return ENODEV;
5107 }
c1c9c9c4
BP
5108 tcmsg->tcm_handle = tc_make_handle(1, 0);
5109 tcmsg->tcm_parent = TC_H_ROOT;
5110
5111 error = tc_transact(&request, NULL);
5112 if (error == EINVAL) {
5113 /* EINVAL probably means that the default qdisc was in use, in which
5114 * case we've accomplished our purpose. */
5115 error = 0;
5116 }
b5d57fc8
BP
5117 if (!error && netdev->tc) {
5118 if (netdev->tc->ops->tc_destroy) {
5119 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5120 }
b5d57fc8 5121 netdev->tc = NULL;
c1c9c9c4
BP
5122 }
5123 return error;
5124}
5125
ac3e3aaa
BP
5126static bool
5127getqdisc_is_safe(void)
5128{
5129 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5130 static bool safe = false;
5131
5132 if (ovsthread_once_start(&once)) {
5133 struct utsname utsname;
5134 int major, minor;
5135
5136 if (uname(&utsname) == -1) {
5137 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5138 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5139 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5140 } else if (major < 2 || (major == 2 && minor < 35)) {
5141 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5142 utsname.release);
5143 } else {
5144 safe = true;
5145 }
5146 ovsthread_once_done(&once);
5147 }
5148 return safe;
5149}
5150
c1c9c9c4
BP
5151/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5152 * kernel to determine what they are. Returns 0 if successful, otherwise a
5153 * positive errno value. */
5154static int
b5d57fc8 5155tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5156{
b5d57fc8 5157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5158 struct ofpbuf request, *qdisc;
5159 const struct tc_ops *ops;
5160 struct tcmsg *tcmsg;
5161 int load_error;
5162 int error;
5163
b5d57fc8 5164 if (netdev->tc) {
c1c9c9c4
BP
5165 return 0;
5166 }
5167
5168 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5169 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5170 * 2.6.35 without that fix backported to it.
5171 *
5172 * To avoid the OOPS, we must not make a request that would attempt to dump
5173 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5174 * few others. There are a few ways that I can see to do this, but most of
5175 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5176 * technique chosen here is to assume that any non-default qdisc that we
5177 * create will have a class with handle 1:0. The built-in qdiscs only have
5178 * a class with handle 0:0.
5179 *
ac3e3aaa
BP
5180 * On Linux 2.6.35+ we use the straightforward method because it allows us
5181 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5182 * in such a case we get no response at all from the kernel (!) if a
5183 * builtin qdisc is in use (which is later caught by "!error &&
5184 * !qdisc->size"). */
b5d57fc8 5185 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
5186 if (!tcmsg) {
5187 return ENODEV;
5188 }
ac3e3aaa
BP
5189 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5190 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5191
5192 /* Figure out what tc class to instantiate. */
5193 error = tc_transact(&request, &qdisc);
ac3e3aaa 5194 if (!error && qdisc->size) {
c1c9c9c4
BP
5195 const char *kind;
5196
5197 error = tc_parse_qdisc(qdisc, &kind, NULL);
5198 if (error) {
5199 ops = &tc_ops_other;
5200 } else {
5201 ops = tc_lookup_linux_name(kind);
5202 if (!ops) {
5203 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5204 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5205
5206 ops = &tc_ops_other;
5207 }
5208 }
ac3e3aaa
BP
5209 } else if ((!error && !qdisc->size) || error == ENOENT) {
5210 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5211 * set up by some other entity that doesn't have a handle 1:0. We will
5212 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5213 ops = &tc_ops_default;
5214 error = 0;
5215 } else {
5216 /* Who knows? Maybe the device got deleted. */
5217 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5218 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5219 ops = &tc_ops_other;
5220 }
5221
5222 /* Instantiate it. */
b5d57fc8
BP
5223 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5224 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5225 ofpbuf_delete(qdisc);
5226
5227 return error ? error : load_error;
5228}
5229
5230/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5231 approximate the time to transmit packets of various lengths. For an MTU of
5232 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5233 represents two possible packet lengths; for a MTU of 513 through 1024, four
5234 possible lengths; and so on.
5235
5236 Returns, for the specified 'mtu', the number of bits that packet lengths
5237 need to be shifted right to fit within such a 256-entry table. */
5238static int
5239tc_calc_cell_log(unsigned int mtu)
5240{
5241 int cell_log;
5242
5243 if (!mtu) {
5244 mtu = ETH_PAYLOAD_MAX;
5245 }
5246 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5247
5248 for (cell_log = 0; mtu >= 256; cell_log++) {
5249 mtu >>= 1;
5250 }
5251
5252 return cell_log;
5253}
5254
5255/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5256 * of 'mtu'. */
5257static void
5258tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5259{
5260 memset(rate, 0, sizeof *rate);
5261 rate->cell_log = tc_calc_cell_log(mtu);
5262 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5263 /* rate->cell_align = 0; */ /* distro headers. */
5264 rate->mpu = ETH_TOTAL_MIN;
5265 rate->rate = Bps;
5266}
5267
5268/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5269 * attribute of the specified "type".
5270 *
5271 * See tc_calc_cell_log() above for a description of "rtab"s. */
5272static void
5273tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5274{
5275 uint32_t *rtab;
5276 unsigned int i;
5277
5278 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5279 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5280 unsigned packet_size = (i + 1) << rate->cell_log;
5281 if (packet_size < rate->mpu) {
5282 packet_size = rate->mpu;
5283 }
5284 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5285 }
5286}
5287
5288/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5289 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5290 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5291 * 0 is fine.) */
c1c9c9c4
BP
5292static int
5293tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5294{
5295 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5296 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5297}
d3980822 5298\f
aaf2fb1a
BP
5299/* Linux-only functions declared in netdev-linux.h */
5300
5301/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5302 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5303int
5304netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5305 const char *flag_name, bool enable)
5306{
5307 const char *netdev_name = netdev_get_name(netdev);
5308 struct ethtool_value evalue;
5309 uint32_t new_flags;
5310 int error;
5311
ab985a77 5312 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5313 memset(&evalue, 0, sizeof evalue);
5314 error = netdev_linux_do_ethtool(netdev_name,
5315 (struct ethtool_cmd *)&evalue,
5316 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5317 if (error) {
5318 return error;
5319 }
5320
ab985a77 5321 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5322 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5323 if (new_flags == evalue.data) {
5324 return 0;
5325 }
5326 evalue.data = new_flags;
aaf2fb1a
BP
5327 error = netdev_linux_do_ethtool(netdev_name,
5328 (struct ethtool_cmd *)&evalue,
5329 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5330 if (error) {
5331 return error;
5332 }
5333
ab985a77 5334 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5335 memset(&evalue, 0, sizeof evalue);
5336 error = netdev_linux_do_ethtool(netdev_name,
5337 (struct ethtool_cmd *)&evalue,
5338 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5339 if (error) {
5340 return error;
5341 }
5342
5343 if (new_flags != evalue.data) {
5344 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5345 "device %s failed", enable ? "enable" : "disable",
5346 flag_name, netdev_name);
5347 return EOPNOTSUPP;
5348 }
5349
5350 return 0;
5351}
5352\f
5353/* Utility functions. */
5354
d3980822 5355/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5356static void
d3980822
BP
5357netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5358 const struct rtnl_link_stats *src)
5359{
f613a0d7
PS
5360 dst->rx_packets = src->rx_packets;
5361 dst->tx_packets = src->tx_packets;
5362 dst->rx_bytes = src->rx_bytes;
5363 dst->tx_bytes = src->tx_bytes;
5364 dst->rx_errors = src->rx_errors;
5365 dst->tx_errors = src->tx_errors;
5366 dst->rx_dropped = src->rx_dropped;
5367 dst->tx_dropped = src->tx_dropped;
5368 dst->multicast = src->multicast;
5369 dst->collisions = src->collisions;
5370 dst->rx_length_errors = src->rx_length_errors;
5371 dst->rx_over_errors = src->rx_over_errors;
5372 dst->rx_crc_errors = src->rx_crc_errors;
5373 dst->rx_frame_errors = src->rx_frame_errors;
5374 dst->rx_fifo_errors = src->rx_fifo_errors;
5375 dst->rx_missed_errors = src->rx_missed_errors;
5376 dst->tx_aborted_errors = src->tx_aborted_errors;
5377 dst->tx_carrier_errors = src->tx_carrier_errors;
5378 dst->tx_fifo_errors = src->tx_fifo_errors;
5379 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5380 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5381}
5382
337c9b99
BP
5383/* Copies 'src' into 'dst', performing format conversion in the process. */
5384static void
5385netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5386 const struct rtnl_link_stats64 *src)
5387{
5388 dst->rx_packets = src->rx_packets;
5389 dst->tx_packets = src->tx_packets;
5390 dst->rx_bytes = src->rx_bytes;
5391 dst->tx_bytes = src->tx_bytes;
5392 dst->rx_errors = src->rx_errors;
5393 dst->tx_errors = src->tx_errors;
5394 dst->rx_dropped = src->rx_dropped;
5395 dst->tx_dropped = src->tx_dropped;
5396 dst->multicast = src->multicast;
5397 dst->collisions = src->collisions;
5398 dst->rx_length_errors = src->rx_length_errors;
5399 dst->rx_over_errors = src->rx_over_errors;
5400 dst->rx_crc_errors = src->rx_crc_errors;
5401 dst->rx_frame_errors = src->rx_frame_errors;
5402 dst->rx_fifo_errors = src->rx_fifo_errors;
5403 dst->rx_missed_errors = src->rx_missed_errors;
5404 dst->tx_aborted_errors = src->tx_aborted_errors;
5405 dst->tx_carrier_errors = src->tx_carrier_errors;
5406 dst->tx_fifo_errors = src->tx_fifo_errors;
5407 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5408 dst->tx_window_errors = src->tx_window_errors;
5409}
5410
c1c9c9c4 5411static int
35eef899 5412get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5413{
c1c9c9c4
BP
5414 struct ofpbuf request;
5415 struct ofpbuf *reply;
c1c9c9c4
BP
5416 int error;
5417
d6e3feb5 5418 /* Filtering all counters by default */
5419 memset(stats, 0xFF, sizeof(struct netdev_stats));
5420
c1c9c9c4 5421 ofpbuf_init(&request, 0);
13a24df8
BP
5422 nl_msg_put_nlmsghdr(&request,
5423 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5424 RTM_GETLINK, NLM_F_REQUEST);
5425 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5426 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5427 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5428 ofpbuf_uninit(&request);
5429 if (error) {
5430 return error;
5431 }
5432
13a24df8 5433 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5434 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5435 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5436 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5437 error = 0;
5438 } else {
337c9b99
BP
5439 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5440 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5441 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5442 error = 0;
5443 } else {
5444 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5445 error = EPROTO;
5446 }
13a24df8
BP
5447 }
5448 } else {
5449 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5450 error = EPROTO;
c1c9c9c4 5451 }
8b61709d 5452
8b61709d 5453
576e26d7 5454 ofpbuf_delete(reply);
35eef899 5455 return error;
8b61709d 5456}
c1c9c9c4 5457
3a183124 5458static int
b5d57fc8 5459get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5460{
5461 struct ifreq ifr;
5462 int error;
5463
755be9ea 5464 *flags = 0;
259e0b1a 5465 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5466 if (!error) {
5467 *flags = ifr.ifr_flags;
5468 }
8b61709d
BP
5469 return error;
5470}
5471
5472static int
4b609110 5473set_flags(const char *name, unsigned int flags)
8b61709d
BP
5474{
5475 struct ifreq ifr;
5476
5477 ifr.ifr_flags = flags;
259e0b1a 5478 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5479}
5480
5481static int
5482do_get_ifindex(const char *netdev_name)
5483{
5484 struct ifreq ifr;
259e0b1a 5485 int error;
8b61709d 5486
71d7c22f 5487 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5488 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5489
5490 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5491 if (error) {
8b61709d 5492 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
5493 netdev_name, ovs_strerror(error));
5494 return -error;
8b61709d
BP
5495 }
5496 return ifr.ifr_ifindex;
5497}
5498
5499static int
5500get_ifindex(const struct netdev *netdev_, int *ifindexp)
5501{
b5d57fc8 5502 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5503
b5d57fc8 5504 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 5505 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5506
8b61709d 5507 if (ifindex < 0) {
b5d57fc8
BP
5508 netdev->get_ifindex_error = -ifindex;
5509 netdev->ifindex = 0;
c7b1b0a5 5510 } else {
b5d57fc8
BP
5511 netdev->get_ifindex_error = 0;
5512 netdev->ifindex = ifindex;
8b61709d 5513 }
b5d57fc8 5514 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5515 }
c7b1b0a5 5516
b5d57fc8
BP
5517 *ifindexp = netdev->ifindex;
5518 return netdev->get_ifindex_error;
8b61709d
BP
5519}
5520
5521static int
74ff3298 5522get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5523{
5524 struct ifreq ifr;
5525 int hwaddr_family;
259e0b1a 5526 int error;
8b61709d
BP
5527
5528 memset(&ifr, 0, sizeof ifr);
71d7c22f 5529 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5530 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5531 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5532 if (error) {
78857dfb
BP
5533 /* ENODEV probably means that a vif disappeared asynchronously and
5534 * hasn't been removed from the database yet, so reduce the log level
5535 * to INFO for that case. */
259e0b1a 5536 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5537 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5538 netdev_name, ovs_strerror(error));
5539 return error;
8b61709d
BP
5540 }
5541 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
5542 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5543 hwaddr_family != ARPHRD_NONE) {
c9697f35 5544 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5545 netdev_name, hwaddr_family);
c9697f35 5546 return EINVAL;
8b61709d
BP
5547 }
5548 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5549 return 0;
5550}
5551
5552static int
74ff3298 5553set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5554{
5555 struct ifreq ifr;
259e0b1a 5556 int error;
8b61709d
BP
5557
5558 memset(&ifr, 0, sizeof ifr);
71d7c22f 5559 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5560 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5561 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5562 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5563 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5564 if (error) {
8b61709d 5565 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5566 netdev_name, ovs_strerror(error));
8b61709d 5567 }
259e0b1a 5568 return error;
8b61709d
BP
5569}
5570
5571static int
0b0544d7 5572netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5573 int cmd, const char *cmd_name)
5574{
5575 struct ifreq ifr;
259e0b1a 5576 int error;
8b61709d
BP
5577
5578 memset(&ifr, 0, sizeof ifr);
71d7c22f 5579 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5580 ifr.ifr_data = (caddr_t) ecmd;
5581
5582 ecmd->cmd = cmd;
259e0b1a
BP
5583 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5584 if (error) {
5585 if (error != EOPNOTSUPP) {
8b61709d 5586 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5587 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5588 } else {
5589 /* The device doesn't support this operation. That's pretty
5590 * common, so there's no point in logging anything. */
5591 }
8b61709d 5592 }
259e0b1a 5593 return error;
8b61709d 5594}
f1acd62b 5595
488d734d
BP
5596/* Returns an AF_PACKET raw socket or a negative errno value. */
5597static int
5598af_packet_sock(void)
5599{
23882115
BP
5600 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5601 static int sock;
488d734d 5602
23882115 5603 if (ovsthread_once_start(&once)) {
488d734d
BP
5604 sock = socket(AF_PACKET, SOCK_RAW, 0);
5605 if (sock >= 0) {
8450059e
BP
5606 int error = set_nonblocking(sock);
5607 if (error) {
5608 close(sock);
5609 sock = -error;
5610 }
488d734d
BP
5611 } else {
5612 sock = -errno;
10a89ef0
BP
5613 VLOG_ERR("failed to create packet socket: %s",
5614 ovs_strerror(errno));
488d734d 5615 }
23882115 5616 ovsthread_once_done(&once);
488d734d
BP
5617 }
5618
5619 return sock;
5620}