]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
userspace: Add packet_type in dp_packet and flow
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
55bc98d6 43#include <net/if_packet.h>
8b61709d
BP
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
e14deea0 52#include "dp-packet.h"
93451a0a 53#include "dpif-netlink.h"
df1e5a3b 54#include "dpif-netdev.h"
3e8a2ad1 55#include "openvswitch/dynamic-string.h"
8b61709d 56#include "fatal-signal.h"
93b13be8 57#include "hash.h"
ee89ea7b 58#include "openvswitch/hmap.h"
8b61709d 59#include "netdev-provider.h"
7fbef77a 60#include "netdev-vport.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
c060c4cf 63#include "netlink.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d
BP
67#include "packets.h"
68#include "poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
ee89ea7b 70#include "openvswitch/shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
ee89ea7b 76#include "util.h"
5136ce49 77
d98e6007 78VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 79
d76f09ea
BP
80COVERAGE_DEFINE(netdev_set_policing);
81COVERAGE_DEFINE(netdev_arp_lookup);
82COVERAGE_DEFINE(netdev_get_ifindex);
83COVERAGE_DEFINE(netdev_get_hwaddr);
84COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
85COVERAGE_DEFINE(netdev_get_ethtool);
86COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 87
8b61709d
BP
88\f
89/* These were introduced in Linux 2.6.14, so they might be missing if we have
90 * old headers. */
91#ifndef ADVERTISED_Pause
92#define ADVERTISED_Pause (1 << 13)
93#endif
94#ifndef ADVERTISED_Asym_Pause
95#define ADVERTISED_Asym_Pause (1 << 14)
96#endif
97
e47bd51a
JP
98/* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100#ifndef ETHTOOL_GFLAGS
101#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102#endif
103#ifndef ETHTOOL_SFLAGS
104#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105#endif
106
c1c9c9c4
BP
107/* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 * headers. */
109#ifndef TC_RTAB_SIZE
110#define TC_RTAB_SIZE 1024
111#endif
112
b73c8518
SH
113/* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
118 *
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
121 */
55bc98d6
BP
122#ifndef PACKET_AUXDATA
123#define PACKET_AUXDATA 8
124#endif
b73c8518
SH
125#ifndef TP_STATUS_VLAN_VALID
126#define TP_STATUS_VLAN_VALID (1 << 4)
127#endif
128#ifndef TP_STATUS_VLAN_TPID_VALID
129#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130#endif
131#undef tpacket_auxdata
132#define tpacket_auxdata rpl_tpacket_auxdata
133struct tpacket_auxdata {
134 uint32_t tp_status;
135 uint32_t tp_len;
136 uint32_t tp_snaplen;
137 uint16_t tp_mac;
138 uint16_t tp_net;
139 uint16_t tp_vlan_tci;
140 uint16_t tp_vlan_tpid;
141};
142
0c615356
SH
143/* Linux 2.6.27 introduced ethtool_cmd_speed
144 *
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
147 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
148 * unconditionally replace ethtool_cmd_speed. */
149#define ethtool_cmd_speed rpl_ethtool_cmd_speed
150static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151{
152 return ep->speed | (ep->speed_hi << 16);
153}
154
67bed84c
SH
155/* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157#ifndef SUPPORTED_1000baseKX_Full
158#define SUPPORTED_1000baseKX_Full (1 << 17)
159#define SUPPORTED_10000baseKX4_Full (1 << 18)
160#define SUPPORTED_10000baseKR_Full (1 << 19)
161#define SUPPORTED_10000baseR_FEC (1 << 20)
162#define ADVERTISED_1000baseKX_Full (1 << 17)
163#define ADVERTISED_10000baseKX4_Full (1 << 18)
164#define ADVERTISED_10000baseKR_Full (1 << 19)
165#define ADVERTISED_10000baseR_FEC (1 << 20)
166#endif
167
168/* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170#ifndef SUPPORTED_40000baseKR4_Full
171#define SUPPORTED_40000baseKR4_Full (1 << 23)
172#define SUPPORTED_40000baseCR4_Full (1 << 24)
173#define SUPPORTED_40000baseSR4_Full (1 << 25)
174#define SUPPORTED_40000baseLR4_Full (1 << 26)
175#define ADVERTISED_40000baseKR4_Full (1 << 23)
176#define ADVERTISED_40000baseCR4_Full (1 << 24)
177#define ADVERTISED_40000baseSR4_Full (1 << 25)
178#define ADVERTISED_40000baseLR4_Full (1 << 26)
179#endif
180
fa373af4
BP
181/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 *
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
185 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
186 * if_link.h is not self-contained on those kernels. It is easiest to
187 * unconditionally define a replacement. */
188#ifndef IFLA_STATS64
337c9b99 189#define IFLA_STATS64 23
fa373af4
BP
190#endif
191#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
192struct rtnl_link_stats64 {
193 uint64_t rx_packets;
194 uint64_t tx_packets;
195 uint64_t rx_bytes;
196 uint64_t tx_bytes;
197 uint64_t rx_errors;
198 uint64_t tx_errors;
199 uint64_t rx_dropped;
200 uint64_t tx_dropped;
201 uint64_t multicast;
202 uint64_t collisions;
203
204 uint64_t rx_length_errors;
205 uint64_t rx_over_errors;
206 uint64_t rx_crc_errors;
207 uint64_t rx_frame_errors;
208 uint64_t rx_fifo_errors;
209 uint64_t rx_missed_errors;
210
211 uint64_t tx_aborted_errors;
212 uint64_t tx_carrier_errors;
213 uint64_t tx_fifo_errors;
214 uint64_t tx_heartbeat_errors;
215 uint64_t tx_window_errors;
216
217 uint64_t rx_compressed;
218 uint64_t tx_compressed;
219};
337c9b99 220
8b61709d 221enum {
7fbef77a
JG
222 VALID_IFINDEX = 1 << 0,
223 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
224 VALID_IN = 1 << 2,
225 VALID_MTU = 1 << 3,
226 VALID_POLICING = 1 << 4,
227 VALID_VPORT_STAT_ERROR = 1 << 5,
228 VALID_DRVINFO = 1 << 6,
229 VALID_FEATURES = 1 << 7,
8b61709d 230};
c1c9c9c4
BP
231\f
232/* Traffic control. */
233
234/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
c1c9c9c4
BP
239struct tc {
240 const struct tc_ops *ops;
93b13be8
BP
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244};
c1c9c9c4 245
559eb230
BP
246#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
93b13be8
BP
248/* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 255 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
256};
257
258/* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
79f1cbe9 292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
79f1cbe9 332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
79f1cbe9 343 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 344
93b13be8
BP
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
93b13be8 358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 359 struct smap *details);
c1c9c9c4
BP
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 373 const struct smap *details);
c1c9c9c4 374
93b13be8
BP
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
93b13be8 380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 381
93b13be8
BP
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
93b13be8
BP
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
c1c9c9c4
BP
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401};
402
403static void
404tc_init(struct tc *tc, const struct tc_ops *ops)
405{
406 tc->ops = ops;
93b13be8 407 hmap_init(&tc->queues);
c1c9c9c4
BP
408}
409
410static void
411tc_destroy(struct tc *tc)
412{
93b13be8 413 hmap_destroy(&tc->queues);
c1c9c9c4
BP
414}
415
416static const struct tc_ops tc_ops_htb;
a339aa81 417static const struct tc_ops tc_ops_hfsc;
677d9158
JV
418static const struct tc_ops tc_ops_codel;
419static const struct tc_ops tc_ops_fqcodel;
420static const struct tc_ops tc_ops_sfq;
c1c9c9c4 421static const struct tc_ops tc_ops_default;
6cf888b8 422static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
423static const struct tc_ops tc_ops_other;
424
559eb230 425static const struct tc_ops *const tcs[] = {
c1c9c9c4 426 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 427 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
428 &tc_ops_codel, /* Controlled delay */
429 &tc_ops_fqcodel, /* Fair queue controlled delay */
430 &tc_ops_sfq, /* Stochastic fair queueing */
6cf888b8 431 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
432 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other, /* Some other qdisc. */
434 NULL
435};
149f577a 436
c1c9c9c4
BP
437static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
438static unsigned int tc_get_major(unsigned int handle);
439static unsigned int tc_get_minor(unsigned int handle);
440
441static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
445static struct tcmsg *tc_make_request(const struct netdev *, int type,
446 unsigned int flags, struct ofpbuf *);
447static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004 448static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
c7952afb
BP
449static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
451
452static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462static int tc_del_qdisc(struct netdev *netdev);
463static int tc_query_qdisc(const struct netdev *netdev);
464
465static int tc_calc_cell_log(unsigned int mtu);
466static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470\f
b5d57fc8
BP
471struct netdev_linux {
472 struct netdev up;
149f577a 473
86383816
BP
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
149f577a 477 unsigned int cache_valid;
8b61709d 478
1670c579
EJ
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
8722022c
BP
483 /* The following are figured out "on demand" only. They are only valid
484 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 485 int ifindex;
74ff3298 486 struct eth_addr etheraddr;
8b61709d 487 int mtu;
059e5f4f 488 unsigned int ifi_flags;
65c3058c 489 long long int carrier_resets;
80a86fbe
BP
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
bba1e6f3
PS
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
90a6637d 494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 496 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 499
a00ca915
EJ
500 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 503
4f925bd3 504 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 505 struct tc *tc;
149f577a 506
d0d08f8a
BP
507 /* For devices of class netdev_tap_class only. */
508 int tap_fd;
8b61709d
BP
509};
510
f7791740
PS
511struct netdev_rxq_linux {
512 struct netdev_rxq up;
796223f5 513 bool is_tap;
5b7448ed 514 int fd;
149f577a 515};
8b61709d 516
8b61709d
BP
517/* This is set pretty low because we probably won't learn anything from the
518 * additional log messages. */
519static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
520
19c8e9c1
JS
521/* Polling miimon status for all ports causes performance degradation when
522 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
523 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 *
525 * Readers do not depend on this variable synchronizing with the related
526 * changes in the device miimon status, so we can use atomic_count. */
527static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 528
1c33f0c3 529static void netdev_linux_run(const struct netdev_class *);
6f643e49 530
0b0544d7 531static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 532 int cmd, const char *cmd_name);
b5d57fc8 533static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 534static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
535static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
536 enum netdev_flags on, enum netdev_flags *old_flagsp)
537 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
538static int do_get_ifindex(const char *netdev_name);
539static int get_ifindex(const struct netdev *, int *ifindexp);
540static int do_set_addr(struct netdev *netdev,
541 int ioctl_nr, const char *ioctl_name,
542 struct in_addr addr);
74ff3298
JR
543static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
544static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 545static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 546static int af_packet_sock(void);
19c8e9c1 547static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
548static void netdev_linux_miimon_run(void);
549static void netdev_linux_miimon_wait(void);
df1e5a3b 550static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 551
15b3596a
JG
552static bool
553is_netdev_linux_class(const struct netdev_class *netdev_class)
554{
259e0b1a 555 return netdev_class->run == netdev_linux_run;
15b3596a
JG
556}
557
796223f5
BP
558static bool
559is_tap_netdev(const struct netdev *netdev)
560{
b5d57fc8 561 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
562}
563
8b61709d
BP
564static struct netdev_linux *
565netdev_linux_cast(const struct netdev *netdev)
566{
b5d57fc8 567 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 568
180c6d0b 569 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 570}
796223f5 571
f7791740
PS
572static struct netdev_rxq_linux *
573netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 574{
9dc63482 575 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 576 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 577}
ff4ed3c9 578\f
cee87338 579static void netdev_linux_update(struct netdev_linux *netdev,
7e9dcc0f 580 const struct rtnetlink_change *)
86383816 581 OVS_REQUIRES(netdev->mutex);
cee87338 582static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
583 unsigned int ifi_flags, unsigned int mask)
584 OVS_REQUIRES(netdev->mutex);
cee87338 585
d6384a3a
AW
586/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
587 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
588 * if no such socket could be created. */
589static struct nl_sock *
590netdev_linux_notify_sock(void)
591{
592 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
593 static struct nl_sock *sock;
989d7135
PS
594 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
595 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
596
597 if (ovsthread_once_start(&once)) {
598 int error;
599
600 error = nl_sock_create(NETLINK_ROUTE, &sock);
601 if (!error) {
d6384a3a
AW
602 size_t i;
603
604 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
605 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
606 if (error) {
607 nl_sock_destroy(sock);
608 sock = NULL;
609 break;
610 }
cee87338
BP
611 }
612 }
613 ovsthread_once_done(&once);
614 }
615
616 return sock;
617}
618
19c8e9c1
JS
619static bool
620netdev_linux_miimon_enabled(void)
621{
812c272c 622 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
623}
624
8b61709d 625static void
1c33f0c3 626netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 627{
cee87338
BP
628 struct nl_sock *sock;
629 int error;
630
19c8e9c1
JS
631 if (netdev_linux_miimon_enabled()) {
632 netdev_linux_miimon_run();
633 }
cee87338
BP
634
635 sock = netdev_linux_notify_sock();
636 if (!sock) {
637 return;
638 }
639
640 do {
641 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
642 uint64_t buf_stub[4096 / 8];
643 struct ofpbuf buf;
644
645 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
646 error = nl_sock_recv(sock, &buf, false);
647 if (!error) {
7e9dcc0f 648 struct rtnetlink_change change;
cee87338 649
7e9dcc0f 650 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
651 struct netdev *netdev_ = NULL;
652 char dev_name[IFNAMSIZ];
653
654 if (!change.ifname) {
655 change.ifname = if_indextoname(change.if_index, dev_name);
656 }
657
658 if (change.ifname) {
659 netdev_ = netdev_from_name(change.ifname);
660 }
cee87338
BP
661 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
663
664 ovs_mutex_lock(&netdev->mutex);
cee87338 665 netdev_linux_update(netdev, &change);
86383816 666 ovs_mutex_unlock(&netdev->mutex);
cee87338 667 }
38e0065b 668 netdev_close(netdev_);
cee87338
BP
669 }
670 } else if (error == ENOBUFS) {
671 struct shash device_shash;
672 struct shash_node *node;
673
674 nl_sock_drain(sock);
675
676 shash_init(&device_shash);
677 netdev_get_devices(&netdev_linux_class, &device_shash);
678 SHASH_FOR_EACH (node, &device_shash) {
679 struct netdev *netdev_ = node->data;
680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
681 unsigned int flags;
682
86383816 683 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
684 get_flags(netdev_, &flags);
685 netdev_linux_changed(netdev, flags, 0);
86383816
BP
686 ovs_mutex_unlock(&netdev->mutex);
687
cee87338
BP
688 netdev_close(netdev_);
689 }
690 shash_destroy(&device_shash);
691 } else if (error != EAGAIN) {
692 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
693 ovs_strerror(error));
694 }
695 ofpbuf_uninit(&buf);
696 } while (!error);
8b61709d
BP
697}
698
699static void
1c33f0c3 700netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 701{
cee87338
BP
702 struct nl_sock *sock;
703
19c8e9c1
JS
704 if (netdev_linux_miimon_enabled()) {
705 netdev_linux_miimon_wait();
706 }
cee87338
BP
707 sock = netdev_linux_notify_sock();
708 if (sock) {
709 nl_sock_wait(sock, POLLIN);
710 }
8b61709d
BP
711}
712
ac4d3bcb 713static void
b5d57fc8
BP
714netdev_linux_changed(struct netdev_linux *dev,
715 unsigned int ifi_flags, unsigned int mask)
86383816 716 OVS_REQUIRES(dev->mutex)
ac4d3bcb 717{
3e912ffc 718 netdev_change_seq_changed(&dev->up);
8aa77183
BP
719
720 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
721 dev->carrier_resets++;
722 }
723 dev->ifi_flags = ifi_flags;
724
4f925bd3 725 dev->cache_valid &= mask;
6b6e1329 726 if (!(mask & VALID_IN)) {
a8704b50
PS
727 netdev_get_addrs_list_flush();
728 }
4f925bd3
PS
729}
730
731static void
b5d57fc8 732netdev_linux_update(struct netdev_linux *dev,
7e9dcc0f 733 const struct rtnetlink_change *change)
86383816 734 OVS_REQUIRES(dev->mutex)
4f925bd3 735{
d6384a3a
AW
736 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
737 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 738 /* Keep drv-info, and ip addresses. */
d6384a3a 739 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 740 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
741
742 /* Update netdev from rtnl-change msg. */
743 if (change->mtu) {
744 dev->mtu = change->mtu;
745 dev->cache_valid |= VALID_MTU;
746 dev->netdev_mtu_error = 0;
747 }
90a6637d 748
74ff3298
JR
749 if (!eth_addr_is_zero(change->mac)) {
750 dev->etheraddr = change->mac;
d6384a3a
AW
751 dev->cache_valid |= VALID_ETHERADDR;
752 dev->ether_addr_error = 0;
753 }
44445cac 754
d6384a3a
AW
755 dev->ifindex = change->if_index;
756 dev->cache_valid |= VALID_IFINDEX;
757 dev->get_ifindex_error = 0;
758 } else {
759 netdev_linux_changed(dev, change->ifi_flags, 0);
760 }
761 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
762 /* Invalidates in4, in6. */
6b6e1329 763 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 764 } else {
d6384a3a 765 OVS_NOT_REACHED();
4f925bd3 766 }
ac4d3bcb
EJ
767}
768
9dc63482
BP
769static struct netdev *
770netdev_linux_alloc(void)
771{
772 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
773 return &netdev->up;
774}
775
48c6733c
WT
776static int
777netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 778{
48c6733c
WT
779 /* Prevent any attempt to create (or open) a network device named "default"
780 * or "all". These device names are effectively reserved on Linux because
781 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
782 * itself this wouldn't call for any special treatment, but in practice if
783 * a program tries to create devices with these names, it causes the kernel
784 * to fire a "new device" notification event even though creation failed,
785 * and in turn that causes OVS to wake up and try to create them again,
786 * which ends up as a 100% CPU loop. */
787 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
788 const char *name = netdev_->name;
789 if (!strcmp(name, "default") || !strcmp(name, "all")) {
790 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
791 VLOG_WARN_RL(&rl, "%s: Linux forbids network device with this name",
792 name);
793 return EINVAL;
794 }
795
834d6caf 796 ovs_mutex_init(&netdev->mutex);
48c6733c 797 return 0;
9dc63482
BP
798}
799
1f6e0fbd
BP
800/* Creates system and internal devices. */
801static int
9dc63482 802netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 803{
9dc63482 804 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
805 int error = netdev_linux_common_construct(netdev_);
806 if (error) {
807 return error;
808 }
1f6e0fbd 809
b5d57fc8
BP
810 error = get_flags(&netdev->up, &netdev->ifi_flags);
811 if (error == ENODEV) {
9dc63482 812 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 813 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
814 return ENODEV;
815 } else {
816 /* "Internal" netdevs have to be created as netdev objects before
817 * they exist in the kernel, because creating them in the kernel
818 * happens by passing a netdev object to dpif_port_add().
819 * Therefore, ignore the error. */
820 }
821 }
46415c90 822
a740f0de
JG
823 return 0;
824}
825
5b7448ed
JG
826/* For most types of netdevs we open the device for each call of
827 * netdev_open(). However, this is not the case with tap devices,
828 * since it is only possible to open the device once. In this
829 * situation we share a single file descriptor, and consequently
830 * buffers, across all readers. Therefore once data is read it will
831 * be unavailable to other reads for tap devices. */
a740f0de 832static int
9dc63482 833netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 834{
9dc63482 835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 836 static const char tap_dev[] = "/dev/net/tun";
9dc63482 837 const char *name = netdev_->name;
a740f0de 838 struct ifreq ifr;
a740f0de 839
48c6733c
WT
840 int error = netdev_linux_common_construct(netdev_);
841 if (error) {
842 return error;
843 }
1f6e0fbd 844
6c88d577 845 /* Open tap device. */
d0d08f8a
BP
846 netdev->tap_fd = open(tap_dev, O_RDWR);
847 if (netdev->tap_fd < 0) {
6c88d577 848 error = errno;
10a89ef0 849 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 850 return error;
6c88d577
JP
851 }
852
853 /* Create tap device. */
854 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 855 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 856 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 857 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 858 ovs_strerror(errno));
6c88d577 859 error = errno;
f61d8d29 860 goto error_close;
6c88d577
JP
861 }
862
863 /* Make non-blocking. */
d0d08f8a 864 error = set_nonblocking(netdev->tap_fd);
a740f0de 865 if (error) {
f61d8d29 866 goto error_close;
a740f0de
JG
867 }
868
869 return 0;
870
f61d8d29 871error_close:
d0d08f8a 872 close(netdev->tap_fd);
a740f0de
JG
873 return error;
874}
875
6c88d577 876static void
9dc63482 877netdev_linux_destruct(struct netdev *netdev_)
6c88d577 878{
b5d57fc8 879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 880
b5d57fc8
BP
881 if (netdev->tc && netdev->tc->ops->tc_destroy) {
882 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
883 }
884
d0d08f8a
BP
885 if (netdev_get_class(netdev_) == &netdev_tap_class
886 && netdev->tap_fd >= 0)
887 {
888 close(netdev->tap_fd);
6c88d577 889 }
86383816 890
19c8e9c1 891 if (netdev->miimon_interval > 0) {
812c272c 892 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
893 }
894
86383816 895 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
896}
897
9dc63482
BP
898static void
899netdev_linux_dealloc(struct netdev *netdev_)
900{
901 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
902 free(netdev);
903}
904
f7791740
PS
905static struct netdev_rxq *
906netdev_linux_rxq_alloc(void)
9dc63482 907{
f7791740 908 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
909 return &rx->up;
910}
911
7b6b0ef4 912static int
f7791740 913netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 914{
f7791740 915 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 916 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 917 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 918 int error;
7b6b0ef4 919
86383816 920 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
921 rx->is_tap = is_tap_netdev(netdev_);
922 if (rx->is_tap) {
923 rx->fd = netdev->tap_fd;
796223f5
BP
924 } else {
925 struct sockaddr_ll sll;
b73c8518 926 int ifindex, val;
32383c3b 927 /* Result of tcpdump -dd inbound */
259e0b1a 928 static const struct sock_filter filt[] = {
32383c3b
MM
929 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
930 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
931 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
932 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
933 };
259e0b1a
BP
934 static const struct sock_fprog fprog = {
935 ARRAY_SIZE(filt), (struct sock_filter *) filt
936 };
7b6b0ef4 937
796223f5 938 /* Create file descriptor. */
9dc63482
BP
939 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
940 if (rx->fd < 0) {
796223f5 941 error = errno;
10a89ef0 942 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
943 goto error;
944 }
33d82a56 945
b73c8518
SH
946 val = 1;
947 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
948 error = errno;
949 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
950 netdev_get_name(netdev_), ovs_strerror(error));
951 goto error;
952 }
953
796223f5 954 /* Set non-blocking mode. */
9dc63482 955 error = set_nonblocking(rx->fd);
796223f5
BP
956 if (error) {
957 goto error;
958 }
7b6b0ef4 959
796223f5 960 /* Get ethernet device index. */
180c6d0b 961 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
962 if (error) {
963 goto error;
964 }
7b6b0ef4 965
796223f5
BP
966 /* Bind to specific ethernet device. */
967 memset(&sll, 0, sizeof sll);
968 sll.sll_family = AF_PACKET;
969 sll.sll_ifindex = ifindex;
b73c8518 970 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 971 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
972 error = errno;
973 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 974 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
975 goto error;
976 }
32383c3b
MM
977
978 /* Filter for only inbound packets. */
9dc63482 979 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
980 sizeof fprog);
981 if (error) {
982 error = errno;
259e0b1a 983 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 984 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
985 goto error;
986 }
7b6b0ef4 987 }
86383816 988 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 989
7b6b0ef4
BP
990 return 0;
991
992error:
9dc63482
BP
993 if (rx->fd >= 0) {
994 close(rx->fd);
7b6b0ef4 995 }
86383816 996 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
997 return error;
998}
999
796223f5 1000static void
f7791740 1001netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1002{
f7791740 1003 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1004
796223f5
BP
1005 if (!rx->is_tap) {
1006 close(rx->fd);
8b61709d 1007 }
9dc63482
BP
1008}
1009
1010static void
f7791740 1011netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1012{
f7791740 1013 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1014
796223f5
BP
1015 free(rx);
1016}
8b61709d 1017
b73c8518 1018static ovs_be16
1ebdc7eb 1019auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1020{
1021 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1022 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1023 } else if (double_tagged) {
1024 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1025 } else {
1ebdc7eb 1026 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1027 }
1028}
1029
1030static bool
1031auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1032{
1033 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1034}
1035
796223f5 1036static int
cf62fa4c 1037netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1038{
b73c8518 1039 size_t size;
796223f5 1040 ssize_t retval;
b73c8518
SH
1041 struct iovec iov;
1042 struct cmsghdr *cmsg;
1043 union {
1044 struct cmsghdr cmsg;
1045 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1046 } cmsg_buffer;
1047 struct msghdr msgh;
1048
1049 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1050 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1051 size = dp_packet_tailroom(buffer);
b73c8518 1052
cf62fa4c 1053 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1054 iov.iov_len = size;
1055 msgh.msg_name = NULL;
1056 msgh.msg_namelen = 0;
1057 msgh.msg_iov = &iov;
1058 msgh.msg_iovlen = 1;
1059 msgh.msg_control = &cmsg_buffer;
1060 msgh.msg_controllen = sizeof cmsg_buffer;
1061 msgh.msg_flags = 0;
8e8cddf7 1062
796223f5 1063 do {
b73c8518 1064 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1065 } while (retval < 0 && errno == EINTR);
1066
bfd3367b 1067 if (retval < 0) {
b73c8518
SH
1068 return errno;
1069 } else if (retval > size) {
1070 return EMSGSIZE;
1071 }
1072
cf62fa4c 1073 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1074
1075 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1076 const struct tpacket_auxdata *aux;
1077
1078 if (cmsg->cmsg_level != SOL_PACKET
1079 || cmsg->cmsg_type != PACKET_AUXDATA
1080 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1081 continue;
8b61709d 1082 }
b73c8518
SH
1083
1084 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1085 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1086 struct eth_header *eth;
1087 bool double_tagged;
1088
b73c8518
SH
1089 if (retval < ETH_HEADER_LEN) {
1090 return EINVAL;
1091 }
1092
1ebdc7eb
EG
1093 eth = dp_packet_data(buffer);
1094 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1095
1096 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1097 htons(aux->tp_vlan_tci));
1098 break;
1099 }
1100 }
1101
1102 return 0;
1103}
1104
1105static int
cf62fa4c 1106netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1107{
1108 ssize_t retval;
cf62fa4c 1109 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1110
1111 do {
cf62fa4c 1112 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1113 } while (retval < 0 && errno == EINTR);
1114
1115 if (retval < 0) {
bfd3367b 1116 return errno;
8b61709d 1117 }
b73c8518 1118
cf62fa4c 1119 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1120 return 0;
1121}
1122
1123static int
64839cf4 1124netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
b73c8518 1125{
f7791740 1126 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1127 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1128 struct dp_packet *buffer;
df1e5a3b
PS
1129 ssize_t retval;
1130 int mtu;
1131
1132 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1133 mtu = ETH_PAYLOAD_MAX;
1134 }
1135
2482b0b0 1136 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1137 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1138 DP_NETDEV_HEADROOM);
b73c8518 1139 retval = (rx->is_tap
f7791740
PS
1140 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1141 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1142
1143 if (retval) {
1144 if (retval != EAGAIN && retval != EMSGSIZE) {
1145 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1146 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1147 }
cf62fa4c 1148 dp_packet_delete(buffer);
df1e5a3b 1149 } else {
72c84bc2 1150 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1151 }
1152
1153 return retval;
8b61709d
BP
1154}
1155
8b61709d 1156static void
f7791740 1157netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1158{
f7791740 1159 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1160 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1161}
1162
8b61709d 1163static int
f7791740 1164netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1165{
f7791740 1166 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1167 if (rx->is_tap) {
8b61709d 1168 struct ifreq ifr;
f7791740 1169 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1170 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1171 if (error) {
1172 return error;
1173 }
796223f5 1174 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1175 return 0;
1176 } else {
796223f5 1177 return drain_rcvbuf(rx->fd);
8b61709d
BP
1178 }
1179}
1180
1181/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1182 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1183 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1184 * the packet is too big or too small to transmit on the device.
1185 *
1186 * The caller retains ownership of 'buffer' in all cases.
1187 *
1188 * The kernel maintains a packet transmission queue, so the caller is not
1189 * expected to do additional queuing of packets. */
1190static int
f00fa8cb 1191netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
324c8374
IM
1192 struct dp_packet_batch *batch, bool may_steal,
1193 bool concurrent_txq OVS_UNUSED)
8b61709d 1194{
f4fd623c
DDP
1195 int i;
1196 int error = 0;
40d26f04 1197
f4fd623c 1198 /* 'i' is incremented only if there's no error */
64839cf4
WT
1199 for (i = 0; i < batch->count;) {
1200 const void *data = dp_packet_data(batch->packets[i]);
1201 size_t size = dp_packet_size(batch->packets[i]);
f23347ea 1202 ssize_t retval;
8b61709d 1203
aaca4fe0 1204 /* Truncate the packet if it is configured. */
64839cf4 1205 size -= dp_packet_get_cutlen(batch->packets[i]);
aaca4fe0 1206
796223f5 1207 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
1208 /* Use our AF_PACKET socket to send to this device. */
1209 struct sockaddr_ll sll;
1210 struct msghdr msg;
1211 struct iovec iov;
1212 int ifindex;
488d734d
BP
1213 int sock;
1214
1215 sock = af_packet_sock();
1216 if (sock < 0) {
c4c7a3d7 1217 return -sock;
488d734d 1218 }
f23347ea 1219
86383816
BP
1220 ifindex = netdev_get_ifindex(netdev_);
1221 if (ifindex < 0) {
1222 return -ifindex;
f23347ea 1223 }
8b61709d 1224
f23347ea
BP
1225 /* We don't bother setting most fields in sockaddr_ll because the
1226 * kernel ignores them for SOCK_RAW. */
1227 memset(&sll, 0, sizeof sll);
1228 sll.sll_family = AF_PACKET;
1229 sll.sll_ifindex = ifindex;
76c308b5 1230
ebc56baa 1231 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1232 iov.iov_len = size;
76c308b5 1233
f23347ea
BP
1234 msg.msg_name = &sll;
1235 msg.msg_namelen = sizeof sll;
1236 msg.msg_iov = &iov;
1237 msg.msg_iovlen = 1;
1238 msg.msg_control = NULL;
1239 msg.msg_controllen = 0;
1240 msg.msg_flags = 0;
1241
488d734d 1242 retval = sendmsg(sock, &msg, 0);
f23347ea 1243 } else {
796223f5
BP
1244 /* Use the tap fd to send to this device. This is essential for
1245 * tap devices, because packets sent to a tap device with an
1246 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1247 * tap device. This doesn't occur on other interface types
1248 * because we attach a socket filter to the rx socket. */
b5d57fc8 1249 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1250
d0d08f8a 1251 retval = write(netdev->tap_fd, data, size);
f23347ea 1252 }
76c308b5 1253
8b61709d 1254 if (retval < 0) {
29736cc0
DDP
1255 if (errno == EINTR) {
1256 /* The send was interrupted by a signal. Retry the packet by
1257 * continuing without incrementing 'i'.*/
8b61709d 1258 continue;
29736cc0
DDP
1259 } else if (errno == EIO && is_tap_netdev(netdev_)) {
1260 /* The Linux tap driver returns EIO if the device is not up.
1261 * From the OVS side this is not an error, so ignore it. */
1262 } else {
1263 /* The Linux AF_PACKET implementation never blocks waiting for
1264 * room for packets, instead returning ENOBUFS. Translate this
1265 * into EAGAIN for the caller. */
1266 error = errno == ENOBUFS ? EAGAIN : errno;
1267 break;
8b61709d 1268 }
8b61709d 1269 } else if (retval != size) {
f4fd623c
DDP
1270 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1271 " of %"PRIuSIZE") on %s", retval, size,
1272 netdev_get_name(netdev_));
1273 error = EMSGSIZE;
1274 break;
1275 }
1276
1277 /* Process the next packet in the batch */
1278 i++;
1279 }
1280
64839cf4 1281 dp_packet_delete_batch(batch, may_steal);
f4fd623c
DDP
1282
1283 if (error && error != EAGAIN) {
1284 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1285 netdev_get_name(netdev_), ovs_strerror(error));
1286 }
1287
1288 return error;
1289
8b61709d
BP
1290}
1291
1292/* Registers with the poll loop to wake up from the next call to poll_block()
1293 * when the packet transmission queue has sufficient room to transmit a packet
1294 * with netdev_send().
1295 *
1296 * The kernel maintains a packet transmission queue, so the client is not
1297 * expected to do additional queuing of packets. Thus, this function is
1298 * unlikely to ever be used. It is included for completeness. */
1299static void
f00fa8cb 1300netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1301{
796223f5 1302 if (is_tap_netdev(netdev)) {
8b61709d
BP
1303 /* TAP device always accepts packets.*/
1304 poll_immediate_wake();
1305 }
1306}
1307
1308/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1309 * otherwise a positive errno value. */
1310static int
74ff3298 1311netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1312{
b5d57fc8 1313 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1314 enum netdev_flags old_flags = 0;
eb395f2e
BP
1315 int error;
1316
86383816
BP
1317 ovs_mutex_lock(&netdev->mutex);
1318
b5d57fc8 1319 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1320 error = netdev->ether_addr_error;
1321 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1322 goto exit;
44445cac 1323 }
b5d57fc8 1324 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1325 }
1326
7eb1bd81 1327 /* Tap devices must be brought down before setting the address. */
796223f5 1328 if (is_tap_netdev(netdev_)) {
4f9f3f21 1329 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1330 }
44445cac
PS
1331 error = set_etheraddr(netdev_get_name(netdev_), mac);
1332 if (!error || error == ENODEV) {
b5d57fc8
BP
1333 netdev->ether_addr_error = error;
1334 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1335 if (!error) {
74ff3298 1336 netdev->etheraddr = mac;
eb395f2e 1337 }
8b61709d 1338 }
44445cac 1339
4f9f3f21
BP
1340 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1341 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1342 }
7eb1bd81 1343
86383816
BP
1344exit:
1345 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1346 return error;
1347}
1348
44445cac 1349/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1350static int
74ff3298 1351netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1352{
b5d57fc8 1353 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1354 int error;
44445cac 1355
86383816 1356 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1357 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816 1358 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1359 &netdev->etheraddr);
b5d57fc8 1360 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1361 }
44445cac 1362
86383816
BP
1363 error = netdev->ether_addr_error;
1364 if (!error) {
74ff3298 1365 *mac = netdev->etheraddr;
44445cac 1366 }
86383816 1367 ovs_mutex_unlock(&netdev->mutex);
44445cac 1368
86383816 1369 return error;
8b61709d
BP
1370}
1371
8b61709d 1372static int
73371c09 1373netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1374{
86383816
BP
1375 int error;
1376
b5d57fc8 1377 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1378 struct ifreq ifr;
90a6637d 1379
86383816 1380 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1381 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1382 netdev->mtu = ifr.ifr_mtu;
1383 netdev->cache_valid |= VALID_MTU;
8b61709d 1384 }
90a6637d 1385
86383816
BP
1386 error = netdev->netdev_mtu_error;
1387 if (!error) {
b5d57fc8 1388 *mtup = netdev->mtu;
90a6637d 1389 }
73371c09
BP
1390
1391 return error;
1392}
1393
1394/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1395 * in bytes, not including the hardware header; thus, this is typically 1500
1396 * bytes for Ethernet devices. */
1397static int
1398netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1399{
1400 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1401 int error;
1402
1403 ovs_mutex_lock(&netdev->mutex);
1404 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1405 ovs_mutex_unlock(&netdev->mutex);
1406
1407 return error;
8b61709d
BP
1408}
1409
9b020780
PS
1410/* Sets the maximum size of transmitted (MTU) for given device using linux
1411 * networking ioctl interface.
1412 */
1413static int
4124cb12 1414netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1415{
b5d57fc8 1416 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1417 struct ifreq ifr;
1418 int error;
1419
86383816 1420 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1421 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1422 error = netdev->netdev_mtu_error;
1423 if (error || netdev->mtu == mtu) {
1424 goto exit;
90a6637d 1425 }
b5d57fc8 1426 netdev->cache_valid &= ~VALID_MTU;
153e5481 1427 }
9b020780 1428 ifr.ifr_mtu = mtu;
259e0b1a
BP
1429 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1430 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1431 if (!error || error == ENODEV) {
b5d57fc8
BP
1432 netdev->netdev_mtu_error = error;
1433 netdev->mtu = ifr.ifr_mtu;
1434 netdev->cache_valid |= VALID_MTU;
9b020780 1435 }
86383816
BP
1436exit:
1437 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1438 return error;
9b020780
PS
1439}
1440
9ab3d9a3
BP
1441/* Returns the ifindex of 'netdev', if successful, as a positive number.
1442 * On failure, returns a negative errno value. */
1443static int
86383816 1444netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1445{
86383816 1446 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1447 int ifindex, error;
1448
86383816
BP
1449 ovs_mutex_lock(&netdev->mutex);
1450 error = get_ifindex(netdev_, &ifindex);
1451 ovs_mutex_unlock(&netdev->mutex);
1452
9ab3d9a3
BP
1453 return error ? -error : ifindex;
1454}
1455
8b61709d
BP
1456static int
1457netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1458{
b5d57fc8 1459 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1460
86383816 1461 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1462 if (netdev->miimon_interval > 0) {
1463 *carrier = netdev->miimon;
3a183124 1464 } else {
b5d57fc8 1465 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1466 }
86383816 1467 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1468
3a183124 1469 return 0;
8b61709d
BP
1470}
1471
65c3058c 1472static long long int
86383816 1473netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1474{
86383816
BP
1475 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1476 long long int carrier_resets;
1477
1478 ovs_mutex_lock(&netdev->mutex);
1479 carrier_resets = netdev->carrier_resets;
1480 ovs_mutex_unlock(&netdev->mutex);
1481
1482 return carrier_resets;
65c3058c
EJ
1483}
1484
63331829 1485static int
1670c579
EJ
1486netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1487 struct mii_ioctl_data *data)
63331829 1488{
63331829 1489 struct ifreq ifr;
782e6111 1490 int error;
63331829 1491
63331829 1492 memset(&ifr, 0, sizeof ifr);
782e6111 1493 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1494 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1495 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1496
782e6111
EJ
1497 return error;
1498}
1499
1500static int
1670c579 1501netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1502{
782e6111
EJ
1503 struct mii_ioctl_data data;
1504 int error;
63331829 1505
782e6111
EJ
1506 *miimon = false;
1507
1508 memset(&data, 0, sizeof data);
1670c579 1509 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1510 if (!error) {
1511 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1512 data.reg_num = MII_BMSR;
1670c579 1513 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1514 &data);
63331829
EJ
1515
1516 if (!error) {
782e6111 1517 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1518 }
9120cfc0
DH
1519 }
1520 if (error) {
63331829 1521 struct ethtool_cmd ecmd;
63331829
EJ
1522
1523 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1524 name);
1525
ab985a77 1526 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1527 memset(&ecmd, 0, sizeof ecmd);
1528 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1529 "ETHTOOL_GLINK");
1530 if (!error) {
782e6111
EJ
1531 struct ethtool_value eval;
1532
1533 memcpy(&eval, &ecmd, sizeof eval);
1534 *miimon = !!eval.data;
63331829
EJ
1535 } else {
1536 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1537 }
1538 }
1539
1540 return error;
1541}
1542
1670c579
EJ
1543static int
1544netdev_linux_set_miimon_interval(struct netdev *netdev_,
1545 long long int interval)
1546{
b5d57fc8 1547 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1548
86383816 1549 ovs_mutex_lock(&netdev->mutex);
1670c579 1550 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1551 if (netdev->miimon_interval != interval) {
19c8e9c1 1552 if (interval && !netdev->miimon_interval) {
812c272c 1553 atomic_count_inc(&miimon_cnt);
19c8e9c1 1554 } else if (!interval && netdev->miimon_interval) {
812c272c 1555 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1556 }
1557
b5d57fc8
BP
1558 netdev->miimon_interval = interval;
1559 timer_set_expired(&netdev->miimon_timer);
1670c579 1560 }
86383816 1561 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1562
1563 return 0;
1564}
1565
1566static void
1567netdev_linux_miimon_run(void)
1568{
1569 struct shash device_shash;
1570 struct shash_node *node;
1571
1572 shash_init(&device_shash);
b5d57fc8 1573 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1574 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1575 struct netdev *netdev = node->data;
1576 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1577 bool miimon;
1578
86383816
BP
1579 ovs_mutex_lock(&dev->mutex);
1580 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1581 netdev_linux_get_miimon(dev->up.name, &miimon);
1582 if (miimon != dev->miimon) {
1583 dev->miimon = miimon;
1584 netdev_linux_changed(dev, dev->ifi_flags, 0);
1585 }
1670c579 1586
86383816 1587 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1588 }
86383816 1589 ovs_mutex_unlock(&dev->mutex);
2f980d74 1590 netdev_close(netdev);
1670c579
EJ
1591 }
1592
1593 shash_destroy(&device_shash);
1594}
1595
1596static void
1597netdev_linux_miimon_wait(void)
1598{
1599 struct shash device_shash;
1600 struct shash_node *node;
1601
1602 shash_init(&device_shash);
b5d57fc8 1603 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1604 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1605 struct netdev *netdev = node->data;
1606 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1607
86383816 1608 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1609 if (dev->miimon_interval > 0) {
1610 timer_wait(&dev->miimon_timer);
1611 }
86383816 1612 ovs_mutex_unlock(&dev->mutex);
2f980d74 1613 netdev_close(netdev);
1670c579
EJ
1614 }
1615 shash_destroy(&device_shash);
1616}
1617
92df599c
JG
1618static void
1619swap_uint64(uint64_t *a, uint64_t *b)
1620{
1de0e8ae
BP
1621 uint64_t tmp = *a;
1622 *a = *b;
1623 *b = tmp;
92df599c
JG
1624}
1625
c060c4cf
EJ
1626/* Copies 'src' into 'dst', performing format conversion in the process.
1627 *
1628 * 'src' is allowed to be misaligned. */
1629static void
1630netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1631 const struct ovs_vport_stats *src)
1632{
6a54dedc
BP
1633 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1634 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1635 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1636 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1637 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1638 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1639 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1640 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1641 dst->multicast = 0;
1642 dst->collisions = 0;
1643 dst->rx_length_errors = 0;
1644 dst->rx_over_errors = 0;
1645 dst->rx_crc_errors = 0;
1646 dst->rx_frame_errors = 0;
1647 dst->rx_fifo_errors = 0;
1648 dst->rx_missed_errors = 0;
1649 dst->tx_aborted_errors = 0;
1650 dst->tx_carrier_errors = 0;
1651 dst->tx_fifo_errors = 0;
1652 dst->tx_heartbeat_errors = 0;
1653 dst->tx_window_errors = 0;
1654}
1655
1656static int
1657get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1658{
93451a0a 1659 struct dpif_netlink_vport reply;
c060c4cf
EJ
1660 struct ofpbuf *buf;
1661 int error;
1662
93451a0a 1663 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1664 if (error) {
1665 return error;
1666 } else if (!reply.stats) {
1667 ofpbuf_delete(buf);
1668 return EOPNOTSUPP;
1669 }
1670
1671 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1672
1673 ofpbuf_delete(buf);
1674
1675 return 0;
1676}
1677
f613a0d7
PS
1678static void
1679get_stats_via_vport(const struct netdev *netdev_,
1680 struct netdev_stats *stats)
8b61709d 1681{
b5d57fc8 1682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1683
b5d57fc8
BP
1684 if (!netdev->vport_stats_error ||
1685 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1686 int error;
7fbef77a 1687
c060c4cf 1688 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1689 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1690 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1691 "(%s)",
1692 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1693 }
b5d57fc8
BP
1694 netdev->vport_stats_error = error;
1695 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1696 }
f613a0d7 1697}
8b61709d 1698
f613a0d7
PS
1699/* Retrieves current device stats for 'netdev-linux'. */
1700static int
1701netdev_linux_get_stats(const struct netdev *netdev_,
1702 struct netdev_stats *stats)
1703{
b5d57fc8 1704 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1705 struct netdev_stats dev_stats;
1706 int error;
1707
86383816 1708 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1709 get_stats_via_vport(netdev_, stats);
35eef899 1710 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1711 if (error) {
86383816
BP
1712 if (!netdev->vport_stats_error) {
1713 error = 0;
f613a0d7 1714 }
86383816 1715 } else if (netdev->vport_stats_error) {
04c881eb 1716 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1717 *stats = dev_stats;
1718 } else {
04c881eb
AZ
1719 /* Use kernel netdev's packet and byte counts since vport's counters
1720 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1721 * enabled. */
1722 stats->rx_packets = dev_stats.rx_packets;
1723 stats->rx_bytes = dev_stats.rx_bytes;
1724 stats->tx_packets = dev_stats.tx_packets;
1725 stats->tx_bytes = dev_stats.tx_bytes;
1726
f613a0d7
PS
1727 stats->rx_errors += dev_stats.rx_errors;
1728 stats->tx_errors += dev_stats.tx_errors;
1729 stats->rx_dropped += dev_stats.rx_dropped;
1730 stats->tx_dropped += dev_stats.tx_dropped;
1731 stats->multicast += dev_stats.multicast;
1732 stats->collisions += dev_stats.collisions;
1733 stats->rx_length_errors += dev_stats.rx_length_errors;
1734 stats->rx_over_errors += dev_stats.rx_over_errors;
1735 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1736 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1737 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1738 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1739 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1740 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1741 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1742 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1743 stats->tx_window_errors += dev_stats.tx_window_errors;
1744 }
86383816
BP
1745 ovs_mutex_unlock(&netdev->mutex);
1746
1747 return error;
f613a0d7
PS
1748}
1749
1750/* Retrieves current device stats for 'netdev-tap' netdev or
1751 * netdev-internal. */
1752static int
15aee116 1753netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1754{
b5d57fc8 1755 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1756 struct netdev_stats dev_stats;
1757 int error;
1758
86383816 1759 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1760 get_stats_via_vport(netdev_, stats);
35eef899 1761 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1762 if (error) {
86383816
BP
1763 if (!netdev->vport_stats_error) {
1764 error = 0;
8b61709d 1765 }
86383816
BP
1766 } else if (netdev->vport_stats_error) {
1767 /* Transmit and receive stats will appear to be swapped relative to the
1768 * other ports since we are the one sending the data, not a remote
1769 * computer. For consistency, we swap them back here. This does not
1770 * apply if we are getting stats from the vport layer because it always
1771 * tracks stats from the perspective of the switch. */
fe6b0e03 1772
f613a0d7 1773 *stats = dev_stats;
92df599c
JG
1774 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1775 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1776 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1777 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1778 stats->rx_length_errors = 0;
1779 stats->rx_over_errors = 0;
1780 stats->rx_crc_errors = 0;
1781 stats->rx_frame_errors = 0;
1782 stats->rx_fifo_errors = 0;
1783 stats->rx_missed_errors = 0;
1784 stats->tx_aborted_errors = 0;
1785 stats->tx_carrier_errors = 0;
1786 stats->tx_fifo_errors = 0;
1787 stats->tx_heartbeat_errors = 0;
1788 stats->tx_window_errors = 0;
f613a0d7 1789 } else {
04c881eb
AZ
1790 /* Use kernel netdev's packet and byte counts since vport counters
1791 * do not reflect packet counts on the wire when GSO, TSO or GRO
1792 * are enabled. */
1793 stats->rx_packets = dev_stats.tx_packets;
1794 stats->rx_bytes = dev_stats.tx_bytes;
1795 stats->tx_packets = dev_stats.rx_packets;
1796 stats->tx_bytes = dev_stats.rx_bytes;
1797
f613a0d7
PS
1798 stats->rx_dropped += dev_stats.tx_dropped;
1799 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1800
f613a0d7
PS
1801 stats->rx_errors += dev_stats.tx_errors;
1802 stats->tx_errors += dev_stats.rx_errors;
1803
1804 stats->multicast += dev_stats.multicast;
1805 stats->collisions += dev_stats.collisions;
1806 }
86383816
BP
1807 ovs_mutex_unlock(&netdev->mutex);
1808
1809 return error;
8b61709d
BP
1810}
1811
bba1e6f3
PS
1812static int
1813netdev_internal_get_stats(const struct netdev *netdev_,
1814 struct netdev_stats *stats)
1815{
b5d57fc8 1816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1817 int error;
bba1e6f3 1818
86383816 1819 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1820 get_stats_via_vport(netdev_, stats);
86383816
BP
1821 error = netdev->vport_stats_error;
1822 ovs_mutex_unlock(&netdev->mutex);
1823
1824 return error;
bba1e6f3
PS
1825}
1826
51f87458 1827static void
b5d57fc8 1828netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1829{
1830 struct ethtool_cmd ecmd;
6c038611 1831 uint32_t speed;
8b61709d
BP
1832 int error;
1833
b5d57fc8 1834 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1835 return;
1836 }
1837
ab985a77 1838 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1839 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1840 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1841 ETHTOOL_GSET, "ETHTOOL_GSET");
1842 if (error) {
51f87458 1843 goto out;
8b61709d
BP
1844 }
1845
1846 /* Supported features. */
b5d57fc8 1847 netdev->supported = 0;
8b61709d 1848 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1849 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1850 }
1851 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1852 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1853 }
1854 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1855 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1856 }
1857 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1858 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1859 }
1860 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1861 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 1862 }
67bed84c
SH
1863 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1864 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 1865 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 1866 }
67bed84c
SH
1867 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1868 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1869 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1870 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 1871 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 1872 }
67bed84c
SH
1873 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1874 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1875 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1876 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1877 netdev->supported |= NETDEV_F_40GB_FD;
1878 }
8b61709d 1879 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1880 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1881 }
1882 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1883 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1884 }
1885 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1886 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1887 }
1888 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1889 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1890 }
1891 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1892 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1893 }
1894
1895 /* Advertised features. */
b5d57fc8 1896 netdev->advertised = 0;
8b61709d 1897 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1898 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1899 }
1900 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1901 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1902 }
1903 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1904 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1905 }
1906 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1907 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1908 }
1909 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1910 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 1911 }
67bed84c
SH
1912 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1913 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 1914 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 1915 }
67bed84c
SH
1916 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1917 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1918 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1919 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 1920 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 1921 }
67bed84c
SH
1922 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1923 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1924 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1925 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1926 netdev->advertised |= NETDEV_F_40GB_FD;
1927 }
8b61709d 1928 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1929 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1930 }
1931 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1932 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1933 }
1934 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1935 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1936 }
1937 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1938 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1939 }
1940 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1941 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1942 }
1943
1944 /* Current settings. */
0c615356 1945 speed = ethtool_cmd_speed(&ecmd);
6c038611 1946 if (speed == SPEED_10) {
b5d57fc8 1947 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1948 } else if (speed == SPEED_100) {
b5d57fc8 1949 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1950 } else if (speed == SPEED_1000) {
b5d57fc8 1951 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1952 } else if (speed == SPEED_10000) {
b5d57fc8 1953 netdev->current = NETDEV_F_10GB_FD;
6c038611 1954 } else if (speed == 40000) {
b5d57fc8 1955 netdev->current = NETDEV_F_40GB_FD;
6c038611 1956 } else if (speed == 100000) {
b5d57fc8 1957 netdev->current = NETDEV_F_100GB_FD;
6c038611 1958 } else if (speed == 1000000) {
b5d57fc8 1959 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1960 } else {
b5d57fc8 1961 netdev->current = 0;
8b61709d
BP
1962 }
1963
1964 if (ecmd.port == PORT_TP) {
b5d57fc8 1965 netdev->current |= NETDEV_F_COPPER;
8b61709d 1966 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1967 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1968 }
1969
1970 if (ecmd.autoneg) {
b5d57fc8 1971 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1972 }
1973
51f87458 1974out:
b5d57fc8
BP
1975 netdev->cache_valid |= VALID_FEATURES;
1976 netdev->get_features_error = error;
51f87458
PS
1977}
1978
887ed8b2
BP
1979/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1980 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1981 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1982static int
1983netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1984 enum netdev_features *current,
1985 enum netdev_features *advertised,
1986 enum netdev_features *supported,
1987 enum netdev_features *peer)
51f87458 1988{
b5d57fc8 1989 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1990 int error;
51f87458 1991
86383816 1992 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1993 netdev_linux_read_features(netdev);
b5d57fc8
BP
1994 if (!netdev->get_features_error) {
1995 *current = netdev->current;
1996 *advertised = netdev->advertised;
1997 *supported = netdev->supported;
887ed8b2 1998 *peer = 0; /* XXX */
51f87458 1999 }
86383816
BP
2000 error = netdev->get_features_error;
2001 ovs_mutex_unlock(&netdev->mutex);
2002
2003 return error;
8b61709d
BP
2004}
2005
2006/* Set the features advertised by 'netdev' to 'advertise'. */
2007static int
86383816 2008netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2009 enum netdev_features advertise)
8b61709d 2010{
86383816 2011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2012 struct ethtool_cmd ecmd;
2013 int error;
2014
86383816
BP
2015 ovs_mutex_lock(&netdev->mutex);
2016
ab985a77 2017 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2018 memset(&ecmd, 0, sizeof ecmd);
86383816 2019 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2020 ETHTOOL_GSET, "ETHTOOL_GSET");
2021 if (error) {
86383816 2022 goto exit;
8b61709d
BP
2023 }
2024
2025 ecmd.advertising = 0;
6c038611 2026 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2027 ecmd.advertising |= ADVERTISED_10baseT_Half;
2028 }
6c038611 2029 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2030 ecmd.advertising |= ADVERTISED_10baseT_Full;
2031 }
6c038611 2032 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2033 ecmd.advertising |= ADVERTISED_100baseT_Half;
2034 }
6c038611 2035 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2036 ecmd.advertising |= ADVERTISED_100baseT_Full;
2037 }
6c038611 2038 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2039 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2040 }
6c038611 2041 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2042 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2043 }
6c038611 2044 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2045 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2046 }
6c038611 2047 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2048 ecmd.advertising |= ADVERTISED_TP;
2049 }
6c038611 2050 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2051 ecmd.advertising |= ADVERTISED_FIBRE;
2052 }
6c038611 2053 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2054 ecmd.advertising |= ADVERTISED_Autoneg;
2055 }
6c038611 2056 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2057 ecmd.advertising |= ADVERTISED_Pause;
2058 }
6c038611 2059 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2060 ecmd.advertising |= ADVERTISED_Asym_Pause;
2061 }
ab985a77 2062 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2063 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2064 ETHTOOL_SSET, "ETHTOOL_SSET");
2065
2066exit:
2067 ovs_mutex_unlock(&netdev->mutex);
2068 return error;
8b61709d
BP
2069}
2070
f8500004
JP
2071/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2072 * successful, otherwise a positive errno value. */
8b61709d 2073static int
b5d57fc8 2074netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2075 uint32_t kbits_rate, uint32_t kbits_burst)
2076{
b5d57fc8
BP
2077 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2078 const char *netdev_name = netdev_get_name(netdev_);
f8500004 2079 int error;
8b61709d 2080
80a86fbe 2081 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2082 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2083 : kbits_burst); /* Stick with user-specified value. */
2084
86383816 2085 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2086 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2087 error = netdev->netdev_policing_error;
2088 if (error || (netdev->kbits_rate == kbits_rate &&
2089 netdev->kbits_burst == kbits_burst)) {
c9f71668 2090 /* Assume that settings haven't changed since we last set them. */
86383816 2091 goto out;
c9f71668 2092 }
b5d57fc8 2093 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2094 }
2095
ac8c3412 2096 COVERAGE_INC(netdev_set_policing);
f8500004 2097 /* Remove any existing ingress qdisc. */
b5d57fc8 2098 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
2099 if (error) {
2100 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2101 netdev_name, ovs_strerror(error));
c9f71668 2102 goto out;
f8500004
JP
2103 }
2104
8b61709d 2105 if (kbits_rate) {
b5d57fc8 2106 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
2107 if (error) {
2108 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2109 netdev_name, ovs_strerror(error));
c9f71668 2110 goto out;
8b61709d
BP
2111 }
2112
b5d57fc8 2113 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2114 if (error){
2115 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2116 netdev_name, ovs_strerror(error));
c9f71668 2117 goto out;
8b61709d 2118 }
8b61709d
BP
2119 }
2120
b5d57fc8
BP
2121 netdev->kbits_rate = kbits_rate;
2122 netdev->kbits_burst = kbits_burst;
f8500004 2123
c9f71668
PS
2124out:
2125 if (!error || error == ENODEV) {
b5d57fc8
BP
2126 netdev->netdev_policing_error = error;
2127 netdev->cache_valid |= VALID_POLICING;
c9f71668 2128 }
86383816 2129 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2130 return error;
8b61709d
BP
2131}
2132
c1c9c9c4
BP
2133static int
2134netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2135 struct sset *types)
c1c9c9c4 2136{
559eb230 2137 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2138 for (opsp = tcs; *opsp != NULL; opsp++) {
2139 const struct tc_ops *ops = *opsp;
2140 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2141 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2142 }
2143 }
2144 return 0;
2145}
2146
2147static const struct tc_ops *
2148tc_lookup_ovs_name(const char *name)
2149{
559eb230 2150 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2151
2152 for (opsp = tcs; *opsp != NULL; opsp++) {
2153 const struct tc_ops *ops = *opsp;
2154 if (!strcmp(name, ops->ovs_name)) {
2155 return ops;
2156 }
2157 }
2158 return NULL;
2159}
2160
2161static const struct tc_ops *
2162tc_lookup_linux_name(const char *name)
2163{
559eb230 2164 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2165
2166 for (opsp = tcs; *opsp != NULL; opsp++) {
2167 const struct tc_ops *ops = *opsp;
2168 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2169 return ops;
2170 }
2171 }
2172 return NULL;
2173}
2174
93b13be8 2175static struct tc_queue *
b5d57fc8 2176tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2177 size_t hash)
2178{
b5d57fc8 2179 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2180 struct tc_queue *queue;
2181
b5d57fc8 2182 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2183 if (queue->queue_id == queue_id) {
2184 return queue;
2185 }
2186 }
2187 return NULL;
2188}
2189
2190static struct tc_queue *
2191tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2192{
2193 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2194}
2195
c1c9c9c4
BP
2196static int
2197netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2198 const char *type,
2199 struct netdev_qos_capabilities *caps)
2200{
2201 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2202 if (!ops) {
2203 return EOPNOTSUPP;
2204 }
2205 caps->n_queues = ops->n_queues;
2206 return 0;
2207}
2208
2209static int
b5d57fc8 2210netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2211 const char **typep, struct smap *details)
c1c9c9c4 2212{
b5d57fc8 2213 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2214 int error;
2215
86383816 2216 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2217 error = tc_query_qdisc(netdev_);
86383816
BP
2218 if (!error) {
2219 *typep = netdev->tc->ops->ovs_name;
2220 error = (netdev->tc->ops->qdisc_get
2221 ? netdev->tc->ops->qdisc_get(netdev_, details)
2222 : 0);
c1c9c9c4 2223 }
86383816 2224 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2225
86383816 2226 return error;
c1c9c9c4
BP
2227}
2228
2229static int
b5d57fc8 2230netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2231 const char *type, const struct smap *details)
c1c9c9c4 2232{
b5d57fc8 2233 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2234 const struct tc_ops *new_ops;
2235 int error;
2236
2237 new_ops = tc_lookup_ovs_name(type);
2238 if (!new_ops || !new_ops->tc_install) {
2239 return EOPNOTSUPP;
2240 }
2241
6cf888b8
BS
2242 if (new_ops == &tc_ops_noop) {
2243 return new_ops->tc_install(netdev_, details);
2244 }
2245
86383816 2246 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2247 error = tc_query_qdisc(netdev_);
c1c9c9c4 2248 if (error) {
86383816 2249 goto exit;
c1c9c9c4
BP
2250 }
2251
b5d57fc8 2252 if (new_ops == netdev->tc->ops) {
86383816 2253 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2254 } else {
2255 /* Delete existing qdisc. */
b5d57fc8 2256 error = tc_del_qdisc(netdev_);
c1c9c9c4 2257 if (error) {
86383816 2258 goto exit;
c1c9c9c4 2259 }
b5d57fc8 2260 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2261
2262 /* Install new qdisc. */
b5d57fc8
BP
2263 error = new_ops->tc_install(netdev_, details);
2264 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2265 }
86383816
BP
2266
2267exit:
2268 ovs_mutex_unlock(&netdev->mutex);
2269 return error;
c1c9c9c4
BP
2270}
2271
2272static int
b5d57fc8 2273netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2274 unsigned int queue_id, struct smap *details)
c1c9c9c4 2275{
b5d57fc8 2276 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2277 int error;
2278
86383816 2279 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2280 error = tc_query_qdisc(netdev_);
86383816 2281 if (!error) {
b5d57fc8 2282 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2283 error = (queue
b5d57fc8 2284 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2285 : ENOENT);
c1c9c9c4 2286 }
86383816
BP
2287 ovs_mutex_unlock(&netdev->mutex);
2288
2289 return error;
c1c9c9c4
BP
2290}
2291
2292static int
b5d57fc8 2293netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2294 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2295{
b5d57fc8 2296 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2297 int error;
2298
86383816 2299 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2300 error = tc_query_qdisc(netdev_);
86383816
BP
2301 if (!error) {
2302 error = (queue_id < netdev->tc->ops->n_queues
2303 && netdev->tc->ops->class_set
2304 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2305 : EINVAL);
c1c9c9c4 2306 }
86383816 2307 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2308
86383816 2309 return error;
c1c9c9c4
BP
2310}
2311
2312static int
b5d57fc8 2313netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2314{
b5d57fc8 2315 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2316 int error;
2317
86383816 2318 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2319 error = tc_query_qdisc(netdev_);
86383816
BP
2320 if (!error) {
2321 if (netdev->tc->ops->class_delete) {
2322 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2323 error = (queue
2324 ? netdev->tc->ops->class_delete(netdev_, queue)
2325 : ENOENT);
2326 } else {
2327 error = EINVAL;
2328 }
c1c9c9c4 2329 }
86383816
BP
2330 ovs_mutex_unlock(&netdev->mutex);
2331
2332 return error;
c1c9c9c4
BP
2333}
2334
2335static int
b5d57fc8 2336netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2337 unsigned int queue_id,
2338 struct netdev_queue_stats *stats)
2339{
b5d57fc8 2340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2341 int error;
2342
86383816 2343 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2344 error = tc_query_qdisc(netdev_);
86383816
BP
2345 if (!error) {
2346 if (netdev->tc->ops->class_get_stats) {
2347 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2348 if (queue) {
2349 stats->created = queue->created;
2350 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2351 stats);
2352 } else {
2353 error = ENOENT;
2354 }
2355 } else {
2356 error = EOPNOTSUPP;
6dc34a0d 2357 }
c1c9c9c4 2358 }
86383816
BP
2359 ovs_mutex_unlock(&netdev->mutex);
2360
2361 return error;
c1c9c9c4
BP
2362}
2363
d57695d7
JS
2364struct queue_dump_state {
2365 struct nl_dump dump;
2366 struct ofpbuf buf;
2367};
2368
23a98ffe 2369static bool
d57695d7 2370start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2371{
2372 struct ofpbuf request;
2373 struct tcmsg *tcmsg;
2374
2375 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2376 if (!tcmsg) {
2377 return false;
2378 }
3c4de644 2379 tcmsg->tcm_parent = 0;
d57695d7 2380 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2381 ofpbuf_uninit(&request);
d57695d7
JS
2382
2383 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2384 return true;
c1c9c9c4
BP
2385}
2386
d57695d7
JS
2387static int
2388finish_queue_dump(struct queue_dump_state *state)
2389{
2390 ofpbuf_uninit(&state->buf);
2391 return nl_dump_done(&state->dump);
2392}
2393
89454bf4
BP
2394struct netdev_linux_queue_state {
2395 unsigned int *queues;
2396 size_t cur_queue;
2397 size_t n_queues;
2398};
2399
c1c9c9c4 2400static int
89454bf4 2401netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2402{
89454bf4 2403 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2404 int error;
2405
86383816 2406 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2407 error = tc_query_qdisc(netdev_);
86383816
BP
2408 if (!error) {
2409 if (netdev->tc->ops->class_get) {
89454bf4
BP
2410 struct netdev_linux_queue_state *state;
2411 struct tc_queue *queue;
2412 size_t i;
2413
2414 *statep = state = xmalloc(sizeof *state);
2415 state->n_queues = hmap_count(&netdev->tc->queues);
2416 state->cur_queue = 0;
2417 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2418
2419 i = 0;
2420 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2421 state->queues[i++] = queue->queue_id;
86383816 2422 }
c1c9c9c4 2423 } else {
86383816 2424 error = EOPNOTSUPP;
c1c9c9c4
BP
2425 }
2426 }
86383816 2427 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2428
86383816 2429 return error;
c1c9c9c4
BP
2430}
2431
89454bf4
BP
2432static int
2433netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2434 unsigned int *queue_idp, struct smap *details)
2435{
2436 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2437 struct netdev_linux_queue_state *state = state_;
2438 int error = EOF;
2439
2440 ovs_mutex_lock(&netdev->mutex);
2441 while (state->cur_queue < state->n_queues) {
2442 unsigned int queue_id = state->queues[state->cur_queue++];
2443 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2444
2445 if (queue) {
2446 *queue_idp = queue_id;
2447 error = netdev->tc->ops->class_get(netdev_, queue, details);
2448 break;
2449 }
2450 }
2451 ovs_mutex_unlock(&netdev->mutex);
2452
2453 return error;
2454}
2455
2456static int
2457netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2458 void *state_)
2459{
2460 struct netdev_linux_queue_state *state = state_;
2461
2462 free(state->queues);
2463 free(state);
2464 return 0;
2465}
2466
c1c9c9c4 2467static int
b5d57fc8 2468netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2469 netdev_dump_queue_stats_cb *cb, void *aux)
2470{
b5d57fc8 2471 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2472 int error;
2473
86383816 2474 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2475 error = tc_query_qdisc(netdev_);
86383816 2476 if (!error) {
d57695d7 2477 struct queue_dump_state state;
c1c9c9c4 2478
86383816
BP
2479 if (!netdev->tc->ops->class_dump_stats) {
2480 error = EOPNOTSUPP;
d57695d7 2481 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2482 error = ENODEV;
2483 } else {
2484 struct ofpbuf msg;
2485 int retval;
2486
d57695d7 2487 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2488 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2489 cb, aux);
2490 if (retval) {
2491 error = retval;
2492 }
2493 }
2494
d57695d7 2495 retval = finish_queue_dump(&state);
86383816
BP
2496 if (retval) {
2497 error = retval;
2498 }
c1c9c9c4
BP
2499 }
2500 }
86383816 2501 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2502
86383816 2503 return error;
c1c9c9c4
BP
2504}
2505
8b61709d 2506static int
f1acd62b
BP
2507netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2508 struct in_addr netmask)
8b61709d 2509{
b5d57fc8 2510 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2511 int error;
2512
86383816 2513 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2514 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2515 if (!error) {
f1acd62b 2516 if (address.s_addr != INADDR_ANY) {
8b61709d 2517 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2518 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2519 }
2520 }
49af9a3d 2521
86383816
BP
2522 ovs_mutex_unlock(&netdev->mutex);
2523
8b61709d
BP
2524 return error;
2525}
2526
7df6932e
AW
2527/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2528 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2529 * error. */
8b61709d 2530static int
a8704b50
PS
2531netdev_linux_get_addr_list(const struct netdev *netdev_,
2532 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2533{
b5d57fc8 2534 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2535 int error;
86383816
BP
2536
2537 ovs_mutex_lock(&netdev->mutex);
a8704b50 2538 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816
BP
2539 ovs_mutex_unlock(&netdev->mutex);
2540
7df6932e 2541 return error;
8b61709d
BP
2542}
2543
2544static void
2545make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2546{
2547 struct sockaddr_in sin;
2548 memset(&sin, 0, sizeof sin);
2549 sin.sin_family = AF_INET;
2550 sin.sin_addr = addr;
2551 sin.sin_port = 0;
2552
2553 memset(sa, 0, sizeof *sa);
2554 memcpy(sa, &sin, sizeof sin);
2555}
2556
2557static int
2558do_set_addr(struct netdev *netdev,
2559 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2560{
2561 struct ifreq ifr;
149f577a 2562
259e0b1a
BP
2563 make_in4_sockaddr(&ifr.ifr_addr, addr);
2564 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2565 ioctl_name);
8b61709d
BP
2566}
2567
2568/* Adds 'router' as a default IP gateway. */
2569static int
67a4917b 2570netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2571{
2572 struct in_addr any = { INADDR_ANY };
2573 struct rtentry rt;
2574 int error;
2575
2576 memset(&rt, 0, sizeof rt);
2577 make_in4_sockaddr(&rt.rt_dst, any);
2578 make_in4_sockaddr(&rt.rt_gateway, router);
2579 make_in4_sockaddr(&rt.rt_genmask, any);
2580 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2581 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2582 if (error) {
10a89ef0 2583 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2584 }
2585 return error;
2586}
2587
f1acd62b
BP
2588static int
2589netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2590 char **netdev_name)
2591{
2592 static const char fn[] = "/proc/net/route";
2593 FILE *stream;
2594 char line[256];
2595 int ln;
2596
2597 *netdev_name = NULL;
2598 stream = fopen(fn, "r");
2599 if (stream == NULL) {
10a89ef0 2600 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2601 return errno;
2602 }
2603
2604 ln = 0;
2605 while (fgets(line, sizeof line, stream)) {
2606 if (++ln >= 2) {
2607 char iface[17];
dbba996b 2608 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2609 int refcnt, metric, mtu;
2610 unsigned int flags, use, window, irtt;
2611
c2c28dfd
BP
2612 if (!ovs_scan(line,
2613 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2614 " %d %u %u\n",
2615 iface, &dest, &gateway, &flags, &refcnt,
2616 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2617 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2618 fn, ln, line);
2619 continue;
2620 }
2621 if (!(flags & RTF_UP)) {
2622 /* Skip routes that aren't up. */
2623 continue;
2624 }
2625
2626 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2627 * network byte order, so we don't need need any endian
f1acd62b
BP
2628 * conversions here. */
2629 if ((dest & mask) == (host->s_addr & mask)) {
2630 if (!gateway) {
2631 /* The host is directly reachable. */
2632 next_hop->s_addr = 0;
2633 } else {
2634 /* To reach the host, we must go through a gateway. */
2635 next_hop->s_addr = gateway;
2636 }
2637 *netdev_name = xstrdup(iface);
2638 fclose(stream);
2639 return 0;
2640 }
2641 }
2642 }
2643
2644 fclose(stream);
2645 return ENXIO;
2646}
2647
e210037e 2648static int
b5d57fc8 2649netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2650{
b5d57fc8 2651 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2652 int error = 0;
2653
86383816 2654 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2655 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2656 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2657
2658 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2659 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2660 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2661 cmd,
2662 ETHTOOL_GDRVINFO,
2663 "ETHTOOL_GDRVINFO");
2664 if (!error) {
b5d57fc8 2665 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2666 }
2667 }
e210037e 2668
e210037e 2669 if (!error) {
b5d57fc8
BP
2670 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2671 smap_add(smap, "driver_version", netdev->drvinfo.version);
2672 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2673 }
86383816
BP
2674 ovs_mutex_unlock(&netdev->mutex);
2675
e210037e
AE
2676 return error;
2677}
2678
4f925bd3 2679static int
275707c3
EJ
2680netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2681 struct smap *smap)
4f925bd3 2682{
79f1cbe9 2683 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2684 return 0;
2685}
2686
8b61709d
BP
2687/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2688 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2689 * returns 0. Otherwise, it returns a positive errno value; in particular,
2690 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2691static int
2692netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2693 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2694{
2695 struct arpreq r;
c100e025 2696 struct sockaddr_in sin;
8b61709d
BP
2697 int retval;
2698
2699 memset(&r, 0, sizeof r);
f2cc621b 2700 memset(&sin, 0, sizeof sin);
c100e025
BP
2701 sin.sin_family = AF_INET;
2702 sin.sin_addr.s_addr = ip;
2703 sin.sin_port = 0;
2704 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2705 r.arp_ha.sa_family = ARPHRD_ETHER;
2706 r.arp_flags = 0;
71d7c22f 2707 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2708 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2709 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2710 if (!retval) {
2711 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2712 } else if (retval != ENXIO) {
2713 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2714 netdev_get_name(netdev), IP_ARGS(ip),
2715 ovs_strerror(retval));
8b61709d
BP
2716 }
2717 return retval;
2718}
2719
2720static int
2721nd_to_iff_flags(enum netdev_flags nd)
2722{
2723 int iff = 0;
2724 if (nd & NETDEV_UP) {
2725 iff |= IFF_UP;
2726 }
2727 if (nd & NETDEV_PROMISC) {
2728 iff |= IFF_PROMISC;
2729 }
7ba19d41
AC
2730 if (nd & NETDEV_LOOPBACK) {
2731 iff |= IFF_LOOPBACK;
2732 }
8b61709d
BP
2733 return iff;
2734}
2735
2736static int
2737iff_to_nd_flags(int iff)
2738{
2739 enum netdev_flags nd = 0;
2740 if (iff & IFF_UP) {
2741 nd |= NETDEV_UP;
2742 }
2743 if (iff & IFF_PROMISC) {
2744 nd |= NETDEV_PROMISC;
2745 }
7ba19d41
AC
2746 if (iff & IFF_LOOPBACK) {
2747 nd |= NETDEV_LOOPBACK;
2748 }
8b61709d
BP
2749 return nd;
2750}
2751
2752static int
4f9f3f21
BP
2753update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2754 enum netdev_flags on, enum netdev_flags *old_flagsp)
2755 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2756{
2757 int old_flags, new_flags;
c37d4da4
EJ
2758 int error = 0;
2759
b5d57fc8 2760 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2761 *old_flagsp = iff_to_nd_flags(old_flags);
2762 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2763 if (new_flags != old_flags) {
4f9f3f21
BP
2764 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2765 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2766 }
4f9f3f21
BP
2767
2768 return error;
2769}
2770
2771static int
2772netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2773 enum netdev_flags on, enum netdev_flags *old_flagsp)
2774{
2775 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2776 int error;
2777
2778 ovs_mutex_lock(&netdev->mutex);
2779 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2780 ovs_mutex_unlock(&netdev->mutex);
2781
8b61709d
BP
2782 return error;
2783}
2784
2f9dd77f 2785#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2786 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2787{ \
2788 NAME, \
118c77b1 2789 false, /* is_pmd */ \
c3827f61 2790 \
259e0b1a 2791 NULL, \
c3827f61
BP
2792 netdev_linux_run, \
2793 netdev_linux_wait, \
2794 \
9dc63482
BP
2795 netdev_linux_alloc, \
2796 CONSTRUCT, \
2797 netdev_linux_destruct, \
2798 netdev_linux_dealloc, \
de5cdb90 2799 NULL, /* get_config */ \
6d9e6eb4 2800 NULL, /* set_config */ \
f431bf7d 2801 NULL, /* get_tunnel_config */ \
a36de779
PS
2802 NULL, /* build header */ \
2803 NULL, /* push header */ \
2804 NULL, /* pop header */ \
7dec44fe 2805 NULL, /* get_numa_id */ \
050c60bf 2806 NULL, /* set_tx_multiq */ \
c3827f61 2807 \
c3827f61
BP
2808 netdev_linux_send, \
2809 netdev_linux_send_wait, \
2810 \
2811 netdev_linux_set_etheraddr, \
2812 netdev_linux_get_etheraddr, \
2813 netdev_linux_get_mtu, \
9b020780 2814 netdev_linux_set_mtu, \
c3827f61
BP
2815 netdev_linux_get_ifindex, \
2816 netdev_linux_get_carrier, \
65c3058c 2817 netdev_linux_get_carrier_resets, \
1670c579 2818 netdev_linux_set_miimon_interval, \
f613a0d7 2819 GET_STATS, \
c3827f61 2820 \
51f87458 2821 GET_FEATURES, \
c3827f61 2822 netdev_linux_set_advertisements, \
c3827f61
BP
2823 \
2824 netdev_linux_set_policing, \
2825 netdev_linux_get_qos_types, \
2826 netdev_linux_get_qos_capabilities, \
2827 netdev_linux_get_qos, \
2828 netdev_linux_set_qos, \
2829 netdev_linux_get_queue, \
2830 netdev_linux_set_queue, \
2831 netdev_linux_delete_queue, \
2832 netdev_linux_get_queue_stats, \
89454bf4
BP
2833 netdev_linux_queue_dump_start, \
2834 netdev_linux_queue_dump_next, \
2835 netdev_linux_queue_dump_done, \
c3827f61
BP
2836 netdev_linux_dump_queue_stats, \
2837 \
c3827f61 2838 netdev_linux_set_in4, \
a8704b50 2839 netdev_linux_get_addr_list, \
c3827f61
BP
2840 netdev_linux_add_router, \
2841 netdev_linux_get_next_hop, \
4f925bd3 2842 GET_STATUS, \
c3827f61
BP
2843 netdev_linux_arp_lookup, \
2844 \
2845 netdev_linux_update_flags, \
790fb3b7 2846 NULL, /* reconfigure */ \
c3827f61 2847 \
f7791740
PS
2848 netdev_linux_rxq_alloc, \
2849 netdev_linux_rxq_construct, \
2850 netdev_linux_rxq_destruct, \
2851 netdev_linux_rxq_dealloc, \
2852 netdev_linux_rxq_recv, \
2853 netdev_linux_rxq_wait, \
2854 netdev_linux_rxq_drain, \
c3827f61
BP
2855}
2856
2857const struct netdev_class netdev_linux_class =
2858 NETDEV_LINUX_CLASS(
2859 "system",
9dc63482 2860 netdev_linux_construct,
f613a0d7 2861 netdev_linux_get_stats,
51f87458 2862 netdev_linux_get_features,
275707c3 2863 netdev_linux_get_status);
c3827f61
BP
2864
2865const struct netdev_class netdev_tap_class =
2866 NETDEV_LINUX_CLASS(
2867 "tap",
9dc63482 2868 netdev_linux_construct_tap,
bba1e6f3 2869 netdev_tap_get_stats,
51f87458 2870 netdev_linux_get_features,
275707c3 2871 netdev_linux_get_status);
c3827f61
BP
2872
2873const struct netdev_class netdev_internal_class =
2874 NETDEV_LINUX_CLASS(
2875 "internal",
9dc63482 2876 netdev_linux_construct,
bba1e6f3 2877 netdev_internal_get_stats,
51f87458 2878 NULL, /* get_features */
275707c3 2879 netdev_internal_get_status);
8b61709d 2880\f
677d9158
JV
2881
2882#define CODEL_N_QUEUES 0x0000
2883
2f4298ce
BP
2884/* In sufficiently new kernel headers these are defined as enums in
2885 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2886 * kernels. (This overrides any enum definition in the header file but that's
2887 * harmless.) */
2888#define TCA_CODEL_TARGET 1
2889#define TCA_CODEL_LIMIT 2
2890#define TCA_CODEL_INTERVAL 3
2891
677d9158
JV
2892struct codel {
2893 struct tc tc;
2894 uint32_t target;
2895 uint32_t limit;
2896 uint32_t interval;
2897};
2898
2899static struct codel *
2900codel_get__(const struct netdev *netdev_)
2901{
2902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2903 return CONTAINER_OF(netdev->tc, struct codel, tc);
2904}
2905
2906static void
2907codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2908 uint32_t interval)
2909{
2910 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2911 struct codel *codel;
2912
2913 codel = xmalloc(sizeof *codel);
2914 tc_init(&codel->tc, &tc_ops_codel);
2915 codel->target = target;
2916 codel->limit = limit;
2917 codel->interval = interval;
2918
2919 netdev->tc = &codel->tc;
2920}
2921
2922static int
2923codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2924 uint32_t interval)
2925{
2926 size_t opt_offset;
2927 struct ofpbuf request;
2928 struct tcmsg *tcmsg;
2929 uint32_t otarget, olimit, ointerval;
2930 int error;
2931
2932 tc_del_qdisc(netdev);
2933
2934 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2935 NLM_F_EXCL | NLM_F_CREATE, &request);
2936 if (!tcmsg) {
2937 return ENODEV;
2938 }
2939 tcmsg->tcm_handle = tc_make_handle(1, 0);
2940 tcmsg->tcm_parent = TC_H_ROOT;
2941
2942 otarget = target ? target : 5000;
2943 olimit = limit ? limit : 10240;
2944 ointerval = interval ? interval : 100000;
2945
2946 nl_msg_put_string(&request, TCA_KIND, "codel");
2947 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2948 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2949 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2950 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2951 nl_msg_end_nested(&request, opt_offset);
2952
2953 error = tc_transact(&request, NULL);
2954 if (error) {
2955 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2956 "target %u, limit %u, interval %u error %d(%s)",
2957 netdev_get_name(netdev),
2958 otarget, olimit, ointerval,
2959 error, ovs_strerror(error));
2960 }
2961 return error;
2962}
2963
2964static void
2965codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2966 const struct smap *details, struct codel *codel)
2967{
13c1637f
BP
2968 codel->target = smap_get_ullong(details, "target", 0);
2969 codel->limit = smap_get_ullong(details, "limit", 0);
2970 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
2971
2972 if (!codel->target) {
2973 codel->target = 5000;
2974 }
2975 if (!codel->limit) {
2976 codel->limit = 10240;
2977 }
2978 if (!codel->interval) {
2979 codel->interval = 100000;
2980 }
2981}
2982
2983static int
2984codel_tc_install(struct netdev *netdev, const struct smap *details)
2985{
2986 int error;
2987 struct codel codel;
2988
2989 codel_parse_qdisc_details__(netdev, details, &codel);
2990 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2991 codel.interval);
2992 if (!error) {
2993 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2994 }
2995 return error;
2996}
2997
2998static int
2999codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3000{
3001 static const struct nl_policy tca_codel_policy[] = {
3002 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3003 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3004 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3005 };
3006
3007 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3008
3009 if (!nl_parse_nested(nl_options, tca_codel_policy,
3010 attrs, ARRAY_SIZE(tca_codel_policy))) {
3011 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3012 return EPROTO;
3013 }
3014
3015 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3016 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3017 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3018 return 0;
3019}
3020
3021static int
3022codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3023{
3024 struct nlattr *nlattr;
3025 const char * kind;
3026 int error;
3027 struct codel codel;
3028
3029 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3030 if (error != 0) {
3031 return error;
3032 }
3033
3034 error = codel_parse_tca_options__(nlattr, &codel);
3035 if (error != 0) {
3036 return error;
3037 }
3038
3039 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3040 return 0;
3041}
3042
3043
3044static void
3045codel_tc_destroy(struct tc *tc)
3046{
3047 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3048 tc_destroy(tc);
3049 free(codel);
3050}
3051
3052static int
3053codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3054{
3055 const struct codel *codel = codel_get__(netdev);
3056 smap_add_format(details, "target", "%u", codel->target);
3057 smap_add_format(details, "limit", "%u", codel->limit);
3058 smap_add_format(details, "interval", "%u", codel->interval);
3059 return 0;
3060}
3061
3062static int
3063codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3064{
3065 struct codel codel;
3066
3067 codel_parse_qdisc_details__(netdev, details, &codel);
3068 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3069 codel_get__(netdev)->target = codel.target;
3070 codel_get__(netdev)->limit = codel.limit;
3071 codel_get__(netdev)->interval = codel.interval;
3072 return 0;
3073}
3074
3075static const struct tc_ops tc_ops_codel = {
3076 "codel", /* linux_name */
3077 "linux-codel", /* ovs_name */
3078 CODEL_N_QUEUES, /* n_queues */
3079 codel_tc_install,
3080 codel_tc_load,
3081 codel_tc_destroy,
3082 codel_qdisc_get,
3083 codel_qdisc_set,
3084 NULL,
3085 NULL,
3086 NULL,
3087 NULL,
3088 NULL
3089};
3090\f
3091/* FQ-CoDel traffic control class. */
3092
3093#define FQCODEL_N_QUEUES 0x0000
3094
2f4298ce
BP
3095/* In sufficiently new kernel headers these are defined as enums in
3096 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3097 * kernels. (This overrides any enum definition in the header file but that's
3098 * harmless.) */
3099#define TCA_FQ_CODEL_TARGET 1
3100#define TCA_FQ_CODEL_LIMIT 2
3101#define TCA_FQ_CODEL_INTERVAL 3
3102#define TCA_FQ_CODEL_ECN 4
3103#define TCA_FQ_CODEL_FLOWS 5
3104#define TCA_FQ_CODEL_QUANTUM 6
3105
677d9158
JV
3106struct fqcodel {
3107 struct tc tc;
3108 uint32_t target;
3109 uint32_t limit;
3110 uint32_t interval;
3111 uint32_t flows;
3112 uint32_t quantum;
3113};
3114
3115static struct fqcodel *
3116fqcodel_get__(const struct netdev *netdev_)
3117{
3118 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3119 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3120}
3121
3122static void
3123fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3124 uint32_t interval, uint32_t flows, uint32_t quantum)
3125{
3126 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3127 struct fqcodel *fqcodel;
3128
3129 fqcodel = xmalloc(sizeof *fqcodel);
3130 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3131 fqcodel->target = target;
3132 fqcodel->limit = limit;
3133 fqcodel->interval = interval;
3134 fqcodel->flows = flows;
3135 fqcodel->quantum = quantum;
3136
3137 netdev->tc = &fqcodel->tc;
3138}
3139
3140static int
3141fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3142 uint32_t interval, uint32_t flows, uint32_t quantum)
3143{
3144 size_t opt_offset;
3145 struct ofpbuf request;
3146 struct tcmsg *tcmsg;
3147 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3148 int error;
3149
3150 tc_del_qdisc(netdev);
3151
3152 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3153 NLM_F_EXCL | NLM_F_CREATE, &request);
3154 if (!tcmsg) {
3155 return ENODEV;
3156 }
3157 tcmsg->tcm_handle = tc_make_handle(1, 0);
3158 tcmsg->tcm_parent = TC_H_ROOT;
3159
3160 otarget = target ? target : 5000;
3161 olimit = limit ? limit : 10240;
3162 ointerval = interval ? interval : 100000;
3163 oflows = flows ? flows : 1024;
3164 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3165 not mtu */
3166
3167 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3168 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3169 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3170 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3171 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3172 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3173 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3174 nl_msg_end_nested(&request, opt_offset);
3175
3176 error = tc_transact(&request, NULL);
3177 if (error) {
3178 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3179 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3180 netdev_get_name(netdev),
3181 otarget, olimit, ointerval, oflows, oquantum,
3182 error, ovs_strerror(error));
3183 }
3184 return error;
3185}
3186
3187static void
3188fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3189 const struct smap *details, struct fqcodel *fqcodel)
3190{
13c1637f
BP
3191 fqcodel->target = smap_get_ullong(details, "target", 0);
3192 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3193 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3194 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3195 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3196
677d9158
JV
3197 if (!fqcodel->target) {
3198 fqcodel->target = 5000;
3199 }
3200 if (!fqcodel->limit) {
3201 fqcodel->limit = 10240;
3202 }
3203 if (!fqcodel->interval) {
3204 fqcodel->interval = 1000000;
3205 }
3206 if (!fqcodel->flows) {
3207 fqcodel->flows = 1024;
3208 }
3209 if (!fqcodel->quantum) {
3210 fqcodel->quantum = 1514;
3211 }
3212}
3213
3214static int
3215fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3216{
3217 int error;
3218 struct fqcodel fqcodel;
3219
3220 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3221 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3222 fqcodel.interval, fqcodel.flows,
3223 fqcodel.quantum);
3224 if (!error) {
3225 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3226 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3227 }
3228 return error;
3229}
3230
3231static int
3232fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3233{
3234 static const struct nl_policy tca_fqcodel_policy[] = {
3235 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3236 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3237 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3238 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3239 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3240 };
3241
3242 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3243
3244 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3245 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3246 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3247 return EPROTO;
3248 }
3249
3250 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3251 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3252 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3253 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3254 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3255 return 0;
3256}
3257
3258static int
3259fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3260{
3261 struct nlattr *nlattr;
3262 const char * kind;
3263 int error;
3264 struct fqcodel fqcodel;
3265
3266 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3267 if (error != 0) {
3268 return error;
3269 }
3270
3271 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3272 if (error != 0) {
3273 return error;
3274 }
3275
3276 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3277 fqcodel.flows, fqcodel.quantum);
3278 return 0;
3279}
3280
3281static void
3282fqcodel_tc_destroy(struct tc *tc)
3283{
3284 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3285 tc_destroy(tc);
3286 free(fqcodel);
3287}
3288
3289static int
3290fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3291{
3292 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3293 smap_add_format(details, "target", "%u", fqcodel->target);
3294 smap_add_format(details, "limit", "%u", fqcodel->limit);
3295 smap_add_format(details, "interval", "%u", fqcodel->interval);
3296 smap_add_format(details, "flows", "%u", fqcodel->flows);
3297 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3298 return 0;
3299}
3300
3301static int
3302fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3303{
3304 struct fqcodel fqcodel;
3305
3306 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3307 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3308 fqcodel.flows, fqcodel.quantum);
3309 fqcodel_get__(netdev)->target = fqcodel.target;
3310 fqcodel_get__(netdev)->limit = fqcodel.limit;
3311 fqcodel_get__(netdev)->interval = fqcodel.interval;
3312 fqcodel_get__(netdev)->flows = fqcodel.flows;
3313 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3314 return 0;
3315}
3316
3317static const struct tc_ops tc_ops_fqcodel = {
3318 "fq_codel", /* linux_name */
3319 "linux-fq_codel", /* ovs_name */
3320 FQCODEL_N_QUEUES, /* n_queues */
3321 fqcodel_tc_install,
3322 fqcodel_tc_load,
3323 fqcodel_tc_destroy,
3324 fqcodel_qdisc_get,
3325 fqcodel_qdisc_set,
3326 NULL,
3327 NULL,
3328 NULL,
3329 NULL,
3330 NULL
3331};
3332\f
3333/* SFQ traffic control class. */
3334
3335#define SFQ_N_QUEUES 0x0000
3336
3337struct sfq {
3338 struct tc tc;
3339 uint32_t quantum;
3340 uint32_t perturb;
3341};
3342
3343static struct sfq *
3344sfq_get__(const struct netdev *netdev_)
3345{
3346 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3347 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3348}
3349
3350static void
3351sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3352{
3353 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3354 struct sfq *sfq;
3355
3356 sfq = xmalloc(sizeof *sfq);
3357 tc_init(&sfq->tc, &tc_ops_sfq);
3358 sfq->perturb = perturb;
3359 sfq->quantum = quantum;
3360
3361 netdev->tc = &sfq->tc;
3362}
3363
3364static int
3365sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3366{
3367 struct tc_sfq_qopt opt;
3368 struct ofpbuf request;
3369 struct tcmsg *tcmsg;
3370 int mtu;
3371 int mtu_error, error;
3372 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3373
3374 tc_del_qdisc(netdev);
3375
3376 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3377 NLM_F_EXCL | NLM_F_CREATE, &request);
3378 if (!tcmsg) {
3379 return ENODEV;
3380 }
3381 tcmsg->tcm_handle = tc_make_handle(1, 0);
3382 tcmsg->tcm_parent = TC_H_ROOT;
3383
3384 memset(&opt, 0, sizeof opt);
3385 if (!quantum) {
3386 if (!mtu_error) {
3387 opt.quantum = mtu; /* if we cannot find mtu, use default */
3388 }
3389 } else {
3390 opt.quantum = quantum;
3391 }
3392
3393 if (!perturb) {
3394 opt.perturb_period = 10;
3395 } else {
3396 opt.perturb_period = perturb;
3397 }
3398
3399 nl_msg_put_string(&request, TCA_KIND, "sfq");
3400 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3401
3402 error = tc_transact(&request, NULL);
3403 if (error) {
3404 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3405 "quantum %u, perturb %u error %d(%s)",
3406 netdev_get_name(netdev),
3407 opt.quantum, opt.perturb_period,
3408 error, ovs_strerror(error));
3409 }
3410 return error;
3411}
3412
3413static void
3414sfq_parse_qdisc_details__(struct netdev *netdev,
3415 const struct smap *details, struct sfq *sfq)
3416{
13c1637f
BP
3417 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3418 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3419
677d9158
JV
3420 if (!sfq->perturb) {
3421 sfq->perturb = 10;
3422 }
3423
3424 if (!sfq->quantum) {
13c1637f
BP
3425 int mtu;
3426 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3427 sfq->quantum = mtu;
3428 } else {
3429 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3430 "device without mtu");
677d9158
JV
3431 }
3432 }
3433}
3434
3435static int
3436sfq_tc_install(struct netdev *netdev, const struct smap *details)
3437{
3438 int error;
3439 struct sfq sfq;
3440
3441 sfq_parse_qdisc_details__(netdev, details, &sfq);
3442 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3443 if (!error) {
3444 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3445 }
3446 return error;
3447}
3448
3449static int
3450sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3451{
3452 const struct tc_sfq_qopt *sfq;
3453 struct nlattr *nlattr;
3454 const char * kind;
3455 int error;
3456
3457 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3458 if (error == 0) {
3459 sfq = nl_attr_get(nlattr);
3460 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3461 return 0;
3462 }
3463
3464 return error;
3465}
3466
3467static void
3468sfq_tc_destroy(struct tc *tc)
3469{
3470 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3471 tc_destroy(tc);
3472 free(sfq);
3473}
3474
3475static int
3476sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3477{
3478 const struct sfq *sfq = sfq_get__(netdev);
3479 smap_add_format(details, "quantum", "%u", sfq->quantum);
3480 smap_add_format(details, "perturb", "%u", sfq->perturb);
3481 return 0;
3482}
3483
3484static int
3485sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3486{
3487 struct sfq sfq;
3488
3489 sfq_parse_qdisc_details__(netdev, details, &sfq);
3490 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3491 sfq_get__(netdev)->quantum = sfq.quantum;
3492 sfq_get__(netdev)->perturb = sfq.perturb;
3493 return 0;
3494}
3495
3496static const struct tc_ops tc_ops_sfq = {
3497 "sfq", /* linux_name */
3498 "linux-sfq", /* ovs_name */
3499 SFQ_N_QUEUES, /* n_queues */
3500 sfq_tc_install,
3501 sfq_tc_load,
3502 sfq_tc_destroy,
3503 sfq_qdisc_get,
3504 sfq_qdisc_set,
3505 NULL,
3506 NULL,
3507 NULL,
3508 NULL,
3509 NULL
3510};
3511\f
c1c9c9c4 3512/* HTB traffic control class. */
559843ed 3513
c1c9c9c4 3514#define HTB_N_QUEUES 0xf000
4f631ccd 3515#define HTB_RATE2QUANTUM 10
8b61709d 3516
c1c9c9c4
BP
3517struct htb {
3518 struct tc tc;
3519 unsigned int max_rate; /* In bytes/s. */
3520};
8b61709d 3521
c1c9c9c4 3522struct htb_class {
93b13be8 3523 struct tc_queue tc_queue;
c1c9c9c4
BP
3524 unsigned int min_rate; /* In bytes/s. */
3525 unsigned int max_rate; /* In bytes/s. */
3526 unsigned int burst; /* In bytes. */
3527 unsigned int priority; /* Lower values are higher priorities. */
3528};
8b61709d 3529
c1c9c9c4 3530static struct htb *
b5d57fc8 3531htb_get__(const struct netdev *netdev_)
c1c9c9c4 3532{
b5d57fc8
BP
3533 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3534 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3535}
3536
24045e35 3537static void
b5d57fc8 3538htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3539{
b5d57fc8 3540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3541 struct htb *htb;
3542
3543 htb = xmalloc(sizeof *htb);
3544 tc_init(&htb->tc, &tc_ops_htb);
3545 htb->max_rate = max_rate;
3546
b5d57fc8 3547 netdev->tc = &htb->tc;
c1c9c9c4
BP
3548}
3549
3550/* Create an HTB qdisc.
3551 *
a339aa81 3552 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3553static int
3554htb_setup_qdisc__(struct netdev *netdev)
3555{
3556 size_t opt_offset;
3557 struct tc_htb_glob opt;
3558 struct ofpbuf request;
3559 struct tcmsg *tcmsg;
3560
3561 tc_del_qdisc(netdev);
3562
3563 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3564 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3565 if (!tcmsg) {
3566 return ENODEV;
3567 }
c1c9c9c4
BP
3568 tcmsg->tcm_handle = tc_make_handle(1, 0);
3569 tcmsg->tcm_parent = TC_H_ROOT;
3570
3571 nl_msg_put_string(&request, TCA_KIND, "htb");
3572
3573 memset(&opt, 0, sizeof opt);
4f631ccd 3574 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3575 opt.version = 3;
4ecf12d5 3576 opt.defcls = 1;
c1c9c9c4
BP
3577
3578 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3579 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3580 nl_msg_end_nested(&request, opt_offset);
3581
3582 return tc_transact(&request, NULL);
3583}
3584
3585/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3586 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3587static int
3588htb_setup_class__(struct netdev *netdev, unsigned int handle,
3589 unsigned int parent, struct htb_class *class)
3590{
3591 size_t opt_offset;
3592 struct tc_htb_opt opt;
3593 struct ofpbuf request;
3594 struct tcmsg *tcmsg;
3595 int error;
3596 int mtu;
3597
73371c09 3598 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3599 if (error) {
f915f1a8
BP
3600 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3601 netdev_get_name(netdev));
9b020780 3602 return error;
f915f1a8 3603 }
c1c9c9c4
BP
3604
3605 memset(&opt, 0, sizeof opt);
3606 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3607 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3608 /* Makes sure the quantum is at least MTU. Setting quantum will
3609 * make htb ignore the r2q for this class. */
3610 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3611 opt.quantum = mtu;
3612 }
c1c9c9c4
BP
3613 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3614 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3615 opt.prio = class->priority;
3616
3617 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
3618 if (!tcmsg) {
3619 return ENODEV;
3620 }
c1c9c9c4
BP
3621 tcmsg->tcm_handle = handle;
3622 tcmsg->tcm_parent = parent;
3623
3624 nl_msg_put_string(&request, TCA_KIND, "htb");
3625 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3626 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3627 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3628 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3629 nl_msg_end_nested(&request, opt_offset);
3630
3631 error = tc_transact(&request, NULL);
3632 if (error) {
3633 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3634 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3635 netdev_get_name(netdev),
3636 tc_get_major(handle), tc_get_minor(handle),
3637 tc_get_major(parent), tc_get_minor(parent),
3638 class->min_rate, class->max_rate,
10a89ef0 3639 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3640 }
3641 return error;
3642}
3643
3644/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3645 * description of them into 'details'. The description complies with the
3646 * specification given in the vswitch database documentation for linux-htb
3647 * queue details. */
3648static int
3649htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3650{
3651 static const struct nl_policy tca_htb_policy[] = {
3652 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3653 .min_len = sizeof(struct tc_htb_opt) },
3654 };
3655
3656 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3657 const struct tc_htb_opt *htb;
3658
3659 if (!nl_parse_nested(nl_options, tca_htb_policy,
3660 attrs, ARRAY_SIZE(tca_htb_policy))) {
3661 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3662 return EPROTO;
3663 }
3664
3665 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3666 class->min_rate = htb->rate.rate;
3667 class->max_rate = htb->ceil.rate;
3668 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3669 class->priority = htb->prio;
3670 return 0;
3671}
3672
3673static int
3674htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3675 struct htb_class *options,
3676 struct netdev_queue_stats *stats)
3677{
3678 struct nlattr *nl_options;
3679 unsigned int handle;
3680 int error;
3681
3682 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3683 if (!error && queue_id) {
17ee3c1f
BP
3684 unsigned int major = tc_get_major(handle);
3685 unsigned int minor = tc_get_minor(handle);
3686 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3687 *queue_id = minor - 1;
c1c9c9c4
BP
3688 } else {
3689 error = EPROTO;
3690 }
3691 }
3692 if (!error && options) {
3693 error = htb_parse_tca_options__(nl_options, options);
3694 }
3695 return error;
3696}
3697
3698static void
73371c09 3699htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3700 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3701{
73371c09 3702 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 3703
13c1637f 3704 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 3705 if (!hc->max_rate) {
a00ca915 3706 enum netdev_features current;
c1c9c9c4 3707
73371c09
BP
3708 netdev_linux_read_features(netdev);
3709 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3710 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3711 }
3712 hc->min_rate = hc->max_rate;
3713 hc->burst = 0;
3714 hc->priority = 0;
3715}
3716
3717static int
3718htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3719 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3720{
3721 const struct htb *htb = htb_get__(netdev);
9b020780 3722 int mtu, error;
c1c9c9c4 3723
73371c09 3724 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3725 if (error) {
f915f1a8
BP
3726 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3727 netdev_get_name(netdev));
9b020780 3728 return error;
f915f1a8
BP
3729 }
3730
4f104611
EJ
3731 /* HTB requires at least an mtu sized min-rate to send any traffic even
3732 * on uncongested links. */
13c1637f 3733 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 3734 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3735 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3736
3737 /* max-rate */
13c1637f
BP
3738 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
3739 if (!hc->max_rate) {
3740 hc->max_rate = htb->max_rate;
3741 }
c1c9c9c4
BP
3742 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3743 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3744
3745 /* burst
3746 *
3747 * According to hints in the documentation that I've read, it is important
3748 * that 'burst' be at least as big as the largest frame that might be
3749 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3750 * but having it a bit too small is a problem. Since netdev_get_mtu()
3751 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3752 * the MTU. We actually add 64, instead of 14, as a guard against
3753 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 3754 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
3755 hc->burst = MAX(hc->burst, mtu + 64);
3756
3757 /* priority */
13c1637f 3758 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
3759
3760 return 0;
3761}
3762
3763static int
3764htb_query_class__(const struct netdev *netdev, unsigned int handle,
3765 unsigned int parent, struct htb_class *options,
3766 struct netdev_queue_stats *stats)
3767{
3768 struct ofpbuf *reply;
3769 int error;
3770
3771 error = tc_query_class(netdev, handle, parent, &reply);
3772 if (!error) {
3773 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3774 ofpbuf_delete(reply);
3775 }
3776 return error;
3777}
3778
3779static int
79f1cbe9 3780htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3781{
3782 int error;
3783
3784 error = htb_setup_qdisc__(netdev);
3785 if (!error) {
3786 struct htb_class hc;
3787
3788 htb_parse_qdisc_details__(netdev, details, &hc);
3789 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3790 tc_make_handle(1, 0), &hc);
3791 if (!error) {
3792 htb_install__(netdev, hc.max_rate);
3793 }
3794 }
3795 return error;
3796}
3797
93b13be8
BP
3798static struct htb_class *
3799htb_class_cast__(const struct tc_queue *queue)
3800{
3801 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3802}
3803
c1c9c9c4
BP
3804static void
3805htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3806 const struct htb_class *hc)
3807{
3808 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3809 size_t hash = hash_int(queue_id, 0);
3810 struct tc_queue *queue;
c1c9c9c4
BP
3811 struct htb_class *hcp;
3812
93b13be8
BP
3813 queue = tc_find_queue__(netdev, queue_id, hash);
3814 if (queue) {
3815 hcp = htb_class_cast__(queue);
3816 } else {
c1c9c9c4 3817 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3818 queue = &hcp->tc_queue;
3819 queue->queue_id = queue_id;
6dc34a0d 3820 queue->created = time_msec();
93b13be8 3821 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3822 }
93b13be8
BP
3823
3824 hcp->min_rate = hc->min_rate;
3825 hcp->max_rate = hc->max_rate;
3826 hcp->burst = hc->burst;
3827 hcp->priority = hc->priority;
c1c9c9c4
BP
3828}
3829
3830static int
3831htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3832{
c1c9c9c4 3833 struct ofpbuf msg;
d57695d7 3834 struct queue_dump_state state;
c1c9c9c4 3835 struct htb_class hc;
c1c9c9c4
BP
3836
3837 /* Get qdisc options. */
3838 hc.max_rate = 0;
3839 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3840 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3841
3842 /* Get queues. */
d57695d7 3843 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3844 return ENODEV;
3845 }
d57695d7 3846 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3847 unsigned int queue_id;
3848
3849 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3850 htb_update_queue__(netdev, queue_id, &hc);
3851 }
3852 }
d57695d7 3853 finish_queue_dump(&state);
c1c9c9c4
BP
3854
3855 return 0;
3856}
3857
3858static void
3859htb_tc_destroy(struct tc *tc)
3860{
3861 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 3862 struct htb_class *hc;
c1c9c9c4 3863
4ec3d7c7 3864 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
3865 free(hc);
3866 }
3867 tc_destroy(tc);
3868 free(htb);
3869}
3870
3871static int
79f1cbe9 3872htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3873{
3874 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3875 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3876 return 0;
3877}
3878
3879static int
79f1cbe9 3880htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3881{
3882 struct htb_class hc;
3883 int error;
3884
3885 htb_parse_qdisc_details__(netdev, details, &hc);
3886 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3887 tc_make_handle(1, 0), &hc);
3888 if (!error) {
3889 htb_get__(netdev)->max_rate = hc.max_rate;
3890 }
3891 return error;
3892}
3893
3894static int
93b13be8 3895htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3896 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3897{
93b13be8 3898 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3899
79f1cbe9 3900 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3901 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3902 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3903 }
79f1cbe9 3904 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3905 if (hc->priority) {
79f1cbe9 3906 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3907 }
3908 return 0;
3909}
3910
3911static int
3912htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3913 const struct smap *details)
c1c9c9c4
BP
3914{
3915 struct htb_class hc;
3916 int error;
3917
3918 error = htb_parse_class_details__(netdev, details, &hc);
3919 if (error) {
3920 return error;
3921 }
3922
17ee3c1f 3923 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3924 tc_make_handle(1, 0xfffe), &hc);
3925 if (error) {
3926 return error;
3927 }
3928
3929 htb_update_queue__(netdev, queue_id, &hc);
3930 return 0;
3931}
3932
3933static int
93b13be8 3934htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3935{
93b13be8 3936 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3937 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3938 int error;
3939
93b13be8 3940 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3941 if (!error) {
93b13be8 3942 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3943 free(hc);
c1c9c9c4
BP
3944 }
3945 return error;
3946}
3947
3948static int
93b13be8 3949htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3950 struct netdev_queue_stats *stats)
3951{
93b13be8 3952 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3953 tc_make_handle(1, 0xfffe), NULL, stats);
3954}
3955
3956static int
3957htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3958 const struct ofpbuf *nlmsg,
3959 netdev_dump_queue_stats_cb *cb, void *aux)
3960{
3961 struct netdev_queue_stats stats;
17ee3c1f 3962 unsigned int handle, major, minor;
c1c9c9c4
BP
3963 int error;
3964
3965 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3966 if (error) {
3967 return error;
3968 }
3969
17ee3c1f
BP
3970 major = tc_get_major(handle);
3971 minor = tc_get_minor(handle);
3972 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3973 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3974 }
3975 return 0;
3976}
3977
3978static const struct tc_ops tc_ops_htb = {
3979 "htb", /* linux_name */
3980 "linux-htb", /* ovs_name */
3981 HTB_N_QUEUES, /* n_queues */
3982 htb_tc_install,
3983 htb_tc_load,
3984 htb_tc_destroy,
3985 htb_qdisc_get,
3986 htb_qdisc_set,
3987 htb_class_get,
3988 htb_class_set,
3989 htb_class_delete,
3990 htb_class_get_stats,
3991 htb_class_dump_stats
3992};
3993\f
a339aa81
EJ
3994/* "linux-hfsc" traffic control class. */
3995
3996#define HFSC_N_QUEUES 0xf000
3997
3998struct hfsc {
3999 struct tc tc;
4000 uint32_t max_rate;
4001};
4002
4003struct hfsc_class {
4004 struct tc_queue tc_queue;
4005 uint32_t min_rate;
4006 uint32_t max_rate;
4007};
4008
4009static struct hfsc *
b5d57fc8 4010hfsc_get__(const struct netdev *netdev_)
a339aa81 4011{
b5d57fc8
BP
4012 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4013 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4014}
4015
4016static struct hfsc_class *
4017hfsc_class_cast__(const struct tc_queue *queue)
4018{
4019 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4020}
4021
24045e35 4022static void
b5d57fc8 4023hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4024{
b5d57fc8 4025 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4026 struct hfsc *hfsc;
4027
a339aa81
EJ
4028 hfsc = xmalloc(sizeof *hfsc);
4029 tc_init(&hfsc->tc, &tc_ops_hfsc);
4030 hfsc->max_rate = max_rate;
b5d57fc8 4031 netdev->tc = &hfsc->tc;
a339aa81
EJ
4032}
4033
4034static void
4035hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4036 const struct hfsc_class *hc)
4037{
4038 size_t hash;
4039 struct hfsc *hfsc;
4040 struct hfsc_class *hcp;
4041 struct tc_queue *queue;
4042
4043 hfsc = hfsc_get__(netdev);
4044 hash = hash_int(queue_id, 0);
4045
4046 queue = tc_find_queue__(netdev, queue_id, hash);
4047 if (queue) {
4048 hcp = hfsc_class_cast__(queue);
4049 } else {
4050 hcp = xmalloc(sizeof *hcp);
4051 queue = &hcp->tc_queue;
4052 queue->queue_id = queue_id;
6dc34a0d 4053 queue->created = time_msec();
a339aa81
EJ
4054 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4055 }
4056
4057 hcp->min_rate = hc->min_rate;
4058 hcp->max_rate = hc->max_rate;
4059}
4060
4061static int
4062hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4063{
4064 const struct tc_service_curve *rsc, *fsc, *usc;
4065 static const struct nl_policy tca_hfsc_policy[] = {
4066 [TCA_HFSC_RSC] = {
4067 .type = NL_A_UNSPEC,
4068 .optional = false,
4069 .min_len = sizeof(struct tc_service_curve),
4070 },
4071 [TCA_HFSC_FSC] = {
4072 .type = NL_A_UNSPEC,
4073 .optional = false,
4074 .min_len = sizeof(struct tc_service_curve),
4075 },
4076 [TCA_HFSC_USC] = {
4077 .type = NL_A_UNSPEC,
4078 .optional = false,
4079 .min_len = sizeof(struct tc_service_curve),
4080 },
4081 };
4082 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4083
4084 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4085 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4086 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4087 return EPROTO;
4088 }
4089
4090 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4091 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4092 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4093
4094 if (rsc->m1 != 0 || rsc->d != 0 ||
4095 fsc->m1 != 0 || fsc->d != 0 ||
4096 usc->m1 != 0 || usc->d != 0) {
4097 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4098 "Non-linear service curves are not supported.");
4099 return EPROTO;
4100 }
4101
4102 if (rsc->m2 != fsc->m2) {
4103 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4104 "Real-time service curves are not supported ");
4105 return EPROTO;
4106 }
4107
4108 if (rsc->m2 > usc->m2) {
4109 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4110 "Min-rate service curve is greater than "
4111 "the max-rate service curve.");
4112 return EPROTO;
4113 }
4114
4115 class->min_rate = fsc->m2;
4116 class->max_rate = usc->m2;
4117 return 0;
4118}
4119
4120static int
4121hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4122 struct hfsc_class *options,
4123 struct netdev_queue_stats *stats)
4124{
4125 int error;
4126 unsigned int handle;
4127 struct nlattr *nl_options;
4128
4129 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4130 if (error) {
4131 return error;
4132 }
4133
4134 if (queue_id) {
4135 unsigned int major, minor;
4136
4137 major = tc_get_major(handle);
4138 minor = tc_get_minor(handle);
4139 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4140 *queue_id = minor - 1;
4141 } else {
4142 return EPROTO;
4143 }
4144 }
4145
4146 if (options) {
4147 error = hfsc_parse_tca_options__(nl_options, options);
4148 }
4149
4150 return error;
4151}
4152
4153static int
4154hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4155 unsigned int parent, struct hfsc_class *options,
4156 struct netdev_queue_stats *stats)
4157{
4158 int error;
4159 struct ofpbuf *reply;
4160
4161 error = tc_query_class(netdev, handle, parent, &reply);
4162 if (error) {
4163 return error;
4164 }
4165
4166 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4167 ofpbuf_delete(reply);
4168 return error;
4169}
4170
4171static void
73371c09 4172hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4173 struct hfsc_class *class)
4174{
73371c09 4175 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4176
13c1637f 4177 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4178 if (!max_rate) {
a00ca915 4179 enum netdev_features current;
a339aa81 4180
73371c09
BP
4181 netdev_linux_read_features(netdev);
4182 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4183 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4184 }
4185
4186 class->min_rate = max_rate;
4187 class->max_rate = max_rate;
4188}
4189
4190static int
4191hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4192 const struct smap *details,
a339aa81
EJ
4193 struct hfsc_class * class)
4194{
4195 const struct hfsc *hfsc;
4196 uint32_t min_rate, max_rate;
a339aa81
EJ
4197
4198 hfsc = hfsc_get__(netdev);
a339aa81 4199
13c1637f 4200 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4201 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4202 min_rate = MIN(min_rate, hfsc->max_rate);
4203
13c1637f 4204 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4205 max_rate = MAX(max_rate, min_rate);
4206 max_rate = MIN(max_rate, hfsc->max_rate);
4207
4208 class->min_rate = min_rate;
4209 class->max_rate = max_rate;
4210
4211 return 0;
4212}
4213
4214/* Create an HFSC qdisc.
4215 *
4216 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4217static int
4218hfsc_setup_qdisc__(struct netdev * netdev)
4219{
4220 struct tcmsg *tcmsg;
4221 struct ofpbuf request;
4222 struct tc_hfsc_qopt opt;
4223
4224 tc_del_qdisc(netdev);
4225
4226 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4227 NLM_F_EXCL | NLM_F_CREATE, &request);
4228
4229 if (!tcmsg) {
4230 return ENODEV;
4231 }
4232
4233 tcmsg->tcm_handle = tc_make_handle(1, 0);
4234 tcmsg->tcm_parent = TC_H_ROOT;
4235
4236 memset(&opt, 0, sizeof opt);
4237 opt.defcls = 1;
4238
4239 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4240 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4241
4242 return tc_transact(&request, NULL);
4243}
4244
4245/* Create an HFSC class.
4246 *
4247 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4248 * sc rate <min_rate> ul rate <max_rate>" */
4249static int
4250hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4251 unsigned int parent, struct hfsc_class *class)
4252{
4253 int error;
4254 size_t opt_offset;
4255 struct tcmsg *tcmsg;
4256 struct ofpbuf request;
4257 struct tc_service_curve min, max;
4258
4259 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4260
4261 if (!tcmsg) {
4262 return ENODEV;
4263 }
4264
4265 tcmsg->tcm_handle = handle;
4266 tcmsg->tcm_parent = parent;
4267
4268 min.m1 = 0;
4269 min.d = 0;
4270 min.m2 = class->min_rate;
4271
4272 max.m1 = 0;
4273 max.d = 0;
4274 max.m2 = class->max_rate;
4275
4276 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4277 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4278 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4279 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4280 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4281 nl_msg_end_nested(&request, opt_offset);
4282
4283 error = tc_transact(&request, NULL);
4284 if (error) {
4285 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4286 "min-rate %ubps, max-rate %ubps (%s)",
4287 netdev_get_name(netdev),
4288 tc_get_major(handle), tc_get_minor(handle),
4289 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4290 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4291 }
4292
4293 return error;
4294}
4295
4296static int
79f1cbe9 4297hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4298{
4299 int error;
4300 struct hfsc_class class;
4301
4302 error = hfsc_setup_qdisc__(netdev);
4303
4304 if (error) {
4305 return error;
4306 }
4307
4308 hfsc_parse_qdisc_details__(netdev, details, &class);
4309 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4310 tc_make_handle(1, 0), &class);
4311
4312 if (error) {
4313 return error;
4314 }
4315
4316 hfsc_install__(netdev, class.max_rate);
4317 return 0;
4318}
4319
4320static int
4321hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4322{
4323 struct ofpbuf msg;
d57695d7 4324 struct queue_dump_state state;
a339aa81
EJ
4325 struct hfsc_class hc;
4326
4327 hc.max_rate = 0;
4328 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4329 hfsc_install__(netdev, hc.max_rate);
a339aa81 4330
d57695d7 4331 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4332 return ENODEV;
4333 }
4334
d57695d7 4335 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4336 unsigned int queue_id;
4337
4338 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4339 hfsc_update_queue__(netdev, queue_id, &hc);
4340 }
4341 }
4342
d57695d7 4343 finish_queue_dump(&state);
a339aa81
EJ
4344 return 0;
4345}
4346
4347static void
4348hfsc_tc_destroy(struct tc *tc)
4349{
4350 struct hfsc *hfsc;
4351 struct hfsc_class *hc, *next;
4352
4353 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4354
4355 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4356 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4357 free(hc);
4358 }
4359
4360 tc_destroy(tc);
4361 free(hfsc);
4362}
4363
4364static int
79f1cbe9 4365hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4366{
4367 const struct hfsc *hfsc;
4368 hfsc = hfsc_get__(netdev);
79f1cbe9 4369 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4370 return 0;
4371}
4372
4373static int
79f1cbe9 4374hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4375{
4376 int error;
4377 struct hfsc_class class;
4378
4379 hfsc_parse_qdisc_details__(netdev, details, &class);
4380 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4381 tc_make_handle(1, 0), &class);
4382
4383 if (!error) {
4384 hfsc_get__(netdev)->max_rate = class.max_rate;
4385 }
4386
4387 return error;
4388}
4389
4390static int
4391hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4392 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4393{
4394 const struct hfsc_class *hc;
4395
4396 hc = hfsc_class_cast__(queue);
79f1cbe9 4397 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4398 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4399 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4400 }
4401 return 0;
4402}
4403
4404static int
4405hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4406 const struct smap *details)
a339aa81
EJ
4407{
4408 int error;
4409 struct hfsc_class class;
4410
4411 error = hfsc_parse_class_details__(netdev, details, &class);
4412 if (error) {
4413 return error;
4414 }
4415
4416 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4417 tc_make_handle(1, 0xfffe), &class);
4418 if (error) {
4419 return error;
4420 }
4421
4422 hfsc_update_queue__(netdev, queue_id, &class);
4423 return 0;
4424}
4425
4426static int
4427hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4428{
4429 int error;
4430 struct hfsc *hfsc;
4431 struct hfsc_class *hc;
4432
4433 hc = hfsc_class_cast__(queue);
4434 hfsc = hfsc_get__(netdev);
4435
4436 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4437 if (!error) {
4438 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4439 free(hc);
4440 }
4441 return error;
4442}
4443
4444static int
4445hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4446 struct netdev_queue_stats *stats)
4447{
4448 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4449 tc_make_handle(1, 0xfffe), NULL, stats);
4450}
4451
4452static int
4453hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4454 const struct ofpbuf *nlmsg,
4455 netdev_dump_queue_stats_cb *cb, void *aux)
4456{
4457 struct netdev_queue_stats stats;
4458 unsigned int handle, major, minor;
4459 int error;
4460
4461 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4462 if (error) {
4463 return error;
4464 }
4465
4466 major = tc_get_major(handle);
4467 minor = tc_get_minor(handle);
4468 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4469 (*cb)(minor - 1, &stats, aux);
4470 }
4471 return 0;
4472}
4473
4474static const struct tc_ops tc_ops_hfsc = {
4475 "hfsc", /* linux_name */
4476 "linux-hfsc", /* ovs_name */
4477 HFSC_N_QUEUES, /* n_queues */
4478 hfsc_tc_install, /* tc_install */
4479 hfsc_tc_load, /* tc_load */
4480 hfsc_tc_destroy, /* tc_destroy */
4481 hfsc_qdisc_get, /* qdisc_get */
4482 hfsc_qdisc_set, /* qdisc_set */
4483 hfsc_class_get, /* class_get */
4484 hfsc_class_set, /* class_set */
4485 hfsc_class_delete, /* class_delete */
4486 hfsc_class_get_stats, /* class_get_stats */
4487 hfsc_class_dump_stats /* class_dump_stats */
4488};
4489\f
6cf888b8
BS
4490/* "linux-noop" traffic control class. */
4491
4492static void
4493noop_install__(struct netdev *netdev_)
4494{
4495 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4496 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4497
4498 netdev->tc = CONST_CAST(struct tc *, &tc);
4499}
4500
4501static int
4502noop_tc_install(struct netdev *netdev,
4503 const struct smap *details OVS_UNUSED)
4504{
4505 noop_install__(netdev);
4506 return 0;
4507}
4508
4509static int
4510noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4511{
4512 noop_install__(netdev);
4513 return 0;
4514}
4515
4516static const struct tc_ops tc_ops_noop = {
4517 NULL, /* linux_name */
4518 "linux-noop", /* ovs_name */
4519 0, /* n_queues */
4520 noop_tc_install,
4521 noop_tc_load,
4522 NULL, /* tc_destroy */
4523 NULL, /* qdisc_get */
4524 NULL, /* qdisc_set */
4525 NULL, /* class_get */
4526 NULL, /* class_set */
4527 NULL, /* class_delete */
4528 NULL, /* class_get_stats */
4529 NULL /* class_dump_stats */
4530};
4531\f
c1c9c9c4
BP
4532/* "linux-default" traffic control class.
4533 *
4534 * This class represents the default, unnamed Linux qdisc. It corresponds to
4535 * the "" (empty string) QoS type in the OVS database. */
4536
4537static void
b5d57fc8 4538default_install__(struct netdev *netdev_)
c1c9c9c4 4539{
b5d57fc8 4540 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4541 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4542
559eb230
BP
4543 /* Nothing but a tc class implementation is allowed to write to a tc. This
4544 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4545 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4546}
4547
4548static int
4549default_tc_install(struct netdev *netdev,
79f1cbe9 4550 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4551{
4552 default_install__(netdev);
4553 return 0;
4554}
4555
4556static int
4557default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4558{
4559 default_install__(netdev);
4560 return 0;
4561}
4562
4563static const struct tc_ops tc_ops_default = {
4564 NULL, /* linux_name */
4565 "", /* ovs_name */
4566 0, /* n_queues */
4567 default_tc_install,
4568 default_tc_load,
4569 NULL, /* tc_destroy */
4570 NULL, /* qdisc_get */
4571 NULL, /* qdisc_set */
4572 NULL, /* class_get */
4573 NULL, /* class_set */
4574 NULL, /* class_delete */
4575 NULL, /* class_get_stats */
4576 NULL /* class_dump_stats */
4577};
4578\f
4579/* "linux-other" traffic control class.
4580 *
4581 * */
4582
4583static int
b5d57fc8 4584other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4585{
b5d57fc8 4586 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4587 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4588
559eb230
BP
4589 /* Nothing but a tc class implementation is allowed to write to a tc. This
4590 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4591 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4592 return 0;
4593}
4594
4595static const struct tc_ops tc_ops_other = {
4596 NULL, /* linux_name */
4597 "linux-other", /* ovs_name */
4598 0, /* n_queues */
4599 NULL, /* tc_install */
4600 other_tc_load,
4601 NULL, /* tc_destroy */
4602 NULL, /* qdisc_get */
4603 NULL, /* qdisc_set */
4604 NULL, /* class_get */
4605 NULL, /* class_set */
4606 NULL, /* class_delete */
4607 NULL, /* class_get_stats */
4608 NULL /* class_dump_stats */
4609};
4610\f
4611/* Traffic control. */
4612
4613/* Number of kernel "tc" ticks per second. */
4614static double ticks_per_s;
4615
4616/* Number of kernel "jiffies" per second. This is used for the purpose of
4617 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4618 * one jiffy's worth of data.
4619 *
4620 * There are two possibilities here:
4621 *
4622 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4623 * approximate range of 100 to 1024. That means that we really need to
4624 * make sure that the qdisc can buffer that much data.
4625 *
4626 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4627 * has finely granular timers and there's no need to fudge additional room
4628 * for buffers. (There's no extra effort needed to implement that: the
4629 * large 'buffer_hz' is used as a divisor, so practically any number will
4630 * come out as 0 in the division. Small integer results in the case of
4631 * really high dividends won't have any real effect anyhow.)
4632 */
4633static unsigned int buffer_hz;
4634
4635/* Returns tc handle 'major':'minor'. */
4636static unsigned int
4637tc_make_handle(unsigned int major, unsigned int minor)
4638{
4639 return TC_H_MAKE(major << 16, minor);
4640}
4641
4642/* Returns the major number from 'handle'. */
4643static unsigned int
4644tc_get_major(unsigned int handle)
4645{
4646 return TC_H_MAJ(handle) >> 16;
4647}
4648
4649/* Returns the minor number from 'handle'. */
4650static unsigned int
4651tc_get_minor(unsigned int handle)
4652{
4653 return TC_H_MIN(handle);
4654}
4655
4656static struct tcmsg *
4657tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4658 struct ofpbuf *request)
4659{
4660 struct tcmsg *tcmsg;
4661 int ifindex;
4662 int error;
4663
4664 error = get_ifindex(netdev, &ifindex);
4665 if (error) {
4666 return NULL;
4667 }
4668
4669 ofpbuf_init(request, 512);
4670 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4671 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4672 tcmsg->tcm_family = AF_UNSPEC;
4673 tcmsg->tcm_ifindex = ifindex;
4674 /* Caller should fill in tcmsg->tcm_handle. */
4675 /* Caller should fill in tcmsg->tcm_parent. */
4676
4677 return tcmsg;
4678}
4679
4680static int
4681tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4682{
a88b4e04 4683 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
4684 ofpbuf_uninit(request);
4685 return error;
4686}
4687
f8500004
JP
4688/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4689 * policing configuration.
4690 *
4691 * This function is equivalent to running the following when 'add' is true:
4692 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4693 *
4694 * This function is equivalent to running the following when 'add' is false:
4695 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4696 *
4697 * The configuration and stats may be seen with the following command:
4698 * /sbin/tc -s qdisc show dev <devname>
4699 *
4700 * Returns 0 if successful, otherwise a positive errno value.
4701 */
4702static int
4703tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4704{
4705 struct ofpbuf request;
4706 struct tcmsg *tcmsg;
4707 int error;
4708 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4709 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4710
4711 tcmsg = tc_make_request(netdev, type, flags, &request);
4712 if (!tcmsg) {
4713 return ENODEV;
4714 }
4715 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4716 tcmsg->tcm_parent = TC_H_INGRESS;
4717 nl_msg_put_string(&request, TCA_KIND, "ingress");
4718 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4719
4720 error = tc_transact(&request, NULL);
4721 if (error) {
4722 /* If we're deleting the qdisc, don't worry about some of the
4723 * error conditions. */
4724 if (!add && (error == ENOENT || error == EINVAL)) {
4725 return 0;
4726 }
4727 return error;
4728 }
4729
4730 return 0;
4731}
4732
4733/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4734 * of 'kbits_burst'.
4735 *
4736 * This function is equivalent to running:
4737 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4738 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4739 * mtu 65535 drop
4740 *
4741 * The configuration and stats may be seen with the following command:
c7952afb 4742 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4743 *
4744 * Returns 0 if successful, otherwise a positive errno value.
4745 */
4746static int
c7952afb
BP
4747tc_add_policer(struct netdev *netdev,
4748 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4749{
4750 struct tc_police tc_police;
4751 struct ofpbuf request;
4752 struct tcmsg *tcmsg;
4753 size_t basic_offset;
4754 size_t police_offset;
4755 int error;
4756 int mtu = 65535;
4757
4758 memset(&tc_police, 0, sizeof tc_police);
4759 tc_police.action = TC_POLICE_SHOT;
4760 tc_police.mtu = mtu;
1aca400c 4761 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4762
79abacc8
MAA
4763 /* The following appears wrong in one way: In networking a kilobit is
4764 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4765 *
4766 * However if you "fix" those problems then "tc filter show ..." shows
4767 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4768 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4769 * tc's point of view. Whatever. */
4770 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4771 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004
JP
4772
4773 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4774 NLM_F_EXCL | NLM_F_CREATE, &request);
4775 if (!tcmsg) {
4776 return ENODEV;
4777 }
4778 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4779 tcmsg->tcm_info = tc_make_handle(49,
4780 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4781
4782 nl_msg_put_string(&request, TCA_KIND, "basic");
4783 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4784 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4785 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4786 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4787 nl_msg_end_nested(&request, police_offset);
4788 nl_msg_end_nested(&request, basic_offset);
4789
4790 error = tc_transact(&request, NULL);
4791 if (error) {
4792 return error;
4793 }
4794
4795 return 0;
4796}
4797
c1c9c9c4
BP
4798static void
4799read_psched(void)
4800{
4801 /* The values in psched are not individually very meaningful, but they are
4802 * important. The tables below show some values seen in the wild.
4803 *
4804 * Some notes:
4805 *
4806 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4807 * (Before that, there are hints that it was 1000000000.)
4808 *
4809 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4810 * above.
4811 *
4812 * /proc/net/psched
4813 * -----------------------------------
4814 * [1] 000c8000 000f4240 000f4240 00000064
4815 * [2] 000003e8 00000400 000f4240 3b9aca00
4816 * [3] 000003e8 00000400 000f4240 3b9aca00
4817 * [4] 000003e8 00000400 000f4240 00000064
4818 * [5] 000003e8 00000040 000f4240 3b9aca00
4819 * [6] 000003e8 00000040 000f4240 000000f9
4820 *
4821 * a b c d ticks_per_s buffer_hz
4822 * ------- --------- ---------- ------------- ----------- -------------
4823 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4824 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4825 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4826 * [4] 1,000 1,024 1,000,000 100 976,562 100
4827 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4828 * [6] 1,000 64 1,000,000 249 15,625,000 249
4829 *
4830 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4831 * [2] 2.6.26-1-686-bigmem from Debian lenny
4832 * [3] 2.6.26-2-sparc64 from Debian lenny
4833 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4834 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4835 * [6] 2.6.34 from kernel.org on KVM
4836 */
23882115 4837 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4838 static const char fn[] = "/proc/net/psched";
4839 unsigned int a, b, c, d;
4840 FILE *stream;
4841
23882115
BP
4842 if (!ovsthread_once_start(&once)) {
4843 return;
4844 }
4845
c1c9c9c4
BP
4846 ticks_per_s = 1.0;
4847 buffer_hz = 100;
4848
4849 stream = fopen(fn, "r");
4850 if (!stream) {
10a89ef0 4851 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4852 goto exit;
c1c9c9c4
BP
4853 }
4854
4855 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4856 VLOG_WARN("%s: read failed", fn);
4857 fclose(stream);
23882115 4858 goto exit;
c1c9c9c4
BP
4859 }
4860 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4861 fclose(stream);
4862
4863 if (!a || !c) {
4864 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4865 goto exit;
c1c9c9c4
BP
4866 }
4867
4868 ticks_per_s = (double) a * c / b;
4869 if (c == 1000000) {
4870 buffer_hz = d;
4871 } else {
4872 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4873 fn, a, b, c, d);
4874 }
4875 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4876
4877exit:
4878 ovsthread_once_done(&once);
c1c9c9c4
BP
4879}
4880
4881/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4882 * rate of 'rate' bytes per second. */
4883static unsigned int
4884tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4885{
23882115 4886 read_psched();
c1c9c9c4
BP
4887 return (rate * ticks) / ticks_per_s;
4888}
4889
4890/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4891 * rate of 'rate' bytes per second. */
4892static unsigned int
4893tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4894{
23882115 4895 read_psched();
015c93a4 4896 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4897}
4898
4899/* Returns the number of bytes that need to be reserved for qdisc buffering at
4900 * a transmission rate of 'rate' bytes per second. */
4901static unsigned int
4902tc_buffer_per_jiffy(unsigned int rate)
4903{
23882115 4904 read_psched();
c1c9c9c4
BP
4905 return rate / buffer_hz;
4906}
4907
4908/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4909 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4910 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4911 * stores NULL into it if it is absent.
4912 *
4913 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4914 * 'msg'.
4915 *
4916 * Returns 0 if successful, otherwise a positive errno value. */
4917static int
4918tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4919 struct nlattr **options)
4920{
4921 static const struct nl_policy tca_policy[] = {
4922 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4923 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4924 };
4925 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4926
4927 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4928 tca_policy, ta, ARRAY_SIZE(ta))) {
4929 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4930 goto error;
4931 }
4932
4933 if (kind) {
4934 *kind = nl_attr_get_string(ta[TCA_KIND]);
4935 }
4936
4937 if (options) {
4938 *options = ta[TCA_OPTIONS];
4939 }
4940
4941 return 0;
4942
4943error:
4944 if (kind) {
4945 *kind = NULL;
4946 }
4947 if (options) {
4948 *options = NULL;
4949 }
4950 return EPROTO;
4951}
4952
4953/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4954 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4955 * into '*options', and its queue statistics into '*stats'. Any of the output
4956 * arguments may be null.
4957 *
4958 * Returns 0 if successful, otherwise a positive errno value. */
4959static int
4960tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4961 struct nlattr **options, struct netdev_queue_stats *stats)
4962{
4963 static const struct nl_policy tca_policy[] = {
4964 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4965 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4966 };
4967 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4968
4969 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4970 tca_policy, ta, ARRAY_SIZE(ta))) {
4971 VLOG_WARN_RL(&rl, "failed to parse class message");
4972 goto error;
4973 }
4974
4975 if (handlep) {
4976 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4977 *handlep = tc->tcm_handle;
4978 }
4979
4980 if (options) {
4981 *options = ta[TCA_OPTIONS];
4982 }
4983
4984 if (stats) {
4985 const struct gnet_stats_queue *gsq;
4986 struct gnet_stats_basic gsb;
4987
4988 static const struct nl_policy stats_policy[] = {
4989 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4990 .min_len = sizeof gsb },
4991 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4992 .min_len = sizeof *gsq },
4993 };
4994 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4995
4996 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4997 sa, ARRAY_SIZE(sa))) {
4998 VLOG_WARN_RL(&rl, "failed to parse class stats");
4999 goto error;
5000 }
5001
5002 /* Alignment issues screw up the length of struct gnet_stats_basic on
5003 * some arch/bitsize combinations. Newer versions of Linux have a
5004 * struct gnet_stats_basic_packed, but we can't depend on that. The
5005 * easiest thing to do is just to make a copy. */
5006 memset(&gsb, 0, sizeof gsb);
5007 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5008 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5009 stats->tx_bytes = gsb.bytes;
5010 stats->tx_packets = gsb.packets;
5011
5012 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5013 stats->tx_errors = gsq->drops;
5014 }
5015
5016 return 0;
5017
5018error:
5019 if (options) {
5020 *options = NULL;
5021 }
5022 if (stats) {
5023 memset(stats, 0, sizeof *stats);
5024 }
5025 return EPROTO;
5026}
5027
5028/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5029 * on 'netdev'. */
5030static int
5031tc_query_class(const struct netdev *netdev,
5032 unsigned int handle, unsigned int parent,
5033 struct ofpbuf **replyp)
5034{
5035 struct ofpbuf request;
5036 struct tcmsg *tcmsg;
5037 int error;
5038
5039 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
5040 if (!tcmsg) {
5041 return ENODEV;
5042 }
c1c9c9c4
BP
5043 tcmsg->tcm_handle = handle;
5044 tcmsg->tcm_parent = parent;
5045
5046 error = tc_transact(&request, replyp);
5047 if (error) {
5048 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5049 netdev_get_name(netdev),
5050 tc_get_major(handle), tc_get_minor(handle),
5051 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5052 ovs_strerror(error));
c1c9c9c4
BP
5053 }
5054 return error;
5055}
5056
5057/* Equivalent to "tc class del dev <name> handle <handle>". */
5058static int
5059tc_delete_class(const struct netdev *netdev, unsigned int handle)
5060{
5061 struct ofpbuf request;
5062 struct tcmsg *tcmsg;
5063 int error;
5064
5065 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5066 if (!tcmsg) {
5067 return ENODEV;
5068 }
c1c9c9c4
BP
5069 tcmsg->tcm_handle = handle;
5070 tcmsg->tcm_parent = 0;
5071
5072 error = tc_transact(&request, NULL);
5073 if (error) {
5074 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5075 netdev_get_name(netdev),
5076 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5077 ovs_strerror(error));
c1c9c9c4
BP
5078 }
5079 return error;
5080}
5081
5082/* Equivalent to "tc qdisc del dev <name> root". */
5083static int
b5d57fc8 5084tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5085{
b5d57fc8 5086 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5087 struct ofpbuf request;
5088 struct tcmsg *tcmsg;
5089 int error;
5090
b5d57fc8 5091 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5092 if (!tcmsg) {
5093 return ENODEV;
5094 }
c1c9c9c4
BP
5095 tcmsg->tcm_handle = tc_make_handle(1, 0);
5096 tcmsg->tcm_parent = TC_H_ROOT;
5097
5098 error = tc_transact(&request, NULL);
5099 if (error == EINVAL) {
5100 /* EINVAL probably means that the default qdisc was in use, in which
5101 * case we've accomplished our purpose. */
5102 error = 0;
5103 }
b5d57fc8
BP
5104 if (!error && netdev->tc) {
5105 if (netdev->tc->ops->tc_destroy) {
5106 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5107 }
b5d57fc8 5108 netdev->tc = NULL;
c1c9c9c4
BP
5109 }
5110 return error;
5111}
5112
ac3e3aaa
BP
5113static bool
5114getqdisc_is_safe(void)
5115{
5116 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5117 static bool safe = false;
5118
5119 if (ovsthread_once_start(&once)) {
5120 struct utsname utsname;
5121 int major, minor;
5122
5123 if (uname(&utsname) == -1) {
5124 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5125 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5126 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5127 } else if (major < 2 || (major == 2 && minor < 35)) {
5128 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5129 utsname.release);
5130 } else {
5131 safe = true;
5132 }
5133 ovsthread_once_done(&once);
5134 }
5135 return safe;
5136}
5137
c1c9c9c4
BP
5138/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5139 * kernel to determine what they are. Returns 0 if successful, otherwise a
5140 * positive errno value. */
5141static int
b5d57fc8 5142tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5143{
b5d57fc8 5144 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5145 struct ofpbuf request, *qdisc;
5146 const struct tc_ops *ops;
5147 struct tcmsg *tcmsg;
5148 int load_error;
5149 int error;
5150
b5d57fc8 5151 if (netdev->tc) {
c1c9c9c4
BP
5152 return 0;
5153 }
5154
5155 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5156 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5157 * 2.6.35 without that fix backported to it.
5158 *
5159 * To avoid the OOPS, we must not make a request that would attempt to dump
5160 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5161 * few others. There are a few ways that I can see to do this, but most of
5162 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5163 * technique chosen here is to assume that any non-default qdisc that we
5164 * create will have a class with handle 1:0. The built-in qdiscs only have
5165 * a class with handle 0:0.
5166 *
ac3e3aaa
BP
5167 * On Linux 2.6.35+ we use the straightforward method because it allows us
5168 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5169 * in such a case we get no response at all from the kernel (!) if a
5170 * builtin qdisc is in use (which is later caught by "!error &&
5171 * !qdisc->size"). */
b5d57fc8 5172 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
5173 if (!tcmsg) {
5174 return ENODEV;
5175 }
ac3e3aaa
BP
5176 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5177 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5178
5179 /* Figure out what tc class to instantiate. */
5180 error = tc_transact(&request, &qdisc);
ac3e3aaa 5181 if (!error && qdisc->size) {
c1c9c9c4
BP
5182 const char *kind;
5183
5184 error = tc_parse_qdisc(qdisc, &kind, NULL);
5185 if (error) {
5186 ops = &tc_ops_other;
5187 } else {
5188 ops = tc_lookup_linux_name(kind);
5189 if (!ops) {
5190 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5191 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5192
5193 ops = &tc_ops_other;
5194 }
5195 }
ac3e3aaa
BP
5196 } else if ((!error && !qdisc->size) || error == ENOENT) {
5197 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5198 * set up by some other entity that doesn't have a handle 1:0. We will
5199 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5200 ops = &tc_ops_default;
5201 error = 0;
5202 } else {
5203 /* Who knows? Maybe the device got deleted. */
5204 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5205 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5206 ops = &tc_ops_other;
5207 }
5208
5209 /* Instantiate it. */
b5d57fc8
BP
5210 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5211 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5212 ofpbuf_delete(qdisc);
5213
5214 return error ? error : load_error;
5215}
5216
5217/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5218 approximate the time to transmit packets of various lengths. For an MTU of
5219 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5220 represents two possible packet lengths; for a MTU of 513 through 1024, four
5221 possible lengths; and so on.
5222
5223 Returns, for the specified 'mtu', the number of bits that packet lengths
5224 need to be shifted right to fit within such a 256-entry table. */
5225static int
5226tc_calc_cell_log(unsigned int mtu)
5227{
5228 int cell_log;
5229
5230 if (!mtu) {
5231 mtu = ETH_PAYLOAD_MAX;
5232 }
5233 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5234
5235 for (cell_log = 0; mtu >= 256; cell_log++) {
5236 mtu >>= 1;
5237 }
5238
5239 return cell_log;
5240}
5241
5242/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5243 * of 'mtu'. */
5244static void
5245tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5246{
5247 memset(rate, 0, sizeof *rate);
5248 rate->cell_log = tc_calc_cell_log(mtu);
5249 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5250 /* rate->cell_align = 0; */ /* distro headers. */
5251 rate->mpu = ETH_TOTAL_MIN;
5252 rate->rate = Bps;
5253}
5254
5255/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5256 * attribute of the specified "type".
5257 *
5258 * See tc_calc_cell_log() above for a description of "rtab"s. */
5259static void
5260tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5261{
5262 uint32_t *rtab;
5263 unsigned int i;
5264
5265 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5266 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5267 unsigned packet_size = (i + 1) << rate->cell_log;
5268 if (packet_size < rate->mpu) {
5269 packet_size = rate->mpu;
5270 }
5271 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5272 }
5273}
5274
5275/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5276 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5277 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5278 * 0 is fine.) */
c1c9c9c4
BP
5279static int
5280tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5281{
5282 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5283 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5284}
d3980822 5285\f
aaf2fb1a
BP
5286/* Linux-only functions declared in netdev-linux.h */
5287
5288/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5289 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5290int
5291netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5292 const char *flag_name, bool enable)
5293{
5294 const char *netdev_name = netdev_get_name(netdev);
5295 struct ethtool_value evalue;
5296 uint32_t new_flags;
5297 int error;
5298
ab985a77 5299 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5300 memset(&evalue, 0, sizeof evalue);
5301 error = netdev_linux_do_ethtool(netdev_name,
5302 (struct ethtool_cmd *)&evalue,
5303 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5304 if (error) {
5305 return error;
5306 }
5307
ab985a77 5308 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5309 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5310 if (new_flags == evalue.data) {
5311 return 0;
5312 }
5313 evalue.data = new_flags;
aaf2fb1a
BP
5314 error = netdev_linux_do_ethtool(netdev_name,
5315 (struct ethtool_cmd *)&evalue,
5316 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5317 if (error) {
5318 return error;
5319 }
5320
ab985a77 5321 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5322 memset(&evalue, 0, sizeof evalue);
5323 error = netdev_linux_do_ethtool(netdev_name,
5324 (struct ethtool_cmd *)&evalue,
5325 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5326 if (error) {
5327 return error;
5328 }
5329
5330 if (new_flags != evalue.data) {
5331 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5332 "device %s failed", enable ? "enable" : "disable",
5333 flag_name, netdev_name);
5334 return EOPNOTSUPP;
5335 }
5336
5337 return 0;
5338}
5339\f
5340/* Utility functions. */
5341
d3980822 5342/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5343static void
d3980822
BP
5344netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5345 const struct rtnl_link_stats *src)
5346{
f613a0d7
PS
5347 dst->rx_packets = src->rx_packets;
5348 dst->tx_packets = src->tx_packets;
5349 dst->rx_bytes = src->rx_bytes;
5350 dst->tx_bytes = src->tx_bytes;
5351 dst->rx_errors = src->rx_errors;
5352 dst->tx_errors = src->tx_errors;
5353 dst->rx_dropped = src->rx_dropped;
5354 dst->tx_dropped = src->tx_dropped;
5355 dst->multicast = src->multicast;
5356 dst->collisions = src->collisions;
5357 dst->rx_length_errors = src->rx_length_errors;
5358 dst->rx_over_errors = src->rx_over_errors;
5359 dst->rx_crc_errors = src->rx_crc_errors;
5360 dst->rx_frame_errors = src->rx_frame_errors;
5361 dst->rx_fifo_errors = src->rx_fifo_errors;
5362 dst->rx_missed_errors = src->rx_missed_errors;
5363 dst->tx_aborted_errors = src->tx_aborted_errors;
5364 dst->tx_carrier_errors = src->tx_carrier_errors;
5365 dst->tx_fifo_errors = src->tx_fifo_errors;
5366 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5367 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5368}
5369
337c9b99
BP
5370/* Copies 'src' into 'dst', performing format conversion in the process. */
5371static void
5372netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5373 const struct rtnl_link_stats64 *src)
5374{
5375 dst->rx_packets = src->rx_packets;
5376 dst->tx_packets = src->tx_packets;
5377 dst->rx_bytes = src->rx_bytes;
5378 dst->tx_bytes = src->tx_bytes;
5379 dst->rx_errors = src->rx_errors;
5380 dst->tx_errors = src->tx_errors;
5381 dst->rx_dropped = src->rx_dropped;
5382 dst->tx_dropped = src->tx_dropped;
5383 dst->multicast = src->multicast;
5384 dst->collisions = src->collisions;
5385 dst->rx_length_errors = src->rx_length_errors;
5386 dst->rx_over_errors = src->rx_over_errors;
5387 dst->rx_crc_errors = src->rx_crc_errors;
5388 dst->rx_frame_errors = src->rx_frame_errors;
5389 dst->rx_fifo_errors = src->rx_fifo_errors;
5390 dst->rx_missed_errors = src->rx_missed_errors;
5391 dst->tx_aborted_errors = src->tx_aborted_errors;
5392 dst->tx_carrier_errors = src->tx_carrier_errors;
5393 dst->tx_fifo_errors = src->tx_fifo_errors;
5394 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5395 dst->tx_window_errors = src->tx_window_errors;
5396}
5397
c1c9c9c4 5398static int
35eef899 5399get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5400{
c1c9c9c4
BP
5401 struct ofpbuf request;
5402 struct ofpbuf *reply;
c1c9c9c4
BP
5403 int error;
5404
d6e3feb5 5405 /* Filtering all counters by default */
5406 memset(stats, 0xFF, sizeof(struct netdev_stats));
5407
c1c9c9c4 5408 ofpbuf_init(&request, 0);
13a24df8
BP
5409 nl_msg_put_nlmsghdr(&request,
5410 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5411 RTM_GETLINK, NLM_F_REQUEST);
5412 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5413 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5414 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5415 ofpbuf_uninit(&request);
5416 if (error) {
5417 return error;
5418 }
5419
13a24df8 5420 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5421 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5422 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5423 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5424 error = 0;
5425 } else {
337c9b99
BP
5426 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5427 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5428 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5429 error = 0;
5430 } else {
5431 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5432 error = EPROTO;
5433 }
13a24df8
BP
5434 }
5435 } else {
5436 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5437 error = EPROTO;
c1c9c9c4 5438 }
8b61709d 5439
8b61709d 5440
576e26d7 5441 ofpbuf_delete(reply);
35eef899 5442 return error;
8b61709d 5443}
c1c9c9c4 5444
3a183124 5445static int
b5d57fc8 5446get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5447{
5448 struct ifreq ifr;
5449 int error;
5450
755be9ea 5451 *flags = 0;
259e0b1a 5452 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5453 if (!error) {
5454 *flags = ifr.ifr_flags;
5455 }
8b61709d
BP
5456 return error;
5457}
5458
5459static int
4b609110 5460set_flags(const char *name, unsigned int flags)
8b61709d
BP
5461{
5462 struct ifreq ifr;
5463
5464 ifr.ifr_flags = flags;
259e0b1a 5465 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5466}
5467
5468static int
5469do_get_ifindex(const char *netdev_name)
5470{
5471 struct ifreq ifr;
259e0b1a 5472 int error;
8b61709d 5473
71d7c22f 5474 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5475 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5476
5477 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5478 if (error) {
8b61709d 5479 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
5480 netdev_name, ovs_strerror(error));
5481 return -error;
8b61709d
BP
5482 }
5483 return ifr.ifr_ifindex;
5484}
5485
5486static int
5487get_ifindex(const struct netdev *netdev_, int *ifindexp)
5488{
b5d57fc8 5489 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5490
b5d57fc8 5491 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 5492 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5493
8b61709d 5494 if (ifindex < 0) {
b5d57fc8
BP
5495 netdev->get_ifindex_error = -ifindex;
5496 netdev->ifindex = 0;
c7b1b0a5 5497 } else {
b5d57fc8
BP
5498 netdev->get_ifindex_error = 0;
5499 netdev->ifindex = ifindex;
8b61709d 5500 }
b5d57fc8 5501 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5502 }
c7b1b0a5 5503
b5d57fc8
BP
5504 *ifindexp = netdev->ifindex;
5505 return netdev->get_ifindex_error;
8b61709d
BP
5506}
5507
5508static int
74ff3298 5509get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5510{
5511 struct ifreq ifr;
5512 int hwaddr_family;
259e0b1a 5513 int error;
8b61709d
BP
5514
5515 memset(&ifr, 0, sizeof ifr);
71d7c22f 5516 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5517 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5518 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5519 if (error) {
78857dfb
BP
5520 /* ENODEV probably means that a vif disappeared asynchronously and
5521 * hasn't been removed from the database yet, so reduce the log level
5522 * to INFO for that case. */
259e0b1a 5523 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5524 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5525 netdev_name, ovs_strerror(error));
5526 return error;
8b61709d
BP
5527 }
5528 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
5529 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5530 hwaddr_family != ARPHRD_NONE) {
c9697f35 5531 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5532 netdev_name, hwaddr_family);
c9697f35 5533 return EINVAL;
8b61709d
BP
5534 }
5535 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5536 return 0;
5537}
5538
5539static int
74ff3298 5540set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5541{
5542 struct ifreq ifr;
259e0b1a 5543 int error;
8b61709d
BP
5544
5545 memset(&ifr, 0, sizeof ifr);
71d7c22f 5546 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5547 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5548 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5549 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5550 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5551 if (error) {
8b61709d 5552 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5553 netdev_name, ovs_strerror(error));
8b61709d 5554 }
259e0b1a 5555 return error;
8b61709d
BP
5556}
5557
5558static int
0b0544d7 5559netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5560 int cmd, const char *cmd_name)
5561{
5562 struct ifreq ifr;
259e0b1a 5563 int error;
8b61709d
BP
5564
5565 memset(&ifr, 0, sizeof ifr);
71d7c22f 5566 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5567 ifr.ifr_data = (caddr_t) ecmd;
5568
5569 ecmd->cmd = cmd;
259e0b1a
BP
5570 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5571 if (error) {
5572 if (error != EOPNOTSUPP) {
8b61709d 5573 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5574 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5575 } else {
5576 /* The device doesn't support this operation. That's pretty
5577 * common, so there's no point in logging anything. */
5578 }
8b61709d 5579 }
259e0b1a 5580 return error;
8b61709d 5581}
f1acd62b 5582
488d734d
BP
5583/* Returns an AF_PACKET raw socket or a negative errno value. */
5584static int
5585af_packet_sock(void)
5586{
23882115
BP
5587 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5588 static int sock;
488d734d 5589
23882115 5590 if (ovsthread_once_start(&once)) {
488d734d
BP
5591 sock = socket(AF_PACKET, SOCK_RAW, 0);
5592 if (sock >= 0) {
8450059e
BP
5593 int error = set_nonblocking(sock);
5594 if (error) {
5595 close(sock);
5596 sock = -error;
5597 }
488d734d
BP
5598 } else {
5599 sock = -errno;
10a89ef0
BP
5600 VLOG_ERR("failed to create packet socket: %s",
5601 ovs_strerror(errno));
488d734d 5602 }
23882115 5603 ovsthread_once_done(&once);
488d734d
BP
5604 }
5605
5606 return sock;
5607}