]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
ovs-tc: allow offloading of ingress mirred TC actions to datapath
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
b2befd5b
BP
23#include <sys/types.h>
24#include <netinet/in.h>
55bc98d6 25#include <arpa/inet.h>
8b61709d 26#include <inttypes.h>
2f564bb1 27#include <math.h>
32383c3b 28#include <linux/filter.h>
c1c9c9c4 29#include <linux/gen_stats.h>
bb7d0e22 30#include <linux/if_ether.h>
8b61709d
BP
31#include <linux/if_tun.h>
32#include <linux/types.h>
33#include <linux/ethtool.h>
63331829 34#include <linux/mii.h>
ef3767f5 35#include <linux/rtnetlink.h>
8b61709d 36#include <linux/sockios.h>
8b61709d
BP
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
8b61709d 43#include <net/route.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
e14deea0 50#include "dp-packet.h"
93451a0a 51#include "dpif-netlink.h"
df1e5a3b 52#include "dpif-netdev.h"
3e8a2ad1 53#include "openvswitch/dynamic-string.h"
8b61709d 54#include "fatal-signal.h"
93b13be8 55#include "hash.h"
ee89ea7b 56#include "openvswitch/hmap.h"
8b61709d 57#include "netdev-provider.h"
18ebd48c 58#include "netdev-tc-offloads.h"
7fbef77a 59#include "netdev-vport.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
c060c4cf 62#include "netlink.h"
bfda5239 63#include "netnsid.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d 67#include "packets.h"
fd016ae3 68#include "openvswitch/poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
ee89ea7b 70#include "openvswitch/shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
c1c5c723 73#include "tc.h"
1670c579 74#include "timer.h"
c060c4cf 75#include "unaligned.h"
e6211adc 76#include "openvswitch/vlog.h"
ee89ea7b 77#include "util.h"
5136ce49 78
d98e6007 79VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 80
d76f09ea
BP
81COVERAGE_DEFINE(netdev_set_policing);
82COVERAGE_DEFINE(netdev_arp_lookup);
83COVERAGE_DEFINE(netdev_get_ifindex);
84COVERAGE_DEFINE(netdev_get_hwaddr);
85COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
86COVERAGE_DEFINE(netdev_get_ethtool);
87COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 88
8b61709d 89\f
756819dd
FL
90#ifndef IFLA_IF_NETNSID
91#define IFLA_IF_NETNSID 0x45
92#endif
8b61709d
BP
93/* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95#ifndef ADVERTISED_Pause
96#define ADVERTISED_Pause (1 << 13)
97#endif
98#ifndef ADVERTISED_Asym_Pause
99#define ADVERTISED_Asym_Pause (1 << 14)
100#endif
101
e47bd51a
JP
102/* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104#ifndef ETHTOOL_GFLAGS
105#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106#endif
107#ifndef ETHTOOL_SFLAGS
108#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109#endif
110
c1c9c9c4
BP
111/* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113#ifndef TC_RTAB_SIZE
114#define TC_RTAB_SIZE 1024
115#endif
116
e7f6ba22
PJV
117#ifndef TCM_IFINDEX_MAGIC_BLOCK
118#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
119#endif
120
b73c8518
SH
121/* Linux 2.6.21 introduced struct tpacket_auxdata.
122 * Linux 2.6.27 added the tp_vlan_tci member.
123 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
124 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
125 * TP_STATUS_VLAN_TPID_VALID.
126 *
127 * With all this churn it's easiest to unconditionally define a replacement
128 * structure that has everything we want.
129 */
55bc98d6
BP
130#ifndef PACKET_AUXDATA
131#define PACKET_AUXDATA 8
132#endif
b73c8518
SH
133#ifndef TP_STATUS_VLAN_VALID
134#define TP_STATUS_VLAN_VALID (1 << 4)
135#endif
136#ifndef TP_STATUS_VLAN_TPID_VALID
137#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
138#endif
139#undef tpacket_auxdata
140#define tpacket_auxdata rpl_tpacket_auxdata
141struct tpacket_auxdata {
142 uint32_t tp_status;
143 uint32_t tp_len;
144 uint32_t tp_snaplen;
145 uint16_t tp_mac;
146 uint16_t tp_net;
147 uint16_t tp_vlan_tci;
148 uint16_t tp_vlan_tpid;
149};
150
0c615356
SH
151/* Linux 2.6.27 introduced ethtool_cmd_speed
152 *
153 * To avoid revisiting problems reported with using configure to detect
154 * compatibility (see report at
8a7903c6 155 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
156 * unconditionally replace ethtool_cmd_speed. */
157#define ethtool_cmd_speed rpl_ethtool_cmd_speed
158static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
159{
160 return ep->speed | (ep->speed_hi << 16);
161}
162
67bed84c
SH
163/* Linux 2.6.30 introduced supported and advertised flags for
164 * 1G base KX, and 10G base KX4, KR and R. */
165#ifndef SUPPORTED_1000baseKX_Full
166#define SUPPORTED_1000baseKX_Full (1 << 17)
167#define SUPPORTED_10000baseKX4_Full (1 << 18)
168#define SUPPORTED_10000baseKR_Full (1 << 19)
169#define SUPPORTED_10000baseR_FEC (1 << 20)
170#define ADVERTISED_1000baseKX_Full (1 << 17)
171#define ADVERTISED_10000baseKX4_Full (1 << 18)
172#define ADVERTISED_10000baseKR_Full (1 << 19)
173#define ADVERTISED_10000baseR_FEC (1 << 20)
174#endif
175
176/* Linux 3.5 introduced supported and advertised flags for
177 * 40G base KR4, CR4, SR4 and LR4. */
178#ifndef SUPPORTED_40000baseKR4_Full
179#define SUPPORTED_40000baseKR4_Full (1 << 23)
180#define SUPPORTED_40000baseCR4_Full (1 << 24)
181#define SUPPORTED_40000baseSR4_Full (1 << 25)
182#define SUPPORTED_40000baseLR4_Full (1 << 26)
183#define ADVERTISED_40000baseKR4_Full (1 << 23)
184#define ADVERTISED_40000baseCR4_Full (1 << 24)
185#define ADVERTISED_40000baseSR4_Full (1 << 25)
186#define ADVERTISED_40000baseLR4_Full (1 << 26)
187#endif
188
fa373af4
BP
189/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
190 *
191 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
192 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
193 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
194 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
195 * unconditionally define a replacement. */
196#ifndef IFLA_STATS64
337c9b99 197#define IFLA_STATS64 23
fa373af4
BP
198#endif
199#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
200struct rtnl_link_stats64 {
201 uint64_t rx_packets;
202 uint64_t tx_packets;
203 uint64_t rx_bytes;
204 uint64_t tx_bytes;
205 uint64_t rx_errors;
206 uint64_t tx_errors;
207 uint64_t rx_dropped;
208 uint64_t tx_dropped;
209 uint64_t multicast;
210 uint64_t collisions;
211
212 uint64_t rx_length_errors;
213 uint64_t rx_over_errors;
214 uint64_t rx_crc_errors;
215 uint64_t rx_frame_errors;
216 uint64_t rx_fifo_errors;
217 uint64_t rx_missed_errors;
218
219 uint64_t tx_aborted_errors;
220 uint64_t tx_carrier_errors;
221 uint64_t tx_fifo_errors;
222 uint64_t tx_heartbeat_errors;
223 uint64_t tx_window_errors;
224
225 uint64_t rx_compressed;
226 uint64_t tx_compressed;
227};
337c9b99 228
8b61709d 229enum {
7fbef77a
JG
230 VALID_IFINDEX = 1 << 0,
231 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
232 VALID_IN = 1 << 2,
233 VALID_MTU = 1 << 3,
234 VALID_POLICING = 1 << 4,
235 VALID_VPORT_STAT_ERROR = 1 << 5,
236 VALID_DRVINFO = 1 << 6,
237 VALID_FEATURES = 1 << 7,
8b61709d 238};
c1c9c9c4 239\f
d22f8927
JH
240struct linux_lag_slave {
241 uint32_t block_id;
242 struct shash_node *node;
243};
244
245/* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
246static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
247
248/* All slaves whose LAG masters are network devices in OvS. */
249static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
250 = SHASH_INITIALIZER(&lag_shash);
251
c1c9c9c4
BP
252/* Traffic control. */
253
254/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
255 * network device.
256 *
257 * Each TC implementation subclasses this with whatever additional data it
258 * needs. */
c1c9c9c4
BP
259struct tc {
260 const struct tc_ops *ops;
93b13be8
BP
261 struct hmap queues; /* Contains "struct tc_queue"s.
262 * Read by generic TC layer.
263 * Written only by TC implementation. */
264};
c1c9c9c4 265
559eb230
BP
266#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
267
93b13be8
BP
268/* One traffic control queue.
269 *
270 * Each TC implementation subclasses this with whatever additional data it
271 * needs. */
272struct tc_queue {
273 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
274 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 275 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
276};
277
278/* A particular kind of traffic control. Each implementation generally maps to
279 * one particular Linux qdisc class.
280 *
281 * The functions below return 0 if successful or a positive errno value on
282 * failure, except where otherwise noted. All of them must be provided, except
283 * where otherwise noted. */
284struct tc_ops {
285 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
286 * This is null for tc_ops_default and tc_ops_other, for which there are no
287 * appropriate values. */
288 const char *linux_name;
289
290 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
291 const char *ovs_name;
292
293 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
294 * queues. The queues are numbered 0 through n_queues - 1. */
295 unsigned int n_queues;
296
297 /* Called to install this TC class on 'netdev'. The implementation should
298 * make the Netlink calls required to set up 'netdev' with the right qdisc
299 * and configure it according to 'details'. The implementation may assume
300 * that the current qdisc is the default; that is, there is no need for it
301 * to delete the current qdisc before installing itself.
302 *
303 * The contents of 'details' should be documented as valid for 'ovs_name'
304 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
305 * (which is built as ovs-vswitchd.conf.db(8)).
306 *
307 * This function must return 0 if and only if it sets 'netdev->tc' to an
308 * initialized 'struct tc'.
309 *
310 * (This function is null for tc_ops_other, which cannot be installed. For
311 * other TC classes it should always be nonnull.) */
79f1cbe9 312 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
313
314 /* Called when the netdev code determines (through a Netlink query) that
315 * this TC class's qdisc is installed on 'netdev', but we didn't install
316 * it ourselves and so don't know any of the details.
317 *
318 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
319 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
320 * implementation should parse the other attributes of 'nlmsg' as
321 * necessary to determine its configuration. If necessary it should also
322 * use Netlink queries to determine the configuration of queues on
323 * 'netdev'.
324 *
325 * This function must return 0 if and only if it sets 'netdev->tc' to an
326 * initialized 'struct tc'. */
327 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
328
329 /* Destroys the data structures allocated by the implementation as part of
330 * 'tc'. (This includes destroying 'tc->queues' by calling
331 * tc_destroy(tc).
332 *
333 * The implementation should not need to perform any Netlink calls. If
334 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
335 * (But it may not be desirable.)
336 *
337 * This function may be null if 'tc' is trivial. */
338 void (*tc_destroy)(struct tc *tc);
339
340 /* Retrieves details of 'netdev->tc' configuration into 'details'.
341 *
342 * The implementation should not need to perform any Netlink calls, because
343 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
344 * cached the configuration.
345 *
346 * The contents of 'details' should be documented as valid for 'ovs_name'
347 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
348 * (which is built as ovs-vswitchd.conf.db(8)).
349 *
350 * This function may be null if 'tc' is not configurable.
351 */
79f1cbe9 352 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
353
354 /* Reconfigures 'netdev->tc' according to 'details', performing any
355 * required Netlink calls to complete the reconfiguration.
356 *
357 * The contents of 'details' should be documented as valid for 'ovs_name'
358 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
359 * (which is built as ovs-vswitchd.conf.db(8)).
360 *
361 * This function may be null if 'tc' is not configurable.
362 */
79f1cbe9 363 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 364
93b13be8
BP
365 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
366 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
367 *
368 * The contents of 'details' should be documented as valid for 'ovs_name'
369 * in the "other_config" column in the "Queue" table in
370 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
371 *
372 * The implementation should not need to perform any Netlink calls, because
373 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
374 * cached the queue configuration.
375 *
376 * This function may be null if 'tc' does not have queues ('n_queues' is
377 * 0). */
93b13be8 378 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 379 struct smap *details);
c1c9c9c4
BP
380
381 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
382 * 'details', perfoming any required Netlink calls to complete the
383 * reconfiguration. The caller ensures that 'queue_id' is less than
384 * 'n_queues'.
385 *
386 * The contents of 'details' should be documented as valid for 'ovs_name'
387 * in the "other_config" column in the "Queue" table in
388 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
389 *
390 * This function may be null if 'tc' does not have queues or its queues are
391 * not configurable. */
392 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 393 const struct smap *details);
c1c9c9c4 394
93b13be8
BP
395 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
396 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
397 *
398 * This function may be null if 'tc' does not have queues or its queues
399 * cannot be deleted. */
93b13be8 400 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 401
93b13be8
BP
402 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
403 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
404 *
405 * On success, initializes '*stats'.
406 *
407 * This function may be null if 'tc' does not have queues or if it cannot
408 * report queue statistics. */
93b13be8
BP
409 int (*class_get_stats)(const struct netdev *netdev,
410 const struct tc_queue *queue,
c1c9c9c4
BP
411 struct netdev_queue_stats *stats);
412
413 /* Extracts queue stats from 'nlmsg', which is a response to a
414 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
415 *
416 * This function may be null if 'tc' does not have queues or if it cannot
417 * report queue statistics. */
418 int (*class_dump_stats)(const struct netdev *netdev,
419 const struct ofpbuf *nlmsg,
420 netdev_dump_queue_stats_cb *cb, void *aux);
421};
422
423static void
424tc_init(struct tc *tc, const struct tc_ops *ops)
425{
426 tc->ops = ops;
93b13be8 427 hmap_init(&tc->queues);
c1c9c9c4
BP
428}
429
430static void
431tc_destroy(struct tc *tc)
432{
93b13be8 433 hmap_destroy(&tc->queues);
c1c9c9c4
BP
434}
435
436static const struct tc_ops tc_ops_htb;
a339aa81 437static const struct tc_ops tc_ops_hfsc;
677d9158
JV
438static const struct tc_ops tc_ops_codel;
439static const struct tc_ops tc_ops_fqcodel;
440static const struct tc_ops tc_ops_sfq;
2f564bb1 441static const struct tc_ops tc_ops_netem;
c1c9c9c4 442static const struct tc_ops tc_ops_default;
6cf888b8 443static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
444static const struct tc_ops tc_ops_other;
445
559eb230 446static const struct tc_ops *const tcs[] = {
c1c9c9c4 447 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 448 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
449 &tc_ops_codel, /* Controlled delay */
450 &tc_ops_fqcodel, /* Fair queue controlled delay */
451 &tc_ops_sfq, /* Stochastic fair queueing */
2f564bb1 452 &tc_ops_netem, /* Network Emulator */
6cf888b8 453 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
454 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
455 &tc_ops_other, /* Some other qdisc. */
456 NULL
457};
149f577a 458
c1c9c9c4
BP
459static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
460static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
461static unsigned int tc_buffer_per_jiffy(unsigned int rate);
2f564bb1 462static uint32_t tc_time_to_ticks(uint32_t time);
c1c9c9c4 463
7874bdff
RD
464static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
465 int type,
466 unsigned int flags,
467 struct ofpbuf *);
c7952afb
BP
468static int tc_add_policer(struct netdev *,
469 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
470
471static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
472 struct nlattr **options);
473static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
474 struct nlattr **options,
475 struct netdev_queue_stats *);
476static int tc_query_class(const struct netdev *,
477 unsigned int handle, unsigned int parent,
478 struct ofpbuf **replyp);
479static int tc_delete_class(const struct netdev *, unsigned int handle);
480
481static int tc_del_qdisc(struct netdev *netdev);
482static int tc_query_qdisc(const struct netdev *netdev);
483
e7f6ba22
PJV
484void
485tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
c1c9c9c4
BP
486static int tc_calc_cell_log(unsigned int mtu);
487static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
c1c9c9c4
BP
488static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
489\f
b5d57fc8
BP
490struct netdev_linux {
491 struct netdev up;
149f577a 492
86383816
BP
493 /* Protects all members below. */
494 struct ovs_mutex mutex;
495
149f577a 496 unsigned int cache_valid;
8b61709d 497
1670c579
EJ
498 bool miimon; /* Link status of last poll. */
499 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
500 struct timer miimon_timer;
501
bfda5239 502 int netnsid; /* Network namespace ID. */
8722022c
BP
503 /* The following are figured out "on demand" only. They are only valid
504 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 505 int ifindex;
74ff3298 506 struct eth_addr etheraddr;
8b61709d 507 int mtu;
059e5f4f 508 unsigned int ifi_flags;
65c3058c 509 long long int carrier_resets;
80a86fbe
BP
510 uint32_t kbits_rate; /* Policing data. */
511 uint32_t kbits_burst;
bba1e6f3
PS
512 int vport_stats_error; /* Cached error code from vport_get_stats().
513 0 or an errno value. */
90a6637d 514 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 515 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 516 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 517 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 518 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 519
a00ca915
EJ
520 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
521 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
522 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 523
4f925bd3 524 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 525 struct tc *tc;
149f577a 526
d0d08f8a
BP
527 /* For devices of class netdev_tap_class only. */
528 int tap_fd;
22dcb534
FL
529 bool present; /* If the device is present in the namespace */
530 uint64_t tx_dropped; /* tap device can drop if the iface is down */
3d9c99ab
JH
531
532 /* LAG information. */
533 bool is_lag_master; /* True if the netdev is a LAG master. */
8b61709d
BP
534};
535
f7791740
PS
536struct netdev_rxq_linux {
537 struct netdev_rxq up;
796223f5 538 bool is_tap;
5b7448ed 539 int fd;
149f577a 540};
8b61709d 541
8b61709d
BP
542/* This is set pretty low because we probably won't learn anything from the
543 * additional log messages. */
544static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
545
19c8e9c1
JS
546/* Polling miimon status for all ports causes performance degradation when
547 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
548 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
549 *
550 * Readers do not depend on this variable synchronizing with the related
551 * changes in the device miimon status, so we can use atomic_count. */
552static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 553
1c33f0c3 554static void netdev_linux_run(const struct netdev_class *);
6f643e49 555
0b0544d7 556static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 557 int cmd, const char *cmd_name);
b5d57fc8 558static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 559static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
560static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
561 enum netdev_flags on, enum netdev_flags *old_flagsp)
562 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
563static int get_ifindex(const struct netdev *, int *ifindexp);
564static int do_set_addr(struct netdev *netdev,
565 int ioctl_nr, const char *ioctl_name,
566 struct in_addr addr);
74ff3298
JR
567static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
568static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 569static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 570static int af_packet_sock(void);
19c8e9c1 571static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
572static void netdev_linux_miimon_run(void);
573static void netdev_linux_miimon_wait(void);
df1e5a3b 574static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 575
15b3596a
JG
576static bool
577is_netdev_linux_class(const struct netdev_class *netdev_class)
578{
259e0b1a 579 return netdev_class->run == netdev_linux_run;
15b3596a
JG
580}
581
796223f5
BP
582static bool
583is_tap_netdev(const struct netdev *netdev)
584{
b5d57fc8 585 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
586}
587
8b61709d
BP
588static struct netdev_linux *
589netdev_linux_cast(const struct netdev *netdev)
590{
b5d57fc8 591 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 592
180c6d0b 593 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 594}
796223f5 595
f7791740
PS
596static struct netdev_rxq_linux *
597netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 598{
9dc63482 599 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 600 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 601}
ff4ed3c9 602\f
bfda5239
FL
603static int
604netdev_linux_netnsid_update__(struct netdev_linux *netdev)
605{
606 struct dpif_netlink_vport reply;
607 struct ofpbuf *buf;
608 int error;
609
610 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
611 if (error) {
629e1476
FL
612 if (error == ENOENT) {
613 /* Assume it is local if there is no API (e.g. if the openvswitch
614 * kernel module is not loaded). */
615 netnsid_set_local(&netdev->netnsid);
616 } else {
617 netnsid_unset(&netdev->netnsid);
618 }
bfda5239
FL
619 return error;
620 }
621
622 netnsid_set(&netdev->netnsid, reply.netnsid);
623 ofpbuf_delete(buf);
624 return 0;
625}
626
627static int
628netdev_linux_netnsid_update(struct netdev_linux *netdev)
629{
630 if (netnsid_is_unset(netdev->netnsid)) {
3dbcbfe4
FL
631 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
632 netnsid_set_local(&netdev->netnsid);
633 } else {
634 return netdev_linux_netnsid_update__(netdev);
635 }
bfda5239
FL
636 }
637
638 return 0;
639}
640
641static bool
642netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
643{
644 netdev_linux_netnsid_update(netdev);
645 return netnsid_eq(netdev->netnsid, nsid);
646}
647
756819dd
FL
648static bool
649netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
650{
651 netdev_linux_netnsid_update(netdev);
652 return netnsid_is_remote(netdev->netnsid);
653}
654
655static int netdev_linux_update_via_netlink(struct netdev_linux *);
bfda5239 656static void netdev_linux_update(struct netdev_linux *netdev, int,
7e9dcc0f 657 const struct rtnetlink_change *)
86383816 658 OVS_REQUIRES(netdev->mutex);
cee87338 659static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
660 unsigned int ifi_flags, unsigned int mask)
661 OVS_REQUIRES(netdev->mutex);
cee87338 662
d6384a3a
AW
663/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
664 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
665 * if no such socket could be created. */
666static struct nl_sock *
667netdev_linux_notify_sock(void)
668{
669 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
670 static struct nl_sock *sock;
989d7135
PS
671 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
672 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
673
674 if (ovsthread_once_start(&once)) {
675 int error;
676
677 error = nl_sock_create(NETLINK_ROUTE, &sock);
678 if (!error) {
d6384a3a
AW
679 size_t i;
680
681 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
682 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
683 if (error) {
684 nl_sock_destroy(sock);
685 sock = NULL;
686 break;
687 }
cee87338
BP
688 }
689 }
cf114a7f 690 nl_sock_listen_all_nsid(sock, true);
cee87338
BP
691 ovsthread_once_done(&once);
692 }
693
694 return sock;
695}
696
19c8e9c1
JS
697static bool
698netdev_linux_miimon_enabled(void)
699{
812c272c 700 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
701}
702
3d9c99ab
JH
703static bool
704netdev_linux_kind_is_lag(const char *kind)
705{
706 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
707 return true;
708 }
709
710 return false;
711}
712
d22f8927
JH
713static void
714netdev_linux_update_lag(struct rtnetlink_change *change)
715 OVS_REQUIRES(lag_mutex)
716{
717 struct linux_lag_slave *lag;
718
719 if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
720 return;
721 }
722
723 if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
724 lag = shash_find_data(&lag_shash, change->ifname);
725
726 if (!lag) {
727 struct netdev *master_netdev;
728 char master_name[IFNAMSIZ];
729 uint32_t block_id;
730 int error = 0;
731
732 if_indextoname(change->master_ifindex, master_name);
733 master_netdev = netdev_from_name(master_name);
e3b5d7c5
TL
734 if (!master_netdev) {
735 return;
736 }
d22f8927
JH
737
738 if (is_netdev_linux_class(master_netdev->netdev_class)) {
739 block_id = netdev_get_block_id(master_netdev);
740 if (!block_id) {
e3b5d7c5
TL
741 netdev_close(master_netdev);
742 return;
d22f8927
JH
743 }
744
745 lag = xmalloc(sizeof *lag);
746 lag->block_id = block_id;
747 lag->node = shash_add(&lag_shash, change->ifname, lag);
748
cae64353
RD
749 /* delete ingress block in case it exists */
750 tc_add_del_ingress_qdisc(change->if_index, false, 0);
d22f8927
JH
751 /* LAG master is linux netdev so add slave to same block. */
752 error = tc_add_del_ingress_qdisc(change->if_index, true,
753 block_id);
754 if (error) {
cae64353
RD
755 VLOG_WARN("failed to bind LAG slave %s to master's block",
756 change->ifname);
d22f8927
JH
757 shash_delete(&lag_shash, lag->node);
758 free(lag);
759 }
760 }
e3b5d7c5
TL
761
762 netdev_close(master_netdev);
d22f8927
JH
763 }
764 } else if (change->master_ifindex == 0) {
765 /* Check if this was a lag slave that has been freed. */
766 lag = shash_find_data(&lag_shash, change->ifname);
767
768 if (lag) {
769 tc_add_del_ingress_qdisc(change->if_index, false,
770 lag->block_id);
771 shash_delete(&lag_shash, lag->node);
772 free(lag);
773 }
774 }
775}
776
8b61709d 777static void
1c33f0c3 778netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 779{
cee87338
BP
780 struct nl_sock *sock;
781 int error;
782
19c8e9c1
JS
783 if (netdev_linux_miimon_enabled()) {
784 netdev_linux_miimon_run();
785 }
cee87338
BP
786
787 sock = netdev_linux_notify_sock();
788 if (!sock) {
789 return;
790 }
791
792 do {
cee87338 793 uint64_t buf_stub[4096 / 8];
bfda5239 794 int nsid;
cee87338
BP
795 struct ofpbuf buf;
796
797 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
bfda5239 798 error = nl_sock_recv(sock, &buf, &nsid, false);
cee87338 799 if (!error) {
7e9dcc0f 800 struct rtnetlink_change change;
cee87338 801
7e9dcc0f 802 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
803 struct netdev *netdev_ = NULL;
804 char dev_name[IFNAMSIZ];
805
806 if (!change.ifname) {
807 change.ifname = if_indextoname(change.if_index, dev_name);
808 }
809
810 if (change.ifname) {
811 netdev_ = netdev_from_name(change.ifname);
812 }
cee87338
BP
813 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
814 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
815
816 ovs_mutex_lock(&netdev->mutex);
bfda5239 817 netdev_linux_update(netdev, nsid, &change);
86383816 818 ovs_mutex_unlock(&netdev->mutex);
cee87338 819 }
d22f8927
JH
820 else if (!netdev_ && change.ifname) {
821 /* Netdev is not present in OvS but its master could be. */
822 ovs_mutex_lock(&lag_mutex);
823 netdev_linux_update_lag(&change);
824 ovs_mutex_unlock(&lag_mutex);
825 }
38e0065b 826 netdev_close(netdev_);
cee87338
BP
827 }
828 } else if (error == ENOBUFS) {
829 struct shash device_shash;
830 struct shash_node *node;
831
832 nl_sock_drain(sock);
833
834 shash_init(&device_shash);
835 netdev_get_devices(&netdev_linux_class, &device_shash);
836 SHASH_FOR_EACH (node, &device_shash) {
837 struct netdev *netdev_ = node->data;
838 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
839 unsigned int flags;
840
86383816 841 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
842 get_flags(netdev_, &flags);
843 netdev_linux_changed(netdev, flags, 0);
86383816
BP
844 ovs_mutex_unlock(&netdev->mutex);
845
cee87338
BP
846 netdev_close(netdev_);
847 }
848 shash_destroy(&device_shash);
849 } else if (error != EAGAIN) {
7ed58d4a
JP
850 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
851 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
852 ovs_strerror(error));
853 }
854 ofpbuf_uninit(&buf);
855 } while (!error);
8b61709d
BP
856}
857
858static void
1c33f0c3 859netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 860{
cee87338
BP
861 struct nl_sock *sock;
862
19c8e9c1
JS
863 if (netdev_linux_miimon_enabled()) {
864 netdev_linux_miimon_wait();
865 }
cee87338
BP
866 sock = netdev_linux_notify_sock();
867 if (sock) {
868 nl_sock_wait(sock, POLLIN);
869 }
8b61709d
BP
870}
871
ac4d3bcb 872static void
b5d57fc8
BP
873netdev_linux_changed(struct netdev_linux *dev,
874 unsigned int ifi_flags, unsigned int mask)
86383816 875 OVS_REQUIRES(dev->mutex)
ac4d3bcb 876{
3e912ffc 877 netdev_change_seq_changed(&dev->up);
8aa77183
BP
878
879 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
880 dev->carrier_resets++;
881 }
882 dev->ifi_flags = ifi_flags;
883
4f925bd3 884 dev->cache_valid &= mask;
6b6e1329 885 if (!(mask & VALID_IN)) {
a8704b50
PS
886 netdev_get_addrs_list_flush();
887 }
4f925bd3
PS
888}
889
890static void
bfda5239
FL
891netdev_linux_update__(struct netdev_linux *dev,
892 const struct rtnetlink_change *change)
86383816 893 OVS_REQUIRES(dev->mutex)
4f925bd3 894{
bfda5239 895 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
d6384a3a 896 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 897 /* Keep drv-info, and ip addresses. */
d6384a3a 898 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 899 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
900
901 /* Update netdev from rtnl-change msg. */
902 if (change->mtu) {
903 dev->mtu = change->mtu;
904 dev->cache_valid |= VALID_MTU;
905 dev->netdev_mtu_error = 0;
906 }
90a6637d 907
74ff3298
JR
908 if (!eth_addr_is_zero(change->mac)) {
909 dev->etheraddr = change->mac;
d6384a3a
AW
910 dev->cache_valid |= VALID_ETHERADDR;
911 dev->ether_addr_error = 0;
e8e1a409
TZ
912
913 /* The mac addr has been changed, report it now. */
914 rtnetlink_report_link();
d6384a3a 915 }
44445cac 916
3d9c99ab
JH
917 if (change->master && netdev_linux_kind_is_lag(change->master)) {
918 dev->is_lag_master = true;
919 }
920
d6384a3a
AW
921 dev->ifindex = change->if_index;
922 dev->cache_valid |= VALID_IFINDEX;
923 dev->get_ifindex_error = 0;
22dcb534 924 dev->present = true;
d6384a3a 925 } else {
bfda5239 926 /* FIXME */
d6384a3a 927 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 928 dev->present = false;
bfda5239 929 netnsid_unset(&dev->netnsid);
d6384a3a
AW
930 }
931 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
932 /* Invalidates in4, in6. */
6b6e1329 933 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 934 } else {
d6384a3a 935 OVS_NOT_REACHED();
4f925bd3 936 }
ac4d3bcb
EJ
937}
938
bfda5239
FL
939static void
940netdev_linux_update(struct netdev_linux *dev, int nsid,
941 const struct rtnetlink_change *change)
942 OVS_REQUIRES(dev->mutex)
943{
944 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
945 netdev_linux_update__(dev, change);
946 }
947}
948
9dc63482
BP
949static struct netdev *
950netdev_linux_alloc(void)
951{
952 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
953 return &netdev->up;
954}
955
48c6733c
WT
956static int
957netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 958{
48c6733c
WT
959 /* Prevent any attempt to create (or open) a network device named "default"
960 * or "all". These device names are effectively reserved on Linux because
961 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
962 * itself this wouldn't call for any special treatment, but in practice if
963 * a program tries to create devices with these names, it causes the kernel
964 * to fire a "new device" notification event even though creation failed,
965 * and in turn that causes OVS to wake up and try to create them again,
966 * which ends up as a 100% CPU loop. */
967 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
968 const char *name = netdev_->name;
969 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
970 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
971 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
972 name);
973 return EINVAL;
974 }
975
bfda5239
FL
976 /* The device could be in the same network namespace or in another one. */
977 netnsid_unset(&netdev->netnsid);
834d6caf 978 ovs_mutex_init(&netdev->mutex);
48c6733c 979 return 0;
9dc63482
BP
980}
981
1f6e0fbd
BP
982/* Creates system and internal devices. */
983static int
9dc63482 984netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 985{
9dc63482 986 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
987 int error = netdev_linux_common_construct(netdev_);
988 if (error) {
989 return error;
990 }
1f6e0fbd 991
b5d57fc8
BP
992 error = get_flags(&netdev->up, &netdev->ifi_flags);
993 if (error == ENODEV) {
9dc63482 994 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 995 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
996 return ENODEV;
997 } else {
998 /* "Internal" netdevs have to be created as netdev objects before
999 * they exist in the kernel, because creating them in the kernel
1000 * happens by passing a netdev object to dpif_port_add().
1001 * Therefore, ignore the error. */
1002 }
1003 }
46415c90 1004
a740f0de
JG
1005 return 0;
1006}
1007
5b7448ed
JG
1008/* For most types of netdevs we open the device for each call of
1009 * netdev_open(). However, this is not the case with tap devices,
1010 * since it is only possible to open the device once. In this
1011 * situation we share a single file descriptor, and consequently
1012 * buffers, across all readers. Therefore once data is read it will
1013 * be unavailable to other reads for tap devices. */
a740f0de 1014static int
9dc63482 1015netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 1016{
9dc63482 1017 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 1018 static const char tap_dev[] = "/dev/net/tun";
9dc63482 1019 const char *name = netdev_->name;
a740f0de 1020 struct ifreq ifr;
a740f0de 1021
48c6733c
WT
1022 int error = netdev_linux_common_construct(netdev_);
1023 if (error) {
1024 return error;
1025 }
1f6e0fbd 1026
6c88d577 1027 /* Open tap device. */
d0d08f8a
BP
1028 netdev->tap_fd = open(tap_dev, O_RDWR);
1029 if (netdev->tap_fd < 0) {
6c88d577 1030 error = errno;
10a89ef0 1031 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 1032 return error;
6c88d577
JP
1033 }
1034
1035 /* Create tap device. */
61b9d078 1036 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 1037 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 1038 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 1039 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 1040 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 1041 ovs_strerror(errno));
6c88d577 1042 error = errno;
f61d8d29 1043 goto error_close;
6c88d577
JP
1044 }
1045
1046 /* Make non-blocking. */
d0d08f8a 1047 error = set_nonblocking(netdev->tap_fd);
a740f0de 1048 if (error) {
f61d8d29 1049 goto error_close;
a740f0de
JG
1050 }
1051
0f28164b
FL
1052 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1053 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1054 ovs_strerror(errno));
1055 error = errno;
1056 goto error_close;
1057 }
1058
19aac14a 1059 netdev->present = true;
a740f0de
JG
1060 return 0;
1061
f61d8d29 1062error_close:
d0d08f8a 1063 close(netdev->tap_fd);
a740f0de
JG
1064 return error;
1065}
1066
6c88d577 1067static void
9dc63482 1068netdev_linux_destruct(struct netdev *netdev_)
6c88d577 1069{
b5d57fc8 1070 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 1071
b5d57fc8
BP
1072 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1073 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
1074 }
1075
d0d08f8a
BP
1076 if (netdev_get_class(netdev_) == &netdev_tap_class
1077 && netdev->tap_fd >= 0)
1078 {
0f28164b 1079 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 1080 close(netdev->tap_fd);
6c88d577 1081 }
86383816 1082
19c8e9c1 1083 if (netdev->miimon_interval > 0) {
812c272c 1084 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1085 }
1086
86383816 1087 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
1088}
1089
9dc63482
BP
1090static void
1091netdev_linux_dealloc(struct netdev *netdev_)
1092{
1093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1094 free(netdev);
1095}
1096
f7791740
PS
1097static struct netdev_rxq *
1098netdev_linux_rxq_alloc(void)
9dc63482 1099{
f7791740 1100 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
1101 return &rx->up;
1102}
1103
7b6b0ef4 1104static int
f7791740 1105netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 1106{
f7791740 1107 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1108 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 1109 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 1110 int error;
7b6b0ef4 1111
86383816 1112 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
1113 rx->is_tap = is_tap_netdev(netdev_);
1114 if (rx->is_tap) {
1115 rx->fd = netdev->tap_fd;
796223f5
BP
1116 } else {
1117 struct sockaddr_ll sll;
b73c8518 1118 int ifindex, val;
32383c3b 1119 /* Result of tcpdump -dd inbound */
259e0b1a 1120 static const struct sock_filter filt[] = {
32383c3b
MM
1121 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1122 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1123 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1124 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1125 };
259e0b1a
BP
1126 static const struct sock_fprog fprog = {
1127 ARRAY_SIZE(filt), (struct sock_filter *) filt
1128 };
7b6b0ef4 1129
796223f5 1130 /* Create file descriptor. */
9dc63482
BP
1131 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1132 if (rx->fd < 0) {
796223f5 1133 error = errno;
10a89ef0 1134 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
1135 goto error;
1136 }
33d82a56 1137
b73c8518
SH
1138 val = 1;
1139 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1140 error = errno;
1141 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1142 netdev_get_name(netdev_), ovs_strerror(error));
1143 goto error;
1144 }
1145
796223f5 1146 /* Set non-blocking mode. */
9dc63482 1147 error = set_nonblocking(rx->fd);
796223f5
BP
1148 if (error) {
1149 goto error;
1150 }
7b6b0ef4 1151
796223f5 1152 /* Get ethernet device index. */
180c6d0b 1153 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
1154 if (error) {
1155 goto error;
1156 }
7b6b0ef4 1157
796223f5
BP
1158 /* Bind to specific ethernet device. */
1159 memset(&sll, 0, sizeof sll);
1160 sll.sll_family = AF_PACKET;
1161 sll.sll_ifindex = ifindex;
b73c8518 1162 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 1163 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
1164 error = errno;
1165 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 1166 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
1167 goto error;
1168 }
32383c3b
MM
1169
1170 /* Filter for only inbound packets. */
9dc63482 1171 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
1172 sizeof fprog);
1173 if (error) {
1174 error = errno;
259e0b1a 1175 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 1176 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
1177 goto error;
1178 }
7b6b0ef4 1179 }
86383816 1180 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1181
7b6b0ef4
BP
1182 return 0;
1183
1184error:
9dc63482
BP
1185 if (rx->fd >= 0) {
1186 close(rx->fd);
7b6b0ef4 1187 }
86383816 1188 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1189 return error;
1190}
1191
796223f5 1192static void
f7791740 1193netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1194{
f7791740 1195 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1196
796223f5
BP
1197 if (!rx->is_tap) {
1198 close(rx->fd);
8b61709d 1199 }
9dc63482
BP
1200}
1201
1202static void
f7791740 1203netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1204{
f7791740 1205 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1206
796223f5
BP
1207 free(rx);
1208}
8b61709d 1209
b73c8518 1210static ovs_be16
1ebdc7eb 1211auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1212{
1213 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1214 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1215 } else if (double_tagged) {
1216 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1217 } else {
1ebdc7eb 1218 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1219 }
1220}
1221
1222static bool
1223auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1224{
1225 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1226}
1227
796223f5 1228static int
cf62fa4c 1229netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1230{
b73c8518 1231 size_t size;
796223f5 1232 ssize_t retval;
b73c8518
SH
1233 struct iovec iov;
1234 struct cmsghdr *cmsg;
1235 union {
1236 struct cmsghdr cmsg;
1237 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1238 } cmsg_buffer;
1239 struct msghdr msgh;
1240
1241 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1242 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1243 size = dp_packet_tailroom(buffer);
b73c8518 1244
cf62fa4c 1245 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1246 iov.iov_len = size;
1247 msgh.msg_name = NULL;
1248 msgh.msg_namelen = 0;
1249 msgh.msg_iov = &iov;
1250 msgh.msg_iovlen = 1;
1251 msgh.msg_control = &cmsg_buffer;
1252 msgh.msg_controllen = sizeof cmsg_buffer;
1253 msgh.msg_flags = 0;
8e8cddf7 1254
796223f5 1255 do {
b73c8518 1256 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1257 } while (retval < 0 && errno == EINTR);
1258
bfd3367b 1259 if (retval < 0) {
b73c8518
SH
1260 return errno;
1261 } else if (retval > size) {
1262 return EMSGSIZE;
1263 }
1264
cf62fa4c 1265 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1266
1267 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1268 const struct tpacket_auxdata *aux;
1269
1270 if (cmsg->cmsg_level != SOL_PACKET
1271 || cmsg->cmsg_type != PACKET_AUXDATA
1272 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1273 continue;
8b61709d 1274 }
b73c8518
SH
1275
1276 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1277 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1278 struct eth_header *eth;
1279 bool double_tagged;
1280
b73c8518
SH
1281 if (retval < ETH_HEADER_LEN) {
1282 return EINVAL;
1283 }
1284
1ebdc7eb
EG
1285 eth = dp_packet_data(buffer);
1286 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1287
1288 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1289 htons(aux->tp_vlan_tci));
1290 break;
1291 }
1292 }
1293
1294 return 0;
1295}
1296
1297static int
cf62fa4c 1298netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1299{
1300 ssize_t retval;
cf62fa4c 1301 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1302
1303 do {
cf62fa4c 1304 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1305 } while (retval < 0 && errno == EINTR);
1306
1307 if (retval < 0) {
bfd3367b 1308 return errno;
8b61709d 1309 }
b73c8518 1310
cf62fa4c 1311 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1312 return 0;
1313}
1314
1315static int
8492adc2
JS
1316netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1317 int *qfill)
b73c8518 1318{
f7791740 1319 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1320 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1321 struct dp_packet *buffer;
df1e5a3b
PS
1322 ssize_t retval;
1323 int mtu;
1324
1325 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1326 mtu = ETH_PAYLOAD_MAX;
1327 }
1328
2482b0b0 1329 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1330 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1331 DP_NETDEV_HEADROOM);
b73c8518 1332 retval = (rx->is_tap
f7791740
PS
1333 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1334 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1335
1336 if (retval) {
1337 if (retval != EAGAIN && retval != EMSGSIZE) {
1338 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1339 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1340 }
cf62fa4c 1341 dp_packet_delete(buffer);
df1e5a3b 1342 } else {
72c84bc2 1343 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1344 }
1345
8492adc2
JS
1346 if (qfill) {
1347 *qfill = -ENOTSUP;
1348 }
1349
b73c8518 1350 return retval;
8b61709d
BP
1351}
1352
8b61709d 1353static void
f7791740 1354netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1355{
f7791740 1356 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1357 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1358}
1359
8b61709d 1360static int
f7791740 1361netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1362{
f7791740 1363 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1364 if (rx->is_tap) {
8b61709d 1365 struct ifreq ifr;
f7791740 1366 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1367 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1368 if (error) {
1369 return error;
1370 }
796223f5 1371 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1372 return 0;
1373 } else {
796223f5 1374 return drain_rcvbuf(rx->fd);
8b61709d
BP
1375 }
1376}
1377
d19cf8bb
ZG
1378static int
1379netdev_linux_sock_batch_send(int sock, int ifindex,
1380 struct dp_packet_batch *batch)
1381{
e0a00cee 1382 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1383 /* We don't bother setting most fields in sockaddr_ll because the
1384 * kernel ignores them for SOCK_RAW. */
1385 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1386 .sll_ifindex = ifindex };
1387
e0a00cee
BB
1388 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1389 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1390
e0a00cee 1391 struct dp_packet *packet;
e883448e 1392 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
d19cf8bb 1393 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1394 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1395 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1396 .msg_namelen = sizeof sll,
1397 .msg_iov = &iov[i],
1398 .msg_iovlen = 1 };
1399 }
1400
1401 int error = 0;
e0a00cee 1402 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1403 ssize_t retval;
1404 do {
e0a00cee 1405 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1406 error = retval < 0 ? errno : 0;
1407 } while (error == EINTR);
1408 if (error) {
1409 break;
1410 }
1411 ofs += retval;
1412 }
1413
1414 free(mmsg);
1415 free(iov);
1416 return error;
1417}
1418
1419/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1420 * essential, because packets sent to a tap device with an AF_PACKET socket
1421 * will loop back to be *received* again on the tap device. This doesn't occur
1422 * on other interface types because we attach a socket filter to the rx
1423 * socket. */
1424static int
1425netdev_linux_tap_batch_send(struct netdev *netdev_,
1426 struct dp_packet_batch *batch)
1427{
1428 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1429 struct dp_packet *packet;
22dcb534
FL
1430
1431 /* The Linux tap driver returns EIO if the device is not up,
1432 * so if the device is not up, don't waste time sending it.
1433 * However, if the device is in another network namespace
1434 * then OVS can't retrieve the state. In that case, send the
1435 * packets anyway. */
1436 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1437 netdev->tx_dropped += dp_packet_batch_size(batch);
1438 return 0;
1439 }
1440
e883448e 1441 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
ad8b0b4f 1442 size_t size = dp_packet_size(packet);
d19cf8bb
ZG
1443 ssize_t retval;
1444 int error;
1445
1446 do {
1447 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1448 error = retval < 0 ? errno : 0;
1449 } while (error == EINTR);
1450
1451 if (error) {
1452 /* The Linux tap driver returns EIO if the device is not up. From
1453 * the OVS side this is not an error, so we ignore it; otherwise,
1454 * return the erro. */
1455 if (error != EIO) {
1456 return error;
1457 }
1458 } else if (retval != size) {
1459 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1460 "bytes of %"PRIuSIZE") on %s",
1461 retval, size, netdev_get_name(netdev_));
1462 return EMSGSIZE;
1463 }
1464 }
1465 return 0;
1466}
1467
1468/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1469 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1470 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1471 * the packet is too big or too small to transmit on the device.
1472 *
8b61709d
BP
1473 * The kernel maintains a packet transmission queue, so the caller is not
1474 * expected to do additional queuing of packets. */
1475static int
f00fa8cb 1476netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1477 struct dp_packet_batch *batch,
324c8374 1478 bool concurrent_txq OVS_UNUSED)
8b61709d 1479{
f4fd623c 1480 int error = 0;
0a62ae2c
ZG
1481 int sock = 0;
1482
0a62ae2c 1483 if (!is_tap_netdev(netdev_)) {
e0e2410d
FL
1484 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1485 error = EOPNOTSUPP;
1486 goto free_batch;
1487 }
1488
0a62ae2c
ZG
1489 sock = af_packet_sock();
1490 if (sock < 0) {
1491 error = -sock;
1492 goto free_batch;
1493 }
1494
1495 int ifindex = netdev_get_ifindex(netdev_);
1496 if (ifindex < 0) {
1497 error = -ifindex;
1498 goto free_batch;
1499 }
1500
d19cf8bb
ZG
1501 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1502 } else {
1503 error = netdev_linux_tap_batch_send(netdev_, batch);
0a62ae2c 1504 }
d19cf8bb
ZG
1505 if (error) {
1506 if (error == ENOBUFS) {
1507 /* The Linux AF_PACKET implementation never blocks waiting
1508 * for room for packets, instead returning ENOBUFS.
1509 * Translate this into EAGAIN for the caller. */
1510 error = EAGAIN;
f23347ea 1511 } else {
f4fd623c
DDP
1512 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1513 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1514 }
f4fd623c
DDP
1515 }
1516
0a62ae2c 1517free_batch:
b30896c9 1518 dp_packet_delete_batch(batch, true);
f4fd623c 1519 return error;
8b61709d
BP
1520}
1521
1522/* Registers with the poll loop to wake up from the next call to poll_block()
1523 * when the packet transmission queue has sufficient room to transmit a packet
1524 * with netdev_send().
1525 *
1526 * The kernel maintains a packet transmission queue, so the client is not
1527 * expected to do additional queuing of packets. Thus, this function is
1528 * unlikely to ever be used. It is included for completeness. */
1529static void
f00fa8cb 1530netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1531{
796223f5 1532 if (is_tap_netdev(netdev)) {
8b61709d
BP
1533 /* TAP device always accepts packets.*/
1534 poll_immediate_wake();
1535 }
1536}
1537
1538/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1539 * otherwise a positive errno value. */
1540static int
74ff3298 1541netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1542{
b5d57fc8 1543 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1544 enum netdev_flags old_flags = 0;
eb395f2e
BP
1545 int error;
1546
86383816 1547 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1548 if (netdev_linux_netnsid_is_remote(netdev)) {
1549 error = EOPNOTSUPP;
1550 goto exit;
1551 }
86383816 1552
b5d57fc8 1553 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1554 error = netdev->ether_addr_error;
1555 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1556 goto exit;
44445cac 1557 }
b5d57fc8 1558 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1559 }
1560
7eb1bd81 1561 /* Tap devices must be brought down before setting the address. */
796223f5 1562 if (is_tap_netdev(netdev_)) {
4f9f3f21 1563 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1564 }
44445cac
PS
1565 error = set_etheraddr(netdev_get_name(netdev_), mac);
1566 if (!error || error == ENODEV) {
b5d57fc8
BP
1567 netdev->ether_addr_error = error;
1568 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1569 if (!error) {
74ff3298 1570 netdev->etheraddr = mac;
eb395f2e 1571 }
8b61709d 1572 }
44445cac 1573
4f9f3f21
BP
1574 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1575 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1576 }
7eb1bd81 1577
86383816
BP
1578exit:
1579 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1580 return error;
1581}
1582
44445cac 1583/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1584static int
74ff3298 1585netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1586{
b5d57fc8 1587 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1588 int error;
44445cac 1589
86383816 1590 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1591 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
756819dd
FL
1592 netdev_linux_update_via_netlink(netdev);
1593 }
1594
1595 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1596 /* Fall back to ioctl if netlink fails */
86383816 1597 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1598 &netdev->etheraddr);
b5d57fc8 1599 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1600 }
44445cac 1601
86383816
BP
1602 error = netdev->ether_addr_error;
1603 if (!error) {
74ff3298 1604 *mac = netdev->etheraddr;
44445cac 1605 }
86383816 1606 ovs_mutex_unlock(&netdev->mutex);
44445cac 1607
86383816 1608 return error;
8b61709d
BP
1609}
1610
8b61709d 1611static int
73371c09 1612netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1613{
86383816
BP
1614 int error;
1615
b5d57fc8 1616 if (!(netdev->cache_valid & VALID_MTU)) {
756819dd
FL
1617 netdev_linux_update_via_netlink(netdev);
1618 }
1619
1620 if (!(netdev->cache_valid & VALID_MTU)) {
1621 /* Fall back to ioctl if netlink fails */
8b61709d 1622 struct ifreq ifr;
90a6637d 1623
86383816 1624 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1625 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1626 netdev->mtu = ifr.ifr_mtu;
1627 netdev->cache_valid |= VALID_MTU;
8b61709d 1628 }
90a6637d 1629
86383816
BP
1630 error = netdev->netdev_mtu_error;
1631 if (!error) {
b5d57fc8 1632 *mtup = netdev->mtu;
90a6637d 1633 }
73371c09
BP
1634
1635 return error;
1636}
1637
1638/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1639 * in bytes, not including the hardware header; thus, this is typically 1500
1640 * bytes for Ethernet devices. */
1641static int
1642netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1643{
1644 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1645 int error;
1646
1647 ovs_mutex_lock(&netdev->mutex);
1648 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1649 ovs_mutex_unlock(&netdev->mutex);
1650
1651 return error;
8b61709d
BP
1652}
1653
9b020780
PS
1654/* Sets the maximum size of transmitted (MTU) for given device using linux
1655 * networking ioctl interface.
1656 */
1657static int
4124cb12 1658netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1659{
b5d57fc8 1660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1661 struct ifreq ifr;
1662 int error;
1663
86383816 1664 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1665 if (netdev_linux_netnsid_is_remote(netdev)) {
1666 error = EOPNOTSUPP;
1667 goto exit;
1668 }
1669
b5d57fc8 1670 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1671 error = netdev->netdev_mtu_error;
1672 if (error || netdev->mtu == mtu) {
1673 goto exit;
90a6637d 1674 }
b5d57fc8 1675 netdev->cache_valid &= ~VALID_MTU;
153e5481 1676 }
9b020780 1677 ifr.ifr_mtu = mtu;
259e0b1a
BP
1678 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1679 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1680 if (!error || error == ENODEV) {
b5d57fc8
BP
1681 netdev->netdev_mtu_error = error;
1682 netdev->mtu = ifr.ifr_mtu;
1683 netdev->cache_valid |= VALID_MTU;
9b020780 1684 }
86383816
BP
1685exit:
1686 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1687 return error;
9b020780
PS
1688}
1689
9ab3d9a3
BP
1690/* Returns the ifindex of 'netdev', if successful, as a positive number.
1691 * On failure, returns a negative errno value. */
1692static int
86383816 1693netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1694{
86383816 1695 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1696 int ifindex, error;
1697
86383816 1698 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1699 if (netdev_linux_netnsid_is_remote(netdev)) {
1700 error = EOPNOTSUPP;
1701 goto exit;
1702 }
86383816 1703 error = get_ifindex(netdev_, &ifindex);
86383816 1704
e0e2410d
FL
1705exit:
1706 ovs_mutex_unlock(&netdev->mutex);
9ab3d9a3
BP
1707 return error ? -error : ifindex;
1708}
1709
8b61709d
BP
1710static int
1711netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1712{
b5d57fc8 1713 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1714
86383816 1715 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1716 if (netdev->miimon_interval > 0) {
1717 *carrier = netdev->miimon;
3a183124 1718 } else {
b5d57fc8 1719 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1720 }
86383816 1721 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1722
3a183124 1723 return 0;
8b61709d
BP
1724}
1725
65c3058c 1726static long long int
86383816 1727netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1728{
86383816
BP
1729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1730 long long int carrier_resets;
1731
1732 ovs_mutex_lock(&netdev->mutex);
1733 carrier_resets = netdev->carrier_resets;
1734 ovs_mutex_unlock(&netdev->mutex);
1735
1736 return carrier_resets;
65c3058c
EJ
1737}
1738
63331829 1739static int
1670c579
EJ
1740netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1741 struct mii_ioctl_data *data)
63331829 1742{
63331829 1743 struct ifreq ifr;
782e6111 1744 int error;
63331829 1745
63331829 1746 memset(&ifr, 0, sizeof ifr);
782e6111 1747 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1748 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1749 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1750
782e6111
EJ
1751 return error;
1752}
1753
1754static int
1670c579 1755netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1756{
782e6111
EJ
1757 struct mii_ioctl_data data;
1758 int error;
63331829 1759
782e6111
EJ
1760 *miimon = false;
1761
1762 memset(&data, 0, sizeof data);
1670c579 1763 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1764 if (!error) {
1765 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1766 data.reg_num = MII_BMSR;
1670c579 1767 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1768 &data);
63331829
EJ
1769
1770 if (!error) {
782e6111 1771 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1772 }
9120cfc0
DH
1773 }
1774 if (error) {
63331829 1775 struct ethtool_cmd ecmd;
63331829
EJ
1776
1777 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1778 name);
1779
ab985a77 1780 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1781 memset(&ecmd, 0, sizeof ecmd);
1782 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1783 "ETHTOOL_GLINK");
1784 if (!error) {
782e6111
EJ
1785 struct ethtool_value eval;
1786
1787 memcpy(&eval, &ecmd, sizeof eval);
1788 *miimon = !!eval.data;
63331829
EJ
1789 } else {
1790 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1791 }
1792 }
1793
1794 return error;
1795}
1796
1670c579
EJ
1797static int
1798netdev_linux_set_miimon_interval(struct netdev *netdev_,
1799 long long int interval)
1800{
b5d57fc8 1801 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1802
86383816 1803 ovs_mutex_lock(&netdev->mutex);
1670c579 1804 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1805 if (netdev->miimon_interval != interval) {
19c8e9c1 1806 if (interval && !netdev->miimon_interval) {
812c272c 1807 atomic_count_inc(&miimon_cnt);
19c8e9c1 1808 } else if (!interval && netdev->miimon_interval) {
812c272c 1809 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1810 }
1811
b5d57fc8
BP
1812 netdev->miimon_interval = interval;
1813 timer_set_expired(&netdev->miimon_timer);
1670c579 1814 }
86383816 1815 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1816
1817 return 0;
1818}
1819
1820static void
1821netdev_linux_miimon_run(void)
1822{
1823 struct shash device_shash;
1824 struct shash_node *node;
1825
1826 shash_init(&device_shash);
b5d57fc8 1827 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1828 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1829 struct netdev *netdev = node->data;
1830 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1831 bool miimon;
1832
86383816
BP
1833 ovs_mutex_lock(&dev->mutex);
1834 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1835 netdev_linux_get_miimon(dev->up.name, &miimon);
1836 if (miimon != dev->miimon) {
1837 dev->miimon = miimon;
1838 netdev_linux_changed(dev, dev->ifi_flags, 0);
1839 }
1670c579 1840
86383816 1841 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1842 }
86383816 1843 ovs_mutex_unlock(&dev->mutex);
2f980d74 1844 netdev_close(netdev);
1670c579
EJ
1845 }
1846
1847 shash_destroy(&device_shash);
1848}
1849
1850static void
1851netdev_linux_miimon_wait(void)
1852{
1853 struct shash device_shash;
1854 struct shash_node *node;
1855
1856 shash_init(&device_shash);
b5d57fc8 1857 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1858 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1859 struct netdev *netdev = node->data;
1860 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1861
86383816 1862 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1863 if (dev->miimon_interval > 0) {
1864 timer_wait(&dev->miimon_timer);
1865 }
86383816 1866 ovs_mutex_unlock(&dev->mutex);
2f980d74 1867 netdev_close(netdev);
1670c579
EJ
1868 }
1869 shash_destroy(&device_shash);
1870}
1871
92df599c
JG
1872static void
1873swap_uint64(uint64_t *a, uint64_t *b)
1874{
1de0e8ae
BP
1875 uint64_t tmp = *a;
1876 *a = *b;
1877 *b = tmp;
92df599c
JG
1878}
1879
c060c4cf
EJ
1880/* Copies 'src' into 'dst', performing format conversion in the process.
1881 *
1882 * 'src' is allowed to be misaligned. */
1883static void
1884netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1885 const struct ovs_vport_stats *src)
1886{
6a54dedc
BP
1887 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1888 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1889 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1890 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1891 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1892 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1893 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1894 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1895 dst->multicast = 0;
1896 dst->collisions = 0;
1897 dst->rx_length_errors = 0;
1898 dst->rx_over_errors = 0;
1899 dst->rx_crc_errors = 0;
1900 dst->rx_frame_errors = 0;
1901 dst->rx_fifo_errors = 0;
1902 dst->rx_missed_errors = 0;
1903 dst->tx_aborted_errors = 0;
1904 dst->tx_carrier_errors = 0;
1905 dst->tx_fifo_errors = 0;
1906 dst->tx_heartbeat_errors = 0;
1907 dst->tx_window_errors = 0;
1908}
1909
1910static int
1911get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1912{
93451a0a 1913 struct dpif_netlink_vport reply;
c060c4cf
EJ
1914 struct ofpbuf *buf;
1915 int error;
1916
93451a0a 1917 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1918 if (error) {
1919 return error;
1920 } else if (!reply.stats) {
1921 ofpbuf_delete(buf);
1922 return EOPNOTSUPP;
1923 }
1924
1925 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1926
1927 ofpbuf_delete(buf);
1928
1929 return 0;
1930}
1931
f613a0d7
PS
1932static void
1933get_stats_via_vport(const struct netdev *netdev_,
1934 struct netdev_stats *stats)
8b61709d 1935{
b5d57fc8 1936 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1937
b5d57fc8
BP
1938 if (!netdev->vport_stats_error ||
1939 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1940 int error;
7fbef77a 1941
c060c4cf 1942 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1943 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1944 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1945 "(%s)",
1946 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1947 }
b5d57fc8
BP
1948 netdev->vport_stats_error = error;
1949 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1950 }
f613a0d7 1951}
8b61709d 1952
f613a0d7
PS
1953/* Retrieves current device stats for 'netdev-linux'. */
1954static int
1955netdev_linux_get_stats(const struct netdev *netdev_,
1956 struct netdev_stats *stats)
1957{
b5d57fc8 1958 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1959 struct netdev_stats dev_stats;
1960 int error;
1961
86383816 1962 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1963 get_stats_via_vport(netdev_, stats);
35eef899 1964 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1965 if (error) {
86383816
BP
1966 if (!netdev->vport_stats_error) {
1967 error = 0;
f613a0d7 1968 }
86383816 1969 } else if (netdev->vport_stats_error) {
04c881eb 1970 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1971 *stats = dev_stats;
1972 } else {
04c881eb
AZ
1973 /* Use kernel netdev's packet and byte counts since vport's counters
1974 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1975 * enabled. */
1976 stats->rx_packets = dev_stats.rx_packets;
1977 stats->rx_bytes = dev_stats.rx_bytes;
1978 stats->tx_packets = dev_stats.tx_packets;
1979 stats->tx_bytes = dev_stats.tx_bytes;
1980
f613a0d7
PS
1981 stats->rx_errors += dev_stats.rx_errors;
1982 stats->tx_errors += dev_stats.tx_errors;
1983 stats->rx_dropped += dev_stats.rx_dropped;
1984 stats->tx_dropped += dev_stats.tx_dropped;
1985 stats->multicast += dev_stats.multicast;
1986 stats->collisions += dev_stats.collisions;
1987 stats->rx_length_errors += dev_stats.rx_length_errors;
1988 stats->rx_over_errors += dev_stats.rx_over_errors;
1989 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1990 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1991 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1992 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1993 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1994 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1995 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1996 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1997 stats->tx_window_errors += dev_stats.tx_window_errors;
1998 }
86383816
BP
1999 ovs_mutex_unlock(&netdev->mutex);
2000
2001 return error;
f613a0d7
PS
2002}
2003
2004/* Retrieves current device stats for 'netdev-tap' netdev or
2005 * netdev-internal. */
2006static int
15aee116 2007netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 2008{
b5d57fc8 2009 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
2010 struct netdev_stats dev_stats;
2011 int error;
2012
86383816 2013 ovs_mutex_lock(&netdev->mutex);
f613a0d7 2014 get_stats_via_vport(netdev_, stats);
35eef899 2015 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 2016 if (error) {
86383816
BP
2017 if (!netdev->vport_stats_error) {
2018 error = 0;
8b61709d 2019 }
86383816
BP
2020 } else if (netdev->vport_stats_error) {
2021 /* Transmit and receive stats will appear to be swapped relative to the
2022 * other ports since we are the one sending the data, not a remote
2023 * computer. For consistency, we swap them back here. This does not
2024 * apply if we are getting stats from the vport layer because it always
2025 * tracks stats from the perspective of the switch. */
fe6b0e03 2026
f613a0d7 2027 *stats = dev_stats;
92df599c
JG
2028 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2029 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2030 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2031 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
2032 stats->rx_length_errors = 0;
2033 stats->rx_over_errors = 0;
2034 stats->rx_crc_errors = 0;
2035 stats->rx_frame_errors = 0;
2036 stats->rx_fifo_errors = 0;
2037 stats->rx_missed_errors = 0;
2038 stats->tx_aborted_errors = 0;
2039 stats->tx_carrier_errors = 0;
2040 stats->tx_fifo_errors = 0;
2041 stats->tx_heartbeat_errors = 0;
2042 stats->tx_window_errors = 0;
f613a0d7 2043 } else {
04c881eb
AZ
2044 /* Use kernel netdev's packet and byte counts since vport counters
2045 * do not reflect packet counts on the wire when GSO, TSO or GRO
2046 * are enabled. */
2047 stats->rx_packets = dev_stats.tx_packets;
2048 stats->rx_bytes = dev_stats.tx_bytes;
2049 stats->tx_packets = dev_stats.rx_packets;
2050 stats->tx_bytes = dev_stats.rx_bytes;
2051
f613a0d7
PS
2052 stats->rx_dropped += dev_stats.tx_dropped;
2053 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 2054
f613a0d7
PS
2055 stats->rx_errors += dev_stats.tx_errors;
2056 stats->tx_errors += dev_stats.rx_errors;
2057
2058 stats->multicast += dev_stats.multicast;
2059 stats->collisions += dev_stats.collisions;
2060 }
22dcb534 2061 stats->tx_dropped += netdev->tx_dropped;
86383816
BP
2062 ovs_mutex_unlock(&netdev->mutex);
2063
2064 return error;
8b61709d
BP
2065}
2066
bba1e6f3
PS
2067static int
2068netdev_internal_get_stats(const struct netdev *netdev_,
2069 struct netdev_stats *stats)
2070{
b5d57fc8 2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2072 int error;
bba1e6f3 2073
86383816 2074 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 2075 get_stats_via_vport(netdev_, stats);
86383816
BP
2076 error = netdev->vport_stats_error;
2077 ovs_mutex_unlock(&netdev->mutex);
2078
2079 return error;
bba1e6f3
PS
2080}
2081
51f87458 2082static void
b5d57fc8 2083netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
2084{
2085 struct ethtool_cmd ecmd;
6c038611 2086 uint32_t speed;
8b61709d
BP
2087 int error;
2088
b5d57fc8 2089 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
2090 return;
2091 }
2092
ab985a77 2093 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2094 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 2095 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
2096 ETHTOOL_GSET, "ETHTOOL_GSET");
2097 if (error) {
51f87458 2098 goto out;
8b61709d
BP
2099 }
2100
2101 /* Supported features. */
b5d57fc8 2102 netdev->supported = 0;
8b61709d 2103 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 2104 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
2105 }
2106 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 2107 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
2108 }
2109 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 2110 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
2111 }
2112 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 2113 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
2114 }
2115 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 2116 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 2117 }
67bed84c
SH
2118 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2119 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 2120 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 2121 }
67bed84c
SH
2122 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2123 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2124 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2125 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 2126 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 2127 }
67bed84c
SH
2128 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2129 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2130 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2131 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2132 netdev->supported |= NETDEV_F_40GB_FD;
2133 }
8b61709d 2134 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 2135 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
2136 }
2137 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 2138 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
2139 }
2140 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 2141 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
2142 }
2143 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 2144 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
2145 }
2146 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 2147 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2148 }
2149
2150 /* Advertised features. */
b5d57fc8 2151 netdev->advertised = 0;
8b61709d 2152 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 2153 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
2154 }
2155 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 2156 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
2157 }
2158 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 2159 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
2160 }
2161 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 2162 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
2163 }
2164 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 2165 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 2166 }
67bed84c
SH
2167 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2168 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 2169 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 2170 }
67bed84c
SH
2171 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2172 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2173 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2174 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 2175 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 2176 }
67bed84c
SH
2177 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2178 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2179 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2180 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2181 netdev->advertised |= NETDEV_F_40GB_FD;
2182 }
8b61709d 2183 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 2184 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
2185 }
2186 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 2187 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
2188 }
2189 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 2190 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
2191 }
2192 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 2193 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
2194 }
2195 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 2196 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2197 }
2198
2199 /* Current settings. */
0c615356 2200 speed = ethtool_cmd_speed(&ecmd);
6c038611 2201 if (speed == SPEED_10) {
b5d57fc8 2202 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 2203 } else if (speed == SPEED_100) {
b5d57fc8 2204 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 2205 } else if (speed == SPEED_1000) {
b5d57fc8 2206 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 2207 } else if (speed == SPEED_10000) {
b5d57fc8 2208 netdev->current = NETDEV_F_10GB_FD;
6c038611 2209 } else if (speed == 40000) {
b5d57fc8 2210 netdev->current = NETDEV_F_40GB_FD;
6c038611 2211 } else if (speed == 100000) {
b5d57fc8 2212 netdev->current = NETDEV_F_100GB_FD;
6c038611 2213 } else if (speed == 1000000) {
b5d57fc8 2214 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2215 } else {
b5d57fc8 2216 netdev->current = 0;
8b61709d
BP
2217 }
2218
2219 if (ecmd.port == PORT_TP) {
b5d57fc8 2220 netdev->current |= NETDEV_F_COPPER;
8b61709d 2221 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2222 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2223 }
2224
2225 if (ecmd.autoneg) {
b5d57fc8 2226 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2227 }
2228
51f87458 2229out:
b5d57fc8
BP
2230 netdev->cache_valid |= VALID_FEATURES;
2231 netdev->get_features_error = error;
51f87458
PS
2232}
2233
887ed8b2
BP
2234/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2235 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2236 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2237static int
2238netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2239 enum netdev_features *current,
2240 enum netdev_features *advertised,
2241 enum netdev_features *supported,
2242 enum netdev_features *peer)
51f87458 2243{
b5d57fc8 2244 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2245 int error;
51f87458 2246
86383816 2247 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2248 if (netdev_linux_netnsid_is_remote(netdev)) {
2249 error = EOPNOTSUPP;
2250 goto exit;
2251 }
2252
b5d57fc8 2253 netdev_linux_read_features(netdev);
b5d57fc8
BP
2254 if (!netdev->get_features_error) {
2255 *current = netdev->current;
2256 *advertised = netdev->advertised;
2257 *supported = netdev->supported;
887ed8b2 2258 *peer = 0; /* XXX */
51f87458 2259 }
86383816 2260 error = netdev->get_features_error;
86383816 2261
e0e2410d
FL
2262exit:
2263 ovs_mutex_unlock(&netdev->mutex);
86383816 2264 return error;
8b61709d
BP
2265}
2266
2267/* Set the features advertised by 'netdev' to 'advertise'. */
2268static int
86383816 2269netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2270 enum netdev_features advertise)
8b61709d 2271{
86383816 2272 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2273 struct ethtool_cmd ecmd;
2274 int error;
2275
86383816
BP
2276 ovs_mutex_lock(&netdev->mutex);
2277
ab985a77 2278 COVERAGE_INC(netdev_get_ethtool);
e0e2410d
FL
2279
2280 if (netdev_linux_netnsid_is_remote(netdev)) {
2281 error = EOPNOTSUPP;
2282 goto exit;
2283 }
2284
8b61709d 2285 memset(&ecmd, 0, sizeof ecmd);
86383816 2286 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2287 ETHTOOL_GSET, "ETHTOOL_GSET");
2288 if (error) {
86383816 2289 goto exit;
8b61709d
BP
2290 }
2291
2292 ecmd.advertising = 0;
6c038611 2293 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2294 ecmd.advertising |= ADVERTISED_10baseT_Half;
2295 }
6c038611 2296 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2297 ecmd.advertising |= ADVERTISED_10baseT_Full;
2298 }
6c038611 2299 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2300 ecmd.advertising |= ADVERTISED_100baseT_Half;
2301 }
6c038611 2302 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2303 ecmd.advertising |= ADVERTISED_100baseT_Full;
2304 }
6c038611 2305 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2306 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2307 }
6c038611 2308 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2309 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2310 }
6c038611 2311 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2312 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2313 }
6c038611 2314 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2315 ecmd.advertising |= ADVERTISED_TP;
2316 }
6c038611 2317 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2318 ecmd.advertising |= ADVERTISED_FIBRE;
2319 }
6c038611 2320 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2321 ecmd.advertising |= ADVERTISED_Autoneg;
2322 }
6c038611 2323 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2324 ecmd.advertising |= ADVERTISED_Pause;
2325 }
6c038611 2326 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2327 ecmd.advertising |= ADVERTISED_Asym_Pause;
2328 }
ab985a77 2329 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2330 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2331 ETHTOOL_SSET, "ETHTOOL_SSET");
2332
2333exit:
2334 ovs_mutex_unlock(&netdev->mutex);
2335 return error;
8b61709d
BP
2336}
2337
e7f6ba22
PJV
2338static struct tc_police
2339tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2340{
2341 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2342 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2343 struct tc_police police;
2344 struct tc_ratespec rate;
2345 int mtu = 65535;
2346
2347 memset(&rate, 0, sizeof rate);
2348 rate.rate = bps;
2349 rate.cell_log = tc_calc_cell_log(mtu);
2350 rate.mpu = ETH_TOTAL_MIN;
2351
2352 memset(&police, 0, sizeof police);
2353 police.burst = tc_bytes_to_ticks(bps, bsize);
2354 police.action = TC_POLICE_SHOT;
2355 police.rate = rate;
2356 police.mtu = mtu;
2357
2358 return police;
2359}
2360
2361static void
2362nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2363{
2364 size_t offset;
2365
2366 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2367 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2368 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2369 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2370 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2371 nl_msg_end_nested(request, offset);
2372}
2373
2374static int
2375tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2376 uint32_t kbits_burst)
2377{
2378 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2379 size_t basic_offset, action_offset, inner_offset;
2380 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2381 int ifindex, index, err = 0;
2382 struct tc_police pol_act;
2383 uint32_t block_id = 0;
2384 struct ofpbuf request;
2385 struct ofpbuf *reply;
2386 struct tcmsg *tcmsg;
2387 uint32_t handle = 1;
2388
2389 err = get_ifindex(netdev, &ifindex);
2390 if (err) {
2391 return err;
2392 }
2393
2394 index = block_id ? TCM_IFINDEX_MAGIC_BLOCK : ifindex;
2395 tcmsg = tc_make_request(index, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2396 &request);
2397 tcmsg->tcm_parent = block_id ? : TC_INGRESS_PARENT;
2398 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2399 tcmsg->tcm_handle = handle;
2400
2401 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2402 nl_msg_put_string(&request, TCA_KIND, "matchall");
2403 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2404 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2405 inner_offset = nl_msg_start_nested(&request, 1);
2406 nl_msg_put_act_police(&request, pol_act);
2407 nl_msg_end_nested(&request, inner_offset);
2408 nl_msg_end_nested(&request, action_offset);
2409 nl_msg_end_nested(&request, basic_offset);
2410
2411 err = tc_transact(&request, &reply);
2412 if (!err) {
2413 struct tcmsg *tc =
2414 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2415 ofpbuf_delete(reply);
2416 }
2417
2418 return err;
2419}
2420
2421static int
2422tc_del_matchall_policer(struct netdev *netdev)
2423{
2424 uint32_t block_id = 0;
2425 int ifindex;
2426 int err;
2427
2428 err = get_ifindex(netdev, &ifindex);
2429 if (err) {
2430 return err;
2431 }
2432
2433 err = tc_del_filter(ifindex, TC_RESERVED_PRIORITY_POLICE, 1, block_id);
2434 if (err) {
2435 return err;
2436 }
2437
2438 return 0;
2439}
2440
f8500004
JP
2441/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2442 * successful, otherwise a positive errno value. */
8b61709d 2443static int
b5d57fc8 2444netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2445 uint32_t kbits_rate, uint32_t kbits_burst)
2446{
b5d57fc8
BP
2447 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2448 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2449 int ifindex;
f8500004 2450 int error;
8b61709d 2451
80a86fbe 2452 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2453 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2454 : kbits_burst); /* Stick with user-specified value. */
2455
86383816 2456 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2457 if (netdev_linux_netnsid_is_remote(netdev)) {
2458 error = EOPNOTSUPP;
2459 goto out;
2460 }
2461
b5d57fc8 2462 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2463 error = netdev->netdev_policing_error;
2464 if (error || (netdev->kbits_rate == kbits_rate &&
2465 netdev->kbits_burst == kbits_burst)) {
c9f71668 2466 /* Assume that settings haven't changed since we last set them. */
86383816 2467 goto out;
c9f71668 2468 }
b5d57fc8 2469 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2470 }
2471
7874bdff
RD
2472 error = get_ifindex(netdev_, &ifindex);
2473 if (error) {
2474 goto out;
2475 }
2476
e7f6ba22
PJV
2477 /* Use matchall for policing when offloadling ovs with tc-flower. */
2478 if (netdev_is_flow_api_enabled()) {
2479 error = tc_del_matchall_policer(netdev_);
2480 if (kbits_rate) {
2481 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2482 }
2483 ovs_mutex_unlock(&netdev->mutex);
2484 return error;
2485 }
2486
ac8c3412 2487 COVERAGE_INC(netdev_set_policing);
f8500004 2488 /* Remove any existing ingress qdisc. */
093c9458 2489 error = tc_add_del_ingress_qdisc(ifindex, false, 0);
f8500004
JP
2490 if (error) {
2491 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2492 netdev_name, ovs_strerror(error));
c9f71668 2493 goto out;
f8500004
JP
2494 }
2495
8b61709d 2496 if (kbits_rate) {
093c9458 2497 error = tc_add_del_ingress_qdisc(ifindex, true, 0);
f8500004
JP
2498 if (error) {
2499 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2500 netdev_name, ovs_strerror(error));
c9f71668 2501 goto out;
8b61709d
BP
2502 }
2503
b5d57fc8 2504 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2505 if (error){
2506 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2507 netdev_name, ovs_strerror(error));
c9f71668 2508 goto out;
8b61709d 2509 }
8b61709d
BP
2510 }
2511
b5d57fc8
BP
2512 netdev->kbits_rate = kbits_rate;
2513 netdev->kbits_burst = kbits_burst;
f8500004 2514
c9f71668
PS
2515out:
2516 if (!error || error == ENODEV) {
b5d57fc8
BP
2517 netdev->netdev_policing_error = error;
2518 netdev->cache_valid |= VALID_POLICING;
c9f71668 2519 }
86383816 2520 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2521 return error;
8b61709d
BP
2522}
2523
c1c9c9c4
BP
2524static int
2525netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2526 struct sset *types)
c1c9c9c4 2527{
559eb230 2528 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2529 for (opsp = tcs; *opsp != NULL; opsp++) {
2530 const struct tc_ops *ops = *opsp;
2531 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2532 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2533 }
2534 }
2535 return 0;
2536}
2537
2538static const struct tc_ops *
2539tc_lookup_ovs_name(const char *name)
2540{
559eb230 2541 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2542
2543 for (opsp = tcs; *opsp != NULL; opsp++) {
2544 const struct tc_ops *ops = *opsp;
2545 if (!strcmp(name, ops->ovs_name)) {
2546 return ops;
2547 }
2548 }
2549 return NULL;
2550}
2551
2552static const struct tc_ops *
2553tc_lookup_linux_name(const char *name)
2554{
559eb230 2555 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2556
2557 for (opsp = tcs; *opsp != NULL; opsp++) {
2558 const struct tc_ops *ops = *opsp;
2559 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2560 return ops;
2561 }
2562 }
2563 return NULL;
2564}
2565
93b13be8 2566static struct tc_queue *
b5d57fc8 2567tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2568 size_t hash)
2569{
b5d57fc8 2570 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2571 struct tc_queue *queue;
2572
b5d57fc8 2573 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2574 if (queue->queue_id == queue_id) {
2575 return queue;
2576 }
2577 }
2578 return NULL;
2579}
2580
2581static struct tc_queue *
2582tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2583{
2584 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2585}
2586
c1c9c9c4
BP
2587static int
2588netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2589 const char *type,
2590 struct netdev_qos_capabilities *caps)
2591{
2592 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2593 if (!ops) {
2594 return EOPNOTSUPP;
2595 }
2596 caps->n_queues = ops->n_queues;
2597 return 0;
2598}
2599
2600static int
b5d57fc8 2601netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2602 const char **typep, struct smap *details)
c1c9c9c4 2603{
b5d57fc8 2604 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2605 int error;
2606
86383816 2607 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2608 if (netdev_linux_netnsid_is_remote(netdev)) {
2609 error = EOPNOTSUPP;
2610 goto exit;
2611 }
2612
b5d57fc8 2613 error = tc_query_qdisc(netdev_);
86383816
BP
2614 if (!error) {
2615 *typep = netdev->tc->ops->ovs_name;
2616 error = (netdev->tc->ops->qdisc_get
2617 ? netdev->tc->ops->qdisc_get(netdev_, details)
2618 : 0);
c1c9c9c4
BP
2619 }
2620
e0e2410d
FL
2621exit:
2622 ovs_mutex_unlock(&netdev->mutex);
86383816 2623 return error;
c1c9c9c4
BP
2624}
2625
2626static int
b5d57fc8 2627netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2628 const char *type, const struct smap *details)
c1c9c9c4 2629{
b5d57fc8 2630 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2631 const struct tc_ops *new_ops;
2632 int error;
2633
2634 new_ops = tc_lookup_ovs_name(type);
2635 if (!new_ops || !new_ops->tc_install) {
2636 return EOPNOTSUPP;
2637 }
2638
6cf888b8
BS
2639 if (new_ops == &tc_ops_noop) {
2640 return new_ops->tc_install(netdev_, details);
2641 }
2642
86383816 2643 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2644 if (netdev_linux_netnsid_is_remote(netdev)) {
2645 error = EOPNOTSUPP;
2646 goto exit;
2647 }
2648
b5d57fc8 2649 error = tc_query_qdisc(netdev_);
c1c9c9c4 2650 if (error) {
86383816 2651 goto exit;
c1c9c9c4
BP
2652 }
2653
b5d57fc8 2654 if (new_ops == netdev->tc->ops) {
86383816 2655 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2656 } else {
2657 /* Delete existing qdisc. */
b5d57fc8 2658 error = tc_del_qdisc(netdev_);
c1c9c9c4 2659 if (error) {
86383816 2660 goto exit;
c1c9c9c4 2661 }
b5d57fc8 2662 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2663
2664 /* Install new qdisc. */
b5d57fc8
BP
2665 error = new_ops->tc_install(netdev_, details);
2666 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2667 }
86383816
BP
2668
2669exit:
2670 ovs_mutex_unlock(&netdev->mutex);
2671 return error;
c1c9c9c4
BP
2672}
2673
2674static int
b5d57fc8 2675netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2676 unsigned int queue_id, struct smap *details)
c1c9c9c4 2677{
b5d57fc8 2678 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2679 int error;
2680
86383816 2681 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2682 if (netdev_linux_netnsid_is_remote(netdev)) {
2683 error = EOPNOTSUPP;
2684 goto exit;
2685 }
2686
b5d57fc8 2687 error = tc_query_qdisc(netdev_);
86383816 2688 if (!error) {
b5d57fc8 2689 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2690 error = (queue
b5d57fc8 2691 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2692 : ENOENT);
c1c9c9c4 2693 }
86383816 2694
e0e2410d
FL
2695exit:
2696 ovs_mutex_unlock(&netdev->mutex);
86383816 2697 return error;
c1c9c9c4
BP
2698}
2699
2700static int
b5d57fc8 2701netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2702 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2703{
b5d57fc8 2704 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2705 int error;
2706
86383816 2707 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2708 if (netdev_linux_netnsid_is_remote(netdev)) {
2709 error = EOPNOTSUPP;
2710 goto exit;
2711 }
2712
b5d57fc8 2713 error = tc_query_qdisc(netdev_);
86383816
BP
2714 if (!error) {
2715 error = (queue_id < netdev->tc->ops->n_queues
2716 && netdev->tc->ops->class_set
2717 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2718 : EINVAL);
c1c9c9c4
BP
2719 }
2720
e0e2410d
FL
2721exit:
2722 ovs_mutex_unlock(&netdev->mutex);
86383816 2723 return error;
c1c9c9c4
BP
2724}
2725
2726static int
b5d57fc8 2727netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2728{
b5d57fc8 2729 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2730 int error;
2731
86383816 2732 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2733 if (netdev_linux_netnsid_is_remote(netdev)) {
2734 error = EOPNOTSUPP;
2735 goto exit;
2736 }
2737
b5d57fc8 2738 error = tc_query_qdisc(netdev_);
86383816
BP
2739 if (!error) {
2740 if (netdev->tc->ops->class_delete) {
2741 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2742 error = (queue
2743 ? netdev->tc->ops->class_delete(netdev_, queue)
2744 : ENOENT);
2745 } else {
2746 error = EINVAL;
2747 }
c1c9c9c4 2748 }
86383816 2749
e0e2410d
FL
2750exit:
2751 ovs_mutex_unlock(&netdev->mutex);
86383816 2752 return error;
c1c9c9c4
BP
2753}
2754
2755static int
b5d57fc8 2756netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2757 unsigned int queue_id,
2758 struct netdev_queue_stats *stats)
2759{
b5d57fc8 2760 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2761 int error;
2762
86383816 2763 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2764 if (netdev_linux_netnsid_is_remote(netdev)) {
2765 error = EOPNOTSUPP;
2766 goto exit;
2767 }
2768
b5d57fc8 2769 error = tc_query_qdisc(netdev_);
86383816
BP
2770 if (!error) {
2771 if (netdev->tc->ops->class_get_stats) {
2772 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2773 if (queue) {
2774 stats->created = queue->created;
2775 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2776 stats);
2777 } else {
2778 error = ENOENT;
2779 }
2780 } else {
2781 error = EOPNOTSUPP;
6dc34a0d 2782 }
c1c9c9c4 2783 }
86383816 2784
e0e2410d
FL
2785exit:
2786 ovs_mutex_unlock(&netdev->mutex);
86383816 2787 return error;
c1c9c9c4
BP
2788}
2789
d57695d7
JS
2790struct queue_dump_state {
2791 struct nl_dump dump;
2792 struct ofpbuf buf;
2793};
2794
23a98ffe 2795static bool
d57695d7 2796start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2797{
2798 struct ofpbuf request;
2799 struct tcmsg *tcmsg;
2800
7874bdff 2801 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2802 if (!tcmsg) {
2803 return false;
2804 }
3c4de644 2805 tcmsg->tcm_parent = 0;
d57695d7 2806 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2807 ofpbuf_uninit(&request);
d57695d7
JS
2808
2809 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2810 return true;
c1c9c9c4
BP
2811}
2812
d57695d7
JS
2813static int
2814finish_queue_dump(struct queue_dump_state *state)
2815{
2816 ofpbuf_uninit(&state->buf);
2817 return nl_dump_done(&state->dump);
2818}
2819
89454bf4
BP
2820struct netdev_linux_queue_state {
2821 unsigned int *queues;
2822 size_t cur_queue;
2823 size_t n_queues;
2824};
2825
c1c9c9c4 2826static int
89454bf4 2827netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2828{
e0e2410d 2829 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2830 int error;
2831
86383816 2832 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2833 if (netdev_linux_netnsid_is_remote(netdev)) {
2834 error = EOPNOTSUPP;
2835 goto exit;
2836 }
2837
b5d57fc8 2838 error = tc_query_qdisc(netdev_);
86383816
BP
2839 if (!error) {
2840 if (netdev->tc->ops->class_get) {
89454bf4
BP
2841 struct netdev_linux_queue_state *state;
2842 struct tc_queue *queue;
2843 size_t i;
2844
2845 *statep = state = xmalloc(sizeof *state);
2846 state->n_queues = hmap_count(&netdev->tc->queues);
2847 state->cur_queue = 0;
2848 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2849
2850 i = 0;
2851 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2852 state->queues[i++] = queue->queue_id;
86383816 2853 }
c1c9c9c4 2854 } else {
86383816 2855 error = EOPNOTSUPP;
c1c9c9c4
BP
2856 }
2857 }
c1c9c9c4 2858
e0e2410d
FL
2859exit:
2860 ovs_mutex_unlock(&netdev->mutex);
86383816 2861 return error;
c1c9c9c4
BP
2862}
2863
89454bf4
BP
2864static int
2865netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2866 unsigned int *queue_idp, struct smap *details)
2867{
e0e2410d 2868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
89454bf4
BP
2869 struct netdev_linux_queue_state *state = state_;
2870 int error = EOF;
2871
2872 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2873 if (netdev_linux_netnsid_is_remote(netdev)) {
2874 error = EOPNOTSUPP;
2875 goto exit;
2876 }
2877
89454bf4
BP
2878 while (state->cur_queue < state->n_queues) {
2879 unsigned int queue_id = state->queues[state->cur_queue++];
2880 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2881
2882 if (queue) {
2883 *queue_idp = queue_id;
2884 error = netdev->tc->ops->class_get(netdev_, queue, details);
2885 break;
2886 }
2887 }
89454bf4 2888
e0e2410d
FL
2889exit:
2890 ovs_mutex_unlock(&netdev->mutex);
89454bf4
BP
2891 return error;
2892}
2893
2894static int
2895netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2896 void *state_)
2897{
2898 struct netdev_linux_queue_state *state = state_;
2899
2900 free(state->queues);
2901 free(state);
2902 return 0;
2903}
2904
c1c9c9c4 2905static int
b5d57fc8 2906netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2907 netdev_dump_queue_stats_cb *cb, void *aux)
2908{
b5d57fc8 2909 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2910 int error;
2911
86383816 2912 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2913 if (netdev_linux_netnsid_is_remote(netdev)) {
2914 error = EOPNOTSUPP;
2915 goto exit;
2916 }
2917
b5d57fc8 2918 error = tc_query_qdisc(netdev_);
86383816 2919 if (!error) {
d57695d7 2920 struct queue_dump_state state;
c1c9c9c4 2921
86383816
BP
2922 if (!netdev->tc->ops->class_dump_stats) {
2923 error = EOPNOTSUPP;
d57695d7 2924 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2925 error = ENODEV;
2926 } else {
2927 struct ofpbuf msg;
2928 int retval;
2929
d57695d7 2930 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2931 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2932 cb, aux);
2933 if (retval) {
2934 error = retval;
2935 }
2936 }
2937
d57695d7 2938 retval = finish_queue_dump(&state);
86383816
BP
2939 if (retval) {
2940 error = retval;
2941 }
c1c9c9c4
BP
2942 }
2943 }
2944
e0e2410d
FL
2945exit:
2946 ovs_mutex_unlock(&netdev->mutex);
86383816 2947 return error;
c1c9c9c4
BP
2948}
2949
8b61709d 2950static int
f1acd62b
BP
2951netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2952 struct in_addr netmask)
8b61709d 2953{
b5d57fc8 2954 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2955 int error;
2956
86383816 2957 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2958 if (netdev_linux_netnsid_is_remote(netdev)) {
2959 error = EOPNOTSUPP;
2960 goto exit;
2961 }
2962
f1acd62b 2963 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2964 if (!error) {
f1acd62b 2965 if (address.s_addr != INADDR_ANY) {
8b61709d 2966 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2967 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2968 }
2969 }
49af9a3d 2970
e0e2410d 2971exit:
86383816 2972 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
2973 return error;
2974}
2975
7df6932e
AW
2976/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2977 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2978 * error. */
8b61709d 2979static int
a8704b50
PS
2980netdev_linux_get_addr_list(const struct netdev *netdev_,
2981 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2982{
b5d57fc8 2983 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2984 int error;
86383816
BP
2985
2986 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2987 if (netdev_linux_netnsid_is_remote(netdev)) {
2988 error = EOPNOTSUPP;
2989 goto exit;
2990 }
2991
a8704b50 2992 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816 2993
e0e2410d
FL
2994exit:
2995 ovs_mutex_unlock(&netdev->mutex);
7df6932e 2996 return error;
8b61709d
BP
2997}
2998
2999static void
3000make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3001{
3002 struct sockaddr_in sin;
3003 memset(&sin, 0, sizeof sin);
3004 sin.sin_family = AF_INET;
3005 sin.sin_addr = addr;
3006 sin.sin_port = 0;
3007
3008 memset(sa, 0, sizeof *sa);
3009 memcpy(sa, &sin, sizeof sin);
3010}
3011
3012static int
3013do_set_addr(struct netdev *netdev,
3014 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3015{
3016 struct ifreq ifr;
149f577a 3017
259e0b1a
BP
3018 make_in4_sockaddr(&ifr.ifr_addr, addr);
3019 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3020 ioctl_name);
8b61709d
BP
3021}
3022
3023/* Adds 'router' as a default IP gateway. */
3024static int
67a4917b 3025netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
3026{
3027 struct in_addr any = { INADDR_ANY };
3028 struct rtentry rt;
3029 int error;
3030
3031 memset(&rt, 0, sizeof rt);
3032 make_in4_sockaddr(&rt.rt_dst, any);
3033 make_in4_sockaddr(&rt.rt_gateway, router);
3034 make_in4_sockaddr(&rt.rt_genmask, any);
3035 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 3036 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 3037 if (error) {
10a89ef0 3038 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
3039 }
3040 return error;
3041}
3042
f1acd62b
BP
3043static int
3044netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3045 char **netdev_name)
3046{
3047 static const char fn[] = "/proc/net/route";
3048 FILE *stream;
3049 char line[256];
3050 int ln;
3051
3052 *netdev_name = NULL;
3053 stream = fopen(fn, "r");
3054 if (stream == NULL) {
10a89ef0 3055 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
3056 return errno;
3057 }
3058
3059 ln = 0;
3060 while (fgets(line, sizeof line, stream)) {
3061 if (++ln >= 2) {
3062 char iface[17];
dbba996b 3063 ovs_be32 dest, gateway, mask;
f1acd62b
BP
3064 int refcnt, metric, mtu;
3065 unsigned int flags, use, window, irtt;
3066
c2c28dfd
BP
3067 if (!ovs_scan(line,
3068 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3069 " %d %u %u\n",
3070 iface, &dest, &gateway, &flags, &refcnt,
3071 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 3072 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
3073 fn, ln, line);
3074 continue;
3075 }
3076 if (!(flags & RTF_UP)) {
3077 /* Skip routes that aren't up. */
3078 continue;
3079 }
3080
3081 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 3082 * network byte order, so we don't need need any endian
f1acd62b
BP
3083 * conversions here. */
3084 if ((dest & mask) == (host->s_addr & mask)) {
3085 if (!gateway) {
3086 /* The host is directly reachable. */
3087 next_hop->s_addr = 0;
3088 } else {
3089 /* To reach the host, we must go through a gateway. */
3090 next_hop->s_addr = gateway;
3091 }
3092 *netdev_name = xstrdup(iface);
3093 fclose(stream);
3094 return 0;
3095 }
3096 }
3097 }
3098
3099 fclose(stream);
3100 return ENXIO;
3101}
3102
e210037e 3103static int
b5d57fc8 3104netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 3105{
b5d57fc8 3106 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
3107 int error = 0;
3108
86383816 3109 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
3110 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3111 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
3112
3113 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
3114 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3115 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
3116 cmd,
3117 ETHTOOL_GDRVINFO,
3118 "ETHTOOL_GDRVINFO");
3119 if (!error) {
b5d57fc8 3120 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
3121 }
3122 }
e210037e 3123
e210037e 3124 if (!error) {
b5d57fc8
BP
3125 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3126 smap_add(smap, "driver_version", netdev->drvinfo.version);
3127 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 3128 }
86383816
BP
3129 ovs_mutex_unlock(&netdev->mutex);
3130
e210037e
AE
3131 return error;
3132}
3133
4f925bd3 3134static int
275707c3
EJ
3135netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3136 struct smap *smap)
4f925bd3 3137{
79f1cbe9 3138 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
3139 return 0;
3140}
3141
25db83be
JH
3142static uint32_t
3143netdev_linux_get_block_id(struct netdev *netdev_)
3144{
3145 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3146 uint32_t block_id = 0;
3147
3148 ovs_mutex_lock(&netdev->mutex);
3149 /* Ensure the linux netdev has had its fields populated. */
3150 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3151 netdev_linux_update_via_netlink(netdev);
3152 }
3153
3154 /* Only assigning block ids to linux netdevs that are LAG masters. */
3155 if (netdev->is_lag_master) {
3156 block_id = netdev->ifindex;
3157 }
3158 ovs_mutex_unlock(&netdev->mutex);
3159
3160 return block_id;
3161}
3162
8b61709d
BP
3163/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3164 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3165 * returns 0. Otherwise, it returns a positive errno value; in particular,
3166 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3167static int
3168netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 3169 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
3170{
3171 struct arpreq r;
c100e025 3172 struct sockaddr_in sin;
8b61709d
BP
3173 int retval;
3174
3175 memset(&r, 0, sizeof r);
f2cc621b 3176 memset(&sin, 0, sizeof sin);
c100e025
BP
3177 sin.sin_family = AF_INET;
3178 sin.sin_addr.s_addr = ip;
3179 sin.sin_port = 0;
3180 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
3181 r.arp_ha.sa_family = ARPHRD_ETHER;
3182 r.arp_flags = 0;
71d7c22f 3183 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 3184 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 3185 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
3186 if (!retval) {
3187 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3188 } else if (retval != ENXIO) {
3189 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
3190 netdev_get_name(netdev), IP_ARGS(ip),
3191 ovs_strerror(retval));
8b61709d
BP
3192 }
3193 return retval;
3194}
3195
b24751ff 3196static unsigned int
8b61709d
BP
3197nd_to_iff_flags(enum netdev_flags nd)
3198{
b24751ff 3199 unsigned int iff = 0;
8b61709d
BP
3200 if (nd & NETDEV_UP) {
3201 iff |= IFF_UP;
3202 }
3203 if (nd & NETDEV_PROMISC) {
3204 iff |= IFF_PROMISC;
3205 }
7ba19d41
AC
3206 if (nd & NETDEV_LOOPBACK) {
3207 iff |= IFF_LOOPBACK;
3208 }
8b61709d
BP
3209 return iff;
3210}
3211
3212static int
b24751ff 3213iff_to_nd_flags(unsigned int iff)
8b61709d
BP
3214{
3215 enum netdev_flags nd = 0;
3216 if (iff & IFF_UP) {
3217 nd |= NETDEV_UP;
3218 }
3219 if (iff & IFF_PROMISC) {
3220 nd |= NETDEV_PROMISC;
3221 }
7ba19d41
AC
3222 if (iff & IFF_LOOPBACK) {
3223 nd |= NETDEV_LOOPBACK;
3224 }
8b61709d
BP
3225 return nd;
3226}
3227
3228static int
4f9f3f21
BP
3229update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3230 enum netdev_flags on, enum netdev_flags *old_flagsp)
3231 OVS_REQUIRES(netdev->mutex)
8b61709d 3232{
b24751ff 3233 unsigned int old_flags, new_flags;
c37d4da4
EJ
3234 int error = 0;
3235
b5d57fc8 3236 old_flags = netdev->ifi_flags;
c37d4da4
EJ
3237 *old_flagsp = iff_to_nd_flags(old_flags);
3238 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3239 if (new_flags != old_flags) {
4f9f3f21
BP
3240 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3241 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 3242 }
4f9f3f21
BP
3243
3244 return error;
3245}
3246
3247static int
3248netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3249 enum netdev_flags on, enum netdev_flags *old_flagsp)
3250{
3251 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756819dd 3252 int error = 0;
4f9f3f21
BP
3253
3254 ovs_mutex_lock(&netdev->mutex);
756819dd
FL
3255 if (on || off) {
3256 /* Changing flags over netlink isn't support yet. */
e0e2410d
FL
3257 if (netdev_linux_netnsid_is_remote(netdev)) {
3258 error = EOPNOTSUPP;
3259 goto exit;
3260 }
756819dd
FL
3261 error = update_flags(netdev, off, on, old_flagsp);
3262 } else {
3263 /* Try reading flags over netlink, or fall back to ioctl. */
3264 if (!netdev_linux_update_via_netlink(netdev)) {
3265 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3266 } else {
3267 error = update_flags(netdev, off, on, old_flagsp);
3268 }
3269 }
e0e2410d
FL
3270
3271exit:
86383816 3272 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3273 return error;
3274}
3275
89c09c1c
BP
3276#define NETDEV_LINUX_CLASS_COMMON \
3277 .run = netdev_linux_run, \
3278 .wait = netdev_linux_wait, \
3279 .alloc = netdev_linux_alloc, \
3280 .destruct = netdev_linux_destruct, \
3281 .dealloc = netdev_linux_dealloc, \
3282 .send = netdev_linux_send, \
3283 .send_wait = netdev_linux_send_wait, \
3284 .set_etheraddr = netdev_linux_set_etheraddr, \
3285 .get_etheraddr = netdev_linux_get_etheraddr, \
3286 .get_mtu = netdev_linux_get_mtu, \
3287 .set_mtu = netdev_linux_set_mtu, \
3288 .get_ifindex = netdev_linux_get_ifindex, \
3289 .get_carrier = netdev_linux_get_carrier, \
3290 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3291 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3292 .set_advertisements = netdev_linux_set_advertisements, \
3293 .set_policing = netdev_linux_set_policing, \
3294 .get_qos_types = netdev_linux_get_qos_types, \
3295 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3296 .get_qos = netdev_linux_get_qos, \
3297 .set_qos = netdev_linux_set_qos, \
3298 .get_queue = netdev_linux_get_queue, \
3299 .set_queue = netdev_linux_set_queue, \
3300 .delete_queue = netdev_linux_delete_queue, \
3301 .get_queue_stats = netdev_linux_get_queue_stats, \
3302 .queue_dump_start = netdev_linux_queue_dump_start, \
3303 .queue_dump_next = netdev_linux_queue_dump_next, \
3304 .queue_dump_done = netdev_linux_queue_dump_done, \
3305 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3306 .set_in4 = netdev_linux_set_in4, \
3307 .get_addr_list = netdev_linux_get_addr_list, \
3308 .add_router = netdev_linux_add_router, \
3309 .get_next_hop = netdev_linux_get_next_hop, \
3310 .arp_lookup = netdev_linux_arp_lookup, \
3311 .update_flags = netdev_linux_update_flags, \
3312 .rxq_alloc = netdev_linux_rxq_alloc, \
3313 .rxq_construct = netdev_linux_rxq_construct, \
3314 .rxq_destruct = netdev_linux_rxq_destruct, \
3315 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3316 .rxq_recv = netdev_linux_rxq_recv, \
3317 .rxq_wait = netdev_linux_rxq_wait, \
3318 .rxq_drain = netdev_linux_rxq_drain
3319
3320const struct netdev_class netdev_linux_class = {
3321 NETDEV_LINUX_CLASS_COMMON,
3322 LINUX_FLOW_OFFLOAD_API,
3323 .type = "system",
3324 .construct = netdev_linux_construct,
3325 .get_stats = netdev_linux_get_stats,
3326 .get_features = netdev_linux_get_features,
3327 .get_status = netdev_linux_get_status,
3328 .get_block_id = netdev_linux_get_block_id
3329};
3330
3331const struct netdev_class netdev_tap_class = {
3332 NETDEV_LINUX_CLASS_COMMON,
3333 .type = "tap",
3334 .construct = netdev_linux_construct_tap,
3335 .get_stats = netdev_tap_get_stats,
3336 .get_features = netdev_linux_get_features,
3337 .get_status = netdev_linux_get_status,
3338};
3339
3340const struct netdev_class netdev_internal_class = {
3341 NETDEV_LINUX_CLASS_COMMON,
3342 .type = "internal",
3343 .construct = netdev_linux_construct,
3344 .get_stats = netdev_internal_get_stats,
3345 .get_status = netdev_internal_get_status,
3346};
8b61709d 3347\f
677d9158
JV
3348
3349#define CODEL_N_QUEUES 0x0000
3350
2f4298ce
BP
3351/* In sufficiently new kernel headers these are defined as enums in
3352 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3353 * kernels. (This overrides any enum definition in the header file but that's
3354 * harmless.) */
3355#define TCA_CODEL_TARGET 1
3356#define TCA_CODEL_LIMIT 2
3357#define TCA_CODEL_INTERVAL 3
3358
677d9158
JV
3359struct codel {
3360 struct tc tc;
3361 uint32_t target;
3362 uint32_t limit;
3363 uint32_t interval;
3364};
3365
3366static struct codel *
3367codel_get__(const struct netdev *netdev_)
3368{
3369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3370 return CONTAINER_OF(netdev->tc, struct codel, tc);
3371}
3372
3373static void
3374codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3375 uint32_t interval)
3376{
3377 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3378 struct codel *codel;
3379
3380 codel = xmalloc(sizeof *codel);
3381 tc_init(&codel->tc, &tc_ops_codel);
3382 codel->target = target;
3383 codel->limit = limit;
3384 codel->interval = interval;
3385
3386 netdev->tc = &codel->tc;
3387}
3388
3389static int
3390codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3391 uint32_t interval)
3392{
3393 size_t opt_offset;
3394 struct ofpbuf request;
3395 struct tcmsg *tcmsg;
3396 uint32_t otarget, olimit, ointerval;
3397 int error;
3398
3399 tc_del_qdisc(netdev);
3400
7874bdff
RD
3401 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3402 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3403 if (!tcmsg) {
3404 return ENODEV;
3405 }
3406 tcmsg->tcm_handle = tc_make_handle(1, 0);
3407 tcmsg->tcm_parent = TC_H_ROOT;
3408
3409 otarget = target ? target : 5000;
3410 olimit = limit ? limit : 10240;
3411 ointerval = interval ? interval : 100000;
3412
3413 nl_msg_put_string(&request, TCA_KIND, "codel");
3414 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3415 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3416 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3417 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3418 nl_msg_end_nested(&request, opt_offset);
3419
3420 error = tc_transact(&request, NULL);
3421 if (error) {
3422 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3423 "target %u, limit %u, interval %u error %d(%s)",
3424 netdev_get_name(netdev),
3425 otarget, olimit, ointerval,
3426 error, ovs_strerror(error));
3427 }
3428 return error;
3429}
3430
3431static void
3432codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3433 const struct smap *details, struct codel *codel)
3434{
13c1637f
BP
3435 codel->target = smap_get_ullong(details, "target", 0);
3436 codel->limit = smap_get_ullong(details, "limit", 0);
3437 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3438
3439 if (!codel->target) {
3440 codel->target = 5000;
3441 }
3442 if (!codel->limit) {
3443 codel->limit = 10240;
3444 }
3445 if (!codel->interval) {
3446 codel->interval = 100000;
3447 }
3448}
3449
3450static int
3451codel_tc_install(struct netdev *netdev, const struct smap *details)
3452{
3453 int error;
3454 struct codel codel;
3455
3456 codel_parse_qdisc_details__(netdev, details, &codel);
3457 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3458 codel.interval);
3459 if (!error) {
3460 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3461 }
3462 return error;
3463}
3464
3465static int
3466codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3467{
3468 static const struct nl_policy tca_codel_policy[] = {
3469 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3470 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3471 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3472 };
3473
3474 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3475
3476 if (!nl_parse_nested(nl_options, tca_codel_policy,
3477 attrs, ARRAY_SIZE(tca_codel_policy))) {
3478 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3479 return EPROTO;
3480 }
3481
3482 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3483 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3484 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3485 return 0;
3486}
3487
3488static int
3489codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3490{
3491 struct nlattr *nlattr;
3492 const char * kind;
3493 int error;
3494 struct codel codel;
3495
3496 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3497 if (error != 0) {
3498 return error;
3499 }
3500
3501 error = codel_parse_tca_options__(nlattr, &codel);
3502 if (error != 0) {
3503 return error;
3504 }
3505
3506 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3507 return 0;
3508}
3509
3510
3511static void
3512codel_tc_destroy(struct tc *tc)
3513{
3514 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3515 tc_destroy(tc);
3516 free(codel);
3517}
3518
3519static int
3520codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3521{
3522 const struct codel *codel = codel_get__(netdev);
3523 smap_add_format(details, "target", "%u", codel->target);
3524 smap_add_format(details, "limit", "%u", codel->limit);
3525 smap_add_format(details, "interval", "%u", codel->interval);
3526 return 0;
3527}
3528
3529static int
3530codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3531{
3532 struct codel codel;
3533
3534 codel_parse_qdisc_details__(netdev, details, &codel);
3535 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3536 codel_get__(netdev)->target = codel.target;
3537 codel_get__(netdev)->limit = codel.limit;
3538 codel_get__(netdev)->interval = codel.interval;
3539 return 0;
3540}
3541
3542static const struct tc_ops tc_ops_codel = {
89c09c1c
BP
3543 .linux_name = "codel",
3544 .ovs_name = "linux-codel",
3545 .n_queues = CODEL_N_QUEUES,
3546 .tc_install = codel_tc_install,
3547 .tc_load = codel_tc_load,
3548 .tc_destroy = codel_tc_destroy,
3549 .qdisc_get = codel_qdisc_get,
3550 .qdisc_set = codel_qdisc_set,
677d9158
JV
3551};
3552\f
3553/* FQ-CoDel traffic control class. */
3554
3555#define FQCODEL_N_QUEUES 0x0000
3556
2f4298ce
BP
3557/* In sufficiently new kernel headers these are defined as enums in
3558 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3559 * kernels. (This overrides any enum definition in the header file but that's
3560 * harmless.) */
3561#define TCA_FQ_CODEL_TARGET 1
3562#define TCA_FQ_CODEL_LIMIT 2
3563#define TCA_FQ_CODEL_INTERVAL 3
3564#define TCA_FQ_CODEL_ECN 4
3565#define TCA_FQ_CODEL_FLOWS 5
3566#define TCA_FQ_CODEL_QUANTUM 6
3567
677d9158
JV
3568struct fqcodel {
3569 struct tc tc;
3570 uint32_t target;
3571 uint32_t limit;
3572 uint32_t interval;
3573 uint32_t flows;
3574 uint32_t quantum;
3575};
3576
3577static struct fqcodel *
3578fqcodel_get__(const struct netdev *netdev_)
3579{
3580 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3581 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3582}
3583
3584static void
3585fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3586 uint32_t interval, uint32_t flows, uint32_t quantum)
3587{
3588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3589 struct fqcodel *fqcodel;
3590
3591 fqcodel = xmalloc(sizeof *fqcodel);
3592 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3593 fqcodel->target = target;
3594 fqcodel->limit = limit;
3595 fqcodel->interval = interval;
3596 fqcodel->flows = flows;
3597 fqcodel->quantum = quantum;
3598
3599 netdev->tc = &fqcodel->tc;
3600}
3601
3602static int
3603fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3604 uint32_t interval, uint32_t flows, uint32_t quantum)
3605{
3606 size_t opt_offset;
3607 struct ofpbuf request;
3608 struct tcmsg *tcmsg;
3609 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3610 int error;
3611
3612 tc_del_qdisc(netdev);
3613
7874bdff
RD
3614 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3615 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3616 if (!tcmsg) {
3617 return ENODEV;
3618 }
3619 tcmsg->tcm_handle = tc_make_handle(1, 0);
3620 tcmsg->tcm_parent = TC_H_ROOT;
3621
3622 otarget = target ? target : 5000;
3623 olimit = limit ? limit : 10240;
3624 ointerval = interval ? interval : 100000;
3625 oflows = flows ? flows : 1024;
3626 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3627 not mtu */
3628
3629 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3630 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3631 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3632 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3633 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3634 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3635 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3636 nl_msg_end_nested(&request, opt_offset);
3637
3638 error = tc_transact(&request, NULL);
3639 if (error) {
3640 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3641 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3642 netdev_get_name(netdev),
3643 otarget, olimit, ointerval, oflows, oquantum,
3644 error, ovs_strerror(error));
3645 }
3646 return error;
3647}
3648
3649static void
3650fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3651 const struct smap *details, struct fqcodel *fqcodel)
3652{
13c1637f
BP
3653 fqcodel->target = smap_get_ullong(details, "target", 0);
3654 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3655 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3656 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3657 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3658
677d9158
JV
3659 if (!fqcodel->target) {
3660 fqcodel->target = 5000;
3661 }
3662 if (!fqcodel->limit) {
3663 fqcodel->limit = 10240;
3664 }
3665 if (!fqcodel->interval) {
3666 fqcodel->interval = 1000000;
3667 }
3668 if (!fqcodel->flows) {
3669 fqcodel->flows = 1024;
3670 }
3671 if (!fqcodel->quantum) {
3672 fqcodel->quantum = 1514;
3673 }
3674}
3675
3676static int
3677fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3678{
3679 int error;
3680 struct fqcodel fqcodel;
3681
3682 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3683 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3684 fqcodel.interval, fqcodel.flows,
3685 fqcodel.quantum);
3686 if (!error) {
3687 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3688 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3689 }
3690 return error;
3691}
3692
3693static int
3694fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3695{
3696 static const struct nl_policy tca_fqcodel_policy[] = {
3697 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3698 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3699 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3700 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3701 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3702 };
3703
3704 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3705
3706 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3707 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3708 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3709 return EPROTO;
3710 }
3711
3712 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3713 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3714 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3715 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3716 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3717 return 0;
3718}
3719
3720static int
3721fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3722{
3723 struct nlattr *nlattr;
3724 const char * kind;
3725 int error;
3726 struct fqcodel fqcodel;
3727
3728 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3729 if (error != 0) {
3730 return error;
3731 }
3732
3733 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3734 if (error != 0) {
3735 return error;
3736 }
3737
3738 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3739 fqcodel.flows, fqcodel.quantum);
3740 return 0;
3741}
3742
3743static void
3744fqcodel_tc_destroy(struct tc *tc)
3745{
3746 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3747 tc_destroy(tc);
3748 free(fqcodel);
3749}
3750
3751static int
3752fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3753{
3754 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3755 smap_add_format(details, "target", "%u", fqcodel->target);
3756 smap_add_format(details, "limit", "%u", fqcodel->limit);
3757 smap_add_format(details, "interval", "%u", fqcodel->interval);
3758 smap_add_format(details, "flows", "%u", fqcodel->flows);
3759 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3760 return 0;
3761}
3762
3763static int
3764fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3765{
3766 struct fqcodel fqcodel;
3767
3768 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3769 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3770 fqcodel.flows, fqcodel.quantum);
3771 fqcodel_get__(netdev)->target = fqcodel.target;
3772 fqcodel_get__(netdev)->limit = fqcodel.limit;
3773 fqcodel_get__(netdev)->interval = fqcodel.interval;
3774 fqcodel_get__(netdev)->flows = fqcodel.flows;
3775 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3776 return 0;
3777}
3778
3779static const struct tc_ops tc_ops_fqcodel = {
89c09c1c
BP
3780 .linux_name = "fq_codel",
3781 .ovs_name = "linux-fq_codel",
3782 .n_queues = FQCODEL_N_QUEUES,
3783 .tc_install = fqcodel_tc_install,
3784 .tc_load = fqcodel_tc_load,
3785 .tc_destroy = fqcodel_tc_destroy,
3786 .qdisc_get = fqcodel_qdisc_get,
3787 .qdisc_set = fqcodel_qdisc_set,
677d9158
JV
3788};
3789\f
3790/* SFQ traffic control class. */
3791
3792#define SFQ_N_QUEUES 0x0000
3793
3794struct sfq {
3795 struct tc tc;
3796 uint32_t quantum;
3797 uint32_t perturb;
3798};
3799
3800static struct sfq *
3801sfq_get__(const struct netdev *netdev_)
3802{
3803 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3804 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3805}
3806
3807static void
3808sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3809{
3810 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3811 struct sfq *sfq;
3812
3813 sfq = xmalloc(sizeof *sfq);
3814 tc_init(&sfq->tc, &tc_ops_sfq);
3815 sfq->perturb = perturb;
3816 sfq->quantum = quantum;
3817
3818 netdev->tc = &sfq->tc;
3819}
3820
3821static int
3822sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3823{
3824 struct tc_sfq_qopt opt;
3825 struct ofpbuf request;
3826 struct tcmsg *tcmsg;
3827 int mtu;
3828 int mtu_error, error;
3829 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3830
3831 tc_del_qdisc(netdev);
3832
7874bdff
RD
3833 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3834 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3835 if (!tcmsg) {
3836 return ENODEV;
3837 }
3838 tcmsg->tcm_handle = tc_make_handle(1, 0);
3839 tcmsg->tcm_parent = TC_H_ROOT;
3840
3841 memset(&opt, 0, sizeof opt);
3842 if (!quantum) {
3843 if (!mtu_error) {
3844 opt.quantum = mtu; /* if we cannot find mtu, use default */
3845 }
3846 } else {
3847 opt.quantum = quantum;
3848 }
3849
3850 if (!perturb) {
3851 opt.perturb_period = 10;
3852 } else {
3853 opt.perturb_period = perturb;
3854 }
3855
3856 nl_msg_put_string(&request, TCA_KIND, "sfq");
3857 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3858
3859 error = tc_transact(&request, NULL);
3860 if (error) {
3861 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3862 "quantum %u, perturb %u error %d(%s)",
3863 netdev_get_name(netdev),
3864 opt.quantum, opt.perturb_period,
3865 error, ovs_strerror(error));
3866 }
3867 return error;
3868}
3869
3870static void
3871sfq_parse_qdisc_details__(struct netdev *netdev,
3872 const struct smap *details, struct sfq *sfq)
3873{
13c1637f
BP
3874 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3875 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3876
677d9158
JV
3877 if (!sfq->perturb) {
3878 sfq->perturb = 10;
3879 }
3880
3881 if (!sfq->quantum) {
13c1637f
BP
3882 int mtu;
3883 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3884 sfq->quantum = mtu;
3885 } else {
3886 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3887 "device without mtu");
677d9158
JV
3888 }
3889 }
3890}
3891
3892static int
3893sfq_tc_install(struct netdev *netdev, const struct smap *details)
3894{
3895 int error;
3896 struct sfq sfq;
3897
3898 sfq_parse_qdisc_details__(netdev, details, &sfq);
3899 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3900 if (!error) {
3901 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3902 }
3903 return error;
3904}
3905
3906static int
3907sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3908{
3909 const struct tc_sfq_qopt *sfq;
3910 struct nlattr *nlattr;
3911 const char * kind;
3912 int error;
3913
3914 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3915 if (error == 0) {
3916 sfq = nl_attr_get(nlattr);
61265c03 3917 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
677d9158
JV
3918 return 0;
3919 }
3920
3921 return error;
3922}
3923
3924static void
3925sfq_tc_destroy(struct tc *tc)
3926{
3927 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3928 tc_destroy(tc);
3929 free(sfq);
3930}
3931
3932static int
3933sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3934{
3935 const struct sfq *sfq = sfq_get__(netdev);
3936 smap_add_format(details, "quantum", "%u", sfq->quantum);
3937 smap_add_format(details, "perturb", "%u", sfq->perturb);
3938 return 0;
3939}
3940
3941static int
3942sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3943{
3944 struct sfq sfq;
3945
3946 sfq_parse_qdisc_details__(netdev, details, &sfq);
3947 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3948 sfq_get__(netdev)->quantum = sfq.quantum;
3949 sfq_get__(netdev)->perturb = sfq.perturb;
3950 return 0;
3951}
3952
3953static const struct tc_ops tc_ops_sfq = {
89c09c1c
BP
3954 .linux_name = "sfq",
3955 .ovs_name = "linux-sfq",
3956 .n_queues = SFQ_N_QUEUES,
3957 .tc_install = sfq_tc_install,
3958 .tc_load = sfq_tc_load,
3959 .tc_destroy = sfq_tc_destroy,
3960 .qdisc_get = sfq_qdisc_get,
3961 .qdisc_set = sfq_qdisc_set,
677d9158
JV
3962};
3963\f
2f564bb1
S
3964/* netem traffic control class. */
3965
3966struct netem {
3967 struct tc tc;
3968 uint32_t latency;
3969 uint32_t limit;
3970 uint32_t loss;
3971};
3972
3973static struct netem *
3974netem_get__(const struct netdev *netdev_)
3975{
3976 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3977 return CONTAINER_OF(netdev->tc, struct netem, tc);
3978}
3979
3980static void
3981netem_install__(struct netdev *netdev_, uint32_t latency,
3982 uint32_t limit, uint32_t loss)
3983{
3984 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3985 struct netem *netem;
3986
3987 netem = xmalloc(sizeof *netem);
3988 tc_init(&netem->tc, &tc_ops_netem);
3989 netem->latency = latency;
3990 netem->limit = limit;
3991 netem->loss = loss;
3992
3993 netdev->tc = &netem->tc;
3994}
3995
3996static int
3997netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
3998 uint32_t limit, uint32_t loss)
3999{
4000 struct tc_netem_qopt opt;
4001 struct ofpbuf request;
4002 struct tcmsg *tcmsg;
4003 int error;
4004
4005 tc_del_qdisc(netdev);
4006
4007 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4008 NLM_F_EXCL | NLM_F_CREATE, &request);
4009 if (!tcmsg) {
4010 return ENODEV;
4011 }
4012 tcmsg->tcm_handle = tc_make_handle(1, 0);
4013 tcmsg->tcm_parent = TC_H_ROOT;
4014
4015 memset(&opt, 0, sizeof opt);
4016
4017 if (!limit) {
4018 opt.limit = 1000;
4019 } else {
4020 opt.limit = limit;
4021 }
4022
4023 if (loss) {
4024 if (loss > 100) {
4025 VLOG_WARN_RL(&rl,
4026 "loss should be a percentage value between 0 to 100, "
4027 "loss was %u", loss);
4028 return EINVAL;
4029 }
4030 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4031 }
4032
4033 opt.latency = tc_time_to_ticks(latency);
4034
4035 nl_msg_put_string(&request, TCA_KIND, "netem");
4036 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4037
4038 error = tc_transact(&request, NULL);
4039 if (error) {
4040 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4041 "latency %u, limit %u, loss %u error %d(%s)",
4042 netdev_get_name(netdev),
4043 opt.latency, opt.limit, opt.loss,
4044 error, ovs_strerror(error));
4045 }
4046 return error;
4047}
4048
4049static void
4050netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4051 const struct smap *details, struct netem *netem)
4052{
4053 netem->latency = smap_get_ullong(details, "latency", 0);
4054 netem->limit = smap_get_ullong(details, "limit", 0);
4055 netem->loss = smap_get_ullong(details, "loss", 0);
4056
4057 if (!netem->limit) {
4058 netem->limit = 1000;
4059 }
4060}
4061
4062static int
4063netem_tc_install(struct netdev *netdev, const struct smap *details)
4064{
4065 int error;
4066 struct netem netem;
4067
4068 netem_parse_qdisc_details__(netdev, details, &netem);
4069 error = netem_setup_qdisc__(netdev, netem.latency,
4070 netem.limit, netem.loss);
4071 if (!error) {
4072 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4073 }
4074 return error;
4075}
4076
4077static int
4078netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4079{
4080 const struct tc_netem_qopt *netem;
4081 struct nlattr *nlattr;
4082 const char *kind;
4083 int error;
4084
4085 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4086 if (error == 0) {
4087 netem = nl_attr_get(nlattr);
4088 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4089 return 0;
4090 }
4091
4092 return error;
4093}
4094
4095static void
4096netem_tc_destroy(struct tc *tc)
4097{
4098 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4099 tc_destroy(tc);
4100 free(netem);
4101}
4102
4103static int
4104netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4105{
4106 const struct netem *netem = netem_get__(netdev);
4107 smap_add_format(details, "latency", "%u", netem->latency);
4108 smap_add_format(details, "limit", "%u", netem->limit);
4109 smap_add_format(details, "loss", "%u", netem->loss);
4110 return 0;
4111}
4112
4113static int
4114netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4115{
4116 struct netem netem;
4117
4118 netem_parse_qdisc_details__(netdev, details, &netem);
4119 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4120 netem_get__(netdev)->latency = netem.latency;
4121 netem_get__(netdev)->limit = netem.limit;
4122 netem_get__(netdev)->loss = netem.loss;
4123 return 0;
4124}
4125
4126static const struct tc_ops tc_ops_netem = {
4127 .linux_name = "netem",
4128 .ovs_name = "linux-netem",
4129 .n_queues = 0,
4130 .tc_install = netem_tc_install,
4131 .tc_load = netem_tc_load,
4132 .tc_destroy = netem_tc_destroy,
4133 .qdisc_get = netem_qdisc_get,
4134 .qdisc_set = netem_qdisc_set,
4135};
4136\f
c1c9c9c4 4137/* HTB traffic control class. */
559843ed 4138
c1c9c9c4 4139#define HTB_N_QUEUES 0xf000
4f631ccd 4140#define HTB_RATE2QUANTUM 10
8b61709d 4141
c1c9c9c4
BP
4142struct htb {
4143 struct tc tc;
4144 unsigned int max_rate; /* In bytes/s. */
4145};
8b61709d 4146
c1c9c9c4 4147struct htb_class {
93b13be8 4148 struct tc_queue tc_queue;
c1c9c9c4
BP
4149 unsigned int min_rate; /* In bytes/s. */
4150 unsigned int max_rate; /* In bytes/s. */
4151 unsigned int burst; /* In bytes. */
4152 unsigned int priority; /* Lower values are higher priorities. */
4153};
8b61709d 4154
c1c9c9c4 4155static struct htb *
b5d57fc8 4156htb_get__(const struct netdev *netdev_)
c1c9c9c4 4157{
b5d57fc8
BP
4158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4159 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
4160}
4161
24045e35 4162static void
b5d57fc8 4163htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 4164{
b5d57fc8 4165 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4166 struct htb *htb;
4167
4168 htb = xmalloc(sizeof *htb);
4169 tc_init(&htb->tc, &tc_ops_htb);
4170 htb->max_rate = max_rate;
4171
b5d57fc8 4172 netdev->tc = &htb->tc;
c1c9c9c4
BP
4173}
4174
4175/* Create an HTB qdisc.
4176 *
a339aa81 4177 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
4178static int
4179htb_setup_qdisc__(struct netdev *netdev)
4180{
4181 size_t opt_offset;
4182 struct tc_htb_glob opt;
4183 struct ofpbuf request;
4184 struct tcmsg *tcmsg;
4185
4186 tc_del_qdisc(netdev);
4187
7874bdff
RD
4188 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4189 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
4190 if (!tcmsg) {
4191 return ENODEV;
4192 }
c1c9c9c4
BP
4193 tcmsg->tcm_handle = tc_make_handle(1, 0);
4194 tcmsg->tcm_parent = TC_H_ROOT;
4195
4196 nl_msg_put_string(&request, TCA_KIND, "htb");
4197
4198 memset(&opt, 0, sizeof opt);
4f631ccd 4199 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 4200 opt.version = 3;
4ecf12d5 4201 opt.defcls = 1;
c1c9c9c4
BP
4202
4203 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4204 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4205 nl_msg_end_nested(&request, opt_offset);
4206
4207 return tc_transact(&request, NULL);
4208}
4209
4210/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4211 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4212static int
4213htb_setup_class__(struct netdev *netdev, unsigned int handle,
4214 unsigned int parent, struct htb_class *class)
4215{
4216 size_t opt_offset;
4217 struct tc_htb_opt opt;
4218 struct ofpbuf request;
4219 struct tcmsg *tcmsg;
4220 int error;
4221 int mtu;
4222
73371c09 4223 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4224 if (error) {
f915f1a8
BP
4225 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4226 netdev_get_name(netdev));
9b020780 4227 return error;
f915f1a8 4228 }
c1c9c9c4
BP
4229
4230 memset(&opt, 0, sizeof opt);
4231 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4232 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
4233 /* Makes sure the quantum is at least MTU. Setting quantum will
4234 * make htb ignore the r2q for this class. */
4235 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4236 opt.quantum = mtu;
4237 }
c1c9c9c4
BP
4238 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4239 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4240 opt.prio = class->priority;
4241
7874bdff
RD
4242 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4243 &request);
23a98ffe
BP
4244 if (!tcmsg) {
4245 return ENODEV;
4246 }
c1c9c9c4
BP
4247 tcmsg->tcm_handle = handle;
4248 tcmsg->tcm_parent = parent;
4249
4250 nl_msg_put_string(&request, TCA_KIND, "htb");
4251 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4252 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4253 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4254 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4255 nl_msg_end_nested(&request, opt_offset);
4256
4257 error = tc_transact(&request, NULL);
4258 if (error) {
4259 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4260 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4261 netdev_get_name(netdev),
4262 tc_get_major(handle), tc_get_minor(handle),
4263 tc_get_major(parent), tc_get_minor(parent),
4264 class->min_rate, class->max_rate,
10a89ef0 4265 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
4266 }
4267 return error;
4268}
4269
4270/* Parses Netlink attributes in 'options' for HTB parameters and stores a
4271 * description of them into 'details'. The description complies with the
4272 * specification given in the vswitch database documentation for linux-htb
4273 * queue details. */
4274static int
4275htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4276{
4277 static const struct nl_policy tca_htb_policy[] = {
4278 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4279 .min_len = sizeof(struct tc_htb_opt) },
4280 };
4281
4282 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4283 const struct tc_htb_opt *htb;
4284
4285 if (!nl_parse_nested(nl_options, tca_htb_policy,
4286 attrs, ARRAY_SIZE(tca_htb_policy))) {
4287 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4288 return EPROTO;
4289 }
4290
4291 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4292 class->min_rate = htb->rate.rate;
4293 class->max_rate = htb->ceil.rate;
4294 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4295 class->priority = htb->prio;
4296 return 0;
4297}
4298
4299static int
4300htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4301 struct htb_class *options,
4302 struct netdev_queue_stats *stats)
4303{
4304 struct nlattr *nl_options;
4305 unsigned int handle;
4306 int error;
4307
4308 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4309 if (!error && queue_id) {
17ee3c1f
BP
4310 unsigned int major = tc_get_major(handle);
4311 unsigned int minor = tc_get_minor(handle);
4312 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4313 *queue_id = minor - 1;
c1c9c9c4
BP
4314 } else {
4315 error = EPROTO;
4316 }
4317 }
4318 if (!error && options) {
4319 error = htb_parse_tca_options__(nl_options, options);
4320 }
4321 return error;
4322}
4323
4324static void
73371c09 4325htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 4326 const struct smap *details, struct htb_class *hc)
c1c9c9c4 4327{
73371c09 4328 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 4329
13c1637f 4330 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 4331 if (!hc->max_rate) {
a00ca915 4332 enum netdev_features current;
c1c9c9c4 4333
73371c09
BP
4334 netdev_linux_read_features(netdev);
4335 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4336 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
4337 }
4338 hc->min_rate = hc->max_rate;
4339 hc->burst = 0;
4340 hc->priority = 0;
4341}
4342
4343static int
4344htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 4345 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
4346{
4347 const struct htb *htb = htb_get__(netdev);
9b020780 4348 int mtu, error;
214117fd 4349 unsigned long long int max_rate_bit;
c1c9c9c4 4350
73371c09 4351 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4352 if (error) {
f915f1a8
BP
4353 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4354 netdev_get_name(netdev));
9b020780 4355 return error;
f915f1a8
BP
4356 }
4357
4f104611
EJ
4358 /* HTB requires at least an mtu sized min-rate to send any traffic even
4359 * on uncongested links. */
13c1637f 4360 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 4361 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
4362 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4363
4364 /* max-rate */
214117fd
KF
4365 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4366 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
4367 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4368 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4369
4370 /* burst
4371 *
4372 * According to hints in the documentation that I've read, it is important
4373 * that 'burst' be at least as big as the largest frame that might be
4374 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4375 * but having it a bit too small is a problem. Since netdev_get_mtu()
4376 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4377 * the MTU. We actually add 64, instead of 14, as a guard against
4378 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 4379 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
4380 hc->burst = MAX(hc->burst, mtu + 64);
4381
4382 /* priority */
13c1637f 4383 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
4384
4385 return 0;
4386}
4387
4388static int
4389htb_query_class__(const struct netdev *netdev, unsigned int handle,
4390 unsigned int parent, struct htb_class *options,
4391 struct netdev_queue_stats *stats)
4392{
4393 struct ofpbuf *reply;
4394 int error;
4395
4396 error = tc_query_class(netdev, handle, parent, &reply);
4397 if (!error) {
4398 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4399 ofpbuf_delete(reply);
4400 }
4401 return error;
4402}
4403
4404static int
79f1cbe9 4405htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4406{
4407 int error;
4408
4409 error = htb_setup_qdisc__(netdev);
4410 if (!error) {
4411 struct htb_class hc;
4412
4413 htb_parse_qdisc_details__(netdev, details, &hc);
4414 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4415 tc_make_handle(1, 0), &hc);
4416 if (!error) {
4417 htb_install__(netdev, hc.max_rate);
4418 }
4419 }
4420 return error;
4421}
4422
93b13be8
BP
4423static struct htb_class *
4424htb_class_cast__(const struct tc_queue *queue)
4425{
4426 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4427}
4428
c1c9c9c4
BP
4429static void
4430htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4431 const struct htb_class *hc)
4432{
4433 struct htb *htb = htb_get__(netdev);
93b13be8
BP
4434 size_t hash = hash_int(queue_id, 0);
4435 struct tc_queue *queue;
c1c9c9c4
BP
4436 struct htb_class *hcp;
4437
93b13be8
BP
4438 queue = tc_find_queue__(netdev, queue_id, hash);
4439 if (queue) {
4440 hcp = htb_class_cast__(queue);
4441 } else {
c1c9c9c4 4442 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
4443 queue = &hcp->tc_queue;
4444 queue->queue_id = queue_id;
6dc34a0d 4445 queue->created = time_msec();
93b13be8 4446 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 4447 }
93b13be8
BP
4448
4449 hcp->min_rate = hc->min_rate;
4450 hcp->max_rate = hc->max_rate;
4451 hcp->burst = hc->burst;
4452 hcp->priority = hc->priority;
c1c9c9c4
BP
4453}
4454
4455static int
4456htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4457{
c1c9c9c4 4458 struct ofpbuf msg;
d57695d7 4459 struct queue_dump_state state;
c1c9c9c4 4460 struct htb_class hc;
c1c9c9c4
BP
4461
4462 /* Get qdisc options. */
4463 hc.max_rate = 0;
4464 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4465 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
4466
4467 /* Get queues. */
d57695d7 4468 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
4469 return ENODEV;
4470 }
d57695d7 4471 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
4472 unsigned int queue_id;
4473
4474 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4475 htb_update_queue__(netdev, queue_id, &hc);
4476 }
4477 }
d57695d7 4478 finish_queue_dump(&state);
c1c9c9c4
BP
4479
4480 return 0;
4481}
4482
4483static void
4484htb_tc_destroy(struct tc *tc)
4485{
4486 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 4487 struct htb_class *hc;
c1c9c9c4 4488
4ec3d7c7 4489 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
4490 free(hc);
4491 }
4492 tc_destroy(tc);
4493 free(htb);
4494}
4495
4496static int
79f1cbe9 4497htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
4498{
4499 const struct htb *htb = htb_get__(netdev);
79f1cbe9 4500 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
4501 return 0;
4502}
4503
4504static int
79f1cbe9 4505htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4506{
4507 struct htb_class hc;
4508 int error;
4509
4510 htb_parse_qdisc_details__(netdev, details, &hc);
4511 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4512 tc_make_handle(1, 0), &hc);
4513 if (!error) {
4514 htb_get__(netdev)->max_rate = hc.max_rate;
4515 }
4516 return error;
4517}
4518
4519static int
93b13be8 4520htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4521 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 4522{
93b13be8 4523 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4524
79f1cbe9 4525 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 4526 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4527 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 4528 }
79f1cbe9 4529 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 4530 if (hc->priority) {
79f1cbe9 4531 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
4532 }
4533 return 0;
4534}
4535
4536static int
4537htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4538 const struct smap *details)
c1c9c9c4
BP
4539{
4540 struct htb_class hc;
4541 int error;
4542
4543 error = htb_parse_class_details__(netdev, details, &hc);
4544 if (error) {
4545 return error;
4546 }
4547
17ee3c1f 4548 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
4549 tc_make_handle(1, 0xfffe), &hc);
4550 if (error) {
4551 return error;
4552 }
4553
4554 htb_update_queue__(netdev, queue_id, &hc);
4555 return 0;
4556}
4557
4558static int
93b13be8 4559htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 4560{
93b13be8 4561 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4562 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4563 int error;
4564
93b13be8 4565 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4566 if (!error) {
93b13be8 4567 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4568 free(hc);
c1c9c9c4
BP
4569 }
4570 return error;
4571}
4572
4573static int
93b13be8 4574htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4575 struct netdev_queue_stats *stats)
4576{
93b13be8 4577 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4578 tc_make_handle(1, 0xfffe), NULL, stats);
4579}
4580
4581static int
4582htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4583 const struct ofpbuf *nlmsg,
4584 netdev_dump_queue_stats_cb *cb, void *aux)
4585{
4586 struct netdev_queue_stats stats;
17ee3c1f 4587 unsigned int handle, major, minor;
c1c9c9c4
BP
4588 int error;
4589
4590 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4591 if (error) {
4592 return error;
4593 }
4594
17ee3c1f
BP
4595 major = tc_get_major(handle);
4596 minor = tc_get_minor(handle);
4597 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4598 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4599 }
4600 return 0;
4601}
4602
4603static const struct tc_ops tc_ops_htb = {
89c09c1c
BP
4604 .linux_name = "htb",
4605 .ovs_name = "linux-htb",
4606 .n_queues = HTB_N_QUEUES,
4607 .tc_install = htb_tc_install,
4608 .tc_load = htb_tc_load,
4609 .tc_destroy = htb_tc_destroy,
4610 .qdisc_get = htb_qdisc_get,
4611 .qdisc_set = htb_qdisc_set,
4612 .class_get = htb_class_get,
4613 .class_set = htb_class_set,
4614 .class_delete = htb_class_delete,
4615 .class_get_stats = htb_class_get_stats,
4616 .class_dump_stats = htb_class_dump_stats
c1c9c9c4
BP
4617};
4618\f
a339aa81
EJ
4619/* "linux-hfsc" traffic control class. */
4620
4621#define HFSC_N_QUEUES 0xf000
4622
4623struct hfsc {
4624 struct tc tc;
4625 uint32_t max_rate;
4626};
4627
4628struct hfsc_class {
4629 struct tc_queue tc_queue;
4630 uint32_t min_rate;
4631 uint32_t max_rate;
4632};
4633
4634static struct hfsc *
b5d57fc8 4635hfsc_get__(const struct netdev *netdev_)
a339aa81 4636{
b5d57fc8
BP
4637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4638 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4639}
4640
4641static struct hfsc_class *
4642hfsc_class_cast__(const struct tc_queue *queue)
4643{
4644 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4645}
4646
24045e35 4647static void
b5d57fc8 4648hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4649{
b5d57fc8 4650 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4651 struct hfsc *hfsc;
4652
a339aa81
EJ
4653 hfsc = xmalloc(sizeof *hfsc);
4654 tc_init(&hfsc->tc, &tc_ops_hfsc);
4655 hfsc->max_rate = max_rate;
b5d57fc8 4656 netdev->tc = &hfsc->tc;
a339aa81
EJ
4657}
4658
4659static void
4660hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4661 const struct hfsc_class *hc)
4662{
4663 size_t hash;
4664 struct hfsc *hfsc;
4665 struct hfsc_class *hcp;
4666 struct tc_queue *queue;
4667
4668 hfsc = hfsc_get__(netdev);
4669 hash = hash_int(queue_id, 0);
4670
4671 queue = tc_find_queue__(netdev, queue_id, hash);
4672 if (queue) {
4673 hcp = hfsc_class_cast__(queue);
4674 } else {
4675 hcp = xmalloc(sizeof *hcp);
4676 queue = &hcp->tc_queue;
4677 queue->queue_id = queue_id;
6dc34a0d 4678 queue->created = time_msec();
a339aa81
EJ
4679 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4680 }
4681
4682 hcp->min_rate = hc->min_rate;
4683 hcp->max_rate = hc->max_rate;
4684}
4685
4686static int
4687hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4688{
4689 const struct tc_service_curve *rsc, *fsc, *usc;
4690 static const struct nl_policy tca_hfsc_policy[] = {
4691 [TCA_HFSC_RSC] = {
4692 .type = NL_A_UNSPEC,
4693 .optional = false,
4694 .min_len = sizeof(struct tc_service_curve),
4695 },
4696 [TCA_HFSC_FSC] = {
4697 .type = NL_A_UNSPEC,
4698 .optional = false,
4699 .min_len = sizeof(struct tc_service_curve),
4700 },
4701 [TCA_HFSC_USC] = {
4702 .type = NL_A_UNSPEC,
4703 .optional = false,
4704 .min_len = sizeof(struct tc_service_curve),
4705 },
4706 };
4707 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4708
4709 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4710 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4711 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4712 return EPROTO;
4713 }
4714
4715 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4716 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4717 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4718
4719 if (rsc->m1 != 0 || rsc->d != 0 ||
4720 fsc->m1 != 0 || fsc->d != 0 ||
4721 usc->m1 != 0 || usc->d != 0) {
4722 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4723 "Non-linear service curves are not supported.");
4724 return EPROTO;
4725 }
4726
4727 if (rsc->m2 != fsc->m2) {
4728 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4729 "Real-time service curves are not supported ");
4730 return EPROTO;
4731 }
4732
4733 if (rsc->m2 > usc->m2) {
4734 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4735 "Min-rate service curve is greater than "
4736 "the max-rate service curve.");
4737 return EPROTO;
4738 }
4739
4740 class->min_rate = fsc->m2;
4741 class->max_rate = usc->m2;
4742 return 0;
4743}
4744
4745static int
4746hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4747 struct hfsc_class *options,
4748 struct netdev_queue_stats *stats)
4749{
4750 int error;
4751 unsigned int handle;
4752 struct nlattr *nl_options;
4753
4754 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4755 if (error) {
4756 return error;
4757 }
4758
4759 if (queue_id) {
4760 unsigned int major, minor;
4761
4762 major = tc_get_major(handle);
4763 minor = tc_get_minor(handle);
4764 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4765 *queue_id = minor - 1;
4766 } else {
4767 return EPROTO;
4768 }
4769 }
4770
4771 if (options) {
4772 error = hfsc_parse_tca_options__(nl_options, options);
4773 }
4774
4775 return error;
4776}
4777
4778static int
4779hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4780 unsigned int parent, struct hfsc_class *options,
4781 struct netdev_queue_stats *stats)
4782{
4783 int error;
4784 struct ofpbuf *reply;
4785
4786 error = tc_query_class(netdev, handle, parent, &reply);
4787 if (error) {
4788 return error;
4789 }
4790
4791 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4792 ofpbuf_delete(reply);
4793 return error;
4794}
4795
4796static void
73371c09 4797hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4798 struct hfsc_class *class)
4799{
73371c09 4800 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4801
13c1637f 4802 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4803 if (!max_rate) {
a00ca915 4804 enum netdev_features current;
a339aa81 4805
73371c09
BP
4806 netdev_linux_read_features(netdev);
4807 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4808 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4809 }
4810
4811 class->min_rate = max_rate;
4812 class->max_rate = max_rate;
4813}
4814
4815static int
4816hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4817 const struct smap *details,
a339aa81
EJ
4818 struct hfsc_class * class)
4819{
4820 const struct hfsc *hfsc;
4821 uint32_t min_rate, max_rate;
a339aa81
EJ
4822
4823 hfsc = hfsc_get__(netdev);
a339aa81 4824
13c1637f 4825 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4826 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4827 min_rate = MIN(min_rate, hfsc->max_rate);
4828
13c1637f 4829 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4830 max_rate = MAX(max_rate, min_rate);
4831 max_rate = MIN(max_rate, hfsc->max_rate);
4832
4833 class->min_rate = min_rate;
4834 class->max_rate = max_rate;
4835
4836 return 0;
4837}
4838
4839/* Create an HFSC qdisc.
4840 *
4841 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4842static int
4843hfsc_setup_qdisc__(struct netdev * netdev)
4844{
4845 struct tcmsg *tcmsg;
4846 struct ofpbuf request;
4847 struct tc_hfsc_qopt opt;
4848
4849 tc_del_qdisc(netdev);
4850
7874bdff
RD
4851 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4852 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
4853
4854 if (!tcmsg) {
4855 return ENODEV;
4856 }
4857
4858 tcmsg->tcm_handle = tc_make_handle(1, 0);
4859 tcmsg->tcm_parent = TC_H_ROOT;
4860
4861 memset(&opt, 0, sizeof opt);
4862 opt.defcls = 1;
4863
4864 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4865 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4866
4867 return tc_transact(&request, NULL);
4868}
4869
4870/* Create an HFSC class.
4871 *
4872 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4873 * sc rate <min_rate> ul rate <max_rate>" */
4874static int
4875hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4876 unsigned int parent, struct hfsc_class *class)
4877{
4878 int error;
4879 size_t opt_offset;
4880 struct tcmsg *tcmsg;
4881 struct ofpbuf request;
4882 struct tc_service_curve min, max;
4883
7874bdff
RD
4884 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4885 &request);
a339aa81
EJ
4886
4887 if (!tcmsg) {
4888 return ENODEV;
4889 }
4890
4891 tcmsg->tcm_handle = handle;
4892 tcmsg->tcm_parent = parent;
4893
4894 min.m1 = 0;
4895 min.d = 0;
4896 min.m2 = class->min_rate;
4897
4898 max.m1 = 0;
4899 max.d = 0;
4900 max.m2 = class->max_rate;
4901
4902 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4903 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4904 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4905 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4906 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4907 nl_msg_end_nested(&request, opt_offset);
4908
4909 error = tc_transact(&request, NULL);
4910 if (error) {
4911 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4912 "min-rate %ubps, max-rate %ubps (%s)",
4913 netdev_get_name(netdev),
4914 tc_get_major(handle), tc_get_minor(handle),
4915 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4916 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4917 }
4918
4919 return error;
4920}
4921
4922static int
79f1cbe9 4923hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4924{
4925 int error;
4926 struct hfsc_class class;
4927
4928 error = hfsc_setup_qdisc__(netdev);
4929
4930 if (error) {
4931 return error;
4932 }
4933
4934 hfsc_parse_qdisc_details__(netdev, details, &class);
4935 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4936 tc_make_handle(1, 0), &class);
4937
4938 if (error) {
4939 return error;
4940 }
4941
4942 hfsc_install__(netdev, class.max_rate);
4943 return 0;
4944}
4945
4946static int
4947hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4948{
4949 struct ofpbuf msg;
d57695d7 4950 struct queue_dump_state state;
a339aa81
EJ
4951 struct hfsc_class hc;
4952
4953 hc.max_rate = 0;
4954 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4955 hfsc_install__(netdev, hc.max_rate);
a339aa81 4956
d57695d7 4957 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4958 return ENODEV;
4959 }
4960
d57695d7 4961 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4962 unsigned int queue_id;
4963
4964 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4965 hfsc_update_queue__(netdev, queue_id, &hc);
4966 }
4967 }
4968
d57695d7 4969 finish_queue_dump(&state);
a339aa81
EJ
4970 return 0;
4971}
4972
4973static void
4974hfsc_tc_destroy(struct tc *tc)
4975{
4976 struct hfsc *hfsc;
4977 struct hfsc_class *hc, *next;
4978
4979 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4980
4981 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4982 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4983 free(hc);
4984 }
4985
4986 tc_destroy(tc);
4987 free(hfsc);
4988}
4989
4990static int
79f1cbe9 4991hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4992{
4993 const struct hfsc *hfsc;
4994 hfsc = hfsc_get__(netdev);
79f1cbe9 4995 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4996 return 0;
4997}
4998
4999static int
79f1cbe9 5000hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
5001{
5002 int error;
5003 struct hfsc_class class;
5004
5005 hfsc_parse_qdisc_details__(netdev, details, &class);
5006 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5007 tc_make_handle(1, 0), &class);
5008
5009 if (!error) {
5010 hfsc_get__(netdev)->max_rate = class.max_rate;
5011 }
5012
5013 return error;
5014}
5015
5016static int
5017hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 5018 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
5019{
5020 const struct hfsc_class *hc;
5021
5022 hc = hfsc_class_cast__(queue);
79f1cbe9 5023 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 5024 if (hc->min_rate != hc->max_rate) {
79f1cbe9 5025 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
5026 }
5027 return 0;
5028}
5029
5030static int
5031hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 5032 const struct smap *details)
a339aa81
EJ
5033{
5034 int error;
5035 struct hfsc_class class;
5036
5037 error = hfsc_parse_class_details__(netdev, details, &class);
5038 if (error) {
5039 return error;
5040 }
5041
5042 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5043 tc_make_handle(1, 0xfffe), &class);
5044 if (error) {
5045 return error;
5046 }
5047
5048 hfsc_update_queue__(netdev, queue_id, &class);
5049 return 0;
5050}
5051
5052static int
5053hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5054{
5055 int error;
5056 struct hfsc *hfsc;
5057 struct hfsc_class *hc;
5058
5059 hc = hfsc_class_cast__(queue);
5060 hfsc = hfsc_get__(netdev);
5061
5062 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5063 if (!error) {
5064 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5065 free(hc);
5066 }
5067 return error;
5068}
5069
5070static int
5071hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5072 struct netdev_queue_stats *stats)
5073{
5074 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5075 tc_make_handle(1, 0xfffe), NULL, stats);
5076}
5077
5078static int
5079hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5080 const struct ofpbuf *nlmsg,
5081 netdev_dump_queue_stats_cb *cb, void *aux)
5082{
5083 struct netdev_queue_stats stats;
5084 unsigned int handle, major, minor;
5085 int error;
5086
5087 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5088 if (error) {
5089 return error;
5090 }
5091
5092 major = tc_get_major(handle);
5093 minor = tc_get_minor(handle);
5094 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5095 (*cb)(minor - 1, &stats, aux);
5096 }
5097 return 0;
5098}
5099
5100static const struct tc_ops tc_ops_hfsc = {
89c09c1c
BP
5101 .linux_name = "hfsc",
5102 .ovs_name = "linux-hfsc",
5103 .n_queues = HFSC_N_QUEUES, /* n_queues */
5104 .tc_install = hfsc_tc_install,
5105 .tc_load = hfsc_tc_load,
5106 .tc_destroy = hfsc_tc_destroy,
5107 .qdisc_get = hfsc_qdisc_get,
5108 .qdisc_set = hfsc_qdisc_set,
5109 .class_get = hfsc_class_get,
5110 .class_set = hfsc_class_set,
5111 .class_delete = hfsc_class_delete,
5112 .class_get_stats = hfsc_class_get_stats,
5113 .class_dump_stats = hfsc_class_dump_stats,
a339aa81
EJ
5114};
5115\f
6cf888b8
BS
5116/* "linux-noop" traffic control class. */
5117
5118static void
5119noop_install__(struct netdev *netdev_)
5120{
5121 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5122 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5123
5124 netdev->tc = CONST_CAST(struct tc *, &tc);
5125}
5126
5127static int
5128noop_tc_install(struct netdev *netdev,
5129 const struct smap *details OVS_UNUSED)
5130{
5131 noop_install__(netdev);
5132 return 0;
5133}
5134
5135static int
5136noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5137{
5138 noop_install__(netdev);
5139 return 0;
5140}
5141
5142static const struct tc_ops tc_ops_noop = {
89c09c1c
BP
5143 .ovs_name = "linux-noop", /* ovs_name */
5144 .tc_install = noop_tc_install,
5145 .tc_load = noop_tc_load,
6cf888b8
BS
5146};
5147\f
c1c9c9c4
BP
5148/* "linux-default" traffic control class.
5149 *
5150 * This class represents the default, unnamed Linux qdisc. It corresponds to
5151 * the "" (empty string) QoS type in the OVS database. */
5152
5153static void
b5d57fc8 5154default_install__(struct netdev *netdev_)
c1c9c9c4 5155{
b5d57fc8 5156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5157 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 5158
559eb230
BP
5159 /* Nothing but a tc class implementation is allowed to write to a tc. This
5160 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5161 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5162}
5163
5164static int
5165default_tc_install(struct netdev *netdev,
79f1cbe9 5166 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
5167{
5168 default_install__(netdev);
5169 return 0;
5170}
5171
5172static int
5173default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5174{
5175 default_install__(netdev);
5176 return 0;
5177}
5178
5179static const struct tc_ops tc_ops_default = {
89c09c1c
BP
5180 .ovs_name = "", /* ovs_name */
5181 .tc_install = default_tc_install,
5182 .tc_load = default_tc_load,
c1c9c9c4
BP
5183};
5184\f
5185/* "linux-other" traffic control class.
5186 *
5187 * */
5188
5189static int
b5d57fc8 5190other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 5191{
b5d57fc8 5192 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5193 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 5194
559eb230
BP
5195 /* Nothing but a tc class implementation is allowed to write to a tc. This
5196 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5197 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5198 return 0;
5199}
5200
5201static const struct tc_ops tc_ops_other = {
89c09c1c
BP
5202 .ovs_name = "linux-other",
5203 .tc_load = other_tc_load,
c1c9c9c4
BP
5204};
5205\f
5206/* Traffic control. */
5207
5208/* Number of kernel "tc" ticks per second. */
5209static double ticks_per_s;
5210
5211/* Number of kernel "jiffies" per second. This is used for the purpose of
5212 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5213 * one jiffy's worth of data.
5214 *
5215 * There are two possibilities here:
5216 *
5217 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5218 * approximate range of 100 to 1024. That means that we really need to
5219 * make sure that the qdisc can buffer that much data.
5220 *
5221 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5222 * has finely granular timers and there's no need to fudge additional room
5223 * for buffers. (There's no extra effort needed to implement that: the
5224 * large 'buffer_hz' is used as a divisor, so practically any number will
5225 * come out as 0 in the division. Small integer results in the case of
5226 * really high dividends won't have any real effect anyhow.)
5227 */
5228static unsigned int buffer_hz;
5229
7874bdff
RD
5230static struct tcmsg *
5231netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5232 unsigned int flags, struct ofpbuf *request)
5233{
5234 int ifindex;
5235 int error;
5236
5237 error = get_ifindex(netdev, &ifindex);
5238 if (error) {
5239 return NULL;
5240 }
5241
5242 return tc_make_request(ifindex, type, flags, request);
5243}
5244
f8500004
JP
5245/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5246 * of 'kbits_burst'.
5247 *
5248 * This function is equivalent to running:
5249 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5250 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5251 * mtu 65535 drop
5252 *
5253 * The configuration and stats may be seen with the following command:
c7952afb 5254 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
5255 *
5256 * Returns 0 if successful, otherwise a positive errno value.
5257 */
5258static int
c7952afb
BP
5259tc_add_policer(struct netdev *netdev,
5260 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
5261{
5262 struct tc_police tc_police;
5263 struct ofpbuf request;
5264 struct tcmsg *tcmsg;
5265 size_t basic_offset;
5266 size_t police_offset;
5267 int error;
5268 int mtu = 65535;
5269
5270 memset(&tc_police, 0, sizeof tc_police);
5271 tc_police.action = TC_POLICE_SHOT;
5272 tc_police.mtu = mtu;
1aca400c 5273 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 5274
79abacc8
MAA
5275 /* The following appears wrong in one way: In networking a kilobit is
5276 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
5277 *
5278 * However if you "fix" those problems then "tc filter show ..." shows
5279 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5280 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5281 * tc's point of view. Whatever. */
5282 tc_police.burst = tc_bytes_to_ticks(
79abacc8 5283 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 5284
7874bdff
RD
5285 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5286 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
5287 if (!tcmsg) {
5288 return ENODEV;
5289 }
5290 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5291 tcmsg->tcm_info = tc_make_handle(49,
5292 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5293
5294 nl_msg_put_string(&request, TCA_KIND, "basic");
5295 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5296 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5297 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5298 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5299 nl_msg_end_nested(&request, police_offset);
5300 nl_msg_end_nested(&request, basic_offset);
5301
5302 error = tc_transact(&request, NULL);
5303 if (error) {
5304 return error;
5305 }
5306
5307 return 0;
5308}
5309
c1c9c9c4
BP
5310static void
5311read_psched(void)
5312{
5313 /* The values in psched are not individually very meaningful, but they are
5314 * important. The tables below show some values seen in the wild.
5315 *
5316 * Some notes:
5317 *
5318 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5319 * (Before that, there are hints that it was 1000000000.)
5320 *
5321 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5322 * above.
5323 *
5324 * /proc/net/psched
5325 * -----------------------------------
5326 * [1] 000c8000 000f4240 000f4240 00000064
5327 * [2] 000003e8 00000400 000f4240 3b9aca00
5328 * [3] 000003e8 00000400 000f4240 3b9aca00
5329 * [4] 000003e8 00000400 000f4240 00000064
5330 * [5] 000003e8 00000040 000f4240 3b9aca00
5331 * [6] 000003e8 00000040 000f4240 000000f9
5332 *
5333 * a b c d ticks_per_s buffer_hz
5334 * ------- --------- ---------- ------------- ----------- -------------
5335 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5336 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5337 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5338 * [4] 1,000 1,024 1,000,000 100 976,562 100
5339 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5340 * [6] 1,000 64 1,000,000 249 15,625,000 249
5341 *
5342 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5343 * [2] 2.6.26-1-686-bigmem from Debian lenny
5344 * [3] 2.6.26-2-sparc64 from Debian lenny
5345 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5346 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5347 * [6] 2.6.34 from kernel.org on KVM
5348 */
23882115 5349 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
5350 static const char fn[] = "/proc/net/psched";
5351 unsigned int a, b, c, d;
5352 FILE *stream;
5353
23882115
BP
5354 if (!ovsthread_once_start(&once)) {
5355 return;
5356 }
5357
c1c9c9c4
BP
5358 ticks_per_s = 1.0;
5359 buffer_hz = 100;
5360
5361 stream = fopen(fn, "r");
5362 if (!stream) {
10a89ef0 5363 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 5364 goto exit;
c1c9c9c4
BP
5365 }
5366
5367 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5368 VLOG_WARN("%s: read failed", fn);
5369 fclose(stream);
23882115 5370 goto exit;
c1c9c9c4
BP
5371 }
5372 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5373 fclose(stream);
5374
1bab4901 5375 if (!a || !b || !c) {
c1c9c9c4 5376 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 5377 goto exit;
c1c9c9c4
BP
5378 }
5379
5380 ticks_per_s = (double) a * c / b;
5381 if (c == 1000000) {
5382 buffer_hz = d;
5383 } else {
5384 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5385 fn, a, b, c, d);
5386 }
5387 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
5388
5389exit:
5390 ovsthread_once_done(&once);
c1c9c9c4
BP
5391}
5392
5393/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5394 * rate of 'rate' bytes per second. */
5395static unsigned int
5396tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5397{
23882115 5398 read_psched();
c1c9c9c4
BP
5399 return (rate * ticks) / ticks_per_s;
5400}
5401
5402/* Returns the number of ticks that it would take to transmit 'size' bytes at a
5403 * rate of 'rate' bytes per second. */
5404static unsigned int
5405tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5406{
23882115 5407 read_psched();
015c93a4 5408 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
5409}
5410
5411/* Returns the number of bytes that need to be reserved for qdisc buffering at
5412 * a transmission rate of 'rate' bytes per second. */
5413static unsigned int
5414tc_buffer_per_jiffy(unsigned int rate)
5415{
23882115 5416 read_psched();
c1c9c9c4
BP
5417 return rate / buffer_hz;
5418}
5419
2f564bb1
S
5420static uint32_t
5421tc_time_to_ticks(uint32_t time) {
5422 read_psched();
5423 return time * (ticks_per_s / 1000000);
5424}
5425
c1c9c9c4
BP
5426/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5427 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5428 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5429 * stores NULL into it if it is absent.
5430 *
5431 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5432 * 'msg'.
5433 *
5434 * Returns 0 if successful, otherwise a positive errno value. */
5435static int
5436tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5437 struct nlattr **options)
5438{
5439 static const struct nl_policy tca_policy[] = {
5440 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5441 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5442 };
5443 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5444
5445 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5446 tca_policy, ta, ARRAY_SIZE(ta))) {
5447 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5448 goto error;
5449 }
5450
5451 if (kind) {
5452 *kind = nl_attr_get_string(ta[TCA_KIND]);
5453 }
5454
5455 if (options) {
5456 *options = ta[TCA_OPTIONS];
5457 }
5458
5459 return 0;
5460
5461error:
5462 if (kind) {
5463 *kind = NULL;
5464 }
5465 if (options) {
5466 *options = NULL;
5467 }
5468 return EPROTO;
5469}
5470
5471/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5472 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5473 * into '*options', and its queue statistics into '*stats'. Any of the output
5474 * arguments may be null.
5475 *
5476 * Returns 0 if successful, otherwise a positive errno value. */
5477static int
5478tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5479 struct nlattr **options, struct netdev_queue_stats *stats)
5480{
5481 static const struct nl_policy tca_policy[] = {
5482 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5483 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5484 };
5485 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5486
5487 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5488 tca_policy, ta, ARRAY_SIZE(ta))) {
5489 VLOG_WARN_RL(&rl, "failed to parse class message");
5490 goto error;
5491 }
5492
5493 if (handlep) {
5494 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5495 *handlep = tc->tcm_handle;
5496 }
5497
5498 if (options) {
5499 *options = ta[TCA_OPTIONS];
5500 }
5501
5502 if (stats) {
5503 const struct gnet_stats_queue *gsq;
5504 struct gnet_stats_basic gsb;
5505
5506 static const struct nl_policy stats_policy[] = {
5507 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5508 .min_len = sizeof gsb },
5509 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5510 .min_len = sizeof *gsq },
5511 };
5512 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5513
5514 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5515 sa, ARRAY_SIZE(sa))) {
5516 VLOG_WARN_RL(&rl, "failed to parse class stats");
5517 goto error;
5518 }
5519
5520 /* Alignment issues screw up the length of struct gnet_stats_basic on
5521 * some arch/bitsize combinations. Newer versions of Linux have a
5522 * struct gnet_stats_basic_packed, but we can't depend on that. The
5523 * easiest thing to do is just to make a copy. */
5524 memset(&gsb, 0, sizeof gsb);
5525 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5526 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5527 stats->tx_bytes = gsb.bytes;
5528 stats->tx_packets = gsb.packets;
5529
5530 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5531 stats->tx_errors = gsq->drops;
5532 }
5533
5534 return 0;
5535
5536error:
5537 if (options) {
5538 *options = NULL;
5539 }
5540 if (stats) {
5541 memset(stats, 0, sizeof *stats);
5542 }
5543 return EPROTO;
5544}
5545
5546/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5547 * on 'netdev'. */
5548static int
5549tc_query_class(const struct netdev *netdev,
5550 unsigned int handle, unsigned int parent,
5551 struct ofpbuf **replyp)
5552{
5553 struct ofpbuf request;
5554 struct tcmsg *tcmsg;
5555 int error;
5556
7874bdff
RD
5557 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5558 &request);
23a98ffe
BP
5559 if (!tcmsg) {
5560 return ENODEV;
5561 }
c1c9c9c4
BP
5562 tcmsg->tcm_handle = handle;
5563 tcmsg->tcm_parent = parent;
5564
5565 error = tc_transact(&request, replyp);
5566 if (error) {
5567 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5568 netdev_get_name(netdev),
5569 tc_get_major(handle), tc_get_minor(handle),
5570 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5571 ovs_strerror(error));
c1c9c9c4
BP
5572 }
5573 return error;
5574}
5575
5576/* Equivalent to "tc class del dev <name> handle <handle>". */
5577static int
5578tc_delete_class(const struct netdev *netdev, unsigned int handle)
5579{
5580 struct ofpbuf request;
5581 struct tcmsg *tcmsg;
5582 int error;
5583
7874bdff 5584 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5585 if (!tcmsg) {
5586 return ENODEV;
5587 }
c1c9c9c4
BP
5588 tcmsg->tcm_handle = handle;
5589 tcmsg->tcm_parent = 0;
5590
5591 error = tc_transact(&request, NULL);
5592 if (error) {
5593 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5594 netdev_get_name(netdev),
5595 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5596 ovs_strerror(error));
c1c9c9c4
BP
5597 }
5598 return error;
5599}
5600
5601/* Equivalent to "tc qdisc del dev <name> root". */
5602static int
b5d57fc8 5603tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5604{
b5d57fc8 5605 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5606 struct ofpbuf request;
5607 struct tcmsg *tcmsg;
5608 int error;
5609
7874bdff 5610 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5611 if (!tcmsg) {
5612 return ENODEV;
5613 }
c1c9c9c4
BP
5614 tcmsg->tcm_handle = tc_make_handle(1, 0);
5615 tcmsg->tcm_parent = TC_H_ROOT;
5616
5617 error = tc_transact(&request, NULL);
5618 if (error == EINVAL) {
5619 /* EINVAL probably means that the default qdisc was in use, in which
5620 * case we've accomplished our purpose. */
5621 error = 0;
5622 }
b5d57fc8
BP
5623 if (!error && netdev->tc) {
5624 if (netdev->tc->ops->tc_destroy) {
5625 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5626 }
b5d57fc8 5627 netdev->tc = NULL;
c1c9c9c4
BP
5628 }
5629 return error;
5630}
5631
ac3e3aaa
BP
5632static bool
5633getqdisc_is_safe(void)
5634{
5635 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5636 static bool safe = false;
5637
5638 if (ovsthread_once_start(&once)) {
5639 struct utsname utsname;
5640 int major, minor;
5641
5642 if (uname(&utsname) == -1) {
5643 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5644 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5645 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5646 } else if (major < 2 || (major == 2 && minor < 35)) {
5647 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5648 utsname.release);
5649 } else {
5650 safe = true;
5651 }
5652 ovsthread_once_done(&once);
5653 }
5654 return safe;
5655}
5656
c1c9c9c4
BP
5657/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5658 * kernel to determine what they are. Returns 0 if successful, otherwise a
5659 * positive errno value. */
5660static int
b5d57fc8 5661tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5662{
b5d57fc8 5663 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5664 struct ofpbuf request, *qdisc;
5665 const struct tc_ops *ops;
5666 struct tcmsg *tcmsg;
5667 int load_error;
5668 int error;
5669
b5d57fc8 5670 if (netdev->tc) {
c1c9c9c4
BP
5671 return 0;
5672 }
5673
5674 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5675 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5676 * 2.6.35 without that fix backported to it.
5677 *
5678 * To avoid the OOPS, we must not make a request that would attempt to dump
5679 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5680 * few others. There are a few ways that I can see to do this, but most of
5681 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5682 * technique chosen here is to assume that any non-default qdisc that we
5683 * create will have a class with handle 1:0. The built-in qdiscs only have
5684 * a class with handle 0:0.
5685 *
ac3e3aaa
BP
5686 * On Linux 2.6.35+ we use the straightforward method because it allows us
5687 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5688 * in such a case we get no response at all from the kernel (!) if a
5689 * builtin qdisc is in use (which is later caught by "!error &&
5690 * !qdisc->size"). */
7874bdff
RD
5691 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5692 &request);
23a98ffe
BP
5693 if (!tcmsg) {
5694 return ENODEV;
5695 }
ac3e3aaa
BP
5696 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5697 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5698
5699 /* Figure out what tc class to instantiate. */
5700 error = tc_transact(&request, &qdisc);
ac3e3aaa 5701 if (!error && qdisc->size) {
c1c9c9c4
BP
5702 const char *kind;
5703
5704 error = tc_parse_qdisc(qdisc, &kind, NULL);
5705 if (error) {
5706 ops = &tc_ops_other;
5707 } else {
5708 ops = tc_lookup_linux_name(kind);
5709 if (!ops) {
5710 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5711 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5712
5713 ops = &tc_ops_other;
5714 }
5715 }
ac3e3aaa
BP
5716 } else if ((!error && !qdisc->size) || error == ENOENT) {
5717 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5718 * set up by some other entity that doesn't have a handle 1:0. We will
5719 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5720 ops = &tc_ops_default;
5721 error = 0;
5722 } else {
5723 /* Who knows? Maybe the device got deleted. */
5724 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5725 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5726 ops = &tc_ops_other;
5727 }
5728
5729 /* Instantiate it. */
b5d57fc8
BP
5730 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5731 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5732 ofpbuf_delete(qdisc);
5733
5734 return error ? error : load_error;
5735}
5736
5737/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5738 approximate the time to transmit packets of various lengths. For an MTU of
5739 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5740 represents two possible packet lengths; for a MTU of 513 through 1024, four
5741 possible lengths; and so on.
5742
5743 Returns, for the specified 'mtu', the number of bits that packet lengths
5744 need to be shifted right to fit within such a 256-entry table. */
5745static int
5746tc_calc_cell_log(unsigned int mtu)
5747{
5748 int cell_log;
5749
5750 if (!mtu) {
5751 mtu = ETH_PAYLOAD_MAX;
5752 }
5753 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5754
5755 for (cell_log = 0; mtu >= 256; cell_log++) {
5756 mtu >>= 1;
5757 }
5758
5759 return cell_log;
5760}
5761
5762/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5763 * of 'mtu'. */
5764static void
5765tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5766{
5767 memset(rate, 0, sizeof *rate);
5768 rate->cell_log = tc_calc_cell_log(mtu);
5769 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5770 /* rate->cell_align = 0; */ /* distro headers. */
5771 rate->mpu = ETH_TOTAL_MIN;
5772 rate->rate = Bps;
5773}
5774
5775/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5776 * attribute of the specified "type".
5777 *
5778 * See tc_calc_cell_log() above for a description of "rtab"s. */
e7f6ba22 5779void
c1c9c9c4
BP
5780tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5781{
5782 uint32_t *rtab;
5783 unsigned int i;
5784
5785 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5786 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5787 unsigned packet_size = (i + 1) << rate->cell_log;
5788 if (packet_size < rate->mpu) {
5789 packet_size = rate->mpu;
5790 }
5791 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5792 }
5793}
5794
5795/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5796 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5797 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5798 * 0 is fine.) */
c1c9c9c4
BP
5799static int
5800tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5801{
5802 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5803 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5804}
d3980822 5805\f
aaf2fb1a
BP
5806/* Linux-only functions declared in netdev-linux.h */
5807
5808/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5809 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5810int
5811netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5812 const char *flag_name, bool enable)
5813{
5814 const char *netdev_name = netdev_get_name(netdev);
5815 struct ethtool_value evalue;
5816 uint32_t new_flags;
5817 int error;
5818
ab985a77 5819 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5820 memset(&evalue, 0, sizeof evalue);
5821 error = netdev_linux_do_ethtool(netdev_name,
5822 (struct ethtool_cmd *)&evalue,
5823 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5824 if (error) {
5825 return error;
5826 }
5827
ab985a77 5828 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5829 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5830 if (new_flags == evalue.data) {
5831 return 0;
5832 }
5833 evalue.data = new_flags;
aaf2fb1a
BP
5834 error = netdev_linux_do_ethtool(netdev_name,
5835 (struct ethtool_cmd *)&evalue,
5836 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5837 if (error) {
5838 return error;
5839 }
5840
ab985a77 5841 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5842 memset(&evalue, 0, sizeof evalue);
5843 error = netdev_linux_do_ethtool(netdev_name,
5844 (struct ethtool_cmd *)&evalue,
5845 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5846 if (error) {
5847 return error;
5848 }
5849
5850 if (new_flags != evalue.data) {
5851 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5852 "device %s failed", enable ? "enable" : "disable",
5853 flag_name, netdev_name);
5854 return EOPNOTSUPP;
5855 }
5856
5857 return 0;
5858}
5859\f
5860/* Utility functions. */
5861
d3980822 5862/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5863static void
d3980822
BP
5864netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5865 const struct rtnl_link_stats *src)
5866{
f613a0d7
PS
5867 dst->rx_packets = src->rx_packets;
5868 dst->tx_packets = src->tx_packets;
5869 dst->rx_bytes = src->rx_bytes;
5870 dst->tx_bytes = src->tx_bytes;
5871 dst->rx_errors = src->rx_errors;
5872 dst->tx_errors = src->tx_errors;
5873 dst->rx_dropped = src->rx_dropped;
5874 dst->tx_dropped = src->tx_dropped;
5875 dst->multicast = src->multicast;
5876 dst->collisions = src->collisions;
5877 dst->rx_length_errors = src->rx_length_errors;
5878 dst->rx_over_errors = src->rx_over_errors;
5879 dst->rx_crc_errors = src->rx_crc_errors;
5880 dst->rx_frame_errors = src->rx_frame_errors;
5881 dst->rx_fifo_errors = src->rx_fifo_errors;
5882 dst->rx_missed_errors = src->rx_missed_errors;
5883 dst->tx_aborted_errors = src->tx_aborted_errors;
5884 dst->tx_carrier_errors = src->tx_carrier_errors;
5885 dst->tx_fifo_errors = src->tx_fifo_errors;
5886 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5887 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5888}
5889
337c9b99
BP
5890/* Copies 'src' into 'dst', performing format conversion in the process. */
5891static void
5892netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5893 const struct rtnl_link_stats64 *src)
5894{
5895 dst->rx_packets = src->rx_packets;
5896 dst->tx_packets = src->tx_packets;
5897 dst->rx_bytes = src->rx_bytes;
5898 dst->tx_bytes = src->tx_bytes;
5899 dst->rx_errors = src->rx_errors;
5900 dst->tx_errors = src->tx_errors;
5901 dst->rx_dropped = src->rx_dropped;
5902 dst->tx_dropped = src->tx_dropped;
5903 dst->multicast = src->multicast;
5904 dst->collisions = src->collisions;
5905 dst->rx_length_errors = src->rx_length_errors;
5906 dst->rx_over_errors = src->rx_over_errors;
5907 dst->rx_crc_errors = src->rx_crc_errors;
5908 dst->rx_frame_errors = src->rx_frame_errors;
5909 dst->rx_fifo_errors = src->rx_fifo_errors;
5910 dst->rx_missed_errors = src->rx_missed_errors;
5911 dst->tx_aborted_errors = src->tx_aborted_errors;
5912 dst->tx_carrier_errors = src->tx_carrier_errors;
5913 dst->tx_fifo_errors = src->tx_fifo_errors;
5914 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5915 dst->tx_window_errors = src->tx_window_errors;
5916}
5917
c1c9c9c4 5918static int
35eef899 5919get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5920{
c1c9c9c4
BP
5921 struct ofpbuf request;
5922 struct ofpbuf *reply;
c1c9c9c4
BP
5923 int error;
5924
d6e3feb5 5925 /* Filtering all counters by default */
5926 memset(stats, 0xFF, sizeof(struct netdev_stats));
5927
c1c9c9c4 5928 ofpbuf_init(&request, 0);
13a24df8
BP
5929 nl_msg_put_nlmsghdr(&request,
5930 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5931 RTM_GETLINK, NLM_F_REQUEST);
5932 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5933 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5934 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5935 ofpbuf_uninit(&request);
5936 if (error) {
5937 return error;
5938 }
5939
13a24df8 5940 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5941 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5942 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5943 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5944 error = 0;
5945 } else {
71f21279 5946 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
5947 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5948 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5949 error = 0;
5950 } else {
5951 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5952 error = EPROTO;
5953 }
13a24df8
BP
5954 }
5955 } else {
5956 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5957 error = EPROTO;
c1c9c9c4 5958 }
8b61709d 5959
8b61709d 5960
576e26d7 5961 ofpbuf_delete(reply);
35eef899 5962 return error;
8b61709d 5963}
c1c9c9c4 5964
3a183124 5965static int
b5d57fc8 5966get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5967{
5968 struct ifreq ifr;
5969 int error;
5970
755be9ea 5971 *flags = 0;
259e0b1a 5972 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5973 if (!error) {
5974 *flags = ifr.ifr_flags;
5975 }
8b61709d
BP
5976 return error;
5977}
5978
5979static int
4b609110 5980set_flags(const char *name, unsigned int flags)
8b61709d
BP
5981{
5982 struct ifreq ifr;
5983
5984 ifr.ifr_flags = flags;
259e0b1a 5985 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5986}
5987
01b25786
PB
5988int
5989linux_get_ifindex(const char *netdev_name)
8b61709d
BP
5990{
5991 struct ifreq ifr;
259e0b1a 5992 int error;
8b61709d 5993
71d7c22f 5994 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5995 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5996
5997 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5998 if (error) {
580e1152
RD
5999 /* ENODEV probably means that a vif disappeared asynchronously and
6000 * hasn't been removed from the database yet, so reduce the log level
6001 * to INFO for that case. */
6002 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6003 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6004 netdev_name, ovs_strerror(error));
259e0b1a 6005 return -error;
8b61709d
BP
6006 }
6007 return ifr.ifr_ifindex;
6008}
6009
6010static int
6011get_ifindex(const struct netdev *netdev_, int *ifindexp)
6012{
b5d57fc8 6013 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 6014
b5d57fc8 6015 if (!(netdev->cache_valid & VALID_IFINDEX)) {
756819dd
FL
6016 netdev_linux_update_via_netlink(netdev);
6017 }
6018
6019 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6020 /* Fall back to ioctl if netlink fails */
01b25786 6021 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 6022
8b61709d 6023 if (ifindex < 0) {
b5d57fc8
BP
6024 netdev->get_ifindex_error = -ifindex;
6025 netdev->ifindex = 0;
c7b1b0a5 6026 } else {
b5d57fc8
BP
6027 netdev->get_ifindex_error = 0;
6028 netdev->ifindex = ifindex;
8b61709d 6029 }
b5d57fc8 6030 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 6031 }
c7b1b0a5 6032
b5d57fc8
BP
6033 *ifindexp = netdev->ifindex;
6034 return netdev->get_ifindex_error;
8b61709d
BP
6035}
6036
6037static int
756819dd
FL
6038netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6039{
6040 struct ofpbuf request;
6041 struct ofpbuf *reply;
6042 struct rtnetlink_change chg;
6043 struct rtnetlink_change *change = &chg;
6044 int error;
6045
6046 ofpbuf_init(&request, 0);
6047 nl_msg_put_nlmsghdr(&request,
6048 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6049 RTM_GETLINK, NLM_F_REQUEST);
6050 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6051
6052 /* The correct identifiers for a Linux device are netnsid and ifindex,
6053 * but ifindex changes as the port is moved to another network namespace
6054 * and the interface name statically stored in ovsdb. */
6055 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6056 if (netdev_linux_netnsid_is_remote(netdev)) {
6057 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6058 }
6059 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6060 ofpbuf_uninit(&request);
6061 if (error) {
6062 ofpbuf_delete(reply);
6063 return error;
6064 }
6065
6066 if (rtnetlink_parse(reply, change)
6067 && change->nlmsg_type == RTM_NEWLINK) {
6068 bool changed = false;
6069 error = 0;
6070
6071 /* Update netdev from rtnl msg and increment its seq if needed. */
6072 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6073 netdev->carrier_resets++;
6074 changed = true;
6075 }
6076 if (change->ifi_flags != netdev->ifi_flags) {
6077 netdev->ifi_flags = change->ifi_flags;
6078 changed = true;
6079 }
6080 if (change->mtu && change->mtu != netdev->mtu) {
6081 netdev->mtu = change->mtu;
6082 netdev->cache_valid |= VALID_MTU;
6083 netdev->netdev_mtu_error = 0;
6084 changed = true;
6085 }
6086 if (!eth_addr_is_zero(change->mac)
6087 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6088 netdev->etheraddr = change->mac;
6089 netdev->cache_valid |= VALID_ETHERADDR;
6090 netdev->ether_addr_error = 0;
6091 changed = true;
6092 }
6093 if (change->if_index != netdev->ifindex) {
6094 netdev->ifindex = change->if_index;
6095 netdev->cache_valid |= VALID_IFINDEX;
6096 netdev->get_ifindex_error = 0;
6097 changed = true;
6098 }
3d9c99ab
JH
6099 if (change->master && netdev_linux_kind_is_lag(change->master)) {
6100 netdev->is_lag_master = true;
6101 }
756819dd
FL
6102 if (changed) {
6103 netdev_change_seq_changed(&netdev->up);
6104 }
6105 } else {
6106 error = EINVAL;
6107 }
6108
6109 ofpbuf_delete(reply);
6110 return error;
6111}
6112
6113static int
74ff3298 6114get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
6115{
6116 struct ifreq ifr;
6117 int hwaddr_family;
259e0b1a 6118 int error;
8b61709d
BP
6119
6120 memset(&ifr, 0, sizeof ifr);
71d7c22f 6121 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 6122 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
6123 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6124 if (error) {
78857dfb
BP
6125 /* ENODEV probably means that a vif disappeared asynchronously and
6126 * hasn't been removed from the database yet, so reduce the log level
6127 * to INFO for that case. */
259e0b1a 6128 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 6129 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
6130 netdev_name, ovs_strerror(error));
6131 return error;
8b61709d
BP
6132 }
6133 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
6134 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6135 hwaddr_family != ARPHRD_NONE) {
c9697f35 6136 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 6137 netdev_name, hwaddr_family);
c9697f35 6138 return EINVAL;
8b61709d
BP
6139 }
6140 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6141 return 0;
6142}
6143
6144static int
74ff3298 6145set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
6146{
6147 struct ifreq ifr;
259e0b1a 6148 int error;
8b61709d
BP
6149
6150 memset(&ifr, 0, sizeof ifr);
71d7c22f 6151 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 6152 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 6153 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 6154 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
6155 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6156 if (error) {
8b61709d 6157 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 6158 netdev_name, ovs_strerror(error));
8b61709d 6159 }
259e0b1a 6160 return error;
8b61709d
BP
6161}
6162
6163static int
0b0544d7 6164netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
6165 int cmd, const char *cmd_name)
6166{
6167 struct ifreq ifr;
259e0b1a 6168 int error;
8b61709d
BP
6169
6170 memset(&ifr, 0, sizeof ifr);
71d7c22f 6171 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
6172 ifr.ifr_data = (caddr_t) ecmd;
6173
6174 ecmd->cmd = cmd;
259e0b1a
BP
6175 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6176 if (error) {
6177 if (error != EOPNOTSUPP) {
8b61709d 6178 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 6179 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
6180 } else {
6181 /* The device doesn't support this operation. That's pretty
6182 * common, so there's no point in logging anything. */
6183 }
8b61709d 6184 }
259e0b1a 6185 return error;
8b61709d 6186}
f1acd62b 6187
488d734d
BP
6188/* Returns an AF_PACKET raw socket or a negative errno value. */
6189static int
6190af_packet_sock(void)
6191{
23882115
BP
6192 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6193 static int sock;
488d734d 6194
23882115 6195 if (ovsthread_once_start(&once)) {
488d734d
BP
6196 sock = socket(AF_PACKET, SOCK_RAW, 0);
6197 if (sock >= 0) {
8450059e
BP
6198 int error = set_nonblocking(sock);
6199 if (error) {
6200 close(sock);
6201 sock = -error;
6202 }
488d734d
BP
6203 } else {
6204 sock = -errno;
10a89ef0
BP
6205 VLOG_ERR("failed to create packet socket: %s",
6206 ovs_strerror(errno));
488d734d 6207 }
23882115 6208 ovsthread_once_done(&once);
488d734d
BP
6209 }
6210
6211 return sock;
6212}