]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
netdev-linux: indicate if netdev is a LAG master
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
b2befd5b
BP
23#include <sys/types.h>
24#include <netinet/in.h>
55bc98d6 25#include <arpa/inet.h>
8b61709d 26#include <inttypes.h>
32383c3b 27#include <linux/filter.h>
c1c9c9c4 28#include <linux/gen_stats.h>
bb7d0e22 29#include <linux/if_ether.h>
8b61709d
BP
30#include <linux/if_tun.h>
31#include <linux/types.h>
32#include <linux/ethtool.h>
63331829 33#include <linux/mii.h>
ef3767f5 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/ioctl.h>
37#include <sys/socket.h>
ac3e3aaa 38#include <sys/utsname.h>
55bc98d6 39#include <netpacket/packet.h>
8b61709d
BP
40#include <net/if.h>
41#include <net/if_arp.h>
55bc98d6 42#include <net/if_packet.h>
8b61709d 43#include <net/route.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
e14deea0 50#include "dp-packet.h"
93451a0a 51#include "dpif-netlink.h"
df1e5a3b 52#include "dpif-netdev.h"
3e8a2ad1 53#include "openvswitch/dynamic-string.h"
8b61709d 54#include "fatal-signal.h"
93b13be8 55#include "hash.h"
ee89ea7b 56#include "openvswitch/hmap.h"
8b61709d 57#include "netdev-provider.h"
18ebd48c 58#include "netdev-tc-offloads.h"
7fbef77a 59#include "netdev-vport.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
c060c4cf 62#include "netlink.h"
bfda5239 63#include "netnsid.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d 67#include "packets.h"
fd016ae3 68#include "openvswitch/poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
ee89ea7b 70#include "openvswitch/shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
c1c5c723 73#include "tc.h"
1670c579 74#include "timer.h"
c060c4cf 75#include "unaligned.h"
e6211adc 76#include "openvswitch/vlog.h"
ee89ea7b 77#include "util.h"
5136ce49 78
d98e6007 79VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 80
d76f09ea
BP
81COVERAGE_DEFINE(netdev_set_policing);
82COVERAGE_DEFINE(netdev_arp_lookup);
83COVERAGE_DEFINE(netdev_get_ifindex);
84COVERAGE_DEFINE(netdev_get_hwaddr);
85COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
86COVERAGE_DEFINE(netdev_get_ethtool);
87COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 88
8b61709d 89\f
756819dd
FL
90#ifndef IFLA_IF_NETNSID
91#define IFLA_IF_NETNSID 0x45
92#endif
8b61709d
BP
93/* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95#ifndef ADVERTISED_Pause
96#define ADVERTISED_Pause (1 << 13)
97#endif
98#ifndef ADVERTISED_Asym_Pause
99#define ADVERTISED_Asym_Pause (1 << 14)
100#endif
101
e47bd51a
JP
102/* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104#ifndef ETHTOOL_GFLAGS
105#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106#endif
107#ifndef ETHTOOL_SFLAGS
108#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109#endif
110
c1c9c9c4
BP
111/* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113#ifndef TC_RTAB_SIZE
114#define TC_RTAB_SIZE 1024
115#endif
116
b73c8518
SH
117/* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
122 *
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
125 */
55bc98d6
BP
126#ifndef PACKET_AUXDATA
127#define PACKET_AUXDATA 8
128#endif
b73c8518
SH
129#ifndef TP_STATUS_VLAN_VALID
130#define TP_STATUS_VLAN_VALID (1 << 4)
131#endif
132#ifndef TP_STATUS_VLAN_TPID_VALID
133#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
134#endif
135#undef tpacket_auxdata
136#define tpacket_auxdata rpl_tpacket_auxdata
137struct tpacket_auxdata {
138 uint32_t tp_status;
139 uint32_t tp_len;
140 uint32_t tp_snaplen;
141 uint16_t tp_mac;
142 uint16_t tp_net;
143 uint16_t tp_vlan_tci;
144 uint16_t tp_vlan_tpid;
145};
146
0c615356
SH
147/* Linux 2.6.27 introduced ethtool_cmd_speed
148 *
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
8a7903c6 151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
152 * unconditionally replace ethtool_cmd_speed. */
153#define ethtool_cmd_speed rpl_ethtool_cmd_speed
154static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
155{
156 return ep->speed | (ep->speed_hi << 16);
157}
158
67bed84c
SH
159/* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161#ifndef SUPPORTED_1000baseKX_Full
162#define SUPPORTED_1000baseKX_Full (1 << 17)
163#define SUPPORTED_10000baseKX4_Full (1 << 18)
164#define SUPPORTED_10000baseKR_Full (1 << 19)
165#define SUPPORTED_10000baseR_FEC (1 << 20)
166#define ADVERTISED_1000baseKX_Full (1 << 17)
167#define ADVERTISED_10000baseKX4_Full (1 << 18)
168#define ADVERTISED_10000baseKR_Full (1 << 19)
169#define ADVERTISED_10000baseR_FEC (1 << 20)
170#endif
171
172/* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174#ifndef SUPPORTED_40000baseKR4_Full
175#define SUPPORTED_40000baseKR4_Full (1 << 23)
176#define SUPPORTED_40000baseCR4_Full (1 << 24)
177#define SUPPORTED_40000baseSR4_Full (1 << 25)
178#define SUPPORTED_40000baseLR4_Full (1 << 26)
179#define ADVERTISED_40000baseKR4_Full (1 << 23)
180#define ADVERTISED_40000baseCR4_Full (1 << 24)
181#define ADVERTISED_40000baseSR4_Full (1 << 25)
182#define ADVERTISED_40000baseLR4_Full (1 << 26)
183#endif
184
fa373af4
BP
185/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
186 *
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
191 * unconditionally define a replacement. */
192#ifndef IFLA_STATS64
337c9b99 193#define IFLA_STATS64 23
fa373af4
BP
194#endif
195#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
196struct rtnl_link_stats64 {
197 uint64_t rx_packets;
198 uint64_t tx_packets;
199 uint64_t rx_bytes;
200 uint64_t tx_bytes;
201 uint64_t rx_errors;
202 uint64_t tx_errors;
203 uint64_t rx_dropped;
204 uint64_t tx_dropped;
205 uint64_t multicast;
206 uint64_t collisions;
207
208 uint64_t rx_length_errors;
209 uint64_t rx_over_errors;
210 uint64_t rx_crc_errors;
211 uint64_t rx_frame_errors;
212 uint64_t rx_fifo_errors;
213 uint64_t rx_missed_errors;
214
215 uint64_t tx_aborted_errors;
216 uint64_t tx_carrier_errors;
217 uint64_t tx_fifo_errors;
218 uint64_t tx_heartbeat_errors;
219 uint64_t tx_window_errors;
220
221 uint64_t rx_compressed;
222 uint64_t tx_compressed;
223};
337c9b99 224
8b61709d 225enum {
7fbef77a
JG
226 VALID_IFINDEX = 1 << 0,
227 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
228 VALID_IN = 1 << 2,
229 VALID_MTU = 1 << 3,
230 VALID_POLICING = 1 << 4,
231 VALID_VPORT_STAT_ERROR = 1 << 5,
232 VALID_DRVINFO = 1 << 6,
233 VALID_FEATURES = 1 << 7,
8b61709d 234};
c1c9c9c4
BP
235\f
236/* Traffic control. */
237
238/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
239 * network device.
240 *
241 * Each TC implementation subclasses this with whatever additional data it
242 * needs. */
c1c9c9c4
BP
243struct tc {
244 const struct tc_ops *ops;
93b13be8
BP
245 struct hmap queues; /* Contains "struct tc_queue"s.
246 * Read by generic TC layer.
247 * Written only by TC implementation. */
248};
c1c9c9c4 249
559eb230
BP
250#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
251
93b13be8
BP
252/* One traffic control queue.
253 *
254 * Each TC implementation subclasses this with whatever additional data it
255 * needs. */
256struct tc_queue {
257 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
258 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 259 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
260};
261
262/* A particular kind of traffic control. Each implementation generally maps to
263 * one particular Linux qdisc class.
264 *
265 * The functions below return 0 if successful or a positive errno value on
266 * failure, except where otherwise noted. All of them must be provided, except
267 * where otherwise noted. */
268struct tc_ops {
269 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
270 * This is null for tc_ops_default and tc_ops_other, for which there are no
271 * appropriate values. */
272 const char *linux_name;
273
274 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
275 const char *ovs_name;
276
277 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
278 * queues. The queues are numbered 0 through n_queues - 1. */
279 unsigned int n_queues;
280
281 /* Called to install this TC class on 'netdev'. The implementation should
282 * make the Netlink calls required to set up 'netdev' with the right qdisc
283 * and configure it according to 'details'. The implementation may assume
284 * that the current qdisc is the default; that is, there is no need for it
285 * to delete the current qdisc before installing itself.
286 *
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
290 *
291 * This function must return 0 if and only if it sets 'netdev->tc' to an
292 * initialized 'struct tc'.
293 *
294 * (This function is null for tc_ops_other, which cannot be installed. For
295 * other TC classes it should always be nonnull.) */
79f1cbe9 296 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
297
298 /* Called when the netdev code determines (through a Netlink query) that
299 * this TC class's qdisc is installed on 'netdev', but we didn't install
300 * it ourselves and so don't know any of the details.
301 *
302 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
303 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
304 * implementation should parse the other attributes of 'nlmsg' as
305 * necessary to determine its configuration. If necessary it should also
306 * use Netlink queries to determine the configuration of queues on
307 * 'netdev'.
308 *
309 * This function must return 0 if and only if it sets 'netdev->tc' to an
310 * initialized 'struct tc'. */
311 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
312
313 /* Destroys the data structures allocated by the implementation as part of
314 * 'tc'. (This includes destroying 'tc->queues' by calling
315 * tc_destroy(tc).
316 *
317 * The implementation should not need to perform any Netlink calls. If
318 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
319 * (But it may not be desirable.)
320 *
321 * This function may be null if 'tc' is trivial. */
322 void (*tc_destroy)(struct tc *tc);
323
324 /* Retrieves details of 'netdev->tc' configuration into 'details'.
325 *
326 * The implementation should not need to perform any Netlink calls, because
327 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
328 * cached the configuration.
329 *
330 * The contents of 'details' should be documented as valid for 'ovs_name'
331 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
332 * (which is built as ovs-vswitchd.conf.db(8)).
333 *
334 * This function may be null if 'tc' is not configurable.
335 */
79f1cbe9 336 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
337
338 /* Reconfigures 'netdev->tc' according to 'details', performing any
339 * required Netlink calls to complete the reconfiguration.
340 *
341 * The contents of 'details' should be documented as valid for 'ovs_name'
342 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
343 * (which is built as ovs-vswitchd.conf.db(8)).
344 *
345 * This function may be null if 'tc' is not configurable.
346 */
79f1cbe9 347 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 348
93b13be8
BP
349 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
350 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
351 *
352 * The contents of 'details' should be documented as valid for 'ovs_name'
353 * in the "other_config" column in the "Queue" table in
354 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
355 *
356 * The implementation should not need to perform any Netlink calls, because
357 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
358 * cached the queue configuration.
359 *
360 * This function may be null if 'tc' does not have queues ('n_queues' is
361 * 0). */
93b13be8 362 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 363 struct smap *details);
c1c9c9c4
BP
364
365 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
366 * 'details', perfoming any required Netlink calls to complete the
367 * reconfiguration. The caller ensures that 'queue_id' is less than
368 * 'n_queues'.
369 *
370 * The contents of 'details' should be documented as valid for 'ovs_name'
371 * in the "other_config" column in the "Queue" table in
372 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
373 *
374 * This function may be null if 'tc' does not have queues or its queues are
375 * not configurable. */
376 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 377 const struct smap *details);
c1c9c9c4 378
93b13be8
BP
379 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
380 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
381 *
382 * This function may be null if 'tc' does not have queues or its queues
383 * cannot be deleted. */
93b13be8 384 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 385
93b13be8
BP
386 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
387 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
388 *
389 * On success, initializes '*stats'.
390 *
391 * This function may be null if 'tc' does not have queues or if it cannot
392 * report queue statistics. */
93b13be8
BP
393 int (*class_get_stats)(const struct netdev *netdev,
394 const struct tc_queue *queue,
c1c9c9c4
BP
395 struct netdev_queue_stats *stats);
396
397 /* Extracts queue stats from 'nlmsg', which is a response to a
398 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
399 *
400 * This function may be null if 'tc' does not have queues or if it cannot
401 * report queue statistics. */
402 int (*class_dump_stats)(const struct netdev *netdev,
403 const struct ofpbuf *nlmsg,
404 netdev_dump_queue_stats_cb *cb, void *aux);
405};
406
407static void
408tc_init(struct tc *tc, const struct tc_ops *ops)
409{
410 tc->ops = ops;
93b13be8 411 hmap_init(&tc->queues);
c1c9c9c4
BP
412}
413
414static void
415tc_destroy(struct tc *tc)
416{
93b13be8 417 hmap_destroy(&tc->queues);
c1c9c9c4
BP
418}
419
420static const struct tc_ops tc_ops_htb;
a339aa81 421static const struct tc_ops tc_ops_hfsc;
677d9158
JV
422static const struct tc_ops tc_ops_codel;
423static const struct tc_ops tc_ops_fqcodel;
424static const struct tc_ops tc_ops_sfq;
c1c9c9c4 425static const struct tc_ops tc_ops_default;
6cf888b8 426static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
427static const struct tc_ops tc_ops_other;
428
559eb230 429static const struct tc_ops *const tcs[] = {
c1c9c9c4 430 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 431 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
432 &tc_ops_codel, /* Controlled delay */
433 &tc_ops_fqcodel, /* Fair queue controlled delay */
434 &tc_ops_sfq, /* Stochastic fair queueing */
6cf888b8 435 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
436 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
437 &tc_ops_other, /* Some other qdisc. */
438 NULL
439};
149f577a 440
c1c9c9c4
BP
441static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
7874bdff
RD
445static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
446 int type,
447 unsigned int flags,
448 struct ofpbuf *);
c7952afb
BP
449static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
451
452static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462static int tc_del_qdisc(struct netdev *netdev);
463static int tc_query_qdisc(const struct netdev *netdev);
464
465static int tc_calc_cell_log(unsigned int mtu);
466static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470\f
b5d57fc8
BP
471struct netdev_linux {
472 struct netdev up;
149f577a 473
86383816
BP
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
149f577a 477 unsigned int cache_valid;
8b61709d 478
1670c579
EJ
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
bfda5239 483 int netnsid; /* Network namespace ID. */
8722022c
BP
484 /* The following are figured out "on demand" only. They are only valid
485 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 486 int ifindex;
74ff3298 487 struct eth_addr etheraddr;
8b61709d 488 int mtu;
059e5f4f 489 unsigned int ifi_flags;
65c3058c 490 long long int carrier_resets;
80a86fbe
BP
491 uint32_t kbits_rate; /* Policing data. */
492 uint32_t kbits_burst;
bba1e6f3
PS
493 int vport_stats_error; /* Cached error code from vport_get_stats().
494 0 or an errno value. */
90a6637d 495 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 496 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 497 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 498 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 499 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 500
a00ca915
EJ
501 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 504
4f925bd3 505 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 506 struct tc *tc;
149f577a 507
d0d08f8a
BP
508 /* For devices of class netdev_tap_class only. */
509 int tap_fd;
22dcb534
FL
510 bool present; /* If the device is present in the namespace */
511 uint64_t tx_dropped; /* tap device can drop if the iface is down */
3d9c99ab
JH
512
513 /* LAG information. */
514 bool is_lag_master; /* True if the netdev is a LAG master. */
8b61709d
BP
515};
516
f7791740
PS
517struct netdev_rxq_linux {
518 struct netdev_rxq up;
796223f5 519 bool is_tap;
5b7448ed 520 int fd;
149f577a 521};
8b61709d 522
8b61709d
BP
523/* This is set pretty low because we probably won't learn anything from the
524 * additional log messages. */
525static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
526
19c8e9c1
JS
527/* Polling miimon status for all ports causes performance degradation when
528 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
529 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
530 *
531 * Readers do not depend on this variable synchronizing with the related
532 * changes in the device miimon status, so we can use atomic_count. */
533static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 534
1c33f0c3 535static void netdev_linux_run(const struct netdev_class *);
6f643e49 536
0b0544d7 537static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 538 int cmd, const char *cmd_name);
b5d57fc8 539static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 540static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
541static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
542 enum netdev_flags on, enum netdev_flags *old_flagsp)
543 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
544static int get_ifindex(const struct netdev *, int *ifindexp);
545static int do_set_addr(struct netdev *netdev,
546 int ioctl_nr, const char *ioctl_name,
547 struct in_addr addr);
74ff3298
JR
548static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
549static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 550static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 551static int af_packet_sock(void);
19c8e9c1 552static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
553static void netdev_linux_miimon_run(void);
554static void netdev_linux_miimon_wait(void);
df1e5a3b 555static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 556
15b3596a
JG
557static bool
558is_netdev_linux_class(const struct netdev_class *netdev_class)
559{
259e0b1a 560 return netdev_class->run == netdev_linux_run;
15b3596a
JG
561}
562
796223f5
BP
563static bool
564is_tap_netdev(const struct netdev *netdev)
565{
b5d57fc8 566 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
567}
568
8b61709d
BP
569static struct netdev_linux *
570netdev_linux_cast(const struct netdev *netdev)
571{
b5d57fc8 572 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 573
180c6d0b 574 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 575}
796223f5 576
f7791740
PS
577static struct netdev_rxq_linux *
578netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 579{
9dc63482 580 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 581 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 582}
ff4ed3c9 583\f
bfda5239
FL
584static int
585netdev_linux_netnsid_update__(struct netdev_linux *netdev)
586{
587 struct dpif_netlink_vport reply;
588 struct ofpbuf *buf;
589 int error;
590
591 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
592 if (error) {
629e1476
FL
593 if (error == ENOENT) {
594 /* Assume it is local if there is no API (e.g. if the openvswitch
595 * kernel module is not loaded). */
596 netnsid_set_local(&netdev->netnsid);
597 } else {
598 netnsid_unset(&netdev->netnsid);
599 }
bfda5239
FL
600 return error;
601 }
602
603 netnsid_set(&netdev->netnsid, reply.netnsid);
604 ofpbuf_delete(buf);
605 return 0;
606}
607
608static int
609netdev_linux_netnsid_update(struct netdev_linux *netdev)
610{
611 if (netnsid_is_unset(netdev->netnsid)) {
3dbcbfe4
FL
612 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
613 netnsid_set_local(&netdev->netnsid);
614 } else {
615 return netdev_linux_netnsid_update__(netdev);
616 }
bfda5239
FL
617 }
618
619 return 0;
620}
621
622static bool
623netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
624{
625 netdev_linux_netnsid_update(netdev);
626 return netnsid_eq(netdev->netnsid, nsid);
627}
628
756819dd
FL
629static bool
630netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
631{
632 netdev_linux_netnsid_update(netdev);
633 return netnsid_is_remote(netdev->netnsid);
634}
635
636static int netdev_linux_update_via_netlink(struct netdev_linux *);
bfda5239 637static void netdev_linux_update(struct netdev_linux *netdev, int,
7e9dcc0f 638 const struct rtnetlink_change *)
86383816 639 OVS_REQUIRES(netdev->mutex);
cee87338 640static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
641 unsigned int ifi_flags, unsigned int mask)
642 OVS_REQUIRES(netdev->mutex);
cee87338 643
d6384a3a
AW
644/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
645 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
646 * if no such socket could be created. */
647static struct nl_sock *
648netdev_linux_notify_sock(void)
649{
650 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
651 static struct nl_sock *sock;
989d7135
PS
652 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
653 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
654
655 if (ovsthread_once_start(&once)) {
656 int error;
657
658 error = nl_sock_create(NETLINK_ROUTE, &sock);
659 if (!error) {
d6384a3a
AW
660 size_t i;
661
662 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
663 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
664 if (error) {
665 nl_sock_destroy(sock);
666 sock = NULL;
667 break;
668 }
cee87338
BP
669 }
670 }
cf114a7f 671 nl_sock_listen_all_nsid(sock, true);
cee87338
BP
672 ovsthread_once_done(&once);
673 }
674
675 return sock;
676}
677
19c8e9c1
JS
678static bool
679netdev_linux_miimon_enabled(void)
680{
812c272c 681 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
682}
683
3d9c99ab
JH
684static bool
685netdev_linux_kind_is_lag(const char *kind)
686{
687 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
688 return true;
689 }
690
691 return false;
692}
693
8b61709d 694static void
1c33f0c3 695netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 696{
cee87338
BP
697 struct nl_sock *sock;
698 int error;
699
19c8e9c1
JS
700 if (netdev_linux_miimon_enabled()) {
701 netdev_linux_miimon_run();
702 }
cee87338
BP
703
704 sock = netdev_linux_notify_sock();
705 if (!sock) {
706 return;
707 }
708
709 do {
cee87338 710 uint64_t buf_stub[4096 / 8];
bfda5239 711 int nsid;
cee87338
BP
712 struct ofpbuf buf;
713
714 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
bfda5239 715 error = nl_sock_recv(sock, &buf, &nsid, false);
cee87338 716 if (!error) {
7e9dcc0f 717 struct rtnetlink_change change;
cee87338 718
7e9dcc0f 719 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
720 struct netdev *netdev_ = NULL;
721 char dev_name[IFNAMSIZ];
722
723 if (!change.ifname) {
724 change.ifname = if_indextoname(change.if_index, dev_name);
725 }
726
727 if (change.ifname) {
728 netdev_ = netdev_from_name(change.ifname);
729 }
cee87338
BP
730 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
731 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
732
733 ovs_mutex_lock(&netdev->mutex);
bfda5239 734 netdev_linux_update(netdev, nsid, &change);
86383816 735 ovs_mutex_unlock(&netdev->mutex);
cee87338 736 }
38e0065b 737 netdev_close(netdev_);
cee87338
BP
738 }
739 } else if (error == ENOBUFS) {
740 struct shash device_shash;
741 struct shash_node *node;
742
743 nl_sock_drain(sock);
744
745 shash_init(&device_shash);
746 netdev_get_devices(&netdev_linux_class, &device_shash);
747 SHASH_FOR_EACH (node, &device_shash) {
748 struct netdev *netdev_ = node->data;
749 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
750 unsigned int flags;
751
86383816 752 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
753 get_flags(netdev_, &flags);
754 netdev_linux_changed(netdev, flags, 0);
86383816
BP
755 ovs_mutex_unlock(&netdev->mutex);
756
cee87338
BP
757 netdev_close(netdev_);
758 }
759 shash_destroy(&device_shash);
760 } else if (error != EAGAIN) {
7ed58d4a
JP
761 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
762 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
763 ovs_strerror(error));
764 }
765 ofpbuf_uninit(&buf);
766 } while (!error);
8b61709d
BP
767}
768
769static void
1c33f0c3 770netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 771{
cee87338
BP
772 struct nl_sock *sock;
773
19c8e9c1
JS
774 if (netdev_linux_miimon_enabled()) {
775 netdev_linux_miimon_wait();
776 }
cee87338
BP
777 sock = netdev_linux_notify_sock();
778 if (sock) {
779 nl_sock_wait(sock, POLLIN);
780 }
8b61709d
BP
781}
782
ac4d3bcb 783static void
b5d57fc8
BP
784netdev_linux_changed(struct netdev_linux *dev,
785 unsigned int ifi_flags, unsigned int mask)
86383816 786 OVS_REQUIRES(dev->mutex)
ac4d3bcb 787{
3e912ffc 788 netdev_change_seq_changed(&dev->up);
8aa77183
BP
789
790 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
791 dev->carrier_resets++;
792 }
793 dev->ifi_flags = ifi_flags;
794
4f925bd3 795 dev->cache_valid &= mask;
6b6e1329 796 if (!(mask & VALID_IN)) {
a8704b50
PS
797 netdev_get_addrs_list_flush();
798 }
4f925bd3
PS
799}
800
801static void
bfda5239
FL
802netdev_linux_update__(struct netdev_linux *dev,
803 const struct rtnetlink_change *change)
86383816 804 OVS_REQUIRES(dev->mutex)
4f925bd3 805{
bfda5239 806 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
d6384a3a 807 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 808 /* Keep drv-info, and ip addresses. */
d6384a3a 809 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 810 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
811
812 /* Update netdev from rtnl-change msg. */
813 if (change->mtu) {
814 dev->mtu = change->mtu;
815 dev->cache_valid |= VALID_MTU;
816 dev->netdev_mtu_error = 0;
817 }
90a6637d 818
74ff3298
JR
819 if (!eth_addr_is_zero(change->mac)) {
820 dev->etheraddr = change->mac;
d6384a3a
AW
821 dev->cache_valid |= VALID_ETHERADDR;
822 dev->ether_addr_error = 0;
e8e1a409
TZ
823
824 /* The mac addr has been changed, report it now. */
825 rtnetlink_report_link();
d6384a3a 826 }
44445cac 827
3d9c99ab
JH
828 if (change->master && netdev_linux_kind_is_lag(change->master)) {
829 dev->is_lag_master = true;
830 }
831
d6384a3a
AW
832 dev->ifindex = change->if_index;
833 dev->cache_valid |= VALID_IFINDEX;
834 dev->get_ifindex_error = 0;
22dcb534 835 dev->present = true;
d6384a3a 836 } else {
bfda5239 837 /* FIXME */
d6384a3a 838 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 839 dev->present = false;
bfda5239 840 netnsid_unset(&dev->netnsid);
d6384a3a
AW
841 }
842 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
843 /* Invalidates in4, in6. */
6b6e1329 844 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 845 } else {
d6384a3a 846 OVS_NOT_REACHED();
4f925bd3 847 }
ac4d3bcb
EJ
848}
849
bfda5239
FL
850static void
851netdev_linux_update(struct netdev_linux *dev, int nsid,
852 const struct rtnetlink_change *change)
853 OVS_REQUIRES(dev->mutex)
854{
855 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
856 netdev_linux_update__(dev, change);
857 }
858}
859
9dc63482
BP
860static struct netdev *
861netdev_linux_alloc(void)
862{
863 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
864 return &netdev->up;
865}
866
48c6733c
WT
867static int
868netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 869{
48c6733c
WT
870 /* Prevent any attempt to create (or open) a network device named "default"
871 * or "all". These device names are effectively reserved on Linux because
872 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
873 * itself this wouldn't call for any special treatment, but in practice if
874 * a program tries to create devices with these names, it causes the kernel
875 * to fire a "new device" notification event even though creation failed,
876 * and in turn that causes OVS to wake up and try to create them again,
877 * which ends up as a 100% CPU loop. */
878 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
879 const char *name = netdev_->name;
880 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
881 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
882 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
883 name);
884 return EINVAL;
885 }
886
bfda5239
FL
887 /* The device could be in the same network namespace or in another one. */
888 netnsid_unset(&netdev->netnsid);
834d6caf 889 ovs_mutex_init(&netdev->mutex);
48c6733c 890 return 0;
9dc63482
BP
891}
892
1f6e0fbd
BP
893/* Creates system and internal devices. */
894static int
9dc63482 895netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 896{
9dc63482 897 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
898 int error = netdev_linux_common_construct(netdev_);
899 if (error) {
900 return error;
901 }
1f6e0fbd 902
b5d57fc8
BP
903 error = get_flags(&netdev->up, &netdev->ifi_flags);
904 if (error == ENODEV) {
9dc63482 905 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 906 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
907 return ENODEV;
908 } else {
909 /* "Internal" netdevs have to be created as netdev objects before
910 * they exist in the kernel, because creating them in the kernel
911 * happens by passing a netdev object to dpif_port_add().
912 * Therefore, ignore the error. */
913 }
914 }
46415c90 915
a740f0de
JG
916 return 0;
917}
918
5b7448ed
JG
919/* For most types of netdevs we open the device for each call of
920 * netdev_open(). However, this is not the case with tap devices,
921 * since it is only possible to open the device once. In this
922 * situation we share a single file descriptor, and consequently
923 * buffers, across all readers. Therefore once data is read it will
924 * be unavailable to other reads for tap devices. */
a740f0de 925static int
9dc63482 926netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 927{
9dc63482 928 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 929 static const char tap_dev[] = "/dev/net/tun";
9dc63482 930 const char *name = netdev_->name;
a740f0de 931 struct ifreq ifr;
a740f0de 932
48c6733c
WT
933 int error = netdev_linux_common_construct(netdev_);
934 if (error) {
935 return error;
936 }
1f6e0fbd 937
6c88d577 938 /* Open tap device. */
d0d08f8a
BP
939 netdev->tap_fd = open(tap_dev, O_RDWR);
940 if (netdev->tap_fd < 0) {
6c88d577 941 error = errno;
10a89ef0 942 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 943 return error;
6c88d577
JP
944 }
945
946 /* Create tap device. */
61b9d078 947 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 948 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 949 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 950 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 951 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 952 ovs_strerror(errno));
6c88d577 953 error = errno;
f61d8d29 954 goto error_close;
6c88d577
JP
955 }
956
957 /* Make non-blocking. */
d0d08f8a 958 error = set_nonblocking(netdev->tap_fd);
a740f0de 959 if (error) {
f61d8d29 960 goto error_close;
a740f0de
JG
961 }
962
0f28164b
FL
963 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
964 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
965 ovs_strerror(errno));
966 error = errno;
967 goto error_close;
968 }
969
19aac14a 970 netdev->present = true;
a740f0de
JG
971 return 0;
972
f61d8d29 973error_close:
d0d08f8a 974 close(netdev->tap_fd);
a740f0de
JG
975 return error;
976}
977
6c88d577 978static void
9dc63482 979netdev_linux_destruct(struct netdev *netdev_)
6c88d577 980{
b5d57fc8 981 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 982
b5d57fc8
BP
983 if (netdev->tc && netdev->tc->ops->tc_destroy) {
984 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
985 }
986
d0d08f8a
BP
987 if (netdev_get_class(netdev_) == &netdev_tap_class
988 && netdev->tap_fd >= 0)
989 {
0f28164b 990 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 991 close(netdev->tap_fd);
6c88d577 992 }
86383816 993
19c8e9c1 994 if (netdev->miimon_interval > 0) {
812c272c 995 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
996 }
997
86383816 998 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
999}
1000
9dc63482
BP
1001static void
1002netdev_linux_dealloc(struct netdev *netdev_)
1003{
1004 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1005 free(netdev);
1006}
1007
f7791740
PS
1008static struct netdev_rxq *
1009netdev_linux_rxq_alloc(void)
9dc63482 1010{
f7791740 1011 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
1012 return &rx->up;
1013}
1014
7b6b0ef4 1015static int
f7791740 1016netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 1017{
f7791740 1018 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1019 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 1020 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 1021 int error;
7b6b0ef4 1022
86383816 1023 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
1024 rx->is_tap = is_tap_netdev(netdev_);
1025 if (rx->is_tap) {
1026 rx->fd = netdev->tap_fd;
796223f5
BP
1027 } else {
1028 struct sockaddr_ll sll;
b73c8518 1029 int ifindex, val;
32383c3b 1030 /* Result of tcpdump -dd inbound */
259e0b1a 1031 static const struct sock_filter filt[] = {
32383c3b
MM
1032 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1033 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1034 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1035 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1036 };
259e0b1a
BP
1037 static const struct sock_fprog fprog = {
1038 ARRAY_SIZE(filt), (struct sock_filter *) filt
1039 };
7b6b0ef4 1040
796223f5 1041 /* Create file descriptor. */
9dc63482
BP
1042 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1043 if (rx->fd < 0) {
796223f5 1044 error = errno;
10a89ef0 1045 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
1046 goto error;
1047 }
33d82a56 1048
b73c8518
SH
1049 val = 1;
1050 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1051 error = errno;
1052 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1053 netdev_get_name(netdev_), ovs_strerror(error));
1054 goto error;
1055 }
1056
796223f5 1057 /* Set non-blocking mode. */
9dc63482 1058 error = set_nonblocking(rx->fd);
796223f5
BP
1059 if (error) {
1060 goto error;
1061 }
7b6b0ef4 1062
796223f5 1063 /* Get ethernet device index. */
180c6d0b 1064 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
1065 if (error) {
1066 goto error;
1067 }
7b6b0ef4 1068
796223f5
BP
1069 /* Bind to specific ethernet device. */
1070 memset(&sll, 0, sizeof sll);
1071 sll.sll_family = AF_PACKET;
1072 sll.sll_ifindex = ifindex;
b73c8518 1073 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 1074 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
1075 error = errno;
1076 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 1077 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
1078 goto error;
1079 }
32383c3b
MM
1080
1081 /* Filter for only inbound packets. */
9dc63482 1082 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
1083 sizeof fprog);
1084 if (error) {
1085 error = errno;
259e0b1a 1086 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 1087 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
1088 goto error;
1089 }
7b6b0ef4 1090 }
86383816 1091 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1092
7b6b0ef4
BP
1093 return 0;
1094
1095error:
9dc63482
BP
1096 if (rx->fd >= 0) {
1097 close(rx->fd);
7b6b0ef4 1098 }
86383816 1099 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1100 return error;
1101}
1102
796223f5 1103static void
f7791740 1104netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1105{
f7791740 1106 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1107
796223f5
BP
1108 if (!rx->is_tap) {
1109 close(rx->fd);
8b61709d 1110 }
9dc63482
BP
1111}
1112
1113static void
f7791740 1114netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1115{
f7791740 1116 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1117
796223f5
BP
1118 free(rx);
1119}
8b61709d 1120
b73c8518 1121static ovs_be16
1ebdc7eb 1122auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1123{
1124 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1125 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1126 } else if (double_tagged) {
1127 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1128 } else {
1ebdc7eb 1129 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1130 }
1131}
1132
1133static bool
1134auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1135{
1136 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1137}
1138
796223f5 1139static int
cf62fa4c 1140netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1141{
b73c8518 1142 size_t size;
796223f5 1143 ssize_t retval;
b73c8518
SH
1144 struct iovec iov;
1145 struct cmsghdr *cmsg;
1146 union {
1147 struct cmsghdr cmsg;
1148 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1149 } cmsg_buffer;
1150 struct msghdr msgh;
1151
1152 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1153 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1154 size = dp_packet_tailroom(buffer);
b73c8518 1155
cf62fa4c 1156 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1157 iov.iov_len = size;
1158 msgh.msg_name = NULL;
1159 msgh.msg_namelen = 0;
1160 msgh.msg_iov = &iov;
1161 msgh.msg_iovlen = 1;
1162 msgh.msg_control = &cmsg_buffer;
1163 msgh.msg_controllen = sizeof cmsg_buffer;
1164 msgh.msg_flags = 0;
8e8cddf7 1165
796223f5 1166 do {
b73c8518 1167 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1168 } while (retval < 0 && errno == EINTR);
1169
bfd3367b 1170 if (retval < 0) {
b73c8518
SH
1171 return errno;
1172 } else if (retval > size) {
1173 return EMSGSIZE;
1174 }
1175
cf62fa4c 1176 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1177
1178 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1179 const struct tpacket_auxdata *aux;
1180
1181 if (cmsg->cmsg_level != SOL_PACKET
1182 || cmsg->cmsg_type != PACKET_AUXDATA
1183 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1184 continue;
8b61709d 1185 }
b73c8518
SH
1186
1187 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1188 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1189 struct eth_header *eth;
1190 bool double_tagged;
1191
b73c8518
SH
1192 if (retval < ETH_HEADER_LEN) {
1193 return EINVAL;
1194 }
1195
1ebdc7eb
EG
1196 eth = dp_packet_data(buffer);
1197 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1198
1199 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1200 htons(aux->tp_vlan_tci));
1201 break;
1202 }
1203 }
1204
1205 return 0;
1206}
1207
1208static int
cf62fa4c 1209netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1210{
1211 ssize_t retval;
cf62fa4c 1212 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1213
1214 do {
cf62fa4c 1215 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1216 } while (retval < 0 && errno == EINTR);
1217
1218 if (retval < 0) {
bfd3367b 1219 return errno;
8b61709d 1220 }
b73c8518 1221
cf62fa4c 1222 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1223 return 0;
1224}
1225
1226static int
8492adc2
JS
1227netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1228 int *qfill)
b73c8518 1229{
f7791740 1230 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1231 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1232 struct dp_packet *buffer;
df1e5a3b
PS
1233 ssize_t retval;
1234 int mtu;
1235
1236 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1237 mtu = ETH_PAYLOAD_MAX;
1238 }
1239
2482b0b0 1240 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1241 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1242 DP_NETDEV_HEADROOM);
b73c8518 1243 retval = (rx->is_tap
f7791740
PS
1244 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1245 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1246
1247 if (retval) {
1248 if (retval != EAGAIN && retval != EMSGSIZE) {
1249 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1250 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1251 }
cf62fa4c 1252 dp_packet_delete(buffer);
df1e5a3b 1253 } else {
72c84bc2 1254 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1255 }
1256
8492adc2
JS
1257 if (qfill) {
1258 *qfill = -ENOTSUP;
1259 }
1260
b73c8518 1261 return retval;
8b61709d
BP
1262}
1263
8b61709d 1264static void
f7791740 1265netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1266{
f7791740 1267 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1268 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1269}
1270
8b61709d 1271static int
f7791740 1272netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1273{
f7791740 1274 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1275 if (rx->is_tap) {
8b61709d 1276 struct ifreq ifr;
f7791740 1277 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1278 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1279 if (error) {
1280 return error;
1281 }
796223f5 1282 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1283 return 0;
1284 } else {
796223f5 1285 return drain_rcvbuf(rx->fd);
8b61709d
BP
1286 }
1287}
1288
d19cf8bb
ZG
1289static int
1290netdev_linux_sock_batch_send(int sock, int ifindex,
1291 struct dp_packet_batch *batch)
1292{
e0a00cee 1293 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1294 /* We don't bother setting most fields in sockaddr_ll because the
1295 * kernel ignores them for SOCK_RAW. */
1296 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1297 .sll_ifindex = ifindex };
1298
e0a00cee
BB
1299 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1300 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1301
e0a00cee 1302 struct dp_packet *packet;
e883448e 1303 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
d19cf8bb 1304 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1305 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1306 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1307 .msg_namelen = sizeof sll,
1308 .msg_iov = &iov[i],
1309 .msg_iovlen = 1 };
1310 }
1311
1312 int error = 0;
e0a00cee 1313 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1314 ssize_t retval;
1315 do {
e0a00cee 1316 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1317 error = retval < 0 ? errno : 0;
1318 } while (error == EINTR);
1319 if (error) {
1320 break;
1321 }
1322 ofs += retval;
1323 }
1324
1325 free(mmsg);
1326 free(iov);
1327 return error;
1328}
1329
1330/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1331 * essential, because packets sent to a tap device with an AF_PACKET socket
1332 * will loop back to be *received* again on the tap device. This doesn't occur
1333 * on other interface types because we attach a socket filter to the rx
1334 * socket. */
1335static int
1336netdev_linux_tap_batch_send(struct netdev *netdev_,
1337 struct dp_packet_batch *batch)
1338{
1339 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1340 struct dp_packet *packet;
22dcb534
FL
1341
1342 /* The Linux tap driver returns EIO if the device is not up,
1343 * so if the device is not up, don't waste time sending it.
1344 * However, if the device is in another network namespace
1345 * then OVS can't retrieve the state. In that case, send the
1346 * packets anyway. */
1347 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1348 netdev->tx_dropped += dp_packet_batch_size(batch);
1349 return 0;
1350 }
1351
e883448e 1352 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
ad8b0b4f 1353 size_t size = dp_packet_size(packet);
d19cf8bb
ZG
1354 ssize_t retval;
1355 int error;
1356
1357 do {
1358 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1359 error = retval < 0 ? errno : 0;
1360 } while (error == EINTR);
1361
1362 if (error) {
1363 /* The Linux tap driver returns EIO if the device is not up. From
1364 * the OVS side this is not an error, so we ignore it; otherwise,
1365 * return the erro. */
1366 if (error != EIO) {
1367 return error;
1368 }
1369 } else if (retval != size) {
1370 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1371 "bytes of %"PRIuSIZE") on %s",
1372 retval, size, netdev_get_name(netdev_));
1373 return EMSGSIZE;
1374 }
1375 }
1376 return 0;
1377}
1378
1379/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1380 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1381 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1382 * the packet is too big or too small to transmit on the device.
1383 *
8b61709d
BP
1384 * The kernel maintains a packet transmission queue, so the caller is not
1385 * expected to do additional queuing of packets. */
1386static int
f00fa8cb 1387netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1388 struct dp_packet_batch *batch,
324c8374 1389 bool concurrent_txq OVS_UNUSED)
8b61709d 1390{
f4fd623c 1391 int error = 0;
0a62ae2c
ZG
1392 int sock = 0;
1393
0a62ae2c 1394 if (!is_tap_netdev(netdev_)) {
e0e2410d
FL
1395 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1396 error = EOPNOTSUPP;
1397 goto free_batch;
1398 }
1399
0a62ae2c
ZG
1400 sock = af_packet_sock();
1401 if (sock < 0) {
1402 error = -sock;
1403 goto free_batch;
1404 }
1405
1406 int ifindex = netdev_get_ifindex(netdev_);
1407 if (ifindex < 0) {
1408 error = -ifindex;
1409 goto free_batch;
1410 }
1411
d19cf8bb
ZG
1412 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1413 } else {
1414 error = netdev_linux_tap_batch_send(netdev_, batch);
0a62ae2c 1415 }
d19cf8bb
ZG
1416 if (error) {
1417 if (error == ENOBUFS) {
1418 /* The Linux AF_PACKET implementation never blocks waiting
1419 * for room for packets, instead returning ENOBUFS.
1420 * Translate this into EAGAIN for the caller. */
1421 error = EAGAIN;
f23347ea 1422 } else {
f4fd623c
DDP
1423 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1424 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1425 }
f4fd623c
DDP
1426 }
1427
0a62ae2c 1428free_batch:
b30896c9 1429 dp_packet_delete_batch(batch, true);
f4fd623c 1430 return error;
8b61709d
BP
1431}
1432
1433/* Registers with the poll loop to wake up from the next call to poll_block()
1434 * when the packet transmission queue has sufficient room to transmit a packet
1435 * with netdev_send().
1436 *
1437 * The kernel maintains a packet transmission queue, so the client is not
1438 * expected to do additional queuing of packets. Thus, this function is
1439 * unlikely to ever be used. It is included for completeness. */
1440static void
f00fa8cb 1441netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1442{
796223f5 1443 if (is_tap_netdev(netdev)) {
8b61709d
BP
1444 /* TAP device always accepts packets.*/
1445 poll_immediate_wake();
1446 }
1447}
1448
1449/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1450 * otherwise a positive errno value. */
1451static int
74ff3298 1452netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1453{
b5d57fc8 1454 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1455 enum netdev_flags old_flags = 0;
eb395f2e
BP
1456 int error;
1457
86383816 1458 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1459 if (netdev_linux_netnsid_is_remote(netdev)) {
1460 error = EOPNOTSUPP;
1461 goto exit;
1462 }
86383816 1463
b5d57fc8 1464 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1465 error = netdev->ether_addr_error;
1466 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1467 goto exit;
44445cac 1468 }
b5d57fc8 1469 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1470 }
1471
7eb1bd81 1472 /* Tap devices must be brought down before setting the address. */
796223f5 1473 if (is_tap_netdev(netdev_)) {
4f9f3f21 1474 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1475 }
44445cac
PS
1476 error = set_etheraddr(netdev_get_name(netdev_), mac);
1477 if (!error || error == ENODEV) {
b5d57fc8
BP
1478 netdev->ether_addr_error = error;
1479 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1480 if (!error) {
74ff3298 1481 netdev->etheraddr = mac;
eb395f2e 1482 }
8b61709d 1483 }
44445cac 1484
4f9f3f21
BP
1485 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1486 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1487 }
7eb1bd81 1488
86383816
BP
1489exit:
1490 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1491 return error;
1492}
1493
44445cac 1494/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1495static int
74ff3298 1496netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1497{
b5d57fc8 1498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1499 int error;
44445cac 1500
86383816 1501 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1502 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
756819dd
FL
1503 netdev_linux_update_via_netlink(netdev);
1504 }
1505
1506 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1507 /* Fall back to ioctl if netlink fails */
86383816 1508 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1509 &netdev->etheraddr);
b5d57fc8 1510 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1511 }
44445cac 1512
86383816
BP
1513 error = netdev->ether_addr_error;
1514 if (!error) {
74ff3298 1515 *mac = netdev->etheraddr;
44445cac 1516 }
86383816 1517 ovs_mutex_unlock(&netdev->mutex);
44445cac 1518
86383816 1519 return error;
8b61709d
BP
1520}
1521
8b61709d 1522static int
73371c09 1523netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1524{
86383816
BP
1525 int error;
1526
b5d57fc8 1527 if (!(netdev->cache_valid & VALID_MTU)) {
756819dd
FL
1528 netdev_linux_update_via_netlink(netdev);
1529 }
1530
1531 if (!(netdev->cache_valid & VALID_MTU)) {
1532 /* Fall back to ioctl if netlink fails */
8b61709d 1533 struct ifreq ifr;
90a6637d 1534
86383816 1535 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1536 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1537 netdev->mtu = ifr.ifr_mtu;
1538 netdev->cache_valid |= VALID_MTU;
8b61709d 1539 }
90a6637d 1540
86383816
BP
1541 error = netdev->netdev_mtu_error;
1542 if (!error) {
b5d57fc8 1543 *mtup = netdev->mtu;
90a6637d 1544 }
73371c09
BP
1545
1546 return error;
1547}
1548
1549/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1550 * in bytes, not including the hardware header; thus, this is typically 1500
1551 * bytes for Ethernet devices. */
1552static int
1553netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1554{
1555 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1556 int error;
1557
1558 ovs_mutex_lock(&netdev->mutex);
1559 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1560 ovs_mutex_unlock(&netdev->mutex);
1561
1562 return error;
8b61709d
BP
1563}
1564
9b020780
PS
1565/* Sets the maximum size of transmitted (MTU) for given device using linux
1566 * networking ioctl interface.
1567 */
1568static int
4124cb12 1569netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1570{
b5d57fc8 1571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1572 struct ifreq ifr;
1573 int error;
1574
86383816 1575 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1576 if (netdev_linux_netnsid_is_remote(netdev)) {
1577 error = EOPNOTSUPP;
1578 goto exit;
1579 }
1580
b5d57fc8 1581 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1582 error = netdev->netdev_mtu_error;
1583 if (error || netdev->mtu == mtu) {
1584 goto exit;
90a6637d 1585 }
b5d57fc8 1586 netdev->cache_valid &= ~VALID_MTU;
153e5481 1587 }
9b020780 1588 ifr.ifr_mtu = mtu;
259e0b1a
BP
1589 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1590 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1591 if (!error || error == ENODEV) {
b5d57fc8
BP
1592 netdev->netdev_mtu_error = error;
1593 netdev->mtu = ifr.ifr_mtu;
1594 netdev->cache_valid |= VALID_MTU;
9b020780 1595 }
86383816
BP
1596exit:
1597 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1598 return error;
9b020780
PS
1599}
1600
9ab3d9a3
BP
1601/* Returns the ifindex of 'netdev', if successful, as a positive number.
1602 * On failure, returns a negative errno value. */
1603static int
86383816 1604netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1605{
86383816 1606 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1607 int ifindex, error;
1608
86383816 1609 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1610 if (netdev_linux_netnsid_is_remote(netdev)) {
1611 error = EOPNOTSUPP;
1612 goto exit;
1613 }
86383816 1614 error = get_ifindex(netdev_, &ifindex);
86383816 1615
e0e2410d
FL
1616exit:
1617 ovs_mutex_unlock(&netdev->mutex);
9ab3d9a3
BP
1618 return error ? -error : ifindex;
1619}
1620
8b61709d
BP
1621static int
1622netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1623{
b5d57fc8 1624 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1625
86383816 1626 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1627 if (netdev->miimon_interval > 0) {
1628 *carrier = netdev->miimon;
3a183124 1629 } else {
b5d57fc8 1630 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1631 }
86383816 1632 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1633
3a183124 1634 return 0;
8b61709d
BP
1635}
1636
65c3058c 1637static long long int
86383816 1638netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1639{
86383816
BP
1640 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1641 long long int carrier_resets;
1642
1643 ovs_mutex_lock(&netdev->mutex);
1644 carrier_resets = netdev->carrier_resets;
1645 ovs_mutex_unlock(&netdev->mutex);
1646
1647 return carrier_resets;
65c3058c
EJ
1648}
1649
63331829 1650static int
1670c579
EJ
1651netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1652 struct mii_ioctl_data *data)
63331829 1653{
63331829 1654 struct ifreq ifr;
782e6111 1655 int error;
63331829 1656
63331829 1657 memset(&ifr, 0, sizeof ifr);
782e6111 1658 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1659 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1660 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1661
782e6111
EJ
1662 return error;
1663}
1664
1665static int
1670c579 1666netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1667{
782e6111
EJ
1668 struct mii_ioctl_data data;
1669 int error;
63331829 1670
782e6111
EJ
1671 *miimon = false;
1672
1673 memset(&data, 0, sizeof data);
1670c579 1674 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1675 if (!error) {
1676 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1677 data.reg_num = MII_BMSR;
1670c579 1678 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1679 &data);
63331829
EJ
1680
1681 if (!error) {
782e6111 1682 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1683 }
9120cfc0
DH
1684 }
1685 if (error) {
63331829 1686 struct ethtool_cmd ecmd;
63331829
EJ
1687
1688 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1689 name);
1690
ab985a77 1691 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1692 memset(&ecmd, 0, sizeof ecmd);
1693 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1694 "ETHTOOL_GLINK");
1695 if (!error) {
782e6111
EJ
1696 struct ethtool_value eval;
1697
1698 memcpy(&eval, &ecmd, sizeof eval);
1699 *miimon = !!eval.data;
63331829
EJ
1700 } else {
1701 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1702 }
1703 }
1704
1705 return error;
1706}
1707
1670c579
EJ
1708static int
1709netdev_linux_set_miimon_interval(struct netdev *netdev_,
1710 long long int interval)
1711{
b5d57fc8 1712 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1713
86383816 1714 ovs_mutex_lock(&netdev->mutex);
1670c579 1715 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1716 if (netdev->miimon_interval != interval) {
19c8e9c1 1717 if (interval && !netdev->miimon_interval) {
812c272c 1718 atomic_count_inc(&miimon_cnt);
19c8e9c1 1719 } else if (!interval && netdev->miimon_interval) {
812c272c 1720 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1721 }
1722
b5d57fc8
BP
1723 netdev->miimon_interval = interval;
1724 timer_set_expired(&netdev->miimon_timer);
1670c579 1725 }
86383816 1726 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1727
1728 return 0;
1729}
1730
1731static void
1732netdev_linux_miimon_run(void)
1733{
1734 struct shash device_shash;
1735 struct shash_node *node;
1736
1737 shash_init(&device_shash);
b5d57fc8 1738 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1739 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1740 struct netdev *netdev = node->data;
1741 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1742 bool miimon;
1743
86383816
BP
1744 ovs_mutex_lock(&dev->mutex);
1745 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1746 netdev_linux_get_miimon(dev->up.name, &miimon);
1747 if (miimon != dev->miimon) {
1748 dev->miimon = miimon;
1749 netdev_linux_changed(dev, dev->ifi_flags, 0);
1750 }
1670c579 1751
86383816 1752 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1753 }
86383816 1754 ovs_mutex_unlock(&dev->mutex);
2f980d74 1755 netdev_close(netdev);
1670c579
EJ
1756 }
1757
1758 shash_destroy(&device_shash);
1759}
1760
1761static void
1762netdev_linux_miimon_wait(void)
1763{
1764 struct shash device_shash;
1765 struct shash_node *node;
1766
1767 shash_init(&device_shash);
b5d57fc8 1768 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1769 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1770 struct netdev *netdev = node->data;
1771 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1772
86383816 1773 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1774 if (dev->miimon_interval > 0) {
1775 timer_wait(&dev->miimon_timer);
1776 }
86383816 1777 ovs_mutex_unlock(&dev->mutex);
2f980d74 1778 netdev_close(netdev);
1670c579
EJ
1779 }
1780 shash_destroy(&device_shash);
1781}
1782
92df599c
JG
1783static void
1784swap_uint64(uint64_t *a, uint64_t *b)
1785{
1de0e8ae
BP
1786 uint64_t tmp = *a;
1787 *a = *b;
1788 *b = tmp;
92df599c
JG
1789}
1790
c060c4cf
EJ
1791/* Copies 'src' into 'dst', performing format conversion in the process.
1792 *
1793 * 'src' is allowed to be misaligned. */
1794static void
1795netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1796 const struct ovs_vport_stats *src)
1797{
6a54dedc
BP
1798 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1799 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1800 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1801 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1802 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1803 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1804 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1805 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1806 dst->multicast = 0;
1807 dst->collisions = 0;
1808 dst->rx_length_errors = 0;
1809 dst->rx_over_errors = 0;
1810 dst->rx_crc_errors = 0;
1811 dst->rx_frame_errors = 0;
1812 dst->rx_fifo_errors = 0;
1813 dst->rx_missed_errors = 0;
1814 dst->tx_aborted_errors = 0;
1815 dst->tx_carrier_errors = 0;
1816 dst->tx_fifo_errors = 0;
1817 dst->tx_heartbeat_errors = 0;
1818 dst->tx_window_errors = 0;
1819}
1820
1821static int
1822get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1823{
93451a0a 1824 struct dpif_netlink_vport reply;
c060c4cf
EJ
1825 struct ofpbuf *buf;
1826 int error;
1827
93451a0a 1828 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1829 if (error) {
1830 return error;
1831 } else if (!reply.stats) {
1832 ofpbuf_delete(buf);
1833 return EOPNOTSUPP;
1834 }
1835
1836 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1837
1838 ofpbuf_delete(buf);
1839
1840 return 0;
1841}
1842
f613a0d7
PS
1843static void
1844get_stats_via_vport(const struct netdev *netdev_,
1845 struct netdev_stats *stats)
8b61709d 1846{
b5d57fc8 1847 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1848
b5d57fc8
BP
1849 if (!netdev->vport_stats_error ||
1850 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1851 int error;
7fbef77a 1852
c060c4cf 1853 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1854 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1855 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1856 "(%s)",
1857 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1858 }
b5d57fc8
BP
1859 netdev->vport_stats_error = error;
1860 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1861 }
f613a0d7 1862}
8b61709d 1863
f613a0d7
PS
1864/* Retrieves current device stats for 'netdev-linux'. */
1865static int
1866netdev_linux_get_stats(const struct netdev *netdev_,
1867 struct netdev_stats *stats)
1868{
b5d57fc8 1869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1870 struct netdev_stats dev_stats;
1871 int error;
1872
86383816 1873 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1874 get_stats_via_vport(netdev_, stats);
35eef899 1875 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1876 if (error) {
86383816
BP
1877 if (!netdev->vport_stats_error) {
1878 error = 0;
f613a0d7 1879 }
86383816 1880 } else if (netdev->vport_stats_error) {
04c881eb 1881 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1882 *stats = dev_stats;
1883 } else {
04c881eb
AZ
1884 /* Use kernel netdev's packet and byte counts since vport's counters
1885 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1886 * enabled. */
1887 stats->rx_packets = dev_stats.rx_packets;
1888 stats->rx_bytes = dev_stats.rx_bytes;
1889 stats->tx_packets = dev_stats.tx_packets;
1890 stats->tx_bytes = dev_stats.tx_bytes;
1891
f613a0d7
PS
1892 stats->rx_errors += dev_stats.rx_errors;
1893 stats->tx_errors += dev_stats.tx_errors;
1894 stats->rx_dropped += dev_stats.rx_dropped;
1895 stats->tx_dropped += dev_stats.tx_dropped;
1896 stats->multicast += dev_stats.multicast;
1897 stats->collisions += dev_stats.collisions;
1898 stats->rx_length_errors += dev_stats.rx_length_errors;
1899 stats->rx_over_errors += dev_stats.rx_over_errors;
1900 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1901 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1902 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1903 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1904 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1905 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1906 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1907 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1908 stats->tx_window_errors += dev_stats.tx_window_errors;
1909 }
86383816
BP
1910 ovs_mutex_unlock(&netdev->mutex);
1911
1912 return error;
f613a0d7
PS
1913}
1914
1915/* Retrieves current device stats for 'netdev-tap' netdev or
1916 * netdev-internal. */
1917static int
15aee116 1918netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1919{
b5d57fc8 1920 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1921 struct netdev_stats dev_stats;
1922 int error;
1923
86383816 1924 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1925 get_stats_via_vport(netdev_, stats);
35eef899 1926 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1927 if (error) {
86383816
BP
1928 if (!netdev->vport_stats_error) {
1929 error = 0;
8b61709d 1930 }
86383816
BP
1931 } else if (netdev->vport_stats_error) {
1932 /* Transmit and receive stats will appear to be swapped relative to the
1933 * other ports since we are the one sending the data, not a remote
1934 * computer. For consistency, we swap them back here. This does not
1935 * apply if we are getting stats from the vport layer because it always
1936 * tracks stats from the perspective of the switch. */
fe6b0e03 1937
f613a0d7 1938 *stats = dev_stats;
92df599c
JG
1939 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1940 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1941 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1942 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1943 stats->rx_length_errors = 0;
1944 stats->rx_over_errors = 0;
1945 stats->rx_crc_errors = 0;
1946 stats->rx_frame_errors = 0;
1947 stats->rx_fifo_errors = 0;
1948 stats->rx_missed_errors = 0;
1949 stats->tx_aborted_errors = 0;
1950 stats->tx_carrier_errors = 0;
1951 stats->tx_fifo_errors = 0;
1952 stats->tx_heartbeat_errors = 0;
1953 stats->tx_window_errors = 0;
f613a0d7 1954 } else {
04c881eb
AZ
1955 /* Use kernel netdev's packet and byte counts since vport counters
1956 * do not reflect packet counts on the wire when GSO, TSO or GRO
1957 * are enabled. */
1958 stats->rx_packets = dev_stats.tx_packets;
1959 stats->rx_bytes = dev_stats.tx_bytes;
1960 stats->tx_packets = dev_stats.rx_packets;
1961 stats->tx_bytes = dev_stats.rx_bytes;
1962
f613a0d7
PS
1963 stats->rx_dropped += dev_stats.tx_dropped;
1964 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1965
f613a0d7
PS
1966 stats->rx_errors += dev_stats.tx_errors;
1967 stats->tx_errors += dev_stats.rx_errors;
1968
1969 stats->multicast += dev_stats.multicast;
1970 stats->collisions += dev_stats.collisions;
1971 }
22dcb534 1972 stats->tx_dropped += netdev->tx_dropped;
86383816
BP
1973 ovs_mutex_unlock(&netdev->mutex);
1974
1975 return error;
8b61709d
BP
1976}
1977
bba1e6f3
PS
1978static int
1979netdev_internal_get_stats(const struct netdev *netdev_,
1980 struct netdev_stats *stats)
1981{
b5d57fc8 1982 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1983 int error;
bba1e6f3 1984
86383816 1985 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1986 get_stats_via_vport(netdev_, stats);
86383816
BP
1987 error = netdev->vport_stats_error;
1988 ovs_mutex_unlock(&netdev->mutex);
1989
1990 return error;
bba1e6f3
PS
1991}
1992
51f87458 1993static void
b5d57fc8 1994netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1995{
1996 struct ethtool_cmd ecmd;
6c038611 1997 uint32_t speed;
8b61709d
BP
1998 int error;
1999
b5d57fc8 2000 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
2001 return;
2002 }
2003
ab985a77 2004 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2005 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 2006 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
2007 ETHTOOL_GSET, "ETHTOOL_GSET");
2008 if (error) {
51f87458 2009 goto out;
8b61709d
BP
2010 }
2011
2012 /* Supported features. */
b5d57fc8 2013 netdev->supported = 0;
8b61709d 2014 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 2015 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
2016 }
2017 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 2018 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
2019 }
2020 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 2021 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
2022 }
2023 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 2024 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
2025 }
2026 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 2027 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 2028 }
67bed84c
SH
2029 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2030 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 2031 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 2032 }
67bed84c
SH
2033 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2034 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2035 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2036 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 2037 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 2038 }
67bed84c
SH
2039 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2040 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2041 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2042 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2043 netdev->supported |= NETDEV_F_40GB_FD;
2044 }
8b61709d 2045 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 2046 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
2047 }
2048 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 2049 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
2050 }
2051 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 2052 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
2053 }
2054 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 2055 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
2056 }
2057 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 2058 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2059 }
2060
2061 /* Advertised features. */
b5d57fc8 2062 netdev->advertised = 0;
8b61709d 2063 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 2064 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
2065 }
2066 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 2067 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
2068 }
2069 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 2070 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
2071 }
2072 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 2073 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
2074 }
2075 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 2076 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 2077 }
67bed84c
SH
2078 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2079 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 2080 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 2081 }
67bed84c
SH
2082 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2083 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2084 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2085 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 2086 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 2087 }
67bed84c
SH
2088 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2089 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2090 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2091 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2092 netdev->advertised |= NETDEV_F_40GB_FD;
2093 }
8b61709d 2094 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 2095 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
2096 }
2097 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 2098 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
2099 }
2100 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 2101 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
2102 }
2103 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 2104 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
2105 }
2106 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 2107 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2108 }
2109
2110 /* Current settings. */
0c615356 2111 speed = ethtool_cmd_speed(&ecmd);
6c038611 2112 if (speed == SPEED_10) {
b5d57fc8 2113 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 2114 } else if (speed == SPEED_100) {
b5d57fc8 2115 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 2116 } else if (speed == SPEED_1000) {
b5d57fc8 2117 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 2118 } else if (speed == SPEED_10000) {
b5d57fc8 2119 netdev->current = NETDEV_F_10GB_FD;
6c038611 2120 } else if (speed == 40000) {
b5d57fc8 2121 netdev->current = NETDEV_F_40GB_FD;
6c038611 2122 } else if (speed == 100000) {
b5d57fc8 2123 netdev->current = NETDEV_F_100GB_FD;
6c038611 2124 } else if (speed == 1000000) {
b5d57fc8 2125 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2126 } else {
b5d57fc8 2127 netdev->current = 0;
8b61709d
BP
2128 }
2129
2130 if (ecmd.port == PORT_TP) {
b5d57fc8 2131 netdev->current |= NETDEV_F_COPPER;
8b61709d 2132 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2133 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2134 }
2135
2136 if (ecmd.autoneg) {
b5d57fc8 2137 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2138 }
2139
51f87458 2140out:
b5d57fc8
BP
2141 netdev->cache_valid |= VALID_FEATURES;
2142 netdev->get_features_error = error;
51f87458
PS
2143}
2144
887ed8b2
BP
2145/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2146 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2147 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2148static int
2149netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2150 enum netdev_features *current,
2151 enum netdev_features *advertised,
2152 enum netdev_features *supported,
2153 enum netdev_features *peer)
51f87458 2154{
b5d57fc8 2155 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2156 int error;
51f87458 2157
86383816 2158 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2159 if (netdev_linux_netnsid_is_remote(netdev)) {
2160 error = EOPNOTSUPP;
2161 goto exit;
2162 }
2163
b5d57fc8 2164 netdev_linux_read_features(netdev);
b5d57fc8
BP
2165 if (!netdev->get_features_error) {
2166 *current = netdev->current;
2167 *advertised = netdev->advertised;
2168 *supported = netdev->supported;
887ed8b2 2169 *peer = 0; /* XXX */
51f87458 2170 }
86383816 2171 error = netdev->get_features_error;
86383816 2172
e0e2410d
FL
2173exit:
2174 ovs_mutex_unlock(&netdev->mutex);
86383816 2175 return error;
8b61709d
BP
2176}
2177
2178/* Set the features advertised by 'netdev' to 'advertise'. */
2179static int
86383816 2180netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2181 enum netdev_features advertise)
8b61709d 2182{
86383816 2183 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2184 struct ethtool_cmd ecmd;
2185 int error;
2186
86383816
BP
2187 ovs_mutex_lock(&netdev->mutex);
2188
ab985a77 2189 COVERAGE_INC(netdev_get_ethtool);
e0e2410d
FL
2190
2191 if (netdev_linux_netnsid_is_remote(netdev)) {
2192 error = EOPNOTSUPP;
2193 goto exit;
2194 }
2195
8b61709d 2196 memset(&ecmd, 0, sizeof ecmd);
86383816 2197 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2198 ETHTOOL_GSET, "ETHTOOL_GSET");
2199 if (error) {
86383816 2200 goto exit;
8b61709d
BP
2201 }
2202
2203 ecmd.advertising = 0;
6c038611 2204 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2205 ecmd.advertising |= ADVERTISED_10baseT_Half;
2206 }
6c038611 2207 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2208 ecmd.advertising |= ADVERTISED_10baseT_Full;
2209 }
6c038611 2210 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2211 ecmd.advertising |= ADVERTISED_100baseT_Half;
2212 }
6c038611 2213 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2214 ecmd.advertising |= ADVERTISED_100baseT_Full;
2215 }
6c038611 2216 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2217 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2218 }
6c038611 2219 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2220 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2221 }
6c038611 2222 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2223 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2224 }
6c038611 2225 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2226 ecmd.advertising |= ADVERTISED_TP;
2227 }
6c038611 2228 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2229 ecmd.advertising |= ADVERTISED_FIBRE;
2230 }
6c038611 2231 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2232 ecmd.advertising |= ADVERTISED_Autoneg;
2233 }
6c038611 2234 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2235 ecmd.advertising |= ADVERTISED_Pause;
2236 }
6c038611 2237 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2238 ecmd.advertising |= ADVERTISED_Asym_Pause;
2239 }
ab985a77 2240 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2241 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2242 ETHTOOL_SSET, "ETHTOOL_SSET");
2243
2244exit:
2245 ovs_mutex_unlock(&netdev->mutex);
2246 return error;
8b61709d
BP
2247}
2248
f8500004
JP
2249/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2250 * successful, otherwise a positive errno value. */
8b61709d 2251static int
b5d57fc8 2252netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2253 uint32_t kbits_rate, uint32_t kbits_burst)
2254{
b5d57fc8
BP
2255 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2256 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2257 int ifindex;
f8500004 2258 int error;
8b61709d 2259
d5ae4a60
PB
2260 if (netdev_is_flow_api_enabled()) {
2261 if (kbits_rate) {
2262 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2263 netdev_name);
2264 }
2265 return EOPNOTSUPP;
2266 }
2267
80a86fbe 2268 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2269 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2270 : kbits_burst); /* Stick with user-specified value. */
2271
86383816 2272 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2273 if (netdev_linux_netnsid_is_remote(netdev)) {
2274 error = EOPNOTSUPP;
2275 goto out;
2276 }
2277
b5d57fc8 2278 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2279 error = netdev->netdev_policing_error;
2280 if (error || (netdev->kbits_rate == kbits_rate &&
2281 netdev->kbits_burst == kbits_burst)) {
c9f71668 2282 /* Assume that settings haven't changed since we last set them. */
86383816 2283 goto out;
c9f71668 2284 }
b5d57fc8 2285 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2286 }
2287
7874bdff
RD
2288 error = get_ifindex(netdev_, &ifindex);
2289 if (error) {
2290 goto out;
2291 }
2292
ac8c3412 2293 COVERAGE_INC(netdev_set_policing);
f8500004 2294 /* Remove any existing ingress qdisc. */
093c9458 2295 error = tc_add_del_ingress_qdisc(ifindex, false, 0);
f8500004
JP
2296 if (error) {
2297 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2298 netdev_name, ovs_strerror(error));
c9f71668 2299 goto out;
f8500004
JP
2300 }
2301
8b61709d 2302 if (kbits_rate) {
093c9458 2303 error = tc_add_del_ingress_qdisc(ifindex, true, 0);
f8500004
JP
2304 if (error) {
2305 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2306 netdev_name, ovs_strerror(error));
c9f71668 2307 goto out;
8b61709d
BP
2308 }
2309
b5d57fc8 2310 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2311 if (error){
2312 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2313 netdev_name, ovs_strerror(error));
c9f71668 2314 goto out;
8b61709d 2315 }
8b61709d
BP
2316 }
2317
b5d57fc8
BP
2318 netdev->kbits_rate = kbits_rate;
2319 netdev->kbits_burst = kbits_burst;
f8500004 2320
c9f71668
PS
2321out:
2322 if (!error || error == ENODEV) {
b5d57fc8
BP
2323 netdev->netdev_policing_error = error;
2324 netdev->cache_valid |= VALID_POLICING;
c9f71668 2325 }
86383816 2326 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2327 return error;
8b61709d
BP
2328}
2329
c1c9c9c4
BP
2330static int
2331netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2332 struct sset *types)
c1c9c9c4 2333{
559eb230 2334 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2335 for (opsp = tcs; *opsp != NULL; opsp++) {
2336 const struct tc_ops *ops = *opsp;
2337 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2338 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2339 }
2340 }
2341 return 0;
2342}
2343
2344static const struct tc_ops *
2345tc_lookup_ovs_name(const char *name)
2346{
559eb230 2347 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2348
2349 for (opsp = tcs; *opsp != NULL; opsp++) {
2350 const struct tc_ops *ops = *opsp;
2351 if (!strcmp(name, ops->ovs_name)) {
2352 return ops;
2353 }
2354 }
2355 return NULL;
2356}
2357
2358static const struct tc_ops *
2359tc_lookup_linux_name(const char *name)
2360{
559eb230 2361 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2362
2363 for (opsp = tcs; *opsp != NULL; opsp++) {
2364 const struct tc_ops *ops = *opsp;
2365 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2366 return ops;
2367 }
2368 }
2369 return NULL;
2370}
2371
93b13be8 2372static struct tc_queue *
b5d57fc8 2373tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2374 size_t hash)
2375{
b5d57fc8 2376 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2377 struct tc_queue *queue;
2378
b5d57fc8 2379 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2380 if (queue->queue_id == queue_id) {
2381 return queue;
2382 }
2383 }
2384 return NULL;
2385}
2386
2387static struct tc_queue *
2388tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2389{
2390 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2391}
2392
c1c9c9c4
BP
2393static int
2394netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2395 const char *type,
2396 struct netdev_qos_capabilities *caps)
2397{
2398 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2399 if (!ops) {
2400 return EOPNOTSUPP;
2401 }
2402 caps->n_queues = ops->n_queues;
2403 return 0;
2404}
2405
2406static int
b5d57fc8 2407netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2408 const char **typep, struct smap *details)
c1c9c9c4 2409{
b5d57fc8 2410 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2411 int error;
2412
86383816 2413 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2414 if (netdev_linux_netnsid_is_remote(netdev)) {
2415 error = EOPNOTSUPP;
2416 goto exit;
2417 }
2418
b5d57fc8 2419 error = tc_query_qdisc(netdev_);
86383816
BP
2420 if (!error) {
2421 *typep = netdev->tc->ops->ovs_name;
2422 error = (netdev->tc->ops->qdisc_get
2423 ? netdev->tc->ops->qdisc_get(netdev_, details)
2424 : 0);
c1c9c9c4
BP
2425 }
2426
e0e2410d
FL
2427exit:
2428 ovs_mutex_unlock(&netdev->mutex);
86383816 2429 return error;
c1c9c9c4
BP
2430}
2431
2432static int
b5d57fc8 2433netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2434 const char *type, const struct smap *details)
c1c9c9c4 2435{
b5d57fc8 2436 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2437 const struct tc_ops *new_ops;
2438 int error;
2439
2440 new_ops = tc_lookup_ovs_name(type);
2441 if (!new_ops || !new_ops->tc_install) {
2442 return EOPNOTSUPP;
2443 }
2444
6cf888b8
BS
2445 if (new_ops == &tc_ops_noop) {
2446 return new_ops->tc_install(netdev_, details);
2447 }
2448
86383816 2449 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2450 if (netdev_linux_netnsid_is_remote(netdev)) {
2451 error = EOPNOTSUPP;
2452 goto exit;
2453 }
2454
b5d57fc8 2455 error = tc_query_qdisc(netdev_);
c1c9c9c4 2456 if (error) {
86383816 2457 goto exit;
c1c9c9c4
BP
2458 }
2459
b5d57fc8 2460 if (new_ops == netdev->tc->ops) {
86383816 2461 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2462 } else {
2463 /* Delete existing qdisc. */
b5d57fc8 2464 error = tc_del_qdisc(netdev_);
c1c9c9c4 2465 if (error) {
86383816 2466 goto exit;
c1c9c9c4 2467 }
b5d57fc8 2468 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2469
2470 /* Install new qdisc. */
b5d57fc8
BP
2471 error = new_ops->tc_install(netdev_, details);
2472 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2473 }
86383816
BP
2474
2475exit:
2476 ovs_mutex_unlock(&netdev->mutex);
2477 return error;
c1c9c9c4
BP
2478}
2479
2480static int
b5d57fc8 2481netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2482 unsigned int queue_id, struct smap *details)
c1c9c9c4 2483{
b5d57fc8 2484 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2485 int error;
2486
86383816 2487 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2488 if (netdev_linux_netnsid_is_remote(netdev)) {
2489 error = EOPNOTSUPP;
2490 goto exit;
2491 }
2492
b5d57fc8 2493 error = tc_query_qdisc(netdev_);
86383816 2494 if (!error) {
b5d57fc8 2495 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2496 error = (queue
b5d57fc8 2497 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2498 : ENOENT);
c1c9c9c4 2499 }
86383816 2500
e0e2410d
FL
2501exit:
2502 ovs_mutex_unlock(&netdev->mutex);
86383816 2503 return error;
c1c9c9c4
BP
2504}
2505
2506static int
b5d57fc8 2507netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2508 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2509{
b5d57fc8 2510 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2511 int error;
2512
86383816 2513 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2514 if (netdev_linux_netnsid_is_remote(netdev)) {
2515 error = EOPNOTSUPP;
2516 goto exit;
2517 }
2518
b5d57fc8 2519 error = tc_query_qdisc(netdev_);
86383816
BP
2520 if (!error) {
2521 error = (queue_id < netdev->tc->ops->n_queues
2522 && netdev->tc->ops->class_set
2523 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2524 : EINVAL);
c1c9c9c4
BP
2525 }
2526
e0e2410d
FL
2527exit:
2528 ovs_mutex_unlock(&netdev->mutex);
86383816 2529 return error;
c1c9c9c4
BP
2530}
2531
2532static int
b5d57fc8 2533netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2534{
b5d57fc8 2535 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2536 int error;
2537
86383816 2538 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2539 if (netdev_linux_netnsid_is_remote(netdev)) {
2540 error = EOPNOTSUPP;
2541 goto exit;
2542 }
2543
b5d57fc8 2544 error = tc_query_qdisc(netdev_);
86383816
BP
2545 if (!error) {
2546 if (netdev->tc->ops->class_delete) {
2547 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2548 error = (queue
2549 ? netdev->tc->ops->class_delete(netdev_, queue)
2550 : ENOENT);
2551 } else {
2552 error = EINVAL;
2553 }
c1c9c9c4 2554 }
86383816 2555
e0e2410d
FL
2556exit:
2557 ovs_mutex_unlock(&netdev->mutex);
86383816 2558 return error;
c1c9c9c4
BP
2559}
2560
2561static int
b5d57fc8 2562netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2563 unsigned int queue_id,
2564 struct netdev_queue_stats *stats)
2565{
b5d57fc8 2566 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2567 int error;
2568
86383816 2569 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2570 if (netdev_linux_netnsid_is_remote(netdev)) {
2571 error = EOPNOTSUPP;
2572 goto exit;
2573 }
2574
b5d57fc8 2575 error = tc_query_qdisc(netdev_);
86383816
BP
2576 if (!error) {
2577 if (netdev->tc->ops->class_get_stats) {
2578 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2579 if (queue) {
2580 stats->created = queue->created;
2581 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2582 stats);
2583 } else {
2584 error = ENOENT;
2585 }
2586 } else {
2587 error = EOPNOTSUPP;
6dc34a0d 2588 }
c1c9c9c4 2589 }
86383816 2590
e0e2410d
FL
2591exit:
2592 ovs_mutex_unlock(&netdev->mutex);
86383816 2593 return error;
c1c9c9c4
BP
2594}
2595
d57695d7
JS
2596struct queue_dump_state {
2597 struct nl_dump dump;
2598 struct ofpbuf buf;
2599};
2600
23a98ffe 2601static bool
d57695d7 2602start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2603{
2604 struct ofpbuf request;
2605 struct tcmsg *tcmsg;
2606
7874bdff 2607 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2608 if (!tcmsg) {
2609 return false;
2610 }
3c4de644 2611 tcmsg->tcm_parent = 0;
d57695d7 2612 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2613 ofpbuf_uninit(&request);
d57695d7
JS
2614
2615 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2616 return true;
c1c9c9c4
BP
2617}
2618
d57695d7
JS
2619static int
2620finish_queue_dump(struct queue_dump_state *state)
2621{
2622 ofpbuf_uninit(&state->buf);
2623 return nl_dump_done(&state->dump);
2624}
2625
89454bf4
BP
2626struct netdev_linux_queue_state {
2627 unsigned int *queues;
2628 size_t cur_queue;
2629 size_t n_queues;
2630};
2631
c1c9c9c4 2632static int
89454bf4 2633netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2634{
e0e2410d 2635 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2636 int error;
2637
86383816 2638 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2639 if (netdev_linux_netnsid_is_remote(netdev)) {
2640 error = EOPNOTSUPP;
2641 goto exit;
2642 }
2643
b5d57fc8 2644 error = tc_query_qdisc(netdev_);
86383816
BP
2645 if (!error) {
2646 if (netdev->tc->ops->class_get) {
89454bf4
BP
2647 struct netdev_linux_queue_state *state;
2648 struct tc_queue *queue;
2649 size_t i;
2650
2651 *statep = state = xmalloc(sizeof *state);
2652 state->n_queues = hmap_count(&netdev->tc->queues);
2653 state->cur_queue = 0;
2654 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2655
2656 i = 0;
2657 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2658 state->queues[i++] = queue->queue_id;
86383816 2659 }
c1c9c9c4 2660 } else {
86383816 2661 error = EOPNOTSUPP;
c1c9c9c4
BP
2662 }
2663 }
c1c9c9c4 2664
e0e2410d
FL
2665exit:
2666 ovs_mutex_unlock(&netdev->mutex);
86383816 2667 return error;
c1c9c9c4
BP
2668}
2669
89454bf4
BP
2670static int
2671netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2672 unsigned int *queue_idp, struct smap *details)
2673{
e0e2410d 2674 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
89454bf4
BP
2675 struct netdev_linux_queue_state *state = state_;
2676 int error = EOF;
2677
2678 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2679 if (netdev_linux_netnsid_is_remote(netdev)) {
2680 error = EOPNOTSUPP;
2681 goto exit;
2682 }
2683
89454bf4
BP
2684 while (state->cur_queue < state->n_queues) {
2685 unsigned int queue_id = state->queues[state->cur_queue++];
2686 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2687
2688 if (queue) {
2689 *queue_idp = queue_id;
2690 error = netdev->tc->ops->class_get(netdev_, queue, details);
2691 break;
2692 }
2693 }
89454bf4 2694
e0e2410d
FL
2695exit:
2696 ovs_mutex_unlock(&netdev->mutex);
89454bf4
BP
2697 return error;
2698}
2699
2700static int
2701netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2702 void *state_)
2703{
2704 struct netdev_linux_queue_state *state = state_;
2705
2706 free(state->queues);
2707 free(state);
2708 return 0;
2709}
2710
c1c9c9c4 2711static int
b5d57fc8 2712netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2713 netdev_dump_queue_stats_cb *cb, void *aux)
2714{
b5d57fc8 2715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2716 int error;
2717
86383816 2718 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2719 if (netdev_linux_netnsid_is_remote(netdev)) {
2720 error = EOPNOTSUPP;
2721 goto exit;
2722 }
2723
b5d57fc8 2724 error = tc_query_qdisc(netdev_);
86383816 2725 if (!error) {
d57695d7 2726 struct queue_dump_state state;
c1c9c9c4 2727
86383816
BP
2728 if (!netdev->tc->ops->class_dump_stats) {
2729 error = EOPNOTSUPP;
d57695d7 2730 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2731 error = ENODEV;
2732 } else {
2733 struct ofpbuf msg;
2734 int retval;
2735
d57695d7 2736 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2737 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2738 cb, aux);
2739 if (retval) {
2740 error = retval;
2741 }
2742 }
2743
d57695d7 2744 retval = finish_queue_dump(&state);
86383816
BP
2745 if (retval) {
2746 error = retval;
2747 }
c1c9c9c4
BP
2748 }
2749 }
2750
e0e2410d
FL
2751exit:
2752 ovs_mutex_unlock(&netdev->mutex);
86383816 2753 return error;
c1c9c9c4
BP
2754}
2755
8b61709d 2756static int
f1acd62b
BP
2757netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2758 struct in_addr netmask)
8b61709d 2759{
b5d57fc8 2760 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2761 int error;
2762
86383816 2763 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2764 if (netdev_linux_netnsid_is_remote(netdev)) {
2765 error = EOPNOTSUPP;
2766 goto exit;
2767 }
2768
f1acd62b 2769 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2770 if (!error) {
f1acd62b 2771 if (address.s_addr != INADDR_ANY) {
8b61709d 2772 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2773 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2774 }
2775 }
49af9a3d 2776
e0e2410d 2777exit:
86383816 2778 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
2779 return error;
2780}
2781
7df6932e
AW
2782/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2783 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2784 * error. */
8b61709d 2785static int
a8704b50
PS
2786netdev_linux_get_addr_list(const struct netdev *netdev_,
2787 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2788{
b5d57fc8 2789 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2790 int error;
86383816
BP
2791
2792 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2793 if (netdev_linux_netnsid_is_remote(netdev)) {
2794 error = EOPNOTSUPP;
2795 goto exit;
2796 }
2797
a8704b50 2798 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816 2799
e0e2410d
FL
2800exit:
2801 ovs_mutex_unlock(&netdev->mutex);
7df6932e 2802 return error;
8b61709d
BP
2803}
2804
2805static void
2806make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2807{
2808 struct sockaddr_in sin;
2809 memset(&sin, 0, sizeof sin);
2810 sin.sin_family = AF_INET;
2811 sin.sin_addr = addr;
2812 sin.sin_port = 0;
2813
2814 memset(sa, 0, sizeof *sa);
2815 memcpy(sa, &sin, sizeof sin);
2816}
2817
2818static int
2819do_set_addr(struct netdev *netdev,
2820 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2821{
2822 struct ifreq ifr;
149f577a 2823
259e0b1a
BP
2824 make_in4_sockaddr(&ifr.ifr_addr, addr);
2825 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2826 ioctl_name);
8b61709d
BP
2827}
2828
2829/* Adds 'router' as a default IP gateway. */
2830static int
67a4917b 2831netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2832{
2833 struct in_addr any = { INADDR_ANY };
2834 struct rtentry rt;
2835 int error;
2836
2837 memset(&rt, 0, sizeof rt);
2838 make_in4_sockaddr(&rt.rt_dst, any);
2839 make_in4_sockaddr(&rt.rt_gateway, router);
2840 make_in4_sockaddr(&rt.rt_genmask, any);
2841 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2842 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2843 if (error) {
10a89ef0 2844 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2845 }
2846 return error;
2847}
2848
f1acd62b
BP
2849static int
2850netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2851 char **netdev_name)
2852{
2853 static const char fn[] = "/proc/net/route";
2854 FILE *stream;
2855 char line[256];
2856 int ln;
2857
2858 *netdev_name = NULL;
2859 stream = fopen(fn, "r");
2860 if (stream == NULL) {
10a89ef0 2861 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2862 return errno;
2863 }
2864
2865 ln = 0;
2866 while (fgets(line, sizeof line, stream)) {
2867 if (++ln >= 2) {
2868 char iface[17];
dbba996b 2869 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2870 int refcnt, metric, mtu;
2871 unsigned int flags, use, window, irtt;
2872
c2c28dfd
BP
2873 if (!ovs_scan(line,
2874 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2875 " %d %u %u\n",
2876 iface, &dest, &gateway, &flags, &refcnt,
2877 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2878 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2879 fn, ln, line);
2880 continue;
2881 }
2882 if (!(flags & RTF_UP)) {
2883 /* Skip routes that aren't up. */
2884 continue;
2885 }
2886
2887 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2888 * network byte order, so we don't need need any endian
f1acd62b
BP
2889 * conversions here. */
2890 if ((dest & mask) == (host->s_addr & mask)) {
2891 if (!gateway) {
2892 /* The host is directly reachable. */
2893 next_hop->s_addr = 0;
2894 } else {
2895 /* To reach the host, we must go through a gateway. */
2896 next_hop->s_addr = gateway;
2897 }
2898 *netdev_name = xstrdup(iface);
2899 fclose(stream);
2900 return 0;
2901 }
2902 }
2903 }
2904
2905 fclose(stream);
2906 return ENXIO;
2907}
2908
e210037e 2909static int
b5d57fc8 2910netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2911{
b5d57fc8 2912 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2913 int error = 0;
2914
86383816 2915 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2916 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2917 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2918
2919 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2920 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2921 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2922 cmd,
2923 ETHTOOL_GDRVINFO,
2924 "ETHTOOL_GDRVINFO");
2925 if (!error) {
b5d57fc8 2926 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2927 }
2928 }
e210037e 2929
e210037e 2930 if (!error) {
b5d57fc8
BP
2931 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2932 smap_add(smap, "driver_version", netdev->drvinfo.version);
2933 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2934 }
86383816
BP
2935 ovs_mutex_unlock(&netdev->mutex);
2936
e210037e
AE
2937 return error;
2938}
2939
4f925bd3 2940static int
275707c3
EJ
2941netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2942 struct smap *smap)
4f925bd3 2943{
79f1cbe9 2944 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2945 return 0;
2946}
2947
8b61709d
BP
2948/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2949 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2950 * returns 0. Otherwise, it returns a positive errno value; in particular,
2951 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2952static int
2953netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2954 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2955{
2956 struct arpreq r;
c100e025 2957 struct sockaddr_in sin;
8b61709d
BP
2958 int retval;
2959
2960 memset(&r, 0, sizeof r);
f2cc621b 2961 memset(&sin, 0, sizeof sin);
c100e025
BP
2962 sin.sin_family = AF_INET;
2963 sin.sin_addr.s_addr = ip;
2964 sin.sin_port = 0;
2965 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2966 r.arp_ha.sa_family = ARPHRD_ETHER;
2967 r.arp_flags = 0;
71d7c22f 2968 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2969 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2970 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2971 if (!retval) {
2972 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2973 } else if (retval != ENXIO) {
2974 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2975 netdev_get_name(netdev), IP_ARGS(ip),
2976 ovs_strerror(retval));
8b61709d
BP
2977 }
2978 return retval;
2979}
2980
2981static int
2982nd_to_iff_flags(enum netdev_flags nd)
2983{
2984 int iff = 0;
2985 if (nd & NETDEV_UP) {
2986 iff |= IFF_UP;
2987 }
2988 if (nd & NETDEV_PROMISC) {
2989 iff |= IFF_PROMISC;
2990 }
7ba19d41
AC
2991 if (nd & NETDEV_LOOPBACK) {
2992 iff |= IFF_LOOPBACK;
2993 }
8b61709d
BP
2994 return iff;
2995}
2996
2997static int
2998iff_to_nd_flags(int iff)
2999{
3000 enum netdev_flags nd = 0;
3001 if (iff & IFF_UP) {
3002 nd |= NETDEV_UP;
3003 }
3004 if (iff & IFF_PROMISC) {
3005 nd |= NETDEV_PROMISC;
3006 }
7ba19d41
AC
3007 if (iff & IFF_LOOPBACK) {
3008 nd |= NETDEV_LOOPBACK;
3009 }
8b61709d
BP
3010 return nd;
3011}
3012
3013static int
4f9f3f21
BP
3014update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3015 enum netdev_flags on, enum netdev_flags *old_flagsp)
3016 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
3017{
3018 int old_flags, new_flags;
c37d4da4
EJ
3019 int error = 0;
3020
b5d57fc8 3021 old_flags = netdev->ifi_flags;
c37d4da4
EJ
3022 *old_flagsp = iff_to_nd_flags(old_flags);
3023 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3024 if (new_flags != old_flags) {
4f9f3f21
BP
3025 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3026 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 3027 }
4f9f3f21
BP
3028
3029 return error;
3030}
3031
3032static int
3033netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3034 enum netdev_flags on, enum netdev_flags *old_flagsp)
3035{
3036 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756819dd 3037 int error = 0;
4f9f3f21
BP
3038
3039 ovs_mutex_lock(&netdev->mutex);
756819dd
FL
3040 if (on || off) {
3041 /* Changing flags over netlink isn't support yet. */
e0e2410d
FL
3042 if (netdev_linux_netnsid_is_remote(netdev)) {
3043 error = EOPNOTSUPP;
3044 goto exit;
3045 }
756819dd
FL
3046 error = update_flags(netdev, off, on, old_flagsp);
3047 } else {
3048 /* Try reading flags over netlink, or fall back to ioctl. */
3049 if (!netdev_linux_update_via_netlink(netdev)) {
3050 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3051 } else {
3052 error = update_flags(netdev, off, on, old_flagsp);
3053 }
3054 }
e0e2410d
FL
3055
3056exit:
86383816 3057 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3058 return error;
3059}
3060
2f9dd77f 3061#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
18ebd48c
PB
3062 GET_FEATURES, GET_STATUS, \
3063 FLOW_OFFLOAD_API) \
c3827f61
BP
3064{ \
3065 NAME, \
118c77b1 3066 false, /* is_pmd */ \
c3827f61 3067 \
259e0b1a 3068 NULL, \
c3827f61
BP
3069 netdev_linux_run, \
3070 netdev_linux_wait, \
3071 \
9dc63482
BP
3072 netdev_linux_alloc, \
3073 CONSTRUCT, \
3074 netdev_linux_destruct, \
3075 netdev_linux_dealloc, \
de5cdb90 3076 NULL, /* get_config */ \
6d9e6eb4 3077 NULL, /* set_config */ \
f431bf7d 3078 NULL, /* get_tunnel_config */ \
a36de779
PS
3079 NULL, /* build header */ \
3080 NULL, /* push header */ \
3081 NULL, /* pop header */ \
7dec44fe 3082 NULL, /* get_numa_id */ \
050c60bf 3083 NULL, /* set_tx_multiq */ \
c3827f61 3084 \
c3827f61
BP
3085 netdev_linux_send, \
3086 netdev_linux_send_wait, \
3087 \
3088 netdev_linux_set_etheraddr, \
3089 netdev_linux_get_etheraddr, \
3090 netdev_linux_get_mtu, \
9b020780 3091 netdev_linux_set_mtu, \
c3827f61
BP
3092 netdev_linux_get_ifindex, \
3093 netdev_linux_get_carrier, \
65c3058c 3094 netdev_linux_get_carrier_resets, \
1670c579 3095 netdev_linux_set_miimon_interval, \
f613a0d7 3096 GET_STATS, \
971f4b39 3097 NULL, \
c3827f61 3098 \
51f87458 3099 GET_FEATURES, \
c3827f61 3100 netdev_linux_set_advertisements, \
875ab130 3101 NULL, /* get_pt_mode */ \
c3827f61
BP
3102 \
3103 netdev_linux_set_policing, \
3104 netdev_linux_get_qos_types, \
3105 netdev_linux_get_qos_capabilities, \
3106 netdev_linux_get_qos, \
3107 netdev_linux_set_qos, \
3108 netdev_linux_get_queue, \
3109 netdev_linux_set_queue, \
3110 netdev_linux_delete_queue, \
3111 netdev_linux_get_queue_stats, \
89454bf4
BP
3112 netdev_linux_queue_dump_start, \
3113 netdev_linux_queue_dump_next, \
3114 netdev_linux_queue_dump_done, \
c3827f61
BP
3115 netdev_linux_dump_queue_stats, \
3116 \
c3827f61 3117 netdev_linux_set_in4, \
a8704b50 3118 netdev_linux_get_addr_list, \
c3827f61
BP
3119 netdev_linux_add_router, \
3120 netdev_linux_get_next_hop, \
4f925bd3 3121 GET_STATUS, \
c3827f61
BP
3122 netdev_linux_arp_lookup, \
3123 \
3124 netdev_linux_update_flags, \
790fb3b7 3125 NULL, /* reconfigure */ \
c3827f61 3126 \
f7791740
PS
3127 netdev_linux_rxq_alloc, \
3128 netdev_linux_rxq_construct, \
3129 netdev_linux_rxq_destruct, \
3130 netdev_linux_rxq_dealloc, \
3131 netdev_linux_rxq_recv, \
3132 netdev_linux_rxq_wait, \
3133 netdev_linux_rxq_drain, \
18ebd48c 3134 \
88dcf2aa
JH
3135 FLOW_OFFLOAD_API, \
3136 NULL /* get_block_id */ \
c3827f61
BP
3137}
3138
3139const struct netdev_class netdev_linux_class =
3140 NETDEV_LINUX_CLASS(
3141 "system",
9dc63482 3142 netdev_linux_construct,
f613a0d7 3143 netdev_linux_get_stats,
51f87458 3144 netdev_linux_get_features,
18ebd48c
PB
3145 netdev_linux_get_status,
3146 LINUX_FLOW_OFFLOAD_API);
c3827f61
BP
3147
3148const struct netdev_class netdev_tap_class =
3149 NETDEV_LINUX_CLASS(
3150 "tap",
9dc63482 3151 netdev_linux_construct_tap,
bba1e6f3 3152 netdev_tap_get_stats,
51f87458 3153 netdev_linux_get_features,
18ebd48c
PB
3154 netdev_linux_get_status,
3155 NO_OFFLOAD_API);
c3827f61
BP
3156
3157const struct netdev_class netdev_internal_class =
3158 NETDEV_LINUX_CLASS(
3159 "internal",
9dc63482 3160 netdev_linux_construct,
bba1e6f3 3161 netdev_internal_get_stats,
51f87458 3162 NULL, /* get_features */
18ebd48c
PB
3163 netdev_internal_get_status,
3164 NO_OFFLOAD_API);
8b61709d 3165\f
677d9158
JV
3166
3167#define CODEL_N_QUEUES 0x0000
3168
2f4298ce
BP
3169/* In sufficiently new kernel headers these are defined as enums in
3170 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3171 * kernels. (This overrides any enum definition in the header file but that's
3172 * harmless.) */
3173#define TCA_CODEL_TARGET 1
3174#define TCA_CODEL_LIMIT 2
3175#define TCA_CODEL_INTERVAL 3
3176
677d9158
JV
3177struct codel {
3178 struct tc tc;
3179 uint32_t target;
3180 uint32_t limit;
3181 uint32_t interval;
3182};
3183
3184static struct codel *
3185codel_get__(const struct netdev *netdev_)
3186{
3187 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3188 return CONTAINER_OF(netdev->tc, struct codel, tc);
3189}
3190
3191static void
3192codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3193 uint32_t interval)
3194{
3195 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3196 struct codel *codel;
3197
3198 codel = xmalloc(sizeof *codel);
3199 tc_init(&codel->tc, &tc_ops_codel);
3200 codel->target = target;
3201 codel->limit = limit;
3202 codel->interval = interval;
3203
3204 netdev->tc = &codel->tc;
3205}
3206
3207static int
3208codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3209 uint32_t interval)
3210{
3211 size_t opt_offset;
3212 struct ofpbuf request;
3213 struct tcmsg *tcmsg;
3214 uint32_t otarget, olimit, ointerval;
3215 int error;
3216
3217 tc_del_qdisc(netdev);
3218
7874bdff
RD
3219 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3220 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3221 if (!tcmsg) {
3222 return ENODEV;
3223 }
3224 tcmsg->tcm_handle = tc_make_handle(1, 0);
3225 tcmsg->tcm_parent = TC_H_ROOT;
3226
3227 otarget = target ? target : 5000;
3228 olimit = limit ? limit : 10240;
3229 ointerval = interval ? interval : 100000;
3230
3231 nl_msg_put_string(&request, TCA_KIND, "codel");
3232 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3233 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3234 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3235 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3236 nl_msg_end_nested(&request, opt_offset);
3237
3238 error = tc_transact(&request, NULL);
3239 if (error) {
3240 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3241 "target %u, limit %u, interval %u error %d(%s)",
3242 netdev_get_name(netdev),
3243 otarget, olimit, ointerval,
3244 error, ovs_strerror(error));
3245 }
3246 return error;
3247}
3248
3249static void
3250codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3251 const struct smap *details, struct codel *codel)
3252{
13c1637f
BP
3253 codel->target = smap_get_ullong(details, "target", 0);
3254 codel->limit = smap_get_ullong(details, "limit", 0);
3255 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3256
3257 if (!codel->target) {
3258 codel->target = 5000;
3259 }
3260 if (!codel->limit) {
3261 codel->limit = 10240;
3262 }
3263 if (!codel->interval) {
3264 codel->interval = 100000;
3265 }
3266}
3267
3268static int
3269codel_tc_install(struct netdev *netdev, const struct smap *details)
3270{
3271 int error;
3272 struct codel codel;
3273
3274 codel_parse_qdisc_details__(netdev, details, &codel);
3275 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3276 codel.interval);
3277 if (!error) {
3278 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3279 }
3280 return error;
3281}
3282
3283static int
3284codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3285{
3286 static const struct nl_policy tca_codel_policy[] = {
3287 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3288 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3289 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3290 };
3291
3292 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3293
3294 if (!nl_parse_nested(nl_options, tca_codel_policy,
3295 attrs, ARRAY_SIZE(tca_codel_policy))) {
3296 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3297 return EPROTO;
3298 }
3299
3300 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3301 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3302 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3303 return 0;
3304}
3305
3306static int
3307codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3308{
3309 struct nlattr *nlattr;
3310 const char * kind;
3311 int error;
3312 struct codel codel;
3313
3314 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3315 if (error != 0) {
3316 return error;
3317 }
3318
3319 error = codel_parse_tca_options__(nlattr, &codel);
3320 if (error != 0) {
3321 return error;
3322 }
3323
3324 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3325 return 0;
3326}
3327
3328
3329static void
3330codel_tc_destroy(struct tc *tc)
3331{
3332 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3333 tc_destroy(tc);
3334 free(codel);
3335}
3336
3337static int
3338codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3339{
3340 const struct codel *codel = codel_get__(netdev);
3341 smap_add_format(details, "target", "%u", codel->target);
3342 smap_add_format(details, "limit", "%u", codel->limit);
3343 smap_add_format(details, "interval", "%u", codel->interval);
3344 return 0;
3345}
3346
3347static int
3348codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3349{
3350 struct codel codel;
3351
3352 codel_parse_qdisc_details__(netdev, details, &codel);
3353 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3354 codel_get__(netdev)->target = codel.target;
3355 codel_get__(netdev)->limit = codel.limit;
3356 codel_get__(netdev)->interval = codel.interval;
3357 return 0;
3358}
3359
3360static const struct tc_ops tc_ops_codel = {
3361 "codel", /* linux_name */
3362 "linux-codel", /* ovs_name */
3363 CODEL_N_QUEUES, /* n_queues */
3364 codel_tc_install,
3365 codel_tc_load,
3366 codel_tc_destroy,
3367 codel_qdisc_get,
3368 codel_qdisc_set,
3369 NULL,
3370 NULL,
3371 NULL,
3372 NULL,
3373 NULL
3374};
3375\f
3376/* FQ-CoDel traffic control class. */
3377
3378#define FQCODEL_N_QUEUES 0x0000
3379
2f4298ce
BP
3380/* In sufficiently new kernel headers these are defined as enums in
3381 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3382 * kernels. (This overrides any enum definition in the header file but that's
3383 * harmless.) */
3384#define TCA_FQ_CODEL_TARGET 1
3385#define TCA_FQ_CODEL_LIMIT 2
3386#define TCA_FQ_CODEL_INTERVAL 3
3387#define TCA_FQ_CODEL_ECN 4
3388#define TCA_FQ_CODEL_FLOWS 5
3389#define TCA_FQ_CODEL_QUANTUM 6
3390
677d9158
JV
3391struct fqcodel {
3392 struct tc tc;
3393 uint32_t target;
3394 uint32_t limit;
3395 uint32_t interval;
3396 uint32_t flows;
3397 uint32_t quantum;
3398};
3399
3400static struct fqcodel *
3401fqcodel_get__(const struct netdev *netdev_)
3402{
3403 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3404 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3405}
3406
3407static void
3408fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3409 uint32_t interval, uint32_t flows, uint32_t quantum)
3410{
3411 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3412 struct fqcodel *fqcodel;
3413
3414 fqcodel = xmalloc(sizeof *fqcodel);
3415 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3416 fqcodel->target = target;
3417 fqcodel->limit = limit;
3418 fqcodel->interval = interval;
3419 fqcodel->flows = flows;
3420 fqcodel->quantum = quantum;
3421
3422 netdev->tc = &fqcodel->tc;
3423}
3424
3425static int
3426fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3427 uint32_t interval, uint32_t flows, uint32_t quantum)
3428{
3429 size_t opt_offset;
3430 struct ofpbuf request;
3431 struct tcmsg *tcmsg;
3432 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3433 int error;
3434
3435 tc_del_qdisc(netdev);
3436
7874bdff
RD
3437 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3438 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3439 if (!tcmsg) {
3440 return ENODEV;
3441 }
3442 tcmsg->tcm_handle = tc_make_handle(1, 0);
3443 tcmsg->tcm_parent = TC_H_ROOT;
3444
3445 otarget = target ? target : 5000;
3446 olimit = limit ? limit : 10240;
3447 ointerval = interval ? interval : 100000;
3448 oflows = flows ? flows : 1024;
3449 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3450 not mtu */
3451
3452 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3453 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3454 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3455 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3456 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3457 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3458 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3459 nl_msg_end_nested(&request, opt_offset);
3460
3461 error = tc_transact(&request, NULL);
3462 if (error) {
3463 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3464 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3465 netdev_get_name(netdev),
3466 otarget, olimit, ointerval, oflows, oquantum,
3467 error, ovs_strerror(error));
3468 }
3469 return error;
3470}
3471
3472static void
3473fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3474 const struct smap *details, struct fqcodel *fqcodel)
3475{
13c1637f
BP
3476 fqcodel->target = smap_get_ullong(details, "target", 0);
3477 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3478 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3479 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3480 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3481
677d9158
JV
3482 if (!fqcodel->target) {
3483 fqcodel->target = 5000;
3484 }
3485 if (!fqcodel->limit) {
3486 fqcodel->limit = 10240;
3487 }
3488 if (!fqcodel->interval) {
3489 fqcodel->interval = 1000000;
3490 }
3491 if (!fqcodel->flows) {
3492 fqcodel->flows = 1024;
3493 }
3494 if (!fqcodel->quantum) {
3495 fqcodel->quantum = 1514;
3496 }
3497}
3498
3499static int
3500fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3501{
3502 int error;
3503 struct fqcodel fqcodel;
3504
3505 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3506 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3507 fqcodel.interval, fqcodel.flows,
3508 fqcodel.quantum);
3509 if (!error) {
3510 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3511 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3512 }
3513 return error;
3514}
3515
3516static int
3517fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3518{
3519 static const struct nl_policy tca_fqcodel_policy[] = {
3520 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3521 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3522 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3523 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3524 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3525 };
3526
3527 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3528
3529 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3530 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3531 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3532 return EPROTO;
3533 }
3534
3535 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3536 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3537 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3538 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3539 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3540 return 0;
3541}
3542
3543static int
3544fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3545{
3546 struct nlattr *nlattr;
3547 const char * kind;
3548 int error;
3549 struct fqcodel fqcodel;
3550
3551 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3552 if (error != 0) {
3553 return error;
3554 }
3555
3556 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3557 if (error != 0) {
3558 return error;
3559 }
3560
3561 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3562 fqcodel.flows, fqcodel.quantum);
3563 return 0;
3564}
3565
3566static void
3567fqcodel_tc_destroy(struct tc *tc)
3568{
3569 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3570 tc_destroy(tc);
3571 free(fqcodel);
3572}
3573
3574static int
3575fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3576{
3577 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3578 smap_add_format(details, "target", "%u", fqcodel->target);
3579 smap_add_format(details, "limit", "%u", fqcodel->limit);
3580 smap_add_format(details, "interval", "%u", fqcodel->interval);
3581 smap_add_format(details, "flows", "%u", fqcodel->flows);
3582 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3583 return 0;
3584}
3585
3586static int
3587fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3588{
3589 struct fqcodel fqcodel;
3590
3591 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3592 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3593 fqcodel.flows, fqcodel.quantum);
3594 fqcodel_get__(netdev)->target = fqcodel.target;
3595 fqcodel_get__(netdev)->limit = fqcodel.limit;
3596 fqcodel_get__(netdev)->interval = fqcodel.interval;
3597 fqcodel_get__(netdev)->flows = fqcodel.flows;
3598 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3599 return 0;
3600}
3601
3602static const struct tc_ops tc_ops_fqcodel = {
3603 "fq_codel", /* linux_name */
3604 "linux-fq_codel", /* ovs_name */
3605 FQCODEL_N_QUEUES, /* n_queues */
3606 fqcodel_tc_install,
3607 fqcodel_tc_load,
3608 fqcodel_tc_destroy,
3609 fqcodel_qdisc_get,
3610 fqcodel_qdisc_set,
3611 NULL,
3612 NULL,
3613 NULL,
3614 NULL,
3615 NULL
3616};
3617\f
3618/* SFQ traffic control class. */
3619
3620#define SFQ_N_QUEUES 0x0000
3621
3622struct sfq {
3623 struct tc tc;
3624 uint32_t quantum;
3625 uint32_t perturb;
3626};
3627
3628static struct sfq *
3629sfq_get__(const struct netdev *netdev_)
3630{
3631 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3632 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3633}
3634
3635static void
3636sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3637{
3638 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3639 struct sfq *sfq;
3640
3641 sfq = xmalloc(sizeof *sfq);
3642 tc_init(&sfq->tc, &tc_ops_sfq);
3643 sfq->perturb = perturb;
3644 sfq->quantum = quantum;
3645
3646 netdev->tc = &sfq->tc;
3647}
3648
3649static int
3650sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3651{
3652 struct tc_sfq_qopt opt;
3653 struct ofpbuf request;
3654 struct tcmsg *tcmsg;
3655 int mtu;
3656 int mtu_error, error;
3657 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3658
3659 tc_del_qdisc(netdev);
3660
7874bdff
RD
3661 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3662 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3663 if (!tcmsg) {
3664 return ENODEV;
3665 }
3666 tcmsg->tcm_handle = tc_make_handle(1, 0);
3667 tcmsg->tcm_parent = TC_H_ROOT;
3668
3669 memset(&opt, 0, sizeof opt);
3670 if (!quantum) {
3671 if (!mtu_error) {
3672 opt.quantum = mtu; /* if we cannot find mtu, use default */
3673 }
3674 } else {
3675 opt.quantum = quantum;
3676 }
3677
3678 if (!perturb) {
3679 opt.perturb_period = 10;
3680 } else {
3681 opt.perturb_period = perturb;
3682 }
3683
3684 nl_msg_put_string(&request, TCA_KIND, "sfq");
3685 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3686
3687 error = tc_transact(&request, NULL);
3688 if (error) {
3689 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3690 "quantum %u, perturb %u error %d(%s)",
3691 netdev_get_name(netdev),
3692 opt.quantum, opt.perturb_period,
3693 error, ovs_strerror(error));
3694 }
3695 return error;
3696}
3697
3698static void
3699sfq_parse_qdisc_details__(struct netdev *netdev,
3700 const struct smap *details, struct sfq *sfq)
3701{
13c1637f
BP
3702 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3703 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3704
677d9158
JV
3705 if (!sfq->perturb) {
3706 sfq->perturb = 10;
3707 }
3708
3709 if (!sfq->quantum) {
13c1637f
BP
3710 int mtu;
3711 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3712 sfq->quantum = mtu;
3713 } else {
3714 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3715 "device without mtu");
677d9158
JV
3716 }
3717 }
3718}
3719
3720static int
3721sfq_tc_install(struct netdev *netdev, const struct smap *details)
3722{
3723 int error;
3724 struct sfq sfq;
3725
3726 sfq_parse_qdisc_details__(netdev, details, &sfq);
3727 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3728 if (!error) {
3729 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3730 }
3731 return error;
3732}
3733
3734static int
3735sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3736{
3737 const struct tc_sfq_qopt *sfq;
3738 struct nlattr *nlattr;
3739 const char * kind;
3740 int error;
3741
3742 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3743 if (error == 0) {
3744 sfq = nl_attr_get(nlattr);
3745 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3746 return 0;
3747 }
3748
3749 return error;
3750}
3751
3752static void
3753sfq_tc_destroy(struct tc *tc)
3754{
3755 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3756 tc_destroy(tc);
3757 free(sfq);
3758}
3759
3760static int
3761sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3762{
3763 const struct sfq *sfq = sfq_get__(netdev);
3764 smap_add_format(details, "quantum", "%u", sfq->quantum);
3765 smap_add_format(details, "perturb", "%u", sfq->perturb);
3766 return 0;
3767}
3768
3769static int
3770sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3771{
3772 struct sfq sfq;
3773
3774 sfq_parse_qdisc_details__(netdev, details, &sfq);
3775 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3776 sfq_get__(netdev)->quantum = sfq.quantum;
3777 sfq_get__(netdev)->perturb = sfq.perturb;
3778 return 0;
3779}
3780
3781static const struct tc_ops tc_ops_sfq = {
3782 "sfq", /* linux_name */
3783 "linux-sfq", /* ovs_name */
3784 SFQ_N_QUEUES, /* n_queues */
3785 sfq_tc_install,
3786 sfq_tc_load,
3787 sfq_tc_destroy,
3788 sfq_qdisc_get,
3789 sfq_qdisc_set,
3790 NULL,
3791 NULL,
3792 NULL,
3793 NULL,
3794 NULL
3795};
3796\f
c1c9c9c4 3797/* HTB traffic control class. */
559843ed 3798
c1c9c9c4 3799#define HTB_N_QUEUES 0xf000
4f631ccd 3800#define HTB_RATE2QUANTUM 10
8b61709d 3801
c1c9c9c4
BP
3802struct htb {
3803 struct tc tc;
3804 unsigned int max_rate; /* In bytes/s. */
3805};
8b61709d 3806
c1c9c9c4 3807struct htb_class {
93b13be8 3808 struct tc_queue tc_queue;
c1c9c9c4
BP
3809 unsigned int min_rate; /* In bytes/s. */
3810 unsigned int max_rate; /* In bytes/s. */
3811 unsigned int burst; /* In bytes. */
3812 unsigned int priority; /* Lower values are higher priorities. */
3813};
8b61709d 3814
c1c9c9c4 3815static struct htb *
b5d57fc8 3816htb_get__(const struct netdev *netdev_)
c1c9c9c4 3817{
b5d57fc8
BP
3818 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3819 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3820}
3821
24045e35 3822static void
b5d57fc8 3823htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3824{
b5d57fc8 3825 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3826 struct htb *htb;
3827
3828 htb = xmalloc(sizeof *htb);
3829 tc_init(&htb->tc, &tc_ops_htb);
3830 htb->max_rate = max_rate;
3831
b5d57fc8 3832 netdev->tc = &htb->tc;
c1c9c9c4
BP
3833}
3834
3835/* Create an HTB qdisc.
3836 *
a339aa81 3837 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3838static int
3839htb_setup_qdisc__(struct netdev *netdev)
3840{
3841 size_t opt_offset;
3842 struct tc_htb_glob opt;
3843 struct ofpbuf request;
3844 struct tcmsg *tcmsg;
3845
3846 tc_del_qdisc(netdev);
3847
7874bdff
RD
3848 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3849 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3850 if (!tcmsg) {
3851 return ENODEV;
3852 }
c1c9c9c4
BP
3853 tcmsg->tcm_handle = tc_make_handle(1, 0);
3854 tcmsg->tcm_parent = TC_H_ROOT;
3855
3856 nl_msg_put_string(&request, TCA_KIND, "htb");
3857
3858 memset(&opt, 0, sizeof opt);
4f631ccd 3859 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3860 opt.version = 3;
4ecf12d5 3861 opt.defcls = 1;
c1c9c9c4
BP
3862
3863 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3864 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3865 nl_msg_end_nested(&request, opt_offset);
3866
3867 return tc_transact(&request, NULL);
3868}
3869
3870/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3871 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3872static int
3873htb_setup_class__(struct netdev *netdev, unsigned int handle,
3874 unsigned int parent, struct htb_class *class)
3875{
3876 size_t opt_offset;
3877 struct tc_htb_opt opt;
3878 struct ofpbuf request;
3879 struct tcmsg *tcmsg;
3880 int error;
3881 int mtu;
3882
73371c09 3883 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3884 if (error) {
f915f1a8
BP
3885 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3886 netdev_get_name(netdev));
9b020780 3887 return error;
f915f1a8 3888 }
c1c9c9c4
BP
3889
3890 memset(&opt, 0, sizeof opt);
3891 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3892 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3893 /* Makes sure the quantum is at least MTU. Setting quantum will
3894 * make htb ignore the r2q for this class. */
3895 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3896 opt.quantum = mtu;
3897 }
c1c9c9c4
BP
3898 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3899 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3900 opt.prio = class->priority;
3901
7874bdff
RD
3902 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3903 &request);
23a98ffe
BP
3904 if (!tcmsg) {
3905 return ENODEV;
3906 }
c1c9c9c4
BP
3907 tcmsg->tcm_handle = handle;
3908 tcmsg->tcm_parent = parent;
3909
3910 nl_msg_put_string(&request, TCA_KIND, "htb");
3911 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3912 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3913 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3914 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3915 nl_msg_end_nested(&request, opt_offset);
3916
3917 error = tc_transact(&request, NULL);
3918 if (error) {
3919 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3920 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3921 netdev_get_name(netdev),
3922 tc_get_major(handle), tc_get_minor(handle),
3923 tc_get_major(parent), tc_get_minor(parent),
3924 class->min_rate, class->max_rate,
10a89ef0 3925 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3926 }
3927 return error;
3928}
3929
3930/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3931 * description of them into 'details'. The description complies with the
3932 * specification given in the vswitch database documentation for linux-htb
3933 * queue details. */
3934static int
3935htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3936{
3937 static const struct nl_policy tca_htb_policy[] = {
3938 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3939 .min_len = sizeof(struct tc_htb_opt) },
3940 };
3941
3942 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3943 const struct tc_htb_opt *htb;
3944
3945 if (!nl_parse_nested(nl_options, tca_htb_policy,
3946 attrs, ARRAY_SIZE(tca_htb_policy))) {
3947 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3948 return EPROTO;
3949 }
3950
3951 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3952 class->min_rate = htb->rate.rate;
3953 class->max_rate = htb->ceil.rate;
3954 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3955 class->priority = htb->prio;
3956 return 0;
3957}
3958
3959static int
3960htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3961 struct htb_class *options,
3962 struct netdev_queue_stats *stats)
3963{
3964 struct nlattr *nl_options;
3965 unsigned int handle;
3966 int error;
3967
3968 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3969 if (!error && queue_id) {
17ee3c1f
BP
3970 unsigned int major = tc_get_major(handle);
3971 unsigned int minor = tc_get_minor(handle);
3972 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3973 *queue_id = minor - 1;
c1c9c9c4
BP
3974 } else {
3975 error = EPROTO;
3976 }
3977 }
3978 if (!error && options) {
3979 error = htb_parse_tca_options__(nl_options, options);
3980 }
3981 return error;
3982}
3983
3984static void
73371c09 3985htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3986 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3987{
73371c09 3988 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 3989
13c1637f 3990 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 3991 if (!hc->max_rate) {
a00ca915 3992 enum netdev_features current;
c1c9c9c4 3993
73371c09
BP
3994 netdev_linux_read_features(netdev);
3995 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3996 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3997 }
3998 hc->min_rate = hc->max_rate;
3999 hc->burst = 0;
4000 hc->priority = 0;
4001}
4002
4003static int
4004htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 4005 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
4006{
4007 const struct htb *htb = htb_get__(netdev);
9b020780 4008 int mtu, error;
214117fd 4009 unsigned long long int max_rate_bit;
c1c9c9c4 4010
73371c09 4011 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4012 if (error) {
f915f1a8
BP
4013 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4014 netdev_get_name(netdev));
9b020780 4015 return error;
f915f1a8
BP
4016 }
4017
4f104611
EJ
4018 /* HTB requires at least an mtu sized min-rate to send any traffic even
4019 * on uncongested links. */
13c1637f 4020 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 4021 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
4022 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4023
4024 /* max-rate */
214117fd
KF
4025 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4026 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
4027 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4028 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4029
4030 /* burst
4031 *
4032 * According to hints in the documentation that I've read, it is important
4033 * that 'burst' be at least as big as the largest frame that might be
4034 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4035 * but having it a bit too small is a problem. Since netdev_get_mtu()
4036 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4037 * the MTU. We actually add 64, instead of 14, as a guard against
4038 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 4039 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
4040 hc->burst = MAX(hc->burst, mtu + 64);
4041
4042 /* priority */
13c1637f 4043 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
4044
4045 return 0;
4046}
4047
4048static int
4049htb_query_class__(const struct netdev *netdev, unsigned int handle,
4050 unsigned int parent, struct htb_class *options,
4051 struct netdev_queue_stats *stats)
4052{
4053 struct ofpbuf *reply;
4054 int error;
4055
4056 error = tc_query_class(netdev, handle, parent, &reply);
4057 if (!error) {
4058 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4059 ofpbuf_delete(reply);
4060 }
4061 return error;
4062}
4063
4064static int
79f1cbe9 4065htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4066{
4067 int error;
4068
4069 error = htb_setup_qdisc__(netdev);
4070 if (!error) {
4071 struct htb_class hc;
4072
4073 htb_parse_qdisc_details__(netdev, details, &hc);
4074 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4075 tc_make_handle(1, 0), &hc);
4076 if (!error) {
4077 htb_install__(netdev, hc.max_rate);
4078 }
4079 }
4080 return error;
4081}
4082
93b13be8
BP
4083static struct htb_class *
4084htb_class_cast__(const struct tc_queue *queue)
4085{
4086 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4087}
4088
c1c9c9c4
BP
4089static void
4090htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4091 const struct htb_class *hc)
4092{
4093 struct htb *htb = htb_get__(netdev);
93b13be8
BP
4094 size_t hash = hash_int(queue_id, 0);
4095 struct tc_queue *queue;
c1c9c9c4
BP
4096 struct htb_class *hcp;
4097
93b13be8
BP
4098 queue = tc_find_queue__(netdev, queue_id, hash);
4099 if (queue) {
4100 hcp = htb_class_cast__(queue);
4101 } else {
c1c9c9c4 4102 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
4103 queue = &hcp->tc_queue;
4104 queue->queue_id = queue_id;
6dc34a0d 4105 queue->created = time_msec();
93b13be8 4106 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 4107 }
93b13be8
BP
4108
4109 hcp->min_rate = hc->min_rate;
4110 hcp->max_rate = hc->max_rate;
4111 hcp->burst = hc->burst;
4112 hcp->priority = hc->priority;
c1c9c9c4
BP
4113}
4114
4115static int
4116htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4117{
c1c9c9c4 4118 struct ofpbuf msg;
d57695d7 4119 struct queue_dump_state state;
c1c9c9c4 4120 struct htb_class hc;
c1c9c9c4
BP
4121
4122 /* Get qdisc options. */
4123 hc.max_rate = 0;
4124 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4125 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
4126
4127 /* Get queues. */
d57695d7 4128 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
4129 return ENODEV;
4130 }
d57695d7 4131 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
4132 unsigned int queue_id;
4133
4134 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4135 htb_update_queue__(netdev, queue_id, &hc);
4136 }
4137 }
d57695d7 4138 finish_queue_dump(&state);
c1c9c9c4
BP
4139
4140 return 0;
4141}
4142
4143static void
4144htb_tc_destroy(struct tc *tc)
4145{
4146 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 4147 struct htb_class *hc;
c1c9c9c4 4148
4ec3d7c7 4149 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
4150 free(hc);
4151 }
4152 tc_destroy(tc);
4153 free(htb);
4154}
4155
4156static int
79f1cbe9 4157htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
4158{
4159 const struct htb *htb = htb_get__(netdev);
79f1cbe9 4160 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
4161 return 0;
4162}
4163
4164static int
79f1cbe9 4165htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4166{
4167 struct htb_class hc;
4168 int error;
4169
4170 htb_parse_qdisc_details__(netdev, details, &hc);
4171 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4172 tc_make_handle(1, 0), &hc);
4173 if (!error) {
4174 htb_get__(netdev)->max_rate = hc.max_rate;
4175 }
4176 return error;
4177}
4178
4179static int
93b13be8 4180htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4181 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 4182{
93b13be8 4183 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4184
79f1cbe9 4185 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 4186 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4187 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 4188 }
79f1cbe9 4189 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 4190 if (hc->priority) {
79f1cbe9 4191 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
4192 }
4193 return 0;
4194}
4195
4196static int
4197htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4198 const struct smap *details)
c1c9c9c4
BP
4199{
4200 struct htb_class hc;
4201 int error;
4202
4203 error = htb_parse_class_details__(netdev, details, &hc);
4204 if (error) {
4205 return error;
4206 }
4207
17ee3c1f 4208 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
4209 tc_make_handle(1, 0xfffe), &hc);
4210 if (error) {
4211 return error;
4212 }
4213
4214 htb_update_queue__(netdev, queue_id, &hc);
4215 return 0;
4216}
4217
4218static int
93b13be8 4219htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 4220{
93b13be8 4221 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4222 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4223 int error;
4224
93b13be8 4225 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4226 if (!error) {
93b13be8 4227 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4228 free(hc);
c1c9c9c4
BP
4229 }
4230 return error;
4231}
4232
4233static int
93b13be8 4234htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4235 struct netdev_queue_stats *stats)
4236{
93b13be8 4237 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4238 tc_make_handle(1, 0xfffe), NULL, stats);
4239}
4240
4241static int
4242htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4243 const struct ofpbuf *nlmsg,
4244 netdev_dump_queue_stats_cb *cb, void *aux)
4245{
4246 struct netdev_queue_stats stats;
17ee3c1f 4247 unsigned int handle, major, minor;
c1c9c9c4
BP
4248 int error;
4249
4250 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4251 if (error) {
4252 return error;
4253 }
4254
17ee3c1f
BP
4255 major = tc_get_major(handle);
4256 minor = tc_get_minor(handle);
4257 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4258 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4259 }
4260 return 0;
4261}
4262
4263static const struct tc_ops tc_ops_htb = {
4264 "htb", /* linux_name */
4265 "linux-htb", /* ovs_name */
4266 HTB_N_QUEUES, /* n_queues */
4267 htb_tc_install,
4268 htb_tc_load,
4269 htb_tc_destroy,
4270 htb_qdisc_get,
4271 htb_qdisc_set,
4272 htb_class_get,
4273 htb_class_set,
4274 htb_class_delete,
4275 htb_class_get_stats,
4276 htb_class_dump_stats
4277};
4278\f
a339aa81
EJ
4279/* "linux-hfsc" traffic control class. */
4280
4281#define HFSC_N_QUEUES 0xf000
4282
4283struct hfsc {
4284 struct tc tc;
4285 uint32_t max_rate;
4286};
4287
4288struct hfsc_class {
4289 struct tc_queue tc_queue;
4290 uint32_t min_rate;
4291 uint32_t max_rate;
4292};
4293
4294static struct hfsc *
b5d57fc8 4295hfsc_get__(const struct netdev *netdev_)
a339aa81 4296{
b5d57fc8
BP
4297 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4298 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4299}
4300
4301static struct hfsc_class *
4302hfsc_class_cast__(const struct tc_queue *queue)
4303{
4304 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4305}
4306
24045e35 4307static void
b5d57fc8 4308hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4309{
b5d57fc8 4310 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4311 struct hfsc *hfsc;
4312
a339aa81
EJ
4313 hfsc = xmalloc(sizeof *hfsc);
4314 tc_init(&hfsc->tc, &tc_ops_hfsc);
4315 hfsc->max_rate = max_rate;
b5d57fc8 4316 netdev->tc = &hfsc->tc;
a339aa81
EJ
4317}
4318
4319static void
4320hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4321 const struct hfsc_class *hc)
4322{
4323 size_t hash;
4324 struct hfsc *hfsc;
4325 struct hfsc_class *hcp;
4326 struct tc_queue *queue;
4327
4328 hfsc = hfsc_get__(netdev);
4329 hash = hash_int(queue_id, 0);
4330
4331 queue = tc_find_queue__(netdev, queue_id, hash);
4332 if (queue) {
4333 hcp = hfsc_class_cast__(queue);
4334 } else {
4335 hcp = xmalloc(sizeof *hcp);
4336 queue = &hcp->tc_queue;
4337 queue->queue_id = queue_id;
6dc34a0d 4338 queue->created = time_msec();
a339aa81
EJ
4339 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4340 }
4341
4342 hcp->min_rate = hc->min_rate;
4343 hcp->max_rate = hc->max_rate;
4344}
4345
4346static int
4347hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4348{
4349 const struct tc_service_curve *rsc, *fsc, *usc;
4350 static const struct nl_policy tca_hfsc_policy[] = {
4351 [TCA_HFSC_RSC] = {
4352 .type = NL_A_UNSPEC,
4353 .optional = false,
4354 .min_len = sizeof(struct tc_service_curve),
4355 },
4356 [TCA_HFSC_FSC] = {
4357 .type = NL_A_UNSPEC,
4358 .optional = false,
4359 .min_len = sizeof(struct tc_service_curve),
4360 },
4361 [TCA_HFSC_USC] = {
4362 .type = NL_A_UNSPEC,
4363 .optional = false,
4364 .min_len = sizeof(struct tc_service_curve),
4365 },
4366 };
4367 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4368
4369 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4370 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4371 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4372 return EPROTO;
4373 }
4374
4375 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4376 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4377 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4378
4379 if (rsc->m1 != 0 || rsc->d != 0 ||
4380 fsc->m1 != 0 || fsc->d != 0 ||
4381 usc->m1 != 0 || usc->d != 0) {
4382 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4383 "Non-linear service curves are not supported.");
4384 return EPROTO;
4385 }
4386
4387 if (rsc->m2 != fsc->m2) {
4388 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4389 "Real-time service curves are not supported ");
4390 return EPROTO;
4391 }
4392
4393 if (rsc->m2 > usc->m2) {
4394 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4395 "Min-rate service curve is greater than "
4396 "the max-rate service curve.");
4397 return EPROTO;
4398 }
4399
4400 class->min_rate = fsc->m2;
4401 class->max_rate = usc->m2;
4402 return 0;
4403}
4404
4405static int
4406hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4407 struct hfsc_class *options,
4408 struct netdev_queue_stats *stats)
4409{
4410 int error;
4411 unsigned int handle;
4412 struct nlattr *nl_options;
4413
4414 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4415 if (error) {
4416 return error;
4417 }
4418
4419 if (queue_id) {
4420 unsigned int major, minor;
4421
4422 major = tc_get_major(handle);
4423 minor = tc_get_minor(handle);
4424 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4425 *queue_id = minor - 1;
4426 } else {
4427 return EPROTO;
4428 }
4429 }
4430
4431 if (options) {
4432 error = hfsc_parse_tca_options__(nl_options, options);
4433 }
4434
4435 return error;
4436}
4437
4438static int
4439hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4440 unsigned int parent, struct hfsc_class *options,
4441 struct netdev_queue_stats *stats)
4442{
4443 int error;
4444 struct ofpbuf *reply;
4445
4446 error = tc_query_class(netdev, handle, parent, &reply);
4447 if (error) {
4448 return error;
4449 }
4450
4451 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4452 ofpbuf_delete(reply);
4453 return error;
4454}
4455
4456static void
73371c09 4457hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4458 struct hfsc_class *class)
4459{
73371c09 4460 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4461
13c1637f 4462 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4463 if (!max_rate) {
a00ca915 4464 enum netdev_features current;
a339aa81 4465
73371c09
BP
4466 netdev_linux_read_features(netdev);
4467 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4468 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4469 }
4470
4471 class->min_rate = max_rate;
4472 class->max_rate = max_rate;
4473}
4474
4475static int
4476hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4477 const struct smap *details,
a339aa81
EJ
4478 struct hfsc_class * class)
4479{
4480 const struct hfsc *hfsc;
4481 uint32_t min_rate, max_rate;
a339aa81
EJ
4482
4483 hfsc = hfsc_get__(netdev);
a339aa81 4484
13c1637f 4485 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4486 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4487 min_rate = MIN(min_rate, hfsc->max_rate);
4488
13c1637f 4489 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4490 max_rate = MAX(max_rate, min_rate);
4491 max_rate = MIN(max_rate, hfsc->max_rate);
4492
4493 class->min_rate = min_rate;
4494 class->max_rate = max_rate;
4495
4496 return 0;
4497}
4498
4499/* Create an HFSC qdisc.
4500 *
4501 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4502static int
4503hfsc_setup_qdisc__(struct netdev * netdev)
4504{
4505 struct tcmsg *tcmsg;
4506 struct ofpbuf request;
4507 struct tc_hfsc_qopt opt;
4508
4509 tc_del_qdisc(netdev);
4510
7874bdff
RD
4511 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4512 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
4513
4514 if (!tcmsg) {
4515 return ENODEV;
4516 }
4517
4518 tcmsg->tcm_handle = tc_make_handle(1, 0);
4519 tcmsg->tcm_parent = TC_H_ROOT;
4520
4521 memset(&opt, 0, sizeof opt);
4522 opt.defcls = 1;
4523
4524 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4525 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4526
4527 return tc_transact(&request, NULL);
4528}
4529
4530/* Create an HFSC class.
4531 *
4532 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4533 * sc rate <min_rate> ul rate <max_rate>" */
4534static int
4535hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4536 unsigned int parent, struct hfsc_class *class)
4537{
4538 int error;
4539 size_t opt_offset;
4540 struct tcmsg *tcmsg;
4541 struct ofpbuf request;
4542 struct tc_service_curve min, max;
4543
7874bdff
RD
4544 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4545 &request);
a339aa81
EJ
4546
4547 if (!tcmsg) {
4548 return ENODEV;
4549 }
4550
4551 tcmsg->tcm_handle = handle;
4552 tcmsg->tcm_parent = parent;
4553
4554 min.m1 = 0;
4555 min.d = 0;
4556 min.m2 = class->min_rate;
4557
4558 max.m1 = 0;
4559 max.d = 0;
4560 max.m2 = class->max_rate;
4561
4562 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4563 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4564 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4565 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4566 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4567 nl_msg_end_nested(&request, opt_offset);
4568
4569 error = tc_transact(&request, NULL);
4570 if (error) {
4571 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4572 "min-rate %ubps, max-rate %ubps (%s)",
4573 netdev_get_name(netdev),
4574 tc_get_major(handle), tc_get_minor(handle),
4575 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4576 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4577 }
4578
4579 return error;
4580}
4581
4582static int
79f1cbe9 4583hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4584{
4585 int error;
4586 struct hfsc_class class;
4587
4588 error = hfsc_setup_qdisc__(netdev);
4589
4590 if (error) {
4591 return error;
4592 }
4593
4594 hfsc_parse_qdisc_details__(netdev, details, &class);
4595 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4596 tc_make_handle(1, 0), &class);
4597
4598 if (error) {
4599 return error;
4600 }
4601
4602 hfsc_install__(netdev, class.max_rate);
4603 return 0;
4604}
4605
4606static int
4607hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4608{
4609 struct ofpbuf msg;
d57695d7 4610 struct queue_dump_state state;
a339aa81
EJ
4611 struct hfsc_class hc;
4612
4613 hc.max_rate = 0;
4614 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4615 hfsc_install__(netdev, hc.max_rate);
a339aa81 4616
d57695d7 4617 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4618 return ENODEV;
4619 }
4620
d57695d7 4621 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4622 unsigned int queue_id;
4623
4624 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4625 hfsc_update_queue__(netdev, queue_id, &hc);
4626 }
4627 }
4628
d57695d7 4629 finish_queue_dump(&state);
a339aa81
EJ
4630 return 0;
4631}
4632
4633static void
4634hfsc_tc_destroy(struct tc *tc)
4635{
4636 struct hfsc *hfsc;
4637 struct hfsc_class *hc, *next;
4638
4639 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4640
4641 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4642 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4643 free(hc);
4644 }
4645
4646 tc_destroy(tc);
4647 free(hfsc);
4648}
4649
4650static int
79f1cbe9 4651hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4652{
4653 const struct hfsc *hfsc;
4654 hfsc = hfsc_get__(netdev);
79f1cbe9 4655 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4656 return 0;
4657}
4658
4659static int
79f1cbe9 4660hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4661{
4662 int error;
4663 struct hfsc_class class;
4664
4665 hfsc_parse_qdisc_details__(netdev, details, &class);
4666 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4667 tc_make_handle(1, 0), &class);
4668
4669 if (!error) {
4670 hfsc_get__(netdev)->max_rate = class.max_rate;
4671 }
4672
4673 return error;
4674}
4675
4676static int
4677hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4678 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4679{
4680 const struct hfsc_class *hc;
4681
4682 hc = hfsc_class_cast__(queue);
79f1cbe9 4683 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4684 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4685 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4686 }
4687 return 0;
4688}
4689
4690static int
4691hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4692 const struct smap *details)
a339aa81
EJ
4693{
4694 int error;
4695 struct hfsc_class class;
4696
4697 error = hfsc_parse_class_details__(netdev, details, &class);
4698 if (error) {
4699 return error;
4700 }
4701
4702 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4703 tc_make_handle(1, 0xfffe), &class);
4704 if (error) {
4705 return error;
4706 }
4707
4708 hfsc_update_queue__(netdev, queue_id, &class);
4709 return 0;
4710}
4711
4712static int
4713hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4714{
4715 int error;
4716 struct hfsc *hfsc;
4717 struct hfsc_class *hc;
4718
4719 hc = hfsc_class_cast__(queue);
4720 hfsc = hfsc_get__(netdev);
4721
4722 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4723 if (!error) {
4724 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4725 free(hc);
4726 }
4727 return error;
4728}
4729
4730static int
4731hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4732 struct netdev_queue_stats *stats)
4733{
4734 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4735 tc_make_handle(1, 0xfffe), NULL, stats);
4736}
4737
4738static int
4739hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4740 const struct ofpbuf *nlmsg,
4741 netdev_dump_queue_stats_cb *cb, void *aux)
4742{
4743 struct netdev_queue_stats stats;
4744 unsigned int handle, major, minor;
4745 int error;
4746
4747 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4748 if (error) {
4749 return error;
4750 }
4751
4752 major = tc_get_major(handle);
4753 minor = tc_get_minor(handle);
4754 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4755 (*cb)(minor - 1, &stats, aux);
4756 }
4757 return 0;
4758}
4759
4760static const struct tc_ops tc_ops_hfsc = {
4761 "hfsc", /* linux_name */
4762 "linux-hfsc", /* ovs_name */
4763 HFSC_N_QUEUES, /* n_queues */
4764 hfsc_tc_install, /* tc_install */
4765 hfsc_tc_load, /* tc_load */
4766 hfsc_tc_destroy, /* tc_destroy */
4767 hfsc_qdisc_get, /* qdisc_get */
4768 hfsc_qdisc_set, /* qdisc_set */
4769 hfsc_class_get, /* class_get */
4770 hfsc_class_set, /* class_set */
4771 hfsc_class_delete, /* class_delete */
4772 hfsc_class_get_stats, /* class_get_stats */
4773 hfsc_class_dump_stats /* class_dump_stats */
4774};
4775\f
6cf888b8
BS
4776/* "linux-noop" traffic control class. */
4777
4778static void
4779noop_install__(struct netdev *netdev_)
4780{
4781 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4782 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4783
4784 netdev->tc = CONST_CAST(struct tc *, &tc);
4785}
4786
4787static int
4788noop_tc_install(struct netdev *netdev,
4789 const struct smap *details OVS_UNUSED)
4790{
4791 noop_install__(netdev);
4792 return 0;
4793}
4794
4795static int
4796noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4797{
4798 noop_install__(netdev);
4799 return 0;
4800}
4801
4802static const struct tc_ops tc_ops_noop = {
4803 NULL, /* linux_name */
4804 "linux-noop", /* ovs_name */
4805 0, /* n_queues */
4806 noop_tc_install,
4807 noop_tc_load,
4808 NULL, /* tc_destroy */
4809 NULL, /* qdisc_get */
4810 NULL, /* qdisc_set */
4811 NULL, /* class_get */
4812 NULL, /* class_set */
4813 NULL, /* class_delete */
4814 NULL, /* class_get_stats */
4815 NULL /* class_dump_stats */
4816};
4817\f
c1c9c9c4
BP
4818/* "linux-default" traffic control class.
4819 *
4820 * This class represents the default, unnamed Linux qdisc. It corresponds to
4821 * the "" (empty string) QoS type in the OVS database. */
4822
4823static void
b5d57fc8 4824default_install__(struct netdev *netdev_)
c1c9c9c4 4825{
b5d57fc8 4826 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4827 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4828
559eb230
BP
4829 /* Nothing but a tc class implementation is allowed to write to a tc. This
4830 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4831 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4832}
4833
4834static int
4835default_tc_install(struct netdev *netdev,
79f1cbe9 4836 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4837{
4838 default_install__(netdev);
4839 return 0;
4840}
4841
4842static int
4843default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4844{
4845 default_install__(netdev);
4846 return 0;
4847}
4848
4849static const struct tc_ops tc_ops_default = {
4850 NULL, /* linux_name */
4851 "", /* ovs_name */
4852 0, /* n_queues */
4853 default_tc_install,
4854 default_tc_load,
4855 NULL, /* tc_destroy */
4856 NULL, /* qdisc_get */
4857 NULL, /* qdisc_set */
4858 NULL, /* class_get */
4859 NULL, /* class_set */
4860 NULL, /* class_delete */
4861 NULL, /* class_get_stats */
4862 NULL /* class_dump_stats */
4863};
4864\f
4865/* "linux-other" traffic control class.
4866 *
4867 * */
4868
4869static int
b5d57fc8 4870other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4871{
b5d57fc8 4872 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4873 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4874
559eb230
BP
4875 /* Nothing but a tc class implementation is allowed to write to a tc. This
4876 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4877 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4878 return 0;
4879}
4880
4881static const struct tc_ops tc_ops_other = {
4882 NULL, /* linux_name */
4883 "linux-other", /* ovs_name */
4884 0, /* n_queues */
4885 NULL, /* tc_install */
4886 other_tc_load,
4887 NULL, /* tc_destroy */
4888 NULL, /* qdisc_get */
4889 NULL, /* qdisc_set */
4890 NULL, /* class_get */
4891 NULL, /* class_set */
4892 NULL, /* class_delete */
4893 NULL, /* class_get_stats */
4894 NULL /* class_dump_stats */
4895};
4896\f
4897/* Traffic control. */
4898
4899/* Number of kernel "tc" ticks per second. */
4900static double ticks_per_s;
4901
4902/* Number of kernel "jiffies" per second. This is used for the purpose of
4903 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4904 * one jiffy's worth of data.
4905 *
4906 * There are two possibilities here:
4907 *
4908 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4909 * approximate range of 100 to 1024. That means that we really need to
4910 * make sure that the qdisc can buffer that much data.
4911 *
4912 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4913 * has finely granular timers and there's no need to fudge additional room
4914 * for buffers. (There's no extra effort needed to implement that: the
4915 * large 'buffer_hz' is used as a divisor, so practically any number will
4916 * come out as 0 in the division. Small integer results in the case of
4917 * really high dividends won't have any real effect anyhow.)
4918 */
4919static unsigned int buffer_hz;
4920
7874bdff
RD
4921static struct tcmsg *
4922netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4923 unsigned int flags, struct ofpbuf *request)
4924{
4925 int ifindex;
4926 int error;
4927
4928 error = get_ifindex(netdev, &ifindex);
4929 if (error) {
4930 return NULL;
4931 }
4932
4933 return tc_make_request(ifindex, type, flags, request);
4934}
4935
f8500004
JP
4936/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4937 * of 'kbits_burst'.
4938 *
4939 * This function is equivalent to running:
4940 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4941 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4942 * mtu 65535 drop
4943 *
4944 * The configuration and stats may be seen with the following command:
c7952afb 4945 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4946 *
4947 * Returns 0 if successful, otherwise a positive errno value.
4948 */
4949static int
c7952afb
BP
4950tc_add_policer(struct netdev *netdev,
4951 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4952{
4953 struct tc_police tc_police;
4954 struct ofpbuf request;
4955 struct tcmsg *tcmsg;
4956 size_t basic_offset;
4957 size_t police_offset;
4958 int error;
4959 int mtu = 65535;
4960
4961 memset(&tc_police, 0, sizeof tc_police);
4962 tc_police.action = TC_POLICE_SHOT;
4963 tc_police.mtu = mtu;
1aca400c 4964 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4965
79abacc8
MAA
4966 /* The following appears wrong in one way: In networking a kilobit is
4967 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4968 *
4969 * However if you "fix" those problems then "tc filter show ..." shows
4970 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4971 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4972 * tc's point of view. Whatever. */
4973 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4974 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 4975
7874bdff
RD
4976 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4977 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
4978 if (!tcmsg) {
4979 return ENODEV;
4980 }
4981 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4982 tcmsg->tcm_info = tc_make_handle(49,
4983 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4984
4985 nl_msg_put_string(&request, TCA_KIND, "basic");
4986 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4987 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4988 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4989 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4990 nl_msg_end_nested(&request, police_offset);
4991 nl_msg_end_nested(&request, basic_offset);
4992
4993 error = tc_transact(&request, NULL);
4994 if (error) {
4995 return error;
4996 }
4997
4998 return 0;
4999}
5000
c1c9c9c4
BP
5001static void
5002read_psched(void)
5003{
5004 /* The values in psched are not individually very meaningful, but they are
5005 * important. The tables below show some values seen in the wild.
5006 *
5007 * Some notes:
5008 *
5009 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5010 * (Before that, there are hints that it was 1000000000.)
5011 *
5012 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5013 * above.
5014 *
5015 * /proc/net/psched
5016 * -----------------------------------
5017 * [1] 000c8000 000f4240 000f4240 00000064
5018 * [2] 000003e8 00000400 000f4240 3b9aca00
5019 * [3] 000003e8 00000400 000f4240 3b9aca00
5020 * [4] 000003e8 00000400 000f4240 00000064
5021 * [5] 000003e8 00000040 000f4240 3b9aca00
5022 * [6] 000003e8 00000040 000f4240 000000f9
5023 *
5024 * a b c d ticks_per_s buffer_hz
5025 * ------- --------- ---------- ------------- ----------- -------------
5026 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5027 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5028 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5029 * [4] 1,000 1,024 1,000,000 100 976,562 100
5030 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5031 * [6] 1,000 64 1,000,000 249 15,625,000 249
5032 *
5033 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5034 * [2] 2.6.26-1-686-bigmem from Debian lenny
5035 * [3] 2.6.26-2-sparc64 from Debian lenny
5036 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5037 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5038 * [6] 2.6.34 from kernel.org on KVM
5039 */
23882115 5040 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
5041 static const char fn[] = "/proc/net/psched";
5042 unsigned int a, b, c, d;
5043 FILE *stream;
5044
23882115
BP
5045 if (!ovsthread_once_start(&once)) {
5046 return;
5047 }
5048
c1c9c9c4
BP
5049 ticks_per_s = 1.0;
5050 buffer_hz = 100;
5051
5052 stream = fopen(fn, "r");
5053 if (!stream) {
10a89ef0 5054 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 5055 goto exit;
c1c9c9c4
BP
5056 }
5057
5058 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5059 VLOG_WARN("%s: read failed", fn);
5060 fclose(stream);
23882115 5061 goto exit;
c1c9c9c4
BP
5062 }
5063 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5064 fclose(stream);
5065
5066 if (!a || !c) {
5067 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 5068 goto exit;
c1c9c9c4
BP
5069 }
5070
5071 ticks_per_s = (double) a * c / b;
5072 if (c == 1000000) {
5073 buffer_hz = d;
5074 } else {
5075 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5076 fn, a, b, c, d);
5077 }
5078 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
5079
5080exit:
5081 ovsthread_once_done(&once);
c1c9c9c4
BP
5082}
5083
5084/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5085 * rate of 'rate' bytes per second. */
5086static unsigned int
5087tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5088{
23882115 5089 read_psched();
c1c9c9c4
BP
5090 return (rate * ticks) / ticks_per_s;
5091}
5092
5093/* Returns the number of ticks that it would take to transmit 'size' bytes at a
5094 * rate of 'rate' bytes per second. */
5095static unsigned int
5096tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5097{
23882115 5098 read_psched();
015c93a4 5099 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
5100}
5101
5102/* Returns the number of bytes that need to be reserved for qdisc buffering at
5103 * a transmission rate of 'rate' bytes per second. */
5104static unsigned int
5105tc_buffer_per_jiffy(unsigned int rate)
5106{
23882115 5107 read_psched();
c1c9c9c4
BP
5108 return rate / buffer_hz;
5109}
5110
5111/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5112 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5113 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5114 * stores NULL into it if it is absent.
5115 *
5116 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5117 * 'msg'.
5118 *
5119 * Returns 0 if successful, otherwise a positive errno value. */
5120static int
5121tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5122 struct nlattr **options)
5123{
5124 static const struct nl_policy tca_policy[] = {
5125 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5126 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5127 };
5128 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5129
5130 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5131 tca_policy, ta, ARRAY_SIZE(ta))) {
5132 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5133 goto error;
5134 }
5135
5136 if (kind) {
5137 *kind = nl_attr_get_string(ta[TCA_KIND]);
5138 }
5139
5140 if (options) {
5141 *options = ta[TCA_OPTIONS];
5142 }
5143
5144 return 0;
5145
5146error:
5147 if (kind) {
5148 *kind = NULL;
5149 }
5150 if (options) {
5151 *options = NULL;
5152 }
5153 return EPROTO;
5154}
5155
5156/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5157 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5158 * into '*options', and its queue statistics into '*stats'. Any of the output
5159 * arguments may be null.
5160 *
5161 * Returns 0 if successful, otherwise a positive errno value. */
5162static int
5163tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5164 struct nlattr **options, struct netdev_queue_stats *stats)
5165{
5166 static const struct nl_policy tca_policy[] = {
5167 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5168 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5169 };
5170 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5171
5172 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5173 tca_policy, ta, ARRAY_SIZE(ta))) {
5174 VLOG_WARN_RL(&rl, "failed to parse class message");
5175 goto error;
5176 }
5177
5178 if (handlep) {
5179 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5180 *handlep = tc->tcm_handle;
5181 }
5182
5183 if (options) {
5184 *options = ta[TCA_OPTIONS];
5185 }
5186
5187 if (stats) {
5188 const struct gnet_stats_queue *gsq;
5189 struct gnet_stats_basic gsb;
5190
5191 static const struct nl_policy stats_policy[] = {
5192 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5193 .min_len = sizeof gsb },
5194 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5195 .min_len = sizeof *gsq },
5196 };
5197 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5198
5199 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5200 sa, ARRAY_SIZE(sa))) {
5201 VLOG_WARN_RL(&rl, "failed to parse class stats");
5202 goto error;
5203 }
5204
5205 /* Alignment issues screw up the length of struct gnet_stats_basic on
5206 * some arch/bitsize combinations. Newer versions of Linux have a
5207 * struct gnet_stats_basic_packed, but we can't depend on that. The
5208 * easiest thing to do is just to make a copy. */
5209 memset(&gsb, 0, sizeof gsb);
5210 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5211 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5212 stats->tx_bytes = gsb.bytes;
5213 stats->tx_packets = gsb.packets;
5214
5215 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5216 stats->tx_errors = gsq->drops;
5217 }
5218
5219 return 0;
5220
5221error:
5222 if (options) {
5223 *options = NULL;
5224 }
5225 if (stats) {
5226 memset(stats, 0, sizeof *stats);
5227 }
5228 return EPROTO;
5229}
5230
5231/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5232 * on 'netdev'. */
5233static int
5234tc_query_class(const struct netdev *netdev,
5235 unsigned int handle, unsigned int parent,
5236 struct ofpbuf **replyp)
5237{
5238 struct ofpbuf request;
5239 struct tcmsg *tcmsg;
5240 int error;
5241
7874bdff
RD
5242 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5243 &request);
23a98ffe
BP
5244 if (!tcmsg) {
5245 return ENODEV;
5246 }
c1c9c9c4
BP
5247 tcmsg->tcm_handle = handle;
5248 tcmsg->tcm_parent = parent;
5249
5250 error = tc_transact(&request, replyp);
5251 if (error) {
5252 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5253 netdev_get_name(netdev),
5254 tc_get_major(handle), tc_get_minor(handle),
5255 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5256 ovs_strerror(error));
c1c9c9c4
BP
5257 }
5258 return error;
5259}
5260
5261/* Equivalent to "tc class del dev <name> handle <handle>". */
5262static int
5263tc_delete_class(const struct netdev *netdev, unsigned int handle)
5264{
5265 struct ofpbuf request;
5266 struct tcmsg *tcmsg;
5267 int error;
5268
7874bdff 5269 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5270 if (!tcmsg) {
5271 return ENODEV;
5272 }
c1c9c9c4
BP
5273 tcmsg->tcm_handle = handle;
5274 tcmsg->tcm_parent = 0;
5275
5276 error = tc_transact(&request, NULL);
5277 if (error) {
5278 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5279 netdev_get_name(netdev),
5280 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5281 ovs_strerror(error));
c1c9c9c4
BP
5282 }
5283 return error;
5284}
5285
5286/* Equivalent to "tc qdisc del dev <name> root". */
5287static int
b5d57fc8 5288tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5289{
b5d57fc8 5290 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5291 struct ofpbuf request;
5292 struct tcmsg *tcmsg;
5293 int error;
5294
7874bdff 5295 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5296 if (!tcmsg) {
5297 return ENODEV;
5298 }
c1c9c9c4
BP
5299 tcmsg->tcm_handle = tc_make_handle(1, 0);
5300 tcmsg->tcm_parent = TC_H_ROOT;
5301
5302 error = tc_transact(&request, NULL);
5303 if (error == EINVAL) {
5304 /* EINVAL probably means that the default qdisc was in use, in which
5305 * case we've accomplished our purpose. */
5306 error = 0;
5307 }
b5d57fc8
BP
5308 if (!error && netdev->tc) {
5309 if (netdev->tc->ops->tc_destroy) {
5310 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5311 }
b5d57fc8 5312 netdev->tc = NULL;
c1c9c9c4
BP
5313 }
5314 return error;
5315}
5316
ac3e3aaa
BP
5317static bool
5318getqdisc_is_safe(void)
5319{
5320 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5321 static bool safe = false;
5322
5323 if (ovsthread_once_start(&once)) {
5324 struct utsname utsname;
5325 int major, minor;
5326
5327 if (uname(&utsname) == -1) {
5328 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5329 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5330 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5331 } else if (major < 2 || (major == 2 && minor < 35)) {
5332 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5333 utsname.release);
5334 } else {
5335 safe = true;
5336 }
5337 ovsthread_once_done(&once);
5338 }
5339 return safe;
5340}
5341
c1c9c9c4
BP
5342/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5343 * kernel to determine what they are. Returns 0 if successful, otherwise a
5344 * positive errno value. */
5345static int
b5d57fc8 5346tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5347{
b5d57fc8 5348 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5349 struct ofpbuf request, *qdisc;
5350 const struct tc_ops *ops;
5351 struct tcmsg *tcmsg;
5352 int load_error;
5353 int error;
5354
b5d57fc8 5355 if (netdev->tc) {
c1c9c9c4
BP
5356 return 0;
5357 }
5358
5359 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5360 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5361 * 2.6.35 without that fix backported to it.
5362 *
5363 * To avoid the OOPS, we must not make a request that would attempt to dump
5364 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5365 * few others. There are a few ways that I can see to do this, but most of
5366 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5367 * technique chosen here is to assume that any non-default qdisc that we
5368 * create will have a class with handle 1:0. The built-in qdiscs only have
5369 * a class with handle 0:0.
5370 *
ac3e3aaa
BP
5371 * On Linux 2.6.35+ we use the straightforward method because it allows us
5372 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5373 * in such a case we get no response at all from the kernel (!) if a
5374 * builtin qdisc is in use (which is later caught by "!error &&
5375 * !qdisc->size"). */
7874bdff
RD
5376 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5377 &request);
23a98ffe
BP
5378 if (!tcmsg) {
5379 return ENODEV;
5380 }
ac3e3aaa
BP
5381 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5382 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5383
5384 /* Figure out what tc class to instantiate. */
5385 error = tc_transact(&request, &qdisc);
ac3e3aaa 5386 if (!error && qdisc->size) {
c1c9c9c4
BP
5387 const char *kind;
5388
5389 error = tc_parse_qdisc(qdisc, &kind, NULL);
5390 if (error) {
5391 ops = &tc_ops_other;
5392 } else {
5393 ops = tc_lookup_linux_name(kind);
5394 if (!ops) {
5395 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5396 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5397
5398 ops = &tc_ops_other;
5399 }
5400 }
ac3e3aaa
BP
5401 } else if ((!error && !qdisc->size) || error == ENOENT) {
5402 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5403 * set up by some other entity that doesn't have a handle 1:0. We will
5404 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5405 ops = &tc_ops_default;
5406 error = 0;
5407 } else {
5408 /* Who knows? Maybe the device got deleted. */
5409 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5410 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5411 ops = &tc_ops_other;
5412 }
5413
5414 /* Instantiate it. */
b5d57fc8
BP
5415 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5416 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5417 ofpbuf_delete(qdisc);
5418
5419 return error ? error : load_error;
5420}
5421
5422/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5423 approximate the time to transmit packets of various lengths. For an MTU of
5424 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5425 represents two possible packet lengths; for a MTU of 513 through 1024, four
5426 possible lengths; and so on.
5427
5428 Returns, for the specified 'mtu', the number of bits that packet lengths
5429 need to be shifted right to fit within such a 256-entry table. */
5430static int
5431tc_calc_cell_log(unsigned int mtu)
5432{
5433 int cell_log;
5434
5435 if (!mtu) {
5436 mtu = ETH_PAYLOAD_MAX;
5437 }
5438 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5439
5440 for (cell_log = 0; mtu >= 256; cell_log++) {
5441 mtu >>= 1;
5442 }
5443
5444 return cell_log;
5445}
5446
5447/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5448 * of 'mtu'. */
5449static void
5450tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5451{
5452 memset(rate, 0, sizeof *rate);
5453 rate->cell_log = tc_calc_cell_log(mtu);
5454 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5455 /* rate->cell_align = 0; */ /* distro headers. */
5456 rate->mpu = ETH_TOTAL_MIN;
5457 rate->rate = Bps;
5458}
5459
5460/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5461 * attribute of the specified "type".
5462 *
5463 * See tc_calc_cell_log() above for a description of "rtab"s. */
5464static void
5465tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5466{
5467 uint32_t *rtab;
5468 unsigned int i;
5469
5470 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5471 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5472 unsigned packet_size = (i + 1) << rate->cell_log;
5473 if (packet_size < rate->mpu) {
5474 packet_size = rate->mpu;
5475 }
5476 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5477 }
5478}
5479
5480/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5481 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5482 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5483 * 0 is fine.) */
c1c9c9c4
BP
5484static int
5485tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5486{
5487 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5488 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5489}
d3980822 5490\f
aaf2fb1a
BP
5491/* Linux-only functions declared in netdev-linux.h */
5492
5493/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5494 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5495int
5496netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5497 const char *flag_name, bool enable)
5498{
5499 const char *netdev_name = netdev_get_name(netdev);
5500 struct ethtool_value evalue;
5501 uint32_t new_flags;
5502 int error;
5503
ab985a77 5504 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5505 memset(&evalue, 0, sizeof evalue);
5506 error = netdev_linux_do_ethtool(netdev_name,
5507 (struct ethtool_cmd *)&evalue,
5508 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5509 if (error) {
5510 return error;
5511 }
5512
ab985a77 5513 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5514 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5515 if (new_flags == evalue.data) {
5516 return 0;
5517 }
5518 evalue.data = new_flags;
aaf2fb1a
BP
5519 error = netdev_linux_do_ethtool(netdev_name,
5520 (struct ethtool_cmd *)&evalue,
5521 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5522 if (error) {
5523 return error;
5524 }
5525
ab985a77 5526 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5527 memset(&evalue, 0, sizeof evalue);
5528 error = netdev_linux_do_ethtool(netdev_name,
5529 (struct ethtool_cmd *)&evalue,
5530 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5531 if (error) {
5532 return error;
5533 }
5534
5535 if (new_flags != evalue.data) {
5536 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5537 "device %s failed", enable ? "enable" : "disable",
5538 flag_name, netdev_name);
5539 return EOPNOTSUPP;
5540 }
5541
5542 return 0;
5543}
5544\f
5545/* Utility functions. */
5546
d3980822 5547/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5548static void
d3980822
BP
5549netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5550 const struct rtnl_link_stats *src)
5551{
f613a0d7
PS
5552 dst->rx_packets = src->rx_packets;
5553 dst->tx_packets = src->tx_packets;
5554 dst->rx_bytes = src->rx_bytes;
5555 dst->tx_bytes = src->tx_bytes;
5556 dst->rx_errors = src->rx_errors;
5557 dst->tx_errors = src->tx_errors;
5558 dst->rx_dropped = src->rx_dropped;
5559 dst->tx_dropped = src->tx_dropped;
5560 dst->multicast = src->multicast;
5561 dst->collisions = src->collisions;
5562 dst->rx_length_errors = src->rx_length_errors;
5563 dst->rx_over_errors = src->rx_over_errors;
5564 dst->rx_crc_errors = src->rx_crc_errors;
5565 dst->rx_frame_errors = src->rx_frame_errors;
5566 dst->rx_fifo_errors = src->rx_fifo_errors;
5567 dst->rx_missed_errors = src->rx_missed_errors;
5568 dst->tx_aborted_errors = src->tx_aborted_errors;
5569 dst->tx_carrier_errors = src->tx_carrier_errors;
5570 dst->tx_fifo_errors = src->tx_fifo_errors;
5571 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5572 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5573}
5574
337c9b99
BP
5575/* Copies 'src' into 'dst', performing format conversion in the process. */
5576static void
5577netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5578 const struct rtnl_link_stats64 *src)
5579{
5580 dst->rx_packets = src->rx_packets;
5581 dst->tx_packets = src->tx_packets;
5582 dst->rx_bytes = src->rx_bytes;
5583 dst->tx_bytes = src->tx_bytes;
5584 dst->rx_errors = src->rx_errors;
5585 dst->tx_errors = src->tx_errors;
5586 dst->rx_dropped = src->rx_dropped;
5587 dst->tx_dropped = src->tx_dropped;
5588 dst->multicast = src->multicast;
5589 dst->collisions = src->collisions;
5590 dst->rx_length_errors = src->rx_length_errors;
5591 dst->rx_over_errors = src->rx_over_errors;
5592 dst->rx_crc_errors = src->rx_crc_errors;
5593 dst->rx_frame_errors = src->rx_frame_errors;
5594 dst->rx_fifo_errors = src->rx_fifo_errors;
5595 dst->rx_missed_errors = src->rx_missed_errors;
5596 dst->tx_aborted_errors = src->tx_aborted_errors;
5597 dst->tx_carrier_errors = src->tx_carrier_errors;
5598 dst->tx_fifo_errors = src->tx_fifo_errors;
5599 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5600 dst->tx_window_errors = src->tx_window_errors;
5601}
5602
c1c9c9c4 5603static int
35eef899 5604get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5605{
c1c9c9c4
BP
5606 struct ofpbuf request;
5607 struct ofpbuf *reply;
c1c9c9c4
BP
5608 int error;
5609
d6e3feb5 5610 /* Filtering all counters by default */
5611 memset(stats, 0xFF, sizeof(struct netdev_stats));
5612
c1c9c9c4 5613 ofpbuf_init(&request, 0);
13a24df8
BP
5614 nl_msg_put_nlmsghdr(&request,
5615 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5616 RTM_GETLINK, NLM_F_REQUEST);
5617 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5618 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5619 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5620 ofpbuf_uninit(&request);
5621 if (error) {
5622 return error;
5623 }
5624
13a24df8 5625 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5626 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5627 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5628 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5629 error = 0;
5630 } else {
71f21279 5631 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
5632 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5633 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5634 error = 0;
5635 } else {
5636 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5637 error = EPROTO;
5638 }
13a24df8
BP
5639 }
5640 } else {
5641 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5642 error = EPROTO;
c1c9c9c4 5643 }
8b61709d 5644
8b61709d 5645
576e26d7 5646 ofpbuf_delete(reply);
35eef899 5647 return error;
8b61709d 5648}
c1c9c9c4 5649
3a183124 5650static int
b5d57fc8 5651get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5652{
5653 struct ifreq ifr;
5654 int error;
5655
755be9ea 5656 *flags = 0;
259e0b1a 5657 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5658 if (!error) {
5659 *flags = ifr.ifr_flags;
5660 }
8b61709d
BP
5661 return error;
5662}
5663
5664static int
4b609110 5665set_flags(const char *name, unsigned int flags)
8b61709d
BP
5666{
5667 struct ifreq ifr;
5668
5669 ifr.ifr_flags = flags;
259e0b1a 5670 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5671}
5672
01b25786
PB
5673int
5674linux_get_ifindex(const char *netdev_name)
8b61709d
BP
5675{
5676 struct ifreq ifr;
259e0b1a 5677 int error;
8b61709d 5678
71d7c22f 5679 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5680 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5681
5682 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5683 if (error) {
580e1152
RD
5684 /* ENODEV probably means that a vif disappeared asynchronously and
5685 * hasn't been removed from the database yet, so reduce the log level
5686 * to INFO for that case. */
5687 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5688 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5689 netdev_name, ovs_strerror(error));
259e0b1a 5690 return -error;
8b61709d
BP
5691 }
5692 return ifr.ifr_ifindex;
5693}
5694
5695static int
5696get_ifindex(const struct netdev *netdev_, int *ifindexp)
5697{
b5d57fc8 5698 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5699
b5d57fc8 5700 if (!(netdev->cache_valid & VALID_IFINDEX)) {
756819dd
FL
5701 netdev_linux_update_via_netlink(netdev);
5702 }
5703
5704 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5705 /* Fall back to ioctl if netlink fails */
01b25786 5706 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5707
8b61709d 5708 if (ifindex < 0) {
b5d57fc8
BP
5709 netdev->get_ifindex_error = -ifindex;
5710 netdev->ifindex = 0;
c7b1b0a5 5711 } else {
b5d57fc8
BP
5712 netdev->get_ifindex_error = 0;
5713 netdev->ifindex = ifindex;
8b61709d 5714 }
b5d57fc8 5715 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5716 }
c7b1b0a5 5717
b5d57fc8
BP
5718 *ifindexp = netdev->ifindex;
5719 return netdev->get_ifindex_error;
8b61709d
BP
5720}
5721
5722static int
756819dd
FL
5723netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5724{
5725 struct ofpbuf request;
5726 struct ofpbuf *reply;
5727 struct rtnetlink_change chg;
5728 struct rtnetlink_change *change = &chg;
5729 int error;
5730
5731 ofpbuf_init(&request, 0);
5732 nl_msg_put_nlmsghdr(&request,
5733 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5734 RTM_GETLINK, NLM_F_REQUEST);
5735 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5736
5737 /* The correct identifiers for a Linux device are netnsid and ifindex,
5738 * but ifindex changes as the port is moved to another network namespace
5739 * and the interface name statically stored in ovsdb. */
5740 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
5741 if (netdev_linux_netnsid_is_remote(netdev)) {
5742 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
5743 }
5744 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5745 ofpbuf_uninit(&request);
5746 if (error) {
5747 ofpbuf_delete(reply);
5748 return error;
5749 }
5750
5751 if (rtnetlink_parse(reply, change)
5752 && change->nlmsg_type == RTM_NEWLINK) {
5753 bool changed = false;
5754 error = 0;
5755
5756 /* Update netdev from rtnl msg and increment its seq if needed. */
5757 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
5758 netdev->carrier_resets++;
5759 changed = true;
5760 }
5761 if (change->ifi_flags != netdev->ifi_flags) {
5762 netdev->ifi_flags = change->ifi_flags;
5763 changed = true;
5764 }
5765 if (change->mtu && change->mtu != netdev->mtu) {
5766 netdev->mtu = change->mtu;
5767 netdev->cache_valid |= VALID_MTU;
5768 netdev->netdev_mtu_error = 0;
5769 changed = true;
5770 }
5771 if (!eth_addr_is_zero(change->mac)
5772 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
5773 netdev->etheraddr = change->mac;
5774 netdev->cache_valid |= VALID_ETHERADDR;
5775 netdev->ether_addr_error = 0;
5776 changed = true;
5777 }
5778 if (change->if_index != netdev->ifindex) {
5779 netdev->ifindex = change->if_index;
5780 netdev->cache_valid |= VALID_IFINDEX;
5781 netdev->get_ifindex_error = 0;
5782 changed = true;
5783 }
3d9c99ab
JH
5784 if (change->master && netdev_linux_kind_is_lag(change->master)) {
5785 netdev->is_lag_master = true;
5786 }
756819dd
FL
5787 if (changed) {
5788 netdev_change_seq_changed(&netdev->up);
5789 }
5790 } else {
5791 error = EINVAL;
5792 }
5793
5794 ofpbuf_delete(reply);
5795 return error;
5796}
5797
5798static int
74ff3298 5799get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5800{
5801 struct ifreq ifr;
5802 int hwaddr_family;
259e0b1a 5803 int error;
8b61709d
BP
5804
5805 memset(&ifr, 0, sizeof ifr);
71d7c22f 5806 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5807 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5808 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5809 if (error) {
78857dfb
BP
5810 /* ENODEV probably means that a vif disappeared asynchronously and
5811 * hasn't been removed from the database yet, so reduce the log level
5812 * to INFO for that case. */
259e0b1a 5813 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5814 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5815 netdev_name, ovs_strerror(error));
5816 return error;
8b61709d
BP
5817 }
5818 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
5819 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5820 hwaddr_family != ARPHRD_NONE) {
c9697f35 5821 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5822 netdev_name, hwaddr_family);
c9697f35 5823 return EINVAL;
8b61709d
BP
5824 }
5825 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5826 return 0;
5827}
5828
5829static int
74ff3298 5830set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5831{
5832 struct ifreq ifr;
259e0b1a 5833 int error;
8b61709d
BP
5834
5835 memset(&ifr, 0, sizeof ifr);
71d7c22f 5836 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5837 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5838 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5839 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5840 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5841 if (error) {
8b61709d 5842 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5843 netdev_name, ovs_strerror(error));
8b61709d 5844 }
259e0b1a 5845 return error;
8b61709d
BP
5846}
5847
5848static int
0b0544d7 5849netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5850 int cmd, const char *cmd_name)
5851{
5852 struct ifreq ifr;
259e0b1a 5853 int error;
8b61709d
BP
5854
5855 memset(&ifr, 0, sizeof ifr);
71d7c22f 5856 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5857 ifr.ifr_data = (caddr_t) ecmd;
5858
5859 ecmd->cmd = cmd;
259e0b1a
BP
5860 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5861 if (error) {
5862 if (error != EOPNOTSUPP) {
8b61709d 5863 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5864 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5865 } else {
5866 /* The device doesn't support this operation. That's pretty
5867 * common, so there's no point in logging anything. */
5868 }
8b61709d 5869 }
259e0b1a 5870 return error;
8b61709d 5871}
f1acd62b 5872
488d734d
BP
5873/* Returns an AF_PACKET raw socket or a negative errno value. */
5874static int
5875af_packet_sock(void)
5876{
23882115
BP
5877 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5878 static int sock;
488d734d 5879
23882115 5880 if (ovsthread_once_start(&once)) {
488d734d
BP
5881 sock = socket(AF_PACKET, SOCK_RAW, 0);
5882 if (sock >= 0) {
8450059e
BP
5883 int error = set_nonblocking(sock);
5884 if (error) {
5885 close(sock);
5886 sock = -error;
5887 }
488d734d
BP
5888 } else {
5889 sock = -errno;
10a89ef0
BP
5890 VLOG_ERR("failed to create packet socket: %s",
5891 ovs_strerror(errno));
488d734d 5892 }
23882115 5893 ovsthread_once_done(&once);
488d734d
BP
5894 }
5895
5896 return sock;
5897}