]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
Add change tracking documentation
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
c7952afb 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
55bc98d6 43#include <net/if_packet.h>
8b61709d
BP
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
e14deea0 52#include "dp-packet.h"
93451a0a 53#include "dpif-netlink.h"
df1e5a3b 54#include "dpif-netdev.h"
3e8a2ad1 55#include "openvswitch/dynamic-string.h"
8b61709d 56#include "fatal-signal.h"
93b13be8
BP
57#include "hash.h"
58#include "hmap.h"
8b61709d 59#include "netdev-provider.h"
7fbef77a 60#include "netdev-vport.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
c060c4cf 63#include "netlink.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d
BP
67#include "packets.h"
68#include "poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
8b61709d 70#include "shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
5136ce49 76
d98e6007 77VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 78
d76f09ea
BP
79COVERAGE_DEFINE(netdev_set_policing);
80COVERAGE_DEFINE(netdev_arp_lookup);
81COVERAGE_DEFINE(netdev_get_ifindex);
82COVERAGE_DEFINE(netdev_get_hwaddr);
83COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
84COVERAGE_DEFINE(netdev_get_ethtool);
85COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 86
8b61709d
BP
87\f
88/* These were introduced in Linux 2.6.14, so they might be missing if we have
89 * old headers. */
90#ifndef ADVERTISED_Pause
91#define ADVERTISED_Pause (1 << 13)
92#endif
93#ifndef ADVERTISED_Asym_Pause
94#define ADVERTISED_Asym_Pause (1 << 14)
95#endif
96
e47bd51a
JP
97/* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99#ifndef ETHTOOL_GFLAGS
100#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101#endif
102#ifndef ETHTOOL_SFLAGS
103#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104#endif
105
c1c9c9c4
BP
106/* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 * headers. */
108#ifndef TC_RTAB_SIZE
109#define TC_RTAB_SIZE 1024
110#endif
111
b73c8518
SH
112/* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
117 *
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
120 */
55bc98d6
BP
121#ifndef PACKET_AUXDATA
122#define PACKET_AUXDATA 8
123#endif
b73c8518
SH
124#ifndef TP_STATUS_VLAN_VALID
125#define TP_STATUS_VLAN_VALID (1 << 4)
126#endif
127#ifndef TP_STATUS_VLAN_TPID_VALID
128#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129#endif
130#undef tpacket_auxdata
131#define tpacket_auxdata rpl_tpacket_auxdata
132struct tpacket_auxdata {
133 uint32_t tp_status;
134 uint32_t tp_len;
135 uint32_t tp_snaplen;
136 uint16_t tp_mac;
137 uint16_t tp_net;
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
140};
141
0c615356
SH
142/* Linux 2.6.27 introduced ethtool_cmd_speed
143 *
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148#define ethtool_cmd_speed rpl_ethtool_cmd_speed
149static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
150{
151 return ep->speed | (ep->speed_hi << 16);
152}
153
67bed84c
SH
154/* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156#ifndef SUPPORTED_1000baseKX_Full
157#define SUPPORTED_1000baseKX_Full (1 << 17)
158#define SUPPORTED_10000baseKX4_Full (1 << 18)
159#define SUPPORTED_10000baseKR_Full (1 << 19)
160#define SUPPORTED_10000baseR_FEC (1 << 20)
161#define ADVERTISED_1000baseKX_Full (1 << 17)
162#define ADVERTISED_10000baseKX4_Full (1 << 18)
163#define ADVERTISED_10000baseKR_Full (1 << 19)
164#define ADVERTISED_10000baseR_FEC (1 << 20)
165#endif
166
167/* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169#ifndef SUPPORTED_40000baseKR4_Full
170#define SUPPORTED_40000baseKR4_Full (1 << 23)
171#define SUPPORTED_40000baseCR4_Full (1 << 24)
172#define SUPPORTED_40000baseSR4_Full (1 << 25)
173#define SUPPORTED_40000baseLR4_Full (1 << 26)
174#define ADVERTISED_40000baseKR4_Full (1 << 23)
175#define ADVERTISED_40000baseCR4_Full (1 << 24)
176#define ADVERTISED_40000baseSR4_Full (1 << 25)
177#define ADVERTISED_40000baseLR4_Full (1 << 26)
178#endif
179
fa373af4
BP
180/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
181 *
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
187#ifndef IFLA_STATS64
337c9b99 188#define IFLA_STATS64 23
fa373af4
BP
189#endif
190#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
191struct rtnl_link_stats64 {
192 uint64_t rx_packets;
193 uint64_t tx_packets;
194 uint64_t rx_bytes;
195 uint64_t tx_bytes;
196 uint64_t rx_errors;
197 uint64_t tx_errors;
198 uint64_t rx_dropped;
199 uint64_t tx_dropped;
200 uint64_t multicast;
201 uint64_t collisions;
202
203 uint64_t rx_length_errors;
204 uint64_t rx_over_errors;
205 uint64_t rx_crc_errors;
206 uint64_t rx_frame_errors;
207 uint64_t rx_fifo_errors;
208 uint64_t rx_missed_errors;
209
210 uint64_t tx_aborted_errors;
211 uint64_t tx_carrier_errors;
212 uint64_t tx_fifo_errors;
213 uint64_t tx_heartbeat_errors;
214 uint64_t tx_window_errors;
215
216 uint64_t rx_compressed;
217 uint64_t tx_compressed;
218};
337c9b99 219
8b61709d 220enum {
7fbef77a
JG
221 VALID_IFINDEX = 1 << 0,
222 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
223 VALID_IN = 1 << 2,
224 VALID_MTU = 1 << 3,
225 VALID_POLICING = 1 << 4,
226 VALID_VPORT_STAT_ERROR = 1 << 5,
227 VALID_DRVINFO = 1 << 6,
228 VALID_FEATURES = 1 << 7,
8b61709d 229};
c1c9c9c4
BP
230\f
231/* Traffic control. */
232
233/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
234 * network device.
235 *
236 * Each TC implementation subclasses this with whatever additional data it
237 * needs. */
c1c9c9c4
BP
238struct tc {
239 const struct tc_ops *ops;
93b13be8
BP
240 struct hmap queues; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
243};
c1c9c9c4 244
559eb230
BP
245#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
246
93b13be8
BP
247/* One traffic control queue.
248 *
249 * Each TC implementation subclasses this with whatever additional data it
250 * needs. */
251struct tc_queue {
252 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 254 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
255};
256
257/* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
259 *
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
263struct tc_ops {
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name;
268
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name;
271
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues;
275
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
281 *
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
285 *
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
288 *
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
79f1cbe9 291 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
292
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
296 *
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
302 * 'netdev'.
303 *
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
307
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
310 * tc_destroy(tc).
311 *
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
315 *
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy)(struct tc *tc);
318
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
320 *
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
324 *
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
328 *
329 * This function may be null if 'tc' is not configurable.
330 */
79f1cbe9 331 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
332
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
335 *
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
339 *
340 * This function may be null if 'tc' is not configurable.
341 */
79f1cbe9 342 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 343
93b13be8
BP
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
346 *
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
350 *
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
354 *
355 * This function may be null if 'tc' does not have queues ('n_queues' is
356 * 0). */
93b13be8 357 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 358 struct smap *details);
c1c9c9c4
BP
359
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
363 * 'n_queues'.
364 *
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
368 *
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 372 const struct smap *details);
c1c9c9c4 373
93b13be8
BP
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
376 *
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
93b13be8 379 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 380
93b13be8
BP
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
383 *
384 * On success, initializes '*stats'.
385 *
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
93b13be8
BP
388 int (*class_get_stats)(const struct netdev *netdev,
389 const struct tc_queue *queue,
c1c9c9c4
BP
390 struct netdev_queue_stats *stats);
391
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
394 *
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats)(const struct netdev *netdev,
398 const struct ofpbuf *nlmsg,
399 netdev_dump_queue_stats_cb *cb, void *aux);
400};
401
402static void
403tc_init(struct tc *tc, const struct tc_ops *ops)
404{
405 tc->ops = ops;
93b13be8 406 hmap_init(&tc->queues);
c1c9c9c4
BP
407}
408
409static void
410tc_destroy(struct tc *tc)
411{
93b13be8 412 hmap_destroy(&tc->queues);
c1c9c9c4
BP
413}
414
415static const struct tc_ops tc_ops_htb;
a339aa81 416static const struct tc_ops tc_ops_hfsc;
677d9158
JV
417static const struct tc_ops tc_ops_codel;
418static const struct tc_ops tc_ops_fqcodel;
419static const struct tc_ops tc_ops_sfq;
c1c9c9c4
BP
420static const struct tc_ops tc_ops_default;
421static const struct tc_ops tc_ops_other;
422
559eb230 423static const struct tc_ops *const tcs[] = {
c1c9c9c4 424 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 425 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
426 &tc_ops_codel, /* Controlled delay */
427 &tc_ops_fqcodel, /* Fair queue controlled delay */
428 &tc_ops_sfq, /* Stochastic fair queueing */
c1c9c9c4
BP
429 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
430 &tc_ops_other, /* Some other qdisc. */
431 NULL
432};
149f577a 433
c1c9c9c4
BP
434static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
435static unsigned int tc_get_major(unsigned int handle);
436static unsigned int tc_get_minor(unsigned int handle);
437
438static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
439static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
440static unsigned int tc_buffer_per_jiffy(unsigned int rate);
441
442static struct tcmsg *tc_make_request(const struct netdev *, int type,
443 unsigned int flags, struct ofpbuf *);
444static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004 445static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
c7952afb
BP
446static int tc_add_policer(struct netdev *,
447 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
448
449static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
450 struct nlattr **options);
451static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
452 struct nlattr **options,
453 struct netdev_queue_stats *);
454static int tc_query_class(const struct netdev *,
455 unsigned int handle, unsigned int parent,
456 struct ofpbuf **replyp);
457static int tc_delete_class(const struct netdev *, unsigned int handle);
458
459static int tc_del_qdisc(struct netdev *netdev);
460static int tc_query_qdisc(const struct netdev *netdev);
461
462static int tc_calc_cell_log(unsigned int mtu);
463static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
464static void tc_put_rtab(struct ofpbuf *, uint16_t type,
465 const struct tc_ratespec *rate);
466static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
467\f
b5d57fc8
BP
468struct netdev_linux {
469 struct netdev up;
149f577a 470
86383816
BP
471 /* Protects all members below. */
472 struct ovs_mutex mutex;
473
149f577a 474 unsigned int cache_valid;
8b61709d 475
1670c579
EJ
476 bool miimon; /* Link status of last poll. */
477 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
478 struct timer miimon_timer;
479
8722022c
BP
480 /* The following are figured out "on demand" only. They are only valid
481 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 482 int ifindex;
74ff3298 483 struct eth_addr etheraddr;
8b61709d 484 int mtu;
059e5f4f 485 unsigned int ifi_flags;
65c3058c 486 long long int carrier_resets;
80a86fbe
BP
487 uint32_t kbits_rate; /* Policing data. */
488 uint32_t kbits_burst;
bba1e6f3
PS
489 int vport_stats_error; /* Cached error code from vport_get_stats().
490 0 or an errno value. */
90a6637d 491 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 492 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 493 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 494 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 495 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 496
a00ca915
EJ
497 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
498 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
499 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 500
4f925bd3 501 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 502 struct tc *tc;
149f577a 503
d0d08f8a
BP
504 /* For devices of class netdev_tap_class only. */
505 int tap_fd;
8b61709d
BP
506};
507
f7791740
PS
508struct netdev_rxq_linux {
509 struct netdev_rxq up;
796223f5 510 bool is_tap;
5b7448ed 511 int fd;
149f577a 512};
8b61709d 513
8b61709d
BP
514/* This is set pretty low because we probably won't learn anything from the
515 * additional log messages. */
516static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
517
19c8e9c1
JS
518/* Polling miimon status for all ports causes performance degradation when
519 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
520 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
521 *
522 * Readers do not depend on this variable synchronizing with the related
523 * changes in the device miimon status, so we can use atomic_count. */
524static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 525
259e0b1a 526static void netdev_linux_run(void);
6f643e49 527
0b0544d7 528static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 529 int cmd, const char *cmd_name);
b5d57fc8 530static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 531static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
532static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
533 enum netdev_flags on, enum netdev_flags *old_flagsp)
534 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
535static int do_get_ifindex(const char *netdev_name);
536static int get_ifindex(const struct netdev *, int *ifindexp);
537static int do_set_addr(struct netdev *netdev,
538 int ioctl_nr, const char *ioctl_name,
539 struct in_addr addr);
74ff3298
JR
540static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
541static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 542static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 543static int af_packet_sock(void);
19c8e9c1 544static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
545static void netdev_linux_miimon_run(void);
546static void netdev_linux_miimon_wait(void);
df1e5a3b 547static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 548
15b3596a
JG
549static bool
550is_netdev_linux_class(const struct netdev_class *netdev_class)
551{
259e0b1a 552 return netdev_class->run == netdev_linux_run;
15b3596a
JG
553}
554
796223f5
BP
555static bool
556is_tap_netdev(const struct netdev *netdev)
557{
b5d57fc8 558 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
559}
560
8b61709d
BP
561static struct netdev_linux *
562netdev_linux_cast(const struct netdev *netdev)
563{
b5d57fc8 564 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 565
180c6d0b 566 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 567}
796223f5 568
f7791740
PS
569static struct netdev_rxq_linux *
570netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 571{
9dc63482 572 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 573 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 574}
ff4ed3c9 575\f
cee87338 576static void netdev_linux_update(struct netdev_linux *netdev,
7e9dcc0f 577 const struct rtnetlink_change *)
86383816 578 OVS_REQUIRES(netdev->mutex);
cee87338 579static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
580 unsigned int ifi_flags, unsigned int mask)
581 OVS_REQUIRES(netdev->mutex);
cee87338 582
d6384a3a
AW
583/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
584 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
585 * if no such socket could be created. */
586static struct nl_sock *
587netdev_linux_notify_sock(void)
588{
589 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
590 static struct nl_sock *sock;
989d7135
PS
591 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
592 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
593
594 if (ovsthread_once_start(&once)) {
595 int error;
596
597 error = nl_sock_create(NETLINK_ROUTE, &sock);
598 if (!error) {
d6384a3a
AW
599 size_t i;
600
601 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
602 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
603 if (error) {
604 nl_sock_destroy(sock);
605 sock = NULL;
606 break;
607 }
cee87338
BP
608 }
609 }
610 ovsthread_once_done(&once);
611 }
612
613 return sock;
614}
615
19c8e9c1
JS
616static bool
617netdev_linux_miimon_enabled(void)
618{
812c272c 619 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
620}
621
8b61709d
BP
622static void
623netdev_linux_run(void)
624{
cee87338
BP
625 struct nl_sock *sock;
626 int error;
627
19c8e9c1
JS
628 if (netdev_linux_miimon_enabled()) {
629 netdev_linux_miimon_run();
630 }
cee87338
BP
631
632 sock = netdev_linux_notify_sock();
633 if (!sock) {
634 return;
635 }
636
637 do {
638 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
639 uint64_t buf_stub[4096 / 8];
640 struct ofpbuf buf;
641
642 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
643 error = nl_sock_recv(sock, &buf, false);
644 if (!error) {
7e9dcc0f 645 struct rtnetlink_change change;
cee87338 646
7e9dcc0f 647 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
648 struct netdev *netdev_ = NULL;
649 char dev_name[IFNAMSIZ];
650
651 if (!change.ifname) {
652 change.ifname = if_indextoname(change.if_index, dev_name);
653 }
654
655 if (change.ifname) {
656 netdev_ = netdev_from_name(change.ifname);
657 }
cee87338
BP
658 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
660
661 ovs_mutex_lock(&netdev->mutex);
cee87338 662 netdev_linux_update(netdev, &change);
86383816 663 ovs_mutex_unlock(&netdev->mutex);
cee87338 664 }
38e0065b 665 netdev_close(netdev_);
cee87338
BP
666 }
667 } else if (error == ENOBUFS) {
668 struct shash device_shash;
669 struct shash_node *node;
670
671 nl_sock_drain(sock);
672
673 shash_init(&device_shash);
674 netdev_get_devices(&netdev_linux_class, &device_shash);
675 SHASH_FOR_EACH (node, &device_shash) {
676 struct netdev *netdev_ = node->data;
677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
678 unsigned int flags;
679
86383816 680 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
681 get_flags(netdev_, &flags);
682 netdev_linux_changed(netdev, flags, 0);
86383816
BP
683 ovs_mutex_unlock(&netdev->mutex);
684
cee87338
BP
685 netdev_close(netdev_);
686 }
687 shash_destroy(&device_shash);
688 } else if (error != EAGAIN) {
689 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
690 ovs_strerror(error));
691 }
692 ofpbuf_uninit(&buf);
693 } while (!error);
8b61709d
BP
694}
695
696static void
697netdev_linux_wait(void)
698{
cee87338
BP
699 struct nl_sock *sock;
700
19c8e9c1
JS
701 if (netdev_linux_miimon_enabled()) {
702 netdev_linux_miimon_wait();
703 }
cee87338
BP
704 sock = netdev_linux_notify_sock();
705 if (sock) {
706 nl_sock_wait(sock, POLLIN);
707 }
8b61709d
BP
708}
709
ac4d3bcb 710static void
b5d57fc8
BP
711netdev_linux_changed(struct netdev_linux *dev,
712 unsigned int ifi_flags, unsigned int mask)
86383816 713 OVS_REQUIRES(dev->mutex)
ac4d3bcb 714{
3e912ffc 715 netdev_change_seq_changed(&dev->up);
8aa77183
BP
716
717 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
718 dev->carrier_resets++;
719 }
720 dev->ifi_flags = ifi_flags;
721
4f925bd3 722 dev->cache_valid &= mask;
6b6e1329 723 if (!(mask & VALID_IN)) {
a8704b50
PS
724 netdev_get_addrs_list_flush();
725 }
4f925bd3
PS
726}
727
728static void
b5d57fc8 729netdev_linux_update(struct netdev_linux *dev,
7e9dcc0f 730 const struct rtnetlink_change *change)
86383816 731 OVS_REQUIRES(dev->mutex)
4f925bd3 732{
d6384a3a
AW
733 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
734 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 735 /* Keep drv-info, and ip addresses. */
d6384a3a 736 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 737 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
738
739 /* Update netdev from rtnl-change msg. */
740 if (change->mtu) {
741 dev->mtu = change->mtu;
742 dev->cache_valid |= VALID_MTU;
743 dev->netdev_mtu_error = 0;
744 }
90a6637d 745
74ff3298
JR
746 if (!eth_addr_is_zero(change->mac)) {
747 dev->etheraddr = change->mac;
d6384a3a
AW
748 dev->cache_valid |= VALID_ETHERADDR;
749 dev->ether_addr_error = 0;
750 }
44445cac 751
d6384a3a
AW
752 dev->ifindex = change->if_index;
753 dev->cache_valid |= VALID_IFINDEX;
754 dev->get_ifindex_error = 0;
755 } else {
756 netdev_linux_changed(dev, change->ifi_flags, 0);
757 }
758 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
759 /* Invalidates in4, in6. */
6b6e1329 760 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 761 } else {
d6384a3a 762 OVS_NOT_REACHED();
4f925bd3 763 }
ac4d3bcb
EJ
764}
765
9dc63482
BP
766static struct netdev *
767netdev_linux_alloc(void)
768{
769 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
770 return &netdev->up;
771}
772
cee87338 773static void
9dc63482
BP
774netdev_linux_common_construct(struct netdev_linux *netdev)
775{
834d6caf 776 ovs_mutex_init(&netdev->mutex);
9dc63482
BP
777}
778
1f6e0fbd
BP
779/* Creates system and internal devices. */
780static int
9dc63482 781netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 782{
9dc63482 783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1f6e0fbd
BP
784 int error;
785
cee87338 786 netdev_linux_common_construct(netdev);
1f6e0fbd 787
b5d57fc8
BP
788 error = get_flags(&netdev->up, &netdev->ifi_flags);
789 if (error == ENODEV) {
9dc63482 790 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 791 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
792 return ENODEV;
793 } else {
794 /* "Internal" netdevs have to be created as netdev objects before
795 * they exist in the kernel, because creating them in the kernel
796 * happens by passing a netdev object to dpif_port_add().
797 * Therefore, ignore the error. */
798 }
799 }
46415c90 800
a740f0de
JG
801 return 0;
802}
803
5b7448ed
JG
804/* For most types of netdevs we open the device for each call of
805 * netdev_open(). However, this is not the case with tap devices,
806 * since it is only possible to open the device once. In this
807 * situation we share a single file descriptor, and consequently
808 * buffers, across all readers. Therefore once data is read it will
809 * be unavailable to other reads for tap devices. */
a740f0de 810static int
9dc63482 811netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 812{
9dc63482 813 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 814 static const char tap_dev[] = "/dev/net/tun";
9dc63482 815 const char *name = netdev_->name;
a740f0de
JG
816 struct ifreq ifr;
817 int error;
818
cee87338 819 netdev_linux_common_construct(netdev);
1f6e0fbd 820
6c88d577 821 /* Open tap device. */
d0d08f8a
BP
822 netdev->tap_fd = open(tap_dev, O_RDWR);
823 if (netdev->tap_fd < 0) {
6c88d577 824 error = errno;
10a89ef0 825 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 826 return error;
6c88d577
JP
827 }
828
829 /* Create tap device. */
830 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 831 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 832 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 833 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 834 ovs_strerror(errno));
6c88d577 835 error = errno;
f61d8d29 836 goto error_close;
6c88d577
JP
837 }
838
839 /* Make non-blocking. */
d0d08f8a 840 error = set_nonblocking(netdev->tap_fd);
a740f0de 841 if (error) {
f61d8d29 842 goto error_close;
a740f0de
JG
843 }
844
845 return 0;
846
f61d8d29 847error_close:
d0d08f8a 848 close(netdev->tap_fd);
a740f0de
JG
849 return error;
850}
851
6c88d577 852static void
9dc63482 853netdev_linux_destruct(struct netdev *netdev_)
6c88d577 854{
b5d57fc8 855 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 856
b5d57fc8
BP
857 if (netdev->tc && netdev->tc->ops->tc_destroy) {
858 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
859 }
860
d0d08f8a
BP
861 if (netdev_get_class(netdev_) == &netdev_tap_class
862 && netdev->tap_fd >= 0)
863 {
864 close(netdev->tap_fd);
6c88d577 865 }
86383816 866
19c8e9c1 867 if (netdev->miimon_interval > 0) {
812c272c 868 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
869 }
870
86383816 871 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
872}
873
9dc63482
BP
874static void
875netdev_linux_dealloc(struct netdev *netdev_)
876{
877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
878 free(netdev);
879}
880
f7791740
PS
881static struct netdev_rxq *
882netdev_linux_rxq_alloc(void)
9dc63482 883{
f7791740 884 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
885 return &rx->up;
886}
887
7b6b0ef4 888static int
f7791740 889netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 890{
f7791740 891 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 892 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 893 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 894 int error;
7b6b0ef4 895
86383816 896 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
897 rx->is_tap = is_tap_netdev(netdev_);
898 if (rx->is_tap) {
899 rx->fd = netdev->tap_fd;
796223f5
BP
900 } else {
901 struct sockaddr_ll sll;
b73c8518 902 int ifindex, val;
32383c3b 903 /* Result of tcpdump -dd inbound */
259e0b1a 904 static const struct sock_filter filt[] = {
32383c3b
MM
905 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
906 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
907 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
908 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
909 };
259e0b1a
BP
910 static const struct sock_fprog fprog = {
911 ARRAY_SIZE(filt), (struct sock_filter *) filt
912 };
7b6b0ef4 913
796223f5 914 /* Create file descriptor. */
9dc63482
BP
915 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
916 if (rx->fd < 0) {
796223f5 917 error = errno;
10a89ef0 918 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
919 goto error;
920 }
33d82a56 921
b73c8518
SH
922 val = 1;
923 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
924 error = errno;
925 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
926 netdev_get_name(netdev_), ovs_strerror(error));
927 goto error;
928 }
929
796223f5 930 /* Set non-blocking mode. */
9dc63482 931 error = set_nonblocking(rx->fd);
796223f5
BP
932 if (error) {
933 goto error;
934 }
7b6b0ef4 935
796223f5 936 /* Get ethernet device index. */
180c6d0b 937 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
938 if (error) {
939 goto error;
940 }
7b6b0ef4 941
796223f5
BP
942 /* Bind to specific ethernet device. */
943 memset(&sll, 0, sizeof sll);
944 sll.sll_family = AF_PACKET;
945 sll.sll_ifindex = ifindex;
b73c8518 946 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 947 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
948 error = errno;
949 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 950 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
951 goto error;
952 }
32383c3b
MM
953
954 /* Filter for only inbound packets. */
9dc63482 955 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
956 sizeof fprog);
957 if (error) {
958 error = errno;
259e0b1a 959 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 960 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
961 goto error;
962 }
7b6b0ef4 963 }
86383816 964 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 965
7b6b0ef4
BP
966 return 0;
967
968error:
9dc63482
BP
969 if (rx->fd >= 0) {
970 close(rx->fd);
7b6b0ef4 971 }
86383816 972 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
973 return error;
974}
975
796223f5 976static void
f7791740 977netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 978{
f7791740 979 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 980
796223f5
BP
981 if (!rx->is_tap) {
982 close(rx->fd);
8b61709d 983 }
9dc63482
BP
984}
985
986static void
f7791740 987netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 988{
f7791740 989 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 990
796223f5
BP
991 free(rx);
992}
8b61709d 993
b73c8518
SH
994static ovs_be16
995auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
996{
997 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
998 return htons(aux->tp_vlan_tpid);
999 } else {
1000 return htons(ETH_TYPE_VLAN);
1001 }
1002}
1003
1004static bool
1005auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1006{
1007 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1008}
1009
796223f5 1010static int
cf62fa4c 1011netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1012{
b73c8518 1013 size_t size;
796223f5 1014 ssize_t retval;
b73c8518
SH
1015 struct iovec iov;
1016 struct cmsghdr *cmsg;
1017 union {
1018 struct cmsghdr cmsg;
1019 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1020 } cmsg_buffer;
1021 struct msghdr msgh;
1022
1023 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1024 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1025 size = dp_packet_tailroom(buffer);
b73c8518 1026
cf62fa4c 1027 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1028 iov.iov_len = size;
1029 msgh.msg_name = NULL;
1030 msgh.msg_namelen = 0;
1031 msgh.msg_iov = &iov;
1032 msgh.msg_iovlen = 1;
1033 msgh.msg_control = &cmsg_buffer;
1034 msgh.msg_controllen = sizeof cmsg_buffer;
1035 msgh.msg_flags = 0;
8e8cddf7 1036
796223f5 1037 do {
b73c8518 1038 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1039 } while (retval < 0 && errno == EINTR);
1040
bfd3367b 1041 if (retval < 0) {
b73c8518
SH
1042 return errno;
1043 } else if (retval > size) {
1044 return EMSGSIZE;
1045 }
1046
cf62fa4c 1047 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1048
1049 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1050 const struct tpacket_auxdata *aux;
1051
1052 if (cmsg->cmsg_level != SOL_PACKET
1053 || cmsg->cmsg_type != PACKET_AUXDATA
1054 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1055 continue;
8b61709d 1056 }
b73c8518
SH
1057
1058 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1059 if (auxdata_has_vlan_tci(aux)) {
1060 if (retval < ETH_HEADER_LEN) {
1061 return EINVAL;
1062 }
1063
1064 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1065 htons(aux->tp_vlan_tci));
1066 break;
1067 }
1068 }
1069
1070 return 0;
1071}
1072
1073static int
cf62fa4c 1074netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1075{
1076 ssize_t retval;
cf62fa4c 1077 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1078
1079 do {
cf62fa4c 1080 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1081 } while (retval < 0 && errno == EINTR);
1082
1083 if (retval < 0) {
bfd3367b 1084 return errno;
8b61709d 1085 }
b73c8518 1086
cf62fa4c 1087 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1088 return 0;
1089}
1090
1091static int
e14deea0 1092netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
91088554 1093 int *c)
b73c8518 1094{
f7791740 1095 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1096 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1097 struct dp_packet *buffer;
df1e5a3b
PS
1098 ssize_t retval;
1099 int mtu;
1100
1101 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1102 mtu = ETH_PAYLOAD_MAX;
1103 }
1104
cf62fa4c 1105 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1106 DP_NETDEV_HEADROOM);
b73c8518 1107 retval = (rx->is_tap
f7791740
PS
1108 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1109 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1110
1111 if (retval) {
1112 if (retval != EAGAIN && retval != EMSGSIZE) {
1113 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1114 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1115 }
cf62fa4c 1116 dp_packet_delete(buffer);
df1e5a3b
PS
1117 } else {
1118 dp_packet_pad(buffer);
cf62fa4c 1119 packets[0] = buffer;
df1e5a3b 1120 *c = 1;
b73c8518
SH
1121 }
1122
1123 return retval;
8b61709d
BP
1124}
1125
8b61709d 1126static void
f7791740 1127netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1128{
f7791740 1129 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1130 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1131}
1132
8b61709d 1133static int
f7791740 1134netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1135{
f7791740 1136 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1137 if (rx->is_tap) {
8b61709d 1138 struct ifreq ifr;
f7791740 1139 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1140 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1141 if (error) {
1142 return error;
1143 }
796223f5 1144 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1145 return 0;
1146 } else {
796223f5 1147 return drain_rcvbuf(rx->fd);
8b61709d
BP
1148 }
1149}
1150
1151/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1152 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1153 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1154 * the packet is too big or too small to transmit on the device.
1155 *
1156 * The caller retains ownership of 'buffer' in all cases.
1157 *
1158 * The kernel maintains a packet transmission queue, so the caller is not
1159 * expected to do additional queuing of packets. */
1160static int
f00fa8cb 1161netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
e14deea0 1162 struct dp_packet **pkts, int cnt, bool may_steal)
8b61709d 1163{
f4fd623c
DDP
1164 int i;
1165 int error = 0;
40d26f04 1166
f4fd623c
DDP
1167 /* 'i' is incremented only if there's no error */
1168 for (i = 0; i < cnt;) {
cf62fa4c
PS
1169 const void *data = dp_packet_data(pkts[i]);
1170 size_t size = dp_packet_size(pkts[i]);
f23347ea 1171 ssize_t retval;
8b61709d 1172
796223f5 1173 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
1174 /* Use our AF_PACKET socket to send to this device. */
1175 struct sockaddr_ll sll;
1176 struct msghdr msg;
1177 struct iovec iov;
1178 int ifindex;
488d734d
BP
1179 int sock;
1180
1181 sock = af_packet_sock();
1182 if (sock < 0) {
c4c7a3d7 1183 return -sock;
488d734d 1184 }
f23347ea 1185
86383816
BP
1186 ifindex = netdev_get_ifindex(netdev_);
1187 if (ifindex < 0) {
1188 return -ifindex;
f23347ea 1189 }
8b61709d 1190
f23347ea
BP
1191 /* We don't bother setting most fields in sockaddr_ll because the
1192 * kernel ignores them for SOCK_RAW. */
1193 memset(&sll, 0, sizeof sll);
1194 sll.sll_family = AF_PACKET;
1195 sll.sll_ifindex = ifindex;
76c308b5 1196
ebc56baa 1197 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1198 iov.iov_len = size;
76c308b5 1199
f23347ea
BP
1200 msg.msg_name = &sll;
1201 msg.msg_namelen = sizeof sll;
1202 msg.msg_iov = &iov;
1203 msg.msg_iovlen = 1;
1204 msg.msg_control = NULL;
1205 msg.msg_controllen = 0;
1206 msg.msg_flags = 0;
1207
488d734d 1208 retval = sendmsg(sock, &msg, 0);
f23347ea 1209 } else {
796223f5
BP
1210 /* Use the tap fd to send to this device. This is essential for
1211 * tap devices, because packets sent to a tap device with an
1212 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1213 * tap device. This doesn't occur on other interface types
1214 * because we attach a socket filter to the rx socket. */
b5d57fc8 1215 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1216
d0d08f8a 1217 retval = write(netdev->tap_fd, data, size);
f23347ea 1218 }
76c308b5 1219
8b61709d
BP
1220 if (retval < 0) {
1221 /* The Linux AF_PACKET implementation never blocks waiting for room
1222 * for packets, instead returning ENOBUFS. Translate this into
1223 * EAGAIN for the caller. */
f4fd623c
DDP
1224 error = errno == ENOBUFS ? EAGAIN : errno;
1225 if (error == EINTR) {
1226 /* continue without incrementing 'i', i.e. retry this packet */
8b61709d 1227 continue;
8b61709d 1228 }
f4fd623c 1229 break;
8b61709d 1230 } else if (retval != size) {
f4fd623c
DDP
1231 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1232 " of %"PRIuSIZE") on %s", retval, size,
1233 netdev_get_name(netdev_));
1234 error = EMSGSIZE;
1235 break;
1236 }
1237
1238 /* Process the next packet in the batch */
1239 i++;
1240 }
1241
1242 if (may_steal) {
1243 for (i = 0; i < cnt; i++) {
e14deea0 1244 dp_packet_delete(pkts[i]);
8b61709d
BP
1245 }
1246 }
f4fd623c
DDP
1247
1248 if (error && error != EAGAIN) {
1249 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1250 netdev_get_name(netdev_), ovs_strerror(error));
1251 }
1252
1253 return error;
1254
8b61709d
BP
1255}
1256
1257/* Registers with the poll loop to wake up from the next call to poll_block()
1258 * when the packet transmission queue has sufficient room to transmit a packet
1259 * with netdev_send().
1260 *
1261 * The kernel maintains a packet transmission queue, so the client is not
1262 * expected to do additional queuing of packets. Thus, this function is
1263 * unlikely to ever be used. It is included for completeness. */
1264static void
f00fa8cb 1265netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1266{
796223f5 1267 if (is_tap_netdev(netdev)) {
8b61709d
BP
1268 /* TAP device always accepts packets.*/
1269 poll_immediate_wake();
1270 }
1271}
1272
1273/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1274 * otherwise a positive errno value. */
1275static int
74ff3298 1276netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1277{
b5d57fc8 1278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1279 enum netdev_flags old_flags = 0;
eb395f2e
BP
1280 int error;
1281
86383816
BP
1282 ovs_mutex_lock(&netdev->mutex);
1283
b5d57fc8 1284 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1285 error = netdev->ether_addr_error;
1286 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1287 goto exit;
44445cac 1288 }
b5d57fc8 1289 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1290 }
1291
7eb1bd81 1292 /* Tap devices must be brought down before setting the address. */
796223f5 1293 if (is_tap_netdev(netdev_)) {
4f9f3f21 1294 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1295 }
44445cac
PS
1296 error = set_etheraddr(netdev_get_name(netdev_), mac);
1297 if (!error || error == ENODEV) {
b5d57fc8
BP
1298 netdev->ether_addr_error = error;
1299 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1300 if (!error) {
74ff3298 1301 netdev->etheraddr = mac;
eb395f2e 1302 }
8b61709d 1303 }
44445cac 1304
4f9f3f21
BP
1305 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1306 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1307 }
7eb1bd81 1308
86383816
BP
1309exit:
1310 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1311 return error;
1312}
1313
44445cac 1314/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1315static int
74ff3298 1316netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1317{
b5d57fc8 1318 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1319 int error;
44445cac 1320
86383816 1321 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1322 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816 1323 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1324 &netdev->etheraddr);
b5d57fc8 1325 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1326 }
44445cac 1327
86383816
BP
1328 error = netdev->ether_addr_error;
1329 if (!error) {
74ff3298 1330 *mac = netdev->etheraddr;
44445cac 1331 }
86383816 1332 ovs_mutex_unlock(&netdev->mutex);
44445cac 1333
86383816 1334 return error;
8b61709d
BP
1335}
1336
8b61709d 1337static int
73371c09 1338netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1339{
86383816
BP
1340 int error;
1341
b5d57fc8 1342 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1343 struct ifreq ifr;
90a6637d 1344
86383816 1345 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1346 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1347 netdev->mtu = ifr.ifr_mtu;
1348 netdev->cache_valid |= VALID_MTU;
8b61709d 1349 }
90a6637d 1350
86383816
BP
1351 error = netdev->netdev_mtu_error;
1352 if (!error) {
b5d57fc8 1353 *mtup = netdev->mtu;
90a6637d 1354 }
73371c09
BP
1355
1356 return error;
1357}
1358
1359/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1360 * in bytes, not including the hardware header; thus, this is typically 1500
1361 * bytes for Ethernet devices. */
1362static int
1363netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1364{
1365 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1366 int error;
1367
1368 ovs_mutex_lock(&netdev->mutex);
1369 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1370 ovs_mutex_unlock(&netdev->mutex);
1371
1372 return error;
8b61709d
BP
1373}
1374
9b020780
PS
1375/* Sets the maximum size of transmitted (MTU) for given device using linux
1376 * networking ioctl interface.
1377 */
1378static int
1379netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1380{
b5d57fc8 1381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1382 struct ifreq ifr;
1383 int error;
1384
86383816 1385 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1386 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1387 error = netdev->netdev_mtu_error;
1388 if (error || netdev->mtu == mtu) {
1389 goto exit;
90a6637d 1390 }
b5d57fc8 1391 netdev->cache_valid &= ~VALID_MTU;
153e5481 1392 }
9b020780 1393 ifr.ifr_mtu = mtu;
259e0b1a
BP
1394 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1395 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1396 if (!error || error == ENODEV) {
b5d57fc8
BP
1397 netdev->netdev_mtu_error = error;
1398 netdev->mtu = ifr.ifr_mtu;
1399 netdev->cache_valid |= VALID_MTU;
9b020780 1400 }
86383816
BP
1401exit:
1402 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1403 return error;
9b020780
PS
1404}
1405
9ab3d9a3
BP
1406/* Returns the ifindex of 'netdev', if successful, as a positive number.
1407 * On failure, returns a negative errno value. */
1408static int
86383816 1409netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1410{
86383816 1411 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1412 int ifindex, error;
1413
86383816
BP
1414 ovs_mutex_lock(&netdev->mutex);
1415 error = get_ifindex(netdev_, &ifindex);
1416 ovs_mutex_unlock(&netdev->mutex);
1417
9ab3d9a3
BP
1418 return error ? -error : ifindex;
1419}
1420
8b61709d
BP
1421static int
1422netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1423{
b5d57fc8 1424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1425
86383816 1426 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1427 if (netdev->miimon_interval > 0) {
1428 *carrier = netdev->miimon;
3a183124 1429 } else {
b5d57fc8 1430 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1431 }
86383816 1432 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1433
3a183124 1434 return 0;
8b61709d
BP
1435}
1436
65c3058c 1437static long long int
86383816 1438netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1439{
86383816
BP
1440 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1441 long long int carrier_resets;
1442
1443 ovs_mutex_lock(&netdev->mutex);
1444 carrier_resets = netdev->carrier_resets;
1445 ovs_mutex_unlock(&netdev->mutex);
1446
1447 return carrier_resets;
65c3058c
EJ
1448}
1449
63331829 1450static int
1670c579
EJ
1451netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1452 struct mii_ioctl_data *data)
63331829 1453{
63331829 1454 struct ifreq ifr;
782e6111 1455 int error;
63331829 1456
63331829 1457 memset(&ifr, 0, sizeof ifr);
782e6111 1458 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1459 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1460 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1461
782e6111
EJ
1462 return error;
1463}
1464
1465static int
1670c579 1466netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1467{
782e6111
EJ
1468 struct mii_ioctl_data data;
1469 int error;
63331829 1470
782e6111
EJ
1471 *miimon = false;
1472
1473 memset(&data, 0, sizeof data);
1670c579 1474 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1475 if (!error) {
1476 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1477 data.reg_num = MII_BMSR;
1670c579 1478 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1479 &data);
63331829
EJ
1480
1481 if (!error) {
782e6111 1482 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1483 } else {
1484 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1485 }
1486 } else {
1487 struct ethtool_cmd ecmd;
63331829
EJ
1488
1489 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1490 name);
1491
ab985a77 1492 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1493 memset(&ecmd, 0, sizeof ecmd);
1494 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1495 "ETHTOOL_GLINK");
1496 if (!error) {
782e6111
EJ
1497 struct ethtool_value eval;
1498
1499 memcpy(&eval, &ecmd, sizeof eval);
1500 *miimon = !!eval.data;
63331829
EJ
1501 } else {
1502 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1503 }
1504 }
1505
1506 return error;
1507}
1508
1670c579
EJ
1509static int
1510netdev_linux_set_miimon_interval(struct netdev *netdev_,
1511 long long int interval)
1512{
b5d57fc8 1513 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1514
86383816 1515 ovs_mutex_lock(&netdev->mutex);
1670c579 1516 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1517 if (netdev->miimon_interval != interval) {
19c8e9c1 1518 if (interval && !netdev->miimon_interval) {
812c272c 1519 atomic_count_inc(&miimon_cnt);
19c8e9c1 1520 } else if (!interval && netdev->miimon_interval) {
812c272c 1521 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1522 }
1523
b5d57fc8
BP
1524 netdev->miimon_interval = interval;
1525 timer_set_expired(&netdev->miimon_timer);
1670c579 1526 }
86383816 1527 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1528
1529 return 0;
1530}
1531
1532static void
1533netdev_linux_miimon_run(void)
1534{
1535 struct shash device_shash;
1536 struct shash_node *node;
1537
1538 shash_init(&device_shash);
b5d57fc8 1539 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1540 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1541 struct netdev *netdev = node->data;
1542 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1543 bool miimon;
1544
86383816
BP
1545 ovs_mutex_lock(&dev->mutex);
1546 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1547 netdev_linux_get_miimon(dev->up.name, &miimon);
1548 if (miimon != dev->miimon) {
1549 dev->miimon = miimon;
1550 netdev_linux_changed(dev, dev->ifi_flags, 0);
1551 }
1670c579 1552
86383816 1553 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1554 }
86383816 1555 ovs_mutex_unlock(&dev->mutex);
2f980d74 1556 netdev_close(netdev);
1670c579
EJ
1557 }
1558
1559 shash_destroy(&device_shash);
1560}
1561
1562static void
1563netdev_linux_miimon_wait(void)
1564{
1565 struct shash device_shash;
1566 struct shash_node *node;
1567
1568 shash_init(&device_shash);
b5d57fc8 1569 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1570 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1571 struct netdev *netdev = node->data;
1572 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1573
86383816 1574 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1575 if (dev->miimon_interval > 0) {
1576 timer_wait(&dev->miimon_timer);
1577 }
86383816 1578 ovs_mutex_unlock(&dev->mutex);
2f980d74 1579 netdev_close(netdev);
1670c579
EJ
1580 }
1581 shash_destroy(&device_shash);
1582}
1583
92df599c
JG
1584static void
1585swap_uint64(uint64_t *a, uint64_t *b)
1586{
1de0e8ae
BP
1587 uint64_t tmp = *a;
1588 *a = *b;
1589 *b = tmp;
92df599c
JG
1590}
1591
c060c4cf
EJ
1592/* Copies 'src' into 'dst', performing format conversion in the process.
1593 *
1594 * 'src' is allowed to be misaligned. */
1595static void
1596netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1597 const struct ovs_vport_stats *src)
1598{
6a54dedc
BP
1599 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1600 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1601 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1602 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1603 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1604 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1605 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1606 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1607 dst->multicast = 0;
1608 dst->collisions = 0;
1609 dst->rx_length_errors = 0;
1610 dst->rx_over_errors = 0;
1611 dst->rx_crc_errors = 0;
1612 dst->rx_frame_errors = 0;
1613 dst->rx_fifo_errors = 0;
1614 dst->rx_missed_errors = 0;
1615 dst->tx_aborted_errors = 0;
1616 dst->tx_carrier_errors = 0;
1617 dst->tx_fifo_errors = 0;
1618 dst->tx_heartbeat_errors = 0;
1619 dst->tx_window_errors = 0;
1620}
1621
1622static int
1623get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1624{
93451a0a 1625 struct dpif_netlink_vport reply;
c060c4cf
EJ
1626 struct ofpbuf *buf;
1627 int error;
1628
93451a0a 1629 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1630 if (error) {
1631 return error;
1632 } else if (!reply.stats) {
1633 ofpbuf_delete(buf);
1634 return EOPNOTSUPP;
1635 }
1636
1637 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1638
1639 ofpbuf_delete(buf);
1640
1641 return 0;
1642}
1643
f613a0d7
PS
1644static void
1645get_stats_via_vport(const struct netdev *netdev_,
1646 struct netdev_stats *stats)
8b61709d 1647{
b5d57fc8 1648 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1649
b5d57fc8
BP
1650 if (!netdev->vport_stats_error ||
1651 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1652 int error;
7fbef77a 1653
c060c4cf 1654 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1655 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1656 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1657 "(%s)",
1658 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1659 }
b5d57fc8
BP
1660 netdev->vport_stats_error = error;
1661 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1662 }
f613a0d7 1663}
8b61709d 1664
f613a0d7
PS
1665/* Retrieves current device stats for 'netdev-linux'. */
1666static int
1667netdev_linux_get_stats(const struct netdev *netdev_,
1668 struct netdev_stats *stats)
1669{
b5d57fc8 1670 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1671 struct netdev_stats dev_stats;
1672 int error;
1673
86383816 1674 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1675 get_stats_via_vport(netdev_, stats);
35eef899 1676 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1677 if (error) {
86383816
BP
1678 if (!netdev->vport_stats_error) {
1679 error = 0;
f613a0d7 1680 }
86383816 1681 } else if (netdev->vport_stats_error) {
04c881eb 1682 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1683 *stats = dev_stats;
1684 } else {
04c881eb
AZ
1685 /* Use kernel netdev's packet and byte counts since vport's counters
1686 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1687 * enabled. */
1688 stats->rx_packets = dev_stats.rx_packets;
1689 stats->rx_bytes = dev_stats.rx_bytes;
1690 stats->tx_packets = dev_stats.tx_packets;
1691 stats->tx_bytes = dev_stats.tx_bytes;
1692
f613a0d7
PS
1693 stats->rx_errors += dev_stats.rx_errors;
1694 stats->tx_errors += dev_stats.tx_errors;
1695 stats->rx_dropped += dev_stats.rx_dropped;
1696 stats->tx_dropped += dev_stats.tx_dropped;
1697 stats->multicast += dev_stats.multicast;
1698 stats->collisions += dev_stats.collisions;
1699 stats->rx_length_errors += dev_stats.rx_length_errors;
1700 stats->rx_over_errors += dev_stats.rx_over_errors;
1701 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1702 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1703 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1704 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1705 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1706 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1707 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1708 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1709 stats->tx_window_errors += dev_stats.tx_window_errors;
1710 }
86383816
BP
1711 ovs_mutex_unlock(&netdev->mutex);
1712
1713 return error;
f613a0d7
PS
1714}
1715
1716/* Retrieves current device stats for 'netdev-tap' netdev or
1717 * netdev-internal. */
1718static int
15aee116 1719netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1720{
b5d57fc8 1721 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1722 struct netdev_stats dev_stats;
1723 int error;
1724
86383816 1725 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1726 get_stats_via_vport(netdev_, stats);
35eef899 1727 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1728 if (error) {
86383816
BP
1729 if (!netdev->vport_stats_error) {
1730 error = 0;
8b61709d 1731 }
86383816
BP
1732 } else if (netdev->vport_stats_error) {
1733 /* Transmit and receive stats will appear to be swapped relative to the
1734 * other ports since we are the one sending the data, not a remote
1735 * computer. For consistency, we swap them back here. This does not
1736 * apply if we are getting stats from the vport layer because it always
1737 * tracks stats from the perspective of the switch. */
fe6b0e03 1738
f613a0d7 1739 *stats = dev_stats;
92df599c
JG
1740 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1741 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1742 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1743 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1744 stats->rx_length_errors = 0;
1745 stats->rx_over_errors = 0;
1746 stats->rx_crc_errors = 0;
1747 stats->rx_frame_errors = 0;
1748 stats->rx_fifo_errors = 0;
1749 stats->rx_missed_errors = 0;
1750 stats->tx_aborted_errors = 0;
1751 stats->tx_carrier_errors = 0;
1752 stats->tx_fifo_errors = 0;
1753 stats->tx_heartbeat_errors = 0;
1754 stats->tx_window_errors = 0;
f613a0d7 1755 } else {
04c881eb
AZ
1756 /* Use kernel netdev's packet and byte counts since vport counters
1757 * do not reflect packet counts on the wire when GSO, TSO or GRO
1758 * are enabled. */
1759 stats->rx_packets = dev_stats.tx_packets;
1760 stats->rx_bytes = dev_stats.tx_bytes;
1761 stats->tx_packets = dev_stats.rx_packets;
1762 stats->tx_bytes = dev_stats.rx_bytes;
1763
f613a0d7
PS
1764 stats->rx_dropped += dev_stats.tx_dropped;
1765 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1766
f613a0d7
PS
1767 stats->rx_errors += dev_stats.tx_errors;
1768 stats->tx_errors += dev_stats.rx_errors;
1769
1770 stats->multicast += dev_stats.multicast;
1771 stats->collisions += dev_stats.collisions;
1772 }
86383816
BP
1773 ovs_mutex_unlock(&netdev->mutex);
1774
1775 return error;
8b61709d
BP
1776}
1777
bba1e6f3
PS
1778static int
1779netdev_internal_get_stats(const struct netdev *netdev_,
1780 struct netdev_stats *stats)
1781{
b5d57fc8 1782 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1783 int error;
bba1e6f3 1784
86383816 1785 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1786 get_stats_via_vport(netdev_, stats);
86383816
BP
1787 error = netdev->vport_stats_error;
1788 ovs_mutex_unlock(&netdev->mutex);
1789
1790 return error;
bba1e6f3
PS
1791}
1792
51f87458 1793static void
b5d57fc8 1794netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1795{
1796 struct ethtool_cmd ecmd;
6c038611 1797 uint32_t speed;
8b61709d
BP
1798 int error;
1799
b5d57fc8 1800 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1801 return;
1802 }
1803
ab985a77 1804 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1805 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1806 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1807 ETHTOOL_GSET, "ETHTOOL_GSET");
1808 if (error) {
51f87458 1809 goto out;
8b61709d
BP
1810 }
1811
1812 /* Supported features. */
b5d57fc8 1813 netdev->supported = 0;
8b61709d 1814 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1815 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1816 }
1817 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1818 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1819 }
1820 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1821 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1822 }
1823 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1824 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1825 }
1826 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1827 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 1828 }
67bed84c
SH
1829 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1830 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 1831 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 1832 }
67bed84c
SH
1833 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1834 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1835 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1836 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 1837 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 1838 }
67bed84c
SH
1839 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1840 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1841 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1842 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1843 netdev->supported |= NETDEV_F_40GB_FD;
1844 }
8b61709d 1845 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1846 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1847 }
1848 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1849 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1850 }
1851 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1852 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1853 }
1854 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1855 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1856 }
1857 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1858 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1859 }
1860
1861 /* Advertised features. */
b5d57fc8 1862 netdev->advertised = 0;
8b61709d 1863 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1864 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1865 }
1866 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1867 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1868 }
1869 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1870 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1871 }
1872 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1873 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1874 }
1875 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1876 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 1877 }
67bed84c
SH
1878 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1879 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 1880 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 1881 }
67bed84c
SH
1882 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1883 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1884 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1885 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 1886 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 1887 }
67bed84c
SH
1888 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1889 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1890 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1891 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1892 netdev->advertised |= NETDEV_F_40GB_FD;
1893 }
8b61709d 1894 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1895 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1896 }
1897 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1898 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1899 }
1900 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1901 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1902 }
1903 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1904 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1905 }
1906 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1907 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1908 }
1909
1910 /* Current settings. */
0c615356 1911 speed = ethtool_cmd_speed(&ecmd);
6c038611 1912 if (speed == SPEED_10) {
b5d57fc8 1913 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1914 } else if (speed == SPEED_100) {
b5d57fc8 1915 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1916 } else if (speed == SPEED_1000) {
b5d57fc8 1917 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1918 } else if (speed == SPEED_10000) {
b5d57fc8 1919 netdev->current = NETDEV_F_10GB_FD;
6c038611 1920 } else if (speed == 40000) {
b5d57fc8 1921 netdev->current = NETDEV_F_40GB_FD;
6c038611 1922 } else if (speed == 100000) {
b5d57fc8 1923 netdev->current = NETDEV_F_100GB_FD;
6c038611 1924 } else if (speed == 1000000) {
b5d57fc8 1925 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1926 } else {
b5d57fc8 1927 netdev->current = 0;
8b61709d
BP
1928 }
1929
1930 if (ecmd.port == PORT_TP) {
b5d57fc8 1931 netdev->current |= NETDEV_F_COPPER;
8b61709d 1932 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1933 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1934 }
1935
1936 if (ecmd.autoneg) {
b5d57fc8 1937 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1938 }
1939
51f87458 1940out:
b5d57fc8
BP
1941 netdev->cache_valid |= VALID_FEATURES;
1942 netdev->get_features_error = error;
51f87458
PS
1943}
1944
887ed8b2
BP
1945/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1946 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1947 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1948static int
1949netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1950 enum netdev_features *current,
1951 enum netdev_features *advertised,
1952 enum netdev_features *supported,
1953 enum netdev_features *peer)
51f87458 1954{
b5d57fc8 1955 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1956 int error;
51f87458 1957
86383816 1958 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1959 netdev_linux_read_features(netdev);
b5d57fc8
BP
1960 if (!netdev->get_features_error) {
1961 *current = netdev->current;
1962 *advertised = netdev->advertised;
1963 *supported = netdev->supported;
887ed8b2 1964 *peer = 0; /* XXX */
51f87458 1965 }
86383816
BP
1966 error = netdev->get_features_error;
1967 ovs_mutex_unlock(&netdev->mutex);
1968
1969 return error;
8b61709d
BP
1970}
1971
1972/* Set the features advertised by 'netdev' to 'advertise'. */
1973static int
86383816 1974netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 1975 enum netdev_features advertise)
8b61709d 1976{
86383816 1977 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
1978 struct ethtool_cmd ecmd;
1979 int error;
1980
86383816
BP
1981 ovs_mutex_lock(&netdev->mutex);
1982
ab985a77 1983 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1984 memset(&ecmd, 0, sizeof ecmd);
86383816 1985 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
1986 ETHTOOL_GSET, "ETHTOOL_GSET");
1987 if (error) {
86383816 1988 goto exit;
8b61709d
BP
1989 }
1990
1991 ecmd.advertising = 0;
6c038611 1992 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1993 ecmd.advertising |= ADVERTISED_10baseT_Half;
1994 }
6c038611 1995 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1996 ecmd.advertising |= ADVERTISED_10baseT_Full;
1997 }
6c038611 1998 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1999 ecmd.advertising |= ADVERTISED_100baseT_Half;
2000 }
6c038611 2001 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2002 ecmd.advertising |= ADVERTISED_100baseT_Full;
2003 }
6c038611 2004 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2005 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2006 }
6c038611 2007 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2008 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2009 }
6c038611 2010 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2011 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2012 }
6c038611 2013 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2014 ecmd.advertising |= ADVERTISED_TP;
2015 }
6c038611 2016 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2017 ecmd.advertising |= ADVERTISED_FIBRE;
2018 }
6c038611 2019 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2020 ecmd.advertising |= ADVERTISED_Autoneg;
2021 }
6c038611 2022 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2023 ecmd.advertising |= ADVERTISED_Pause;
2024 }
6c038611 2025 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2026 ecmd.advertising |= ADVERTISED_Asym_Pause;
2027 }
ab985a77 2028 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2029 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2030 ETHTOOL_SSET, "ETHTOOL_SSET");
2031
2032exit:
2033 ovs_mutex_unlock(&netdev->mutex);
2034 return error;
8b61709d
BP
2035}
2036
f8500004
JP
2037/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2038 * successful, otherwise a positive errno value. */
8b61709d 2039static int
b5d57fc8 2040netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2041 uint32_t kbits_rate, uint32_t kbits_burst)
2042{
b5d57fc8
BP
2043 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2044 const char *netdev_name = netdev_get_name(netdev_);
f8500004 2045 int error;
8b61709d 2046
80a86fbe 2047 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2048 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2049 : kbits_burst); /* Stick with user-specified value. */
2050
86383816 2051 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2052 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2053 error = netdev->netdev_policing_error;
2054 if (error || (netdev->kbits_rate == kbits_rate &&
2055 netdev->kbits_burst == kbits_burst)) {
c9f71668 2056 /* Assume that settings haven't changed since we last set them. */
86383816 2057 goto out;
c9f71668 2058 }
b5d57fc8 2059 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2060 }
2061
ac8c3412 2062 COVERAGE_INC(netdev_set_policing);
f8500004 2063 /* Remove any existing ingress qdisc. */
b5d57fc8 2064 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
2065 if (error) {
2066 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2067 netdev_name, ovs_strerror(error));
c9f71668 2068 goto out;
f8500004
JP
2069 }
2070
8b61709d 2071 if (kbits_rate) {
b5d57fc8 2072 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
2073 if (error) {
2074 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2075 netdev_name, ovs_strerror(error));
c9f71668 2076 goto out;
8b61709d
BP
2077 }
2078
b5d57fc8 2079 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2080 if (error){
2081 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2082 netdev_name, ovs_strerror(error));
c9f71668 2083 goto out;
8b61709d 2084 }
8b61709d
BP
2085 }
2086
b5d57fc8
BP
2087 netdev->kbits_rate = kbits_rate;
2088 netdev->kbits_burst = kbits_burst;
f8500004 2089
c9f71668
PS
2090out:
2091 if (!error || error == ENODEV) {
b5d57fc8
BP
2092 netdev->netdev_policing_error = error;
2093 netdev->cache_valid |= VALID_POLICING;
c9f71668 2094 }
86383816 2095 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2096 return error;
8b61709d
BP
2097}
2098
c1c9c9c4
BP
2099static int
2100netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2101 struct sset *types)
c1c9c9c4 2102{
559eb230 2103 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2104
2105 for (opsp = tcs; *opsp != NULL; opsp++) {
2106 const struct tc_ops *ops = *opsp;
2107 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2108 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2109 }
2110 }
2111 return 0;
2112}
2113
2114static const struct tc_ops *
2115tc_lookup_ovs_name(const char *name)
2116{
559eb230 2117 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2118
2119 for (opsp = tcs; *opsp != NULL; opsp++) {
2120 const struct tc_ops *ops = *opsp;
2121 if (!strcmp(name, ops->ovs_name)) {
2122 return ops;
2123 }
2124 }
2125 return NULL;
2126}
2127
2128static const struct tc_ops *
2129tc_lookup_linux_name(const char *name)
2130{
559eb230 2131 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2132
2133 for (opsp = tcs; *opsp != NULL; opsp++) {
2134 const struct tc_ops *ops = *opsp;
2135 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2136 return ops;
2137 }
2138 }
2139 return NULL;
2140}
2141
93b13be8 2142static struct tc_queue *
b5d57fc8 2143tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2144 size_t hash)
2145{
b5d57fc8 2146 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2147 struct tc_queue *queue;
2148
b5d57fc8 2149 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2150 if (queue->queue_id == queue_id) {
2151 return queue;
2152 }
2153 }
2154 return NULL;
2155}
2156
2157static struct tc_queue *
2158tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2159{
2160 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2161}
2162
c1c9c9c4
BP
2163static int
2164netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2165 const char *type,
2166 struct netdev_qos_capabilities *caps)
2167{
2168 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2169 if (!ops) {
2170 return EOPNOTSUPP;
2171 }
2172 caps->n_queues = ops->n_queues;
2173 return 0;
2174}
2175
2176static int
b5d57fc8 2177netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2178 const char **typep, struct smap *details)
c1c9c9c4 2179{
b5d57fc8 2180 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2181 int error;
2182
86383816 2183 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2184 error = tc_query_qdisc(netdev_);
86383816
BP
2185 if (!error) {
2186 *typep = netdev->tc->ops->ovs_name;
2187 error = (netdev->tc->ops->qdisc_get
2188 ? netdev->tc->ops->qdisc_get(netdev_, details)
2189 : 0);
c1c9c9c4 2190 }
86383816 2191 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2192
86383816 2193 return error;
c1c9c9c4
BP
2194}
2195
2196static int
b5d57fc8 2197netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2198 const char *type, const struct smap *details)
c1c9c9c4 2199{
b5d57fc8 2200 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2201 const struct tc_ops *new_ops;
2202 int error;
2203
2204 new_ops = tc_lookup_ovs_name(type);
2205 if (!new_ops || !new_ops->tc_install) {
2206 return EOPNOTSUPP;
2207 }
2208
86383816 2209 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2210 error = tc_query_qdisc(netdev_);
c1c9c9c4 2211 if (error) {
86383816 2212 goto exit;
c1c9c9c4
BP
2213 }
2214
b5d57fc8 2215 if (new_ops == netdev->tc->ops) {
86383816 2216 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2217 } else {
2218 /* Delete existing qdisc. */
b5d57fc8 2219 error = tc_del_qdisc(netdev_);
c1c9c9c4 2220 if (error) {
86383816 2221 goto exit;
c1c9c9c4 2222 }
b5d57fc8 2223 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2224
2225 /* Install new qdisc. */
b5d57fc8
BP
2226 error = new_ops->tc_install(netdev_, details);
2227 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2228 }
86383816
BP
2229
2230exit:
2231 ovs_mutex_unlock(&netdev->mutex);
2232 return error;
c1c9c9c4
BP
2233}
2234
2235static int
b5d57fc8 2236netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2237 unsigned int queue_id, struct smap *details)
c1c9c9c4 2238{
b5d57fc8 2239 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2240 int error;
2241
86383816 2242 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2243 error = tc_query_qdisc(netdev_);
86383816 2244 if (!error) {
b5d57fc8 2245 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2246 error = (queue
b5d57fc8 2247 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2248 : ENOENT);
c1c9c9c4 2249 }
86383816
BP
2250 ovs_mutex_unlock(&netdev->mutex);
2251
2252 return error;
c1c9c9c4
BP
2253}
2254
2255static int
b5d57fc8 2256netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2257 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2258{
b5d57fc8 2259 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2260 int error;
2261
86383816 2262 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2263 error = tc_query_qdisc(netdev_);
86383816
BP
2264 if (!error) {
2265 error = (queue_id < netdev->tc->ops->n_queues
2266 && netdev->tc->ops->class_set
2267 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2268 : EINVAL);
c1c9c9c4 2269 }
86383816 2270 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2271
86383816 2272 return error;
c1c9c9c4
BP
2273}
2274
2275static int
b5d57fc8 2276netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2277{
b5d57fc8 2278 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2279 int error;
2280
86383816 2281 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2282 error = tc_query_qdisc(netdev_);
86383816
BP
2283 if (!error) {
2284 if (netdev->tc->ops->class_delete) {
2285 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2286 error = (queue
2287 ? netdev->tc->ops->class_delete(netdev_, queue)
2288 : ENOENT);
2289 } else {
2290 error = EINVAL;
2291 }
c1c9c9c4 2292 }
86383816
BP
2293 ovs_mutex_unlock(&netdev->mutex);
2294
2295 return error;
c1c9c9c4
BP
2296}
2297
2298static int
b5d57fc8 2299netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2300 unsigned int queue_id,
2301 struct netdev_queue_stats *stats)
2302{
b5d57fc8 2303 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2304 int error;
2305
86383816 2306 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2307 error = tc_query_qdisc(netdev_);
86383816
BP
2308 if (!error) {
2309 if (netdev->tc->ops->class_get_stats) {
2310 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2311 if (queue) {
2312 stats->created = queue->created;
2313 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2314 stats);
2315 } else {
2316 error = ENOENT;
2317 }
2318 } else {
2319 error = EOPNOTSUPP;
6dc34a0d 2320 }
c1c9c9c4 2321 }
86383816
BP
2322 ovs_mutex_unlock(&netdev->mutex);
2323
2324 return error;
c1c9c9c4
BP
2325}
2326
d57695d7
JS
2327struct queue_dump_state {
2328 struct nl_dump dump;
2329 struct ofpbuf buf;
2330};
2331
23a98ffe 2332static bool
d57695d7 2333start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2334{
2335 struct ofpbuf request;
2336 struct tcmsg *tcmsg;
2337
2338 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2339 if (!tcmsg) {
2340 return false;
2341 }
3c4de644 2342 tcmsg->tcm_parent = 0;
d57695d7 2343 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2344 ofpbuf_uninit(&request);
d57695d7
JS
2345
2346 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2347 return true;
c1c9c9c4
BP
2348}
2349
d57695d7
JS
2350static int
2351finish_queue_dump(struct queue_dump_state *state)
2352{
2353 ofpbuf_uninit(&state->buf);
2354 return nl_dump_done(&state->dump);
2355}
2356
89454bf4
BP
2357struct netdev_linux_queue_state {
2358 unsigned int *queues;
2359 size_t cur_queue;
2360 size_t n_queues;
2361};
2362
c1c9c9c4 2363static int
89454bf4 2364netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2365{
89454bf4 2366 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2367 int error;
2368
86383816 2369 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2370 error = tc_query_qdisc(netdev_);
86383816
BP
2371 if (!error) {
2372 if (netdev->tc->ops->class_get) {
89454bf4
BP
2373 struct netdev_linux_queue_state *state;
2374 struct tc_queue *queue;
2375 size_t i;
2376
2377 *statep = state = xmalloc(sizeof *state);
2378 state->n_queues = hmap_count(&netdev->tc->queues);
2379 state->cur_queue = 0;
2380 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2381
2382 i = 0;
2383 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2384 state->queues[i++] = queue->queue_id;
86383816 2385 }
c1c9c9c4 2386 } else {
86383816 2387 error = EOPNOTSUPP;
c1c9c9c4
BP
2388 }
2389 }
86383816 2390 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2391
86383816 2392 return error;
c1c9c9c4
BP
2393}
2394
89454bf4
BP
2395static int
2396netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2397 unsigned int *queue_idp, struct smap *details)
2398{
2399 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2400 struct netdev_linux_queue_state *state = state_;
2401 int error = EOF;
2402
2403 ovs_mutex_lock(&netdev->mutex);
2404 while (state->cur_queue < state->n_queues) {
2405 unsigned int queue_id = state->queues[state->cur_queue++];
2406 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2407
2408 if (queue) {
2409 *queue_idp = queue_id;
2410 error = netdev->tc->ops->class_get(netdev_, queue, details);
2411 break;
2412 }
2413 }
2414 ovs_mutex_unlock(&netdev->mutex);
2415
2416 return error;
2417}
2418
2419static int
2420netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2421 void *state_)
2422{
2423 struct netdev_linux_queue_state *state = state_;
2424
2425 free(state->queues);
2426 free(state);
2427 return 0;
2428}
2429
c1c9c9c4 2430static int
b5d57fc8 2431netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2432 netdev_dump_queue_stats_cb *cb, void *aux)
2433{
b5d57fc8 2434 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2435 int error;
2436
86383816 2437 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2438 error = tc_query_qdisc(netdev_);
86383816 2439 if (!error) {
d57695d7 2440 struct queue_dump_state state;
c1c9c9c4 2441
86383816
BP
2442 if (!netdev->tc->ops->class_dump_stats) {
2443 error = EOPNOTSUPP;
d57695d7 2444 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2445 error = ENODEV;
2446 } else {
2447 struct ofpbuf msg;
2448 int retval;
2449
d57695d7 2450 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2451 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2452 cb, aux);
2453 if (retval) {
2454 error = retval;
2455 }
2456 }
2457
d57695d7 2458 retval = finish_queue_dump(&state);
86383816
BP
2459 if (retval) {
2460 error = retval;
2461 }
c1c9c9c4
BP
2462 }
2463 }
86383816 2464 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2465
86383816 2466 return error;
c1c9c9c4
BP
2467}
2468
8b61709d 2469static int
f1acd62b
BP
2470netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2471 struct in_addr netmask)
8b61709d 2472{
b5d57fc8 2473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2474 int error;
2475
86383816 2476 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2477 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2478 if (!error) {
f1acd62b 2479 if (address.s_addr != INADDR_ANY) {
8b61709d 2480 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2481 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2482 }
2483 }
49af9a3d 2484
86383816
BP
2485 ovs_mutex_unlock(&netdev->mutex);
2486
8b61709d
BP
2487 return error;
2488}
2489
7df6932e
AW
2490/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2491 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2492 * error. */
8b61709d 2493static int
a8704b50
PS
2494netdev_linux_get_addr_list(const struct netdev *netdev_,
2495 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2496{
b5d57fc8 2497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2498 int error;
86383816
BP
2499
2500 ovs_mutex_lock(&netdev->mutex);
a8704b50 2501 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816
BP
2502 ovs_mutex_unlock(&netdev->mutex);
2503
7df6932e 2504 return error;
8b61709d
BP
2505}
2506
2507static void
2508make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2509{
2510 struct sockaddr_in sin;
2511 memset(&sin, 0, sizeof sin);
2512 sin.sin_family = AF_INET;
2513 sin.sin_addr = addr;
2514 sin.sin_port = 0;
2515
2516 memset(sa, 0, sizeof *sa);
2517 memcpy(sa, &sin, sizeof sin);
2518}
2519
2520static int
2521do_set_addr(struct netdev *netdev,
2522 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2523{
2524 struct ifreq ifr;
149f577a 2525
259e0b1a
BP
2526 make_in4_sockaddr(&ifr.ifr_addr, addr);
2527 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2528 ioctl_name);
8b61709d
BP
2529}
2530
2531/* Adds 'router' as a default IP gateway. */
2532static int
67a4917b 2533netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2534{
2535 struct in_addr any = { INADDR_ANY };
2536 struct rtentry rt;
2537 int error;
2538
2539 memset(&rt, 0, sizeof rt);
2540 make_in4_sockaddr(&rt.rt_dst, any);
2541 make_in4_sockaddr(&rt.rt_gateway, router);
2542 make_in4_sockaddr(&rt.rt_genmask, any);
2543 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2544 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2545 if (error) {
10a89ef0 2546 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2547 }
2548 return error;
2549}
2550
f1acd62b
BP
2551static int
2552netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2553 char **netdev_name)
2554{
2555 static const char fn[] = "/proc/net/route";
2556 FILE *stream;
2557 char line[256];
2558 int ln;
2559
2560 *netdev_name = NULL;
2561 stream = fopen(fn, "r");
2562 if (stream == NULL) {
10a89ef0 2563 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2564 return errno;
2565 }
2566
2567 ln = 0;
2568 while (fgets(line, sizeof line, stream)) {
2569 if (++ln >= 2) {
2570 char iface[17];
dbba996b 2571 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2572 int refcnt, metric, mtu;
2573 unsigned int flags, use, window, irtt;
2574
c2c28dfd
BP
2575 if (!ovs_scan(line,
2576 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2577 " %d %u %u\n",
2578 iface, &dest, &gateway, &flags, &refcnt,
2579 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2580 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2581 fn, ln, line);
2582 continue;
2583 }
2584 if (!(flags & RTF_UP)) {
2585 /* Skip routes that aren't up. */
2586 continue;
2587 }
2588
2589 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2590 * network byte order, so we don't need need any endian
f1acd62b
BP
2591 * conversions here. */
2592 if ((dest & mask) == (host->s_addr & mask)) {
2593 if (!gateway) {
2594 /* The host is directly reachable. */
2595 next_hop->s_addr = 0;
2596 } else {
2597 /* To reach the host, we must go through a gateway. */
2598 next_hop->s_addr = gateway;
2599 }
2600 *netdev_name = xstrdup(iface);
2601 fclose(stream);
2602 return 0;
2603 }
2604 }
2605 }
2606
2607 fclose(stream);
2608 return ENXIO;
2609}
2610
e210037e 2611static int
b5d57fc8 2612netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2613{
b5d57fc8 2614 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2615 int error = 0;
2616
86383816 2617 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2618 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2619 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2620
2621 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2622 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2623 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2624 cmd,
2625 ETHTOOL_GDRVINFO,
2626 "ETHTOOL_GDRVINFO");
2627 if (!error) {
b5d57fc8 2628 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2629 }
2630 }
e210037e 2631
e210037e 2632 if (!error) {
b5d57fc8
BP
2633 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2634 smap_add(smap, "driver_version", netdev->drvinfo.version);
2635 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2636 }
86383816
BP
2637 ovs_mutex_unlock(&netdev->mutex);
2638
e210037e
AE
2639 return error;
2640}
2641
4f925bd3 2642static int
275707c3
EJ
2643netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2644 struct smap *smap)
4f925bd3 2645{
79f1cbe9 2646 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2647 return 0;
2648}
2649
8b61709d
BP
2650/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2651 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2652 * returns 0. Otherwise, it returns a positive errno value; in particular,
2653 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2654static int
2655netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2656 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2657{
2658 struct arpreq r;
c100e025 2659 struct sockaddr_in sin;
8b61709d
BP
2660 int retval;
2661
2662 memset(&r, 0, sizeof r);
f2cc621b 2663 memset(&sin, 0, sizeof sin);
c100e025
BP
2664 sin.sin_family = AF_INET;
2665 sin.sin_addr.s_addr = ip;
2666 sin.sin_port = 0;
2667 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2668 r.arp_ha.sa_family = ARPHRD_ETHER;
2669 r.arp_flags = 0;
71d7c22f 2670 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2671 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2672 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2673 if (!retval) {
2674 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2675 } else if (retval != ENXIO) {
2676 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2677 netdev_get_name(netdev), IP_ARGS(ip),
2678 ovs_strerror(retval));
8b61709d
BP
2679 }
2680 return retval;
2681}
2682
2683static int
2684nd_to_iff_flags(enum netdev_flags nd)
2685{
2686 int iff = 0;
2687 if (nd & NETDEV_UP) {
2688 iff |= IFF_UP;
2689 }
2690 if (nd & NETDEV_PROMISC) {
2691 iff |= IFF_PROMISC;
2692 }
7ba19d41
AC
2693 if (nd & NETDEV_LOOPBACK) {
2694 iff |= IFF_LOOPBACK;
2695 }
8b61709d
BP
2696 return iff;
2697}
2698
2699static int
2700iff_to_nd_flags(int iff)
2701{
2702 enum netdev_flags nd = 0;
2703 if (iff & IFF_UP) {
2704 nd |= NETDEV_UP;
2705 }
2706 if (iff & IFF_PROMISC) {
2707 nd |= NETDEV_PROMISC;
2708 }
7ba19d41
AC
2709 if (iff & IFF_LOOPBACK) {
2710 nd |= NETDEV_LOOPBACK;
2711 }
8b61709d
BP
2712 return nd;
2713}
2714
2715static int
4f9f3f21
BP
2716update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2717 enum netdev_flags on, enum netdev_flags *old_flagsp)
2718 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2719{
2720 int old_flags, new_flags;
c37d4da4
EJ
2721 int error = 0;
2722
b5d57fc8 2723 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2724 *old_flagsp = iff_to_nd_flags(old_flags);
2725 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2726 if (new_flags != old_flags) {
4f9f3f21
BP
2727 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2728 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2729 }
4f9f3f21
BP
2730
2731 return error;
2732}
2733
2734static int
2735netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2736 enum netdev_flags on, enum netdev_flags *old_flagsp)
2737{
2738 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2739 int error;
2740
2741 ovs_mutex_lock(&netdev->mutex);
2742 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2743 ovs_mutex_unlock(&netdev->mutex);
2744
8b61709d
BP
2745 return error;
2746}
2747
2f9dd77f 2748#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2749 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2750{ \
2751 NAME, \
118c77b1 2752 false, /* is_pmd */ \
c3827f61 2753 \
259e0b1a 2754 NULL, \
c3827f61
BP
2755 netdev_linux_run, \
2756 netdev_linux_wait, \
2757 \
9dc63482
BP
2758 netdev_linux_alloc, \
2759 CONSTRUCT, \
2760 netdev_linux_destruct, \
2761 netdev_linux_dealloc, \
de5cdb90 2762 NULL, /* get_config */ \
6d9e6eb4 2763 NULL, /* set_config */ \
f431bf7d 2764 NULL, /* get_tunnel_config */ \
a36de779
PS
2765 NULL, /* build header */ \
2766 NULL, /* push header */ \
2767 NULL, /* pop header */ \
7dec44fe 2768 NULL, /* get_numa_id */ \
5496878c 2769 NULL, /* set_multiq */ \
c3827f61 2770 \
c3827f61
BP
2771 netdev_linux_send, \
2772 netdev_linux_send_wait, \
2773 \
2774 netdev_linux_set_etheraddr, \
2775 netdev_linux_get_etheraddr, \
2776 netdev_linux_get_mtu, \
9b020780 2777 netdev_linux_set_mtu, \
c3827f61
BP
2778 netdev_linux_get_ifindex, \
2779 netdev_linux_get_carrier, \
65c3058c 2780 netdev_linux_get_carrier_resets, \
1670c579 2781 netdev_linux_set_miimon_interval, \
f613a0d7 2782 GET_STATS, \
c3827f61 2783 \
51f87458 2784 GET_FEATURES, \
c3827f61 2785 netdev_linux_set_advertisements, \
c3827f61
BP
2786 \
2787 netdev_linux_set_policing, \
2788 netdev_linux_get_qos_types, \
2789 netdev_linux_get_qos_capabilities, \
2790 netdev_linux_get_qos, \
2791 netdev_linux_set_qos, \
2792 netdev_linux_get_queue, \
2793 netdev_linux_set_queue, \
2794 netdev_linux_delete_queue, \
2795 netdev_linux_get_queue_stats, \
89454bf4
BP
2796 netdev_linux_queue_dump_start, \
2797 netdev_linux_queue_dump_next, \
2798 netdev_linux_queue_dump_done, \
c3827f61
BP
2799 netdev_linux_dump_queue_stats, \
2800 \
c3827f61 2801 netdev_linux_set_in4, \
a8704b50 2802 netdev_linux_get_addr_list, \
c3827f61
BP
2803 netdev_linux_add_router, \
2804 netdev_linux_get_next_hop, \
4f925bd3 2805 GET_STATUS, \
c3827f61
BP
2806 netdev_linux_arp_lookup, \
2807 \
2808 netdev_linux_update_flags, \
2809 \
f7791740
PS
2810 netdev_linux_rxq_alloc, \
2811 netdev_linux_rxq_construct, \
2812 netdev_linux_rxq_destruct, \
2813 netdev_linux_rxq_dealloc, \
2814 netdev_linux_rxq_recv, \
2815 netdev_linux_rxq_wait, \
2816 netdev_linux_rxq_drain, \
c3827f61
BP
2817}
2818
2819const struct netdev_class netdev_linux_class =
2820 NETDEV_LINUX_CLASS(
2821 "system",
9dc63482 2822 netdev_linux_construct,
f613a0d7 2823 netdev_linux_get_stats,
51f87458 2824 netdev_linux_get_features,
275707c3 2825 netdev_linux_get_status);
c3827f61
BP
2826
2827const struct netdev_class netdev_tap_class =
2828 NETDEV_LINUX_CLASS(
2829 "tap",
9dc63482 2830 netdev_linux_construct_tap,
bba1e6f3 2831 netdev_tap_get_stats,
51f87458 2832 netdev_linux_get_features,
275707c3 2833 netdev_linux_get_status);
c3827f61
BP
2834
2835const struct netdev_class netdev_internal_class =
2836 NETDEV_LINUX_CLASS(
2837 "internal",
9dc63482 2838 netdev_linux_construct,
bba1e6f3 2839 netdev_internal_get_stats,
51f87458 2840 NULL, /* get_features */
275707c3 2841 netdev_internal_get_status);
8b61709d 2842\f
677d9158
JV
2843
2844#define CODEL_N_QUEUES 0x0000
2845
2f4298ce
BP
2846/* In sufficiently new kernel headers these are defined as enums in
2847 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2848 * kernels. (This overrides any enum definition in the header file but that's
2849 * harmless.) */
2850#define TCA_CODEL_TARGET 1
2851#define TCA_CODEL_LIMIT 2
2852#define TCA_CODEL_INTERVAL 3
2853
677d9158
JV
2854struct codel {
2855 struct tc tc;
2856 uint32_t target;
2857 uint32_t limit;
2858 uint32_t interval;
2859};
2860
2861static struct codel *
2862codel_get__(const struct netdev *netdev_)
2863{
2864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2865 return CONTAINER_OF(netdev->tc, struct codel, tc);
2866}
2867
2868static void
2869codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2870 uint32_t interval)
2871{
2872 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2873 struct codel *codel;
2874
2875 codel = xmalloc(sizeof *codel);
2876 tc_init(&codel->tc, &tc_ops_codel);
2877 codel->target = target;
2878 codel->limit = limit;
2879 codel->interval = interval;
2880
2881 netdev->tc = &codel->tc;
2882}
2883
2884static int
2885codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2886 uint32_t interval)
2887{
2888 size_t opt_offset;
2889 struct ofpbuf request;
2890 struct tcmsg *tcmsg;
2891 uint32_t otarget, olimit, ointerval;
2892 int error;
2893
2894 tc_del_qdisc(netdev);
2895
2896 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2897 NLM_F_EXCL | NLM_F_CREATE, &request);
2898 if (!tcmsg) {
2899 return ENODEV;
2900 }
2901 tcmsg->tcm_handle = tc_make_handle(1, 0);
2902 tcmsg->tcm_parent = TC_H_ROOT;
2903
2904 otarget = target ? target : 5000;
2905 olimit = limit ? limit : 10240;
2906 ointerval = interval ? interval : 100000;
2907
2908 nl_msg_put_string(&request, TCA_KIND, "codel");
2909 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2910 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2911 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2912 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2913 nl_msg_end_nested(&request, opt_offset);
2914
2915 error = tc_transact(&request, NULL);
2916 if (error) {
2917 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2918 "target %u, limit %u, interval %u error %d(%s)",
2919 netdev_get_name(netdev),
2920 otarget, olimit, ointerval,
2921 error, ovs_strerror(error));
2922 }
2923 return error;
2924}
2925
2926static void
2927codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2928 const struct smap *details, struct codel *codel)
2929{
2930 const char *target_s;
2931 const char *limit_s;
2932 const char *interval_s;
2933
2934 target_s = smap_get(details, "target");
2935 limit_s = smap_get(details, "limit");
2936 interval_s = smap_get(details, "interval");
2937
2938 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2939 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2940 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2941
2942 if (!codel->target) {
2943 codel->target = 5000;
2944 }
2945 if (!codel->limit) {
2946 codel->limit = 10240;
2947 }
2948 if (!codel->interval) {
2949 codel->interval = 100000;
2950 }
2951}
2952
2953static int
2954codel_tc_install(struct netdev *netdev, const struct smap *details)
2955{
2956 int error;
2957 struct codel codel;
2958
2959 codel_parse_qdisc_details__(netdev, details, &codel);
2960 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2961 codel.interval);
2962 if (!error) {
2963 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2964 }
2965 return error;
2966}
2967
2968static int
2969codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2970{
2971 static const struct nl_policy tca_codel_policy[] = {
2972 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2973 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2974 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2975 };
2976
2977 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2978
2979 if (!nl_parse_nested(nl_options, tca_codel_policy,
2980 attrs, ARRAY_SIZE(tca_codel_policy))) {
2981 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2982 return EPROTO;
2983 }
2984
2985 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2986 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2987 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2988 return 0;
2989}
2990
2991static int
2992codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2993{
2994 struct nlattr *nlattr;
2995 const char * kind;
2996 int error;
2997 struct codel codel;
2998
2999 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3000 if (error != 0) {
3001 return error;
3002 }
3003
3004 error = codel_parse_tca_options__(nlattr, &codel);
3005 if (error != 0) {
3006 return error;
3007 }
3008
3009 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3010 return 0;
3011}
3012
3013
3014static void
3015codel_tc_destroy(struct tc *tc)
3016{
3017 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3018 tc_destroy(tc);
3019 free(codel);
3020}
3021
3022static int
3023codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3024{
3025 const struct codel *codel = codel_get__(netdev);
3026 smap_add_format(details, "target", "%u", codel->target);
3027 smap_add_format(details, "limit", "%u", codel->limit);
3028 smap_add_format(details, "interval", "%u", codel->interval);
3029 return 0;
3030}
3031
3032static int
3033codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3034{
3035 struct codel codel;
3036
3037 codel_parse_qdisc_details__(netdev, details, &codel);
3038 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3039 codel_get__(netdev)->target = codel.target;
3040 codel_get__(netdev)->limit = codel.limit;
3041 codel_get__(netdev)->interval = codel.interval;
3042 return 0;
3043}
3044
3045static const struct tc_ops tc_ops_codel = {
3046 "codel", /* linux_name */
3047 "linux-codel", /* ovs_name */
3048 CODEL_N_QUEUES, /* n_queues */
3049 codel_tc_install,
3050 codel_tc_load,
3051 codel_tc_destroy,
3052 codel_qdisc_get,
3053 codel_qdisc_set,
3054 NULL,
3055 NULL,
3056 NULL,
3057 NULL,
3058 NULL
3059};
3060\f
3061/* FQ-CoDel traffic control class. */
3062
3063#define FQCODEL_N_QUEUES 0x0000
3064
2f4298ce
BP
3065/* In sufficiently new kernel headers these are defined as enums in
3066 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3067 * kernels. (This overrides any enum definition in the header file but that's
3068 * harmless.) */
3069#define TCA_FQ_CODEL_TARGET 1
3070#define TCA_FQ_CODEL_LIMIT 2
3071#define TCA_FQ_CODEL_INTERVAL 3
3072#define TCA_FQ_CODEL_ECN 4
3073#define TCA_FQ_CODEL_FLOWS 5
3074#define TCA_FQ_CODEL_QUANTUM 6
3075
677d9158
JV
3076struct fqcodel {
3077 struct tc tc;
3078 uint32_t target;
3079 uint32_t limit;
3080 uint32_t interval;
3081 uint32_t flows;
3082 uint32_t quantum;
3083};
3084
3085static struct fqcodel *
3086fqcodel_get__(const struct netdev *netdev_)
3087{
3088 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3089 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3090}
3091
3092static void
3093fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3094 uint32_t interval, uint32_t flows, uint32_t quantum)
3095{
3096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3097 struct fqcodel *fqcodel;
3098
3099 fqcodel = xmalloc(sizeof *fqcodel);
3100 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3101 fqcodel->target = target;
3102 fqcodel->limit = limit;
3103 fqcodel->interval = interval;
3104 fqcodel->flows = flows;
3105 fqcodel->quantum = quantum;
3106
3107 netdev->tc = &fqcodel->tc;
3108}
3109
3110static int
3111fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3112 uint32_t interval, uint32_t flows, uint32_t quantum)
3113{
3114 size_t opt_offset;
3115 struct ofpbuf request;
3116 struct tcmsg *tcmsg;
3117 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3118 int error;
3119
3120 tc_del_qdisc(netdev);
3121
3122 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3123 NLM_F_EXCL | NLM_F_CREATE, &request);
3124 if (!tcmsg) {
3125 return ENODEV;
3126 }
3127 tcmsg->tcm_handle = tc_make_handle(1, 0);
3128 tcmsg->tcm_parent = TC_H_ROOT;
3129
3130 otarget = target ? target : 5000;
3131 olimit = limit ? limit : 10240;
3132 ointerval = interval ? interval : 100000;
3133 oflows = flows ? flows : 1024;
3134 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3135 not mtu */
3136
3137 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3138 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3139 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3140 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3141 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3142 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3143 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3144 nl_msg_end_nested(&request, opt_offset);
3145
3146 error = tc_transact(&request, NULL);
3147 if (error) {
3148 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3149 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3150 netdev_get_name(netdev),
3151 otarget, olimit, ointerval, oflows, oquantum,
3152 error, ovs_strerror(error));
3153 }
3154 return error;
3155}
3156
3157static void
3158fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3159 const struct smap *details, struct fqcodel *fqcodel)
3160{
3161 const char *target_s;
3162 const char *limit_s;
3163 const char *interval_s;
3164 const char *flows_s;
3165 const char *quantum_s;
3166
3167 target_s = smap_get(details, "target");
3168 limit_s = smap_get(details, "limit");
3169 interval_s = smap_get(details, "interval");
3170 flows_s = smap_get(details, "flows");
3171 quantum_s = smap_get(details, "quantum");
3172 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3173 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3174 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3175 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3176 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3177 if (!fqcodel->target) {
3178 fqcodel->target = 5000;
3179 }
3180 if (!fqcodel->limit) {
3181 fqcodel->limit = 10240;
3182 }
3183 if (!fqcodel->interval) {
3184 fqcodel->interval = 1000000;
3185 }
3186 if (!fqcodel->flows) {
3187 fqcodel->flows = 1024;
3188 }
3189 if (!fqcodel->quantum) {
3190 fqcodel->quantum = 1514;
3191 }
3192}
3193
3194static int
3195fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3196{
3197 int error;
3198 struct fqcodel fqcodel;
3199
3200 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3201 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3202 fqcodel.interval, fqcodel.flows,
3203 fqcodel.quantum);
3204 if (!error) {
3205 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3206 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3207 }
3208 return error;
3209}
3210
3211static int
3212fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3213{
3214 static const struct nl_policy tca_fqcodel_policy[] = {
3215 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3216 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3217 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3218 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3219 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3220 };
3221
3222 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3223
3224 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3225 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3226 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3227 return EPROTO;
3228 }
3229
3230 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3231 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3232 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3233 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3234 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3235 return 0;
3236}
3237
3238static int
3239fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3240{
3241 struct nlattr *nlattr;
3242 const char * kind;
3243 int error;
3244 struct fqcodel fqcodel;
3245
3246 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3247 if (error != 0) {
3248 return error;
3249 }
3250
3251 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3252 if (error != 0) {
3253 return error;
3254 }
3255
3256 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3257 fqcodel.flows, fqcodel.quantum);
3258 return 0;
3259}
3260
3261static void
3262fqcodel_tc_destroy(struct tc *tc)
3263{
3264 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3265 tc_destroy(tc);
3266 free(fqcodel);
3267}
3268
3269static int
3270fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3271{
3272 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3273 smap_add_format(details, "target", "%u", fqcodel->target);
3274 smap_add_format(details, "limit", "%u", fqcodel->limit);
3275 smap_add_format(details, "interval", "%u", fqcodel->interval);
3276 smap_add_format(details, "flows", "%u", fqcodel->flows);
3277 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3278 return 0;
3279}
3280
3281static int
3282fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3283{
3284 struct fqcodel fqcodel;
3285
3286 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3287 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3288 fqcodel.flows, fqcodel.quantum);
3289 fqcodel_get__(netdev)->target = fqcodel.target;
3290 fqcodel_get__(netdev)->limit = fqcodel.limit;
3291 fqcodel_get__(netdev)->interval = fqcodel.interval;
3292 fqcodel_get__(netdev)->flows = fqcodel.flows;
3293 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3294 return 0;
3295}
3296
3297static const struct tc_ops tc_ops_fqcodel = {
3298 "fq_codel", /* linux_name */
3299 "linux-fq_codel", /* ovs_name */
3300 FQCODEL_N_QUEUES, /* n_queues */
3301 fqcodel_tc_install,
3302 fqcodel_tc_load,
3303 fqcodel_tc_destroy,
3304 fqcodel_qdisc_get,
3305 fqcodel_qdisc_set,
3306 NULL,
3307 NULL,
3308 NULL,
3309 NULL,
3310 NULL
3311};
3312\f
3313/* SFQ traffic control class. */
3314
3315#define SFQ_N_QUEUES 0x0000
3316
3317struct sfq {
3318 struct tc tc;
3319 uint32_t quantum;
3320 uint32_t perturb;
3321};
3322
3323static struct sfq *
3324sfq_get__(const struct netdev *netdev_)
3325{
3326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3327 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3328}
3329
3330static void
3331sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3332{
3333 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3334 struct sfq *sfq;
3335
3336 sfq = xmalloc(sizeof *sfq);
3337 tc_init(&sfq->tc, &tc_ops_sfq);
3338 sfq->perturb = perturb;
3339 sfq->quantum = quantum;
3340
3341 netdev->tc = &sfq->tc;
3342}
3343
3344static int
3345sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3346{
3347 struct tc_sfq_qopt opt;
3348 struct ofpbuf request;
3349 struct tcmsg *tcmsg;
3350 int mtu;
3351 int mtu_error, error;
3352 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3353
3354 tc_del_qdisc(netdev);
3355
3356 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3357 NLM_F_EXCL | NLM_F_CREATE, &request);
3358 if (!tcmsg) {
3359 return ENODEV;
3360 }
3361 tcmsg->tcm_handle = tc_make_handle(1, 0);
3362 tcmsg->tcm_parent = TC_H_ROOT;
3363
3364 memset(&opt, 0, sizeof opt);
3365 if (!quantum) {
3366 if (!mtu_error) {
3367 opt.quantum = mtu; /* if we cannot find mtu, use default */
3368 }
3369 } else {
3370 opt.quantum = quantum;
3371 }
3372
3373 if (!perturb) {
3374 opt.perturb_period = 10;
3375 } else {
3376 opt.perturb_period = perturb;
3377 }
3378
3379 nl_msg_put_string(&request, TCA_KIND, "sfq");
3380 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3381
3382 error = tc_transact(&request, NULL);
3383 if (error) {
3384 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3385 "quantum %u, perturb %u error %d(%s)",
3386 netdev_get_name(netdev),
3387 opt.quantum, opt.perturb_period,
3388 error, ovs_strerror(error));
3389 }
3390 return error;
3391}
3392
3393static void
3394sfq_parse_qdisc_details__(struct netdev *netdev,
3395 const struct smap *details, struct sfq *sfq)
3396{
3397 const char *perturb_s;
3398 const char *quantum_s;
3399 int mtu;
3400 int mtu_error;
3401
3402 perturb_s = smap_get(details, "perturb");
3403 quantum_s = smap_get(details, "quantum");
3404 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3405 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3406 if (!sfq->perturb) {
3407 sfq->perturb = 10;
3408 }
3409
3410 if (!sfq->quantum) {
3411 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3412 if (!mtu_error) {
3413 sfq->quantum = mtu;
3414 } else {
3415 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3416 "device without mtu");
3417 return;
3418 }
3419 }
3420}
3421
3422static int
3423sfq_tc_install(struct netdev *netdev, const struct smap *details)
3424{
3425 int error;
3426 struct sfq sfq;
3427
3428 sfq_parse_qdisc_details__(netdev, details, &sfq);
3429 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3430 if (!error) {
3431 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3432 }
3433 return error;
3434}
3435
3436static int
3437sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3438{
3439 const struct tc_sfq_qopt *sfq;
3440 struct nlattr *nlattr;
3441 const char * kind;
3442 int error;
3443
3444 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3445 if (error == 0) {
3446 sfq = nl_attr_get(nlattr);
3447 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3448 return 0;
3449 }
3450
3451 return error;
3452}
3453
3454static void
3455sfq_tc_destroy(struct tc *tc)
3456{
3457 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3458 tc_destroy(tc);
3459 free(sfq);
3460}
3461
3462static int
3463sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3464{
3465 const struct sfq *sfq = sfq_get__(netdev);
3466 smap_add_format(details, "quantum", "%u", sfq->quantum);
3467 smap_add_format(details, "perturb", "%u", sfq->perturb);
3468 return 0;
3469}
3470
3471static int
3472sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3473{
3474 struct sfq sfq;
3475
3476 sfq_parse_qdisc_details__(netdev, details, &sfq);
3477 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3478 sfq_get__(netdev)->quantum = sfq.quantum;
3479 sfq_get__(netdev)->perturb = sfq.perturb;
3480 return 0;
3481}
3482
3483static const struct tc_ops tc_ops_sfq = {
3484 "sfq", /* linux_name */
3485 "linux-sfq", /* ovs_name */
3486 SFQ_N_QUEUES, /* n_queues */
3487 sfq_tc_install,
3488 sfq_tc_load,
3489 sfq_tc_destroy,
3490 sfq_qdisc_get,
3491 sfq_qdisc_set,
3492 NULL,
3493 NULL,
3494 NULL,
3495 NULL,
3496 NULL
3497};
3498\f
c1c9c9c4 3499/* HTB traffic control class. */
559843ed 3500
c1c9c9c4 3501#define HTB_N_QUEUES 0xf000
4f631ccd 3502#define HTB_RATE2QUANTUM 10
8b61709d 3503
c1c9c9c4
BP
3504struct htb {
3505 struct tc tc;
3506 unsigned int max_rate; /* In bytes/s. */
3507};
8b61709d 3508
c1c9c9c4 3509struct htb_class {
93b13be8 3510 struct tc_queue tc_queue;
c1c9c9c4
BP
3511 unsigned int min_rate; /* In bytes/s. */
3512 unsigned int max_rate; /* In bytes/s. */
3513 unsigned int burst; /* In bytes. */
3514 unsigned int priority; /* Lower values are higher priorities. */
3515};
8b61709d 3516
c1c9c9c4 3517static struct htb *
b5d57fc8 3518htb_get__(const struct netdev *netdev_)
c1c9c9c4 3519{
b5d57fc8
BP
3520 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3521 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3522}
3523
24045e35 3524static void
b5d57fc8 3525htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3526{
b5d57fc8 3527 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3528 struct htb *htb;
3529
3530 htb = xmalloc(sizeof *htb);
3531 tc_init(&htb->tc, &tc_ops_htb);
3532 htb->max_rate = max_rate;
3533
b5d57fc8 3534 netdev->tc = &htb->tc;
c1c9c9c4
BP
3535}
3536
3537/* Create an HTB qdisc.
3538 *
a339aa81 3539 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3540static int
3541htb_setup_qdisc__(struct netdev *netdev)
3542{
3543 size_t opt_offset;
3544 struct tc_htb_glob opt;
3545 struct ofpbuf request;
3546 struct tcmsg *tcmsg;
3547
3548 tc_del_qdisc(netdev);
3549
3550 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3551 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3552 if (!tcmsg) {
3553 return ENODEV;
3554 }
c1c9c9c4
BP
3555 tcmsg->tcm_handle = tc_make_handle(1, 0);
3556 tcmsg->tcm_parent = TC_H_ROOT;
3557
3558 nl_msg_put_string(&request, TCA_KIND, "htb");
3559
3560 memset(&opt, 0, sizeof opt);
4f631ccd 3561 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3562 opt.version = 3;
4ecf12d5 3563 opt.defcls = 1;
c1c9c9c4
BP
3564
3565 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3566 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3567 nl_msg_end_nested(&request, opt_offset);
3568
3569 return tc_transact(&request, NULL);
3570}
3571
3572/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3573 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3574static int
3575htb_setup_class__(struct netdev *netdev, unsigned int handle,
3576 unsigned int parent, struct htb_class *class)
3577{
3578 size_t opt_offset;
3579 struct tc_htb_opt opt;
3580 struct ofpbuf request;
3581 struct tcmsg *tcmsg;
3582 int error;
3583 int mtu;
3584
73371c09 3585 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3586 if (error) {
f915f1a8
BP
3587 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3588 netdev_get_name(netdev));
9b020780 3589 return error;
f915f1a8 3590 }
c1c9c9c4
BP
3591
3592 memset(&opt, 0, sizeof opt);
3593 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3594 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3595 /* Makes sure the quantum is at least MTU. Setting quantum will
3596 * make htb ignore the r2q for this class. */
3597 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3598 opt.quantum = mtu;
3599 }
c1c9c9c4
BP
3600 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3601 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3602 opt.prio = class->priority;
3603
3604 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
3605 if (!tcmsg) {
3606 return ENODEV;
3607 }
c1c9c9c4
BP
3608 tcmsg->tcm_handle = handle;
3609 tcmsg->tcm_parent = parent;
3610
3611 nl_msg_put_string(&request, TCA_KIND, "htb");
3612 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3613 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3614 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3615 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3616 nl_msg_end_nested(&request, opt_offset);
3617
3618 error = tc_transact(&request, NULL);
3619 if (error) {
3620 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3621 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3622 netdev_get_name(netdev),
3623 tc_get_major(handle), tc_get_minor(handle),
3624 tc_get_major(parent), tc_get_minor(parent),
3625 class->min_rate, class->max_rate,
10a89ef0 3626 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3627 }
3628 return error;
3629}
3630
3631/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3632 * description of them into 'details'. The description complies with the
3633 * specification given in the vswitch database documentation for linux-htb
3634 * queue details. */
3635static int
3636htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3637{
3638 static const struct nl_policy tca_htb_policy[] = {
3639 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3640 .min_len = sizeof(struct tc_htb_opt) },
3641 };
3642
3643 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3644 const struct tc_htb_opt *htb;
3645
3646 if (!nl_parse_nested(nl_options, tca_htb_policy,
3647 attrs, ARRAY_SIZE(tca_htb_policy))) {
3648 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3649 return EPROTO;
3650 }
3651
3652 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3653 class->min_rate = htb->rate.rate;
3654 class->max_rate = htb->ceil.rate;
3655 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3656 class->priority = htb->prio;
3657 return 0;
3658}
3659
3660static int
3661htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3662 struct htb_class *options,
3663 struct netdev_queue_stats *stats)
3664{
3665 struct nlattr *nl_options;
3666 unsigned int handle;
3667 int error;
3668
3669 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3670 if (!error && queue_id) {
17ee3c1f
BP
3671 unsigned int major = tc_get_major(handle);
3672 unsigned int minor = tc_get_minor(handle);
3673 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3674 *queue_id = minor - 1;
c1c9c9c4
BP
3675 } else {
3676 error = EPROTO;
3677 }
3678 }
3679 if (!error && options) {
3680 error = htb_parse_tca_options__(nl_options, options);
3681 }
3682 return error;
3683}
3684
3685static void
73371c09 3686htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3687 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3688{
73371c09 3689 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3690 const char *max_rate_s;
3691
79f1cbe9 3692 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
3693 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3694 if (!hc->max_rate) {
a00ca915 3695 enum netdev_features current;
c1c9c9c4 3696
73371c09
BP
3697 netdev_linux_read_features(netdev);
3698 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3699 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3700 }
3701 hc->min_rate = hc->max_rate;
3702 hc->burst = 0;
3703 hc->priority = 0;
3704}
3705
3706static int
3707htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3708 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3709{
3710 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
3711 const char *min_rate_s = smap_get(details, "min-rate");
3712 const char *max_rate_s = smap_get(details, "max-rate");
3713 const char *burst_s = smap_get(details, "burst");
3714 const char *priority_s = smap_get(details, "priority");
9b020780 3715 int mtu, error;
c1c9c9c4 3716
73371c09 3717 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3718 if (error) {
f915f1a8
BP
3719 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3720 netdev_get_name(netdev));
9b020780 3721 return error;
f915f1a8
BP
3722 }
3723
4f104611
EJ
3724 /* HTB requires at least an mtu sized min-rate to send any traffic even
3725 * on uncongested links. */
c45ab5e9 3726 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 3727 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3728 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3729
3730 /* max-rate */
3731 hc->max_rate = (max_rate_s
3732 ? strtoull(max_rate_s, NULL, 10) / 8
3733 : htb->max_rate);
3734 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3735 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3736
3737 /* burst
3738 *
3739 * According to hints in the documentation that I've read, it is important
3740 * that 'burst' be at least as big as the largest frame that might be
3741 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3742 * but having it a bit too small is a problem. Since netdev_get_mtu()
3743 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3744 * the MTU. We actually add 64, instead of 14, as a guard against
3745 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
3746 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3747 hc->burst = MAX(hc->burst, mtu + 64);
3748
3749 /* priority */
3750 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3751
3752 return 0;
3753}
3754
3755static int
3756htb_query_class__(const struct netdev *netdev, unsigned int handle,
3757 unsigned int parent, struct htb_class *options,
3758 struct netdev_queue_stats *stats)
3759{
3760 struct ofpbuf *reply;
3761 int error;
3762
3763 error = tc_query_class(netdev, handle, parent, &reply);
3764 if (!error) {
3765 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3766 ofpbuf_delete(reply);
3767 }
3768 return error;
3769}
3770
3771static int
79f1cbe9 3772htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3773{
3774 int error;
3775
3776 error = htb_setup_qdisc__(netdev);
3777 if (!error) {
3778 struct htb_class hc;
3779
3780 htb_parse_qdisc_details__(netdev, details, &hc);
3781 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3782 tc_make_handle(1, 0), &hc);
3783 if (!error) {
3784 htb_install__(netdev, hc.max_rate);
3785 }
3786 }
3787 return error;
3788}
3789
93b13be8
BP
3790static struct htb_class *
3791htb_class_cast__(const struct tc_queue *queue)
3792{
3793 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3794}
3795
c1c9c9c4
BP
3796static void
3797htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3798 const struct htb_class *hc)
3799{
3800 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3801 size_t hash = hash_int(queue_id, 0);
3802 struct tc_queue *queue;
c1c9c9c4
BP
3803 struct htb_class *hcp;
3804
93b13be8
BP
3805 queue = tc_find_queue__(netdev, queue_id, hash);
3806 if (queue) {
3807 hcp = htb_class_cast__(queue);
3808 } else {
c1c9c9c4 3809 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3810 queue = &hcp->tc_queue;
3811 queue->queue_id = queue_id;
6dc34a0d 3812 queue->created = time_msec();
93b13be8 3813 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3814 }
93b13be8
BP
3815
3816 hcp->min_rate = hc->min_rate;
3817 hcp->max_rate = hc->max_rate;
3818 hcp->burst = hc->burst;
3819 hcp->priority = hc->priority;
c1c9c9c4
BP
3820}
3821
3822static int
3823htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3824{
c1c9c9c4 3825 struct ofpbuf msg;
d57695d7 3826 struct queue_dump_state state;
c1c9c9c4 3827 struct htb_class hc;
c1c9c9c4
BP
3828
3829 /* Get qdisc options. */
3830 hc.max_rate = 0;
3831 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3832 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3833
3834 /* Get queues. */
d57695d7 3835 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3836 return ENODEV;
3837 }
d57695d7 3838 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3839 unsigned int queue_id;
3840
3841 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3842 htb_update_queue__(netdev, queue_id, &hc);
3843 }
3844 }
d57695d7 3845 finish_queue_dump(&state);
c1c9c9c4
BP
3846
3847 return 0;
3848}
3849
3850static void
3851htb_tc_destroy(struct tc *tc)
3852{
3853 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 3854 struct htb_class *hc;
c1c9c9c4 3855
4ec3d7c7 3856 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
3857 free(hc);
3858 }
3859 tc_destroy(tc);
3860 free(htb);
3861}
3862
3863static int
79f1cbe9 3864htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3865{
3866 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3867 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3868 return 0;
3869}
3870
3871static int
79f1cbe9 3872htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3873{
3874 struct htb_class hc;
3875 int error;
3876
3877 htb_parse_qdisc_details__(netdev, details, &hc);
3878 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3879 tc_make_handle(1, 0), &hc);
3880 if (!error) {
3881 htb_get__(netdev)->max_rate = hc.max_rate;
3882 }
3883 return error;
3884}
3885
3886static int
93b13be8 3887htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3888 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3889{
93b13be8 3890 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3891
79f1cbe9 3892 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3893 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3894 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3895 }
79f1cbe9 3896 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3897 if (hc->priority) {
79f1cbe9 3898 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3899 }
3900 return 0;
3901}
3902
3903static int
3904htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3905 const struct smap *details)
c1c9c9c4
BP
3906{
3907 struct htb_class hc;
3908 int error;
3909
3910 error = htb_parse_class_details__(netdev, details, &hc);
3911 if (error) {
3912 return error;
3913 }
3914
17ee3c1f 3915 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3916 tc_make_handle(1, 0xfffe), &hc);
3917 if (error) {
3918 return error;
3919 }
3920
3921 htb_update_queue__(netdev, queue_id, &hc);
3922 return 0;
3923}
3924
3925static int
93b13be8 3926htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3927{
93b13be8 3928 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3929 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3930 int error;
3931
93b13be8 3932 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3933 if (!error) {
93b13be8 3934 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3935 free(hc);
c1c9c9c4
BP
3936 }
3937 return error;
3938}
3939
3940static int
93b13be8 3941htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3942 struct netdev_queue_stats *stats)
3943{
93b13be8 3944 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3945 tc_make_handle(1, 0xfffe), NULL, stats);
3946}
3947
3948static int
3949htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3950 const struct ofpbuf *nlmsg,
3951 netdev_dump_queue_stats_cb *cb, void *aux)
3952{
3953 struct netdev_queue_stats stats;
17ee3c1f 3954 unsigned int handle, major, minor;
c1c9c9c4
BP
3955 int error;
3956
3957 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3958 if (error) {
3959 return error;
3960 }
3961
17ee3c1f
BP
3962 major = tc_get_major(handle);
3963 minor = tc_get_minor(handle);
3964 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3965 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3966 }
3967 return 0;
3968}
3969
3970static const struct tc_ops tc_ops_htb = {
3971 "htb", /* linux_name */
3972 "linux-htb", /* ovs_name */
3973 HTB_N_QUEUES, /* n_queues */
3974 htb_tc_install,
3975 htb_tc_load,
3976 htb_tc_destroy,
3977 htb_qdisc_get,
3978 htb_qdisc_set,
3979 htb_class_get,
3980 htb_class_set,
3981 htb_class_delete,
3982 htb_class_get_stats,
3983 htb_class_dump_stats
3984};
3985\f
a339aa81
EJ
3986/* "linux-hfsc" traffic control class. */
3987
3988#define HFSC_N_QUEUES 0xf000
3989
3990struct hfsc {
3991 struct tc tc;
3992 uint32_t max_rate;
3993};
3994
3995struct hfsc_class {
3996 struct tc_queue tc_queue;
3997 uint32_t min_rate;
3998 uint32_t max_rate;
3999};
4000
4001static struct hfsc *
b5d57fc8 4002hfsc_get__(const struct netdev *netdev_)
a339aa81 4003{
b5d57fc8
BP
4004 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4005 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4006}
4007
4008static struct hfsc_class *
4009hfsc_class_cast__(const struct tc_queue *queue)
4010{
4011 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4012}
4013
24045e35 4014static void
b5d57fc8 4015hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4016{
b5d57fc8 4017 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4018 struct hfsc *hfsc;
4019
a339aa81
EJ
4020 hfsc = xmalloc(sizeof *hfsc);
4021 tc_init(&hfsc->tc, &tc_ops_hfsc);
4022 hfsc->max_rate = max_rate;
b5d57fc8 4023 netdev->tc = &hfsc->tc;
a339aa81
EJ
4024}
4025
4026static void
4027hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4028 const struct hfsc_class *hc)
4029{
4030 size_t hash;
4031 struct hfsc *hfsc;
4032 struct hfsc_class *hcp;
4033 struct tc_queue *queue;
4034
4035 hfsc = hfsc_get__(netdev);
4036 hash = hash_int(queue_id, 0);
4037
4038 queue = tc_find_queue__(netdev, queue_id, hash);
4039 if (queue) {
4040 hcp = hfsc_class_cast__(queue);
4041 } else {
4042 hcp = xmalloc(sizeof *hcp);
4043 queue = &hcp->tc_queue;
4044 queue->queue_id = queue_id;
6dc34a0d 4045 queue->created = time_msec();
a339aa81
EJ
4046 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4047 }
4048
4049 hcp->min_rate = hc->min_rate;
4050 hcp->max_rate = hc->max_rate;
4051}
4052
4053static int
4054hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4055{
4056 const struct tc_service_curve *rsc, *fsc, *usc;
4057 static const struct nl_policy tca_hfsc_policy[] = {
4058 [TCA_HFSC_RSC] = {
4059 .type = NL_A_UNSPEC,
4060 .optional = false,
4061 .min_len = sizeof(struct tc_service_curve),
4062 },
4063 [TCA_HFSC_FSC] = {
4064 .type = NL_A_UNSPEC,
4065 .optional = false,
4066 .min_len = sizeof(struct tc_service_curve),
4067 },
4068 [TCA_HFSC_USC] = {
4069 .type = NL_A_UNSPEC,
4070 .optional = false,
4071 .min_len = sizeof(struct tc_service_curve),
4072 },
4073 };
4074 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4075
4076 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4077 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4078 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4079 return EPROTO;
4080 }
4081
4082 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4083 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4084 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4085
4086 if (rsc->m1 != 0 || rsc->d != 0 ||
4087 fsc->m1 != 0 || fsc->d != 0 ||
4088 usc->m1 != 0 || usc->d != 0) {
4089 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4090 "Non-linear service curves are not supported.");
4091 return EPROTO;
4092 }
4093
4094 if (rsc->m2 != fsc->m2) {
4095 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4096 "Real-time service curves are not supported ");
4097 return EPROTO;
4098 }
4099
4100 if (rsc->m2 > usc->m2) {
4101 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4102 "Min-rate service curve is greater than "
4103 "the max-rate service curve.");
4104 return EPROTO;
4105 }
4106
4107 class->min_rate = fsc->m2;
4108 class->max_rate = usc->m2;
4109 return 0;
4110}
4111
4112static int
4113hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4114 struct hfsc_class *options,
4115 struct netdev_queue_stats *stats)
4116{
4117 int error;
4118 unsigned int handle;
4119 struct nlattr *nl_options;
4120
4121 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4122 if (error) {
4123 return error;
4124 }
4125
4126 if (queue_id) {
4127 unsigned int major, minor;
4128
4129 major = tc_get_major(handle);
4130 minor = tc_get_minor(handle);
4131 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4132 *queue_id = minor - 1;
4133 } else {
4134 return EPROTO;
4135 }
4136 }
4137
4138 if (options) {
4139 error = hfsc_parse_tca_options__(nl_options, options);
4140 }
4141
4142 return error;
4143}
4144
4145static int
4146hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4147 unsigned int parent, struct hfsc_class *options,
4148 struct netdev_queue_stats *stats)
4149{
4150 int error;
4151 struct ofpbuf *reply;
4152
4153 error = tc_query_class(netdev, handle, parent, &reply);
4154 if (error) {
4155 return error;
4156 }
4157
4158 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4159 ofpbuf_delete(reply);
4160 return error;
4161}
4162
4163static void
73371c09 4164hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4165 struct hfsc_class *class)
4166{
73371c09 4167 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4168 uint32_t max_rate;
4169 const char *max_rate_s;
4170
79f1cbe9 4171 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
4172 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4173
4174 if (!max_rate) {
a00ca915 4175 enum netdev_features current;
a339aa81 4176
73371c09
BP
4177 netdev_linux_read_features(netdev);
4178 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4179 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4180 }
4181
4182 class->min_rate = max_rate;
4183 class->max_rate = max_rate;
4184}
4185
4186static int
4187hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4188 const struct smap *details,
a339aa81
EJ
4189 struct hfsc_class * class)
4190{
4191 const struct hfsc *hfsc;
4192 uint32_t min_rate, max_rate;
4193 const char *min_rate_s, *max_rate_s;
4194
4195 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
4196 min_rate_s = smap_get(details, "min-rate");
4197 max_rate_s = smap_get(details, "max-rate");
a339aa81 4198
c45ab5e9 4199 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 4200 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4201 min_rate = MIN(min_rate, hfsc->max_rate);
4202
4203 max_rate = (max_rate_s
4204 ? strtoull(max_rate_s, NULL, 10) / 8
4205 : hfsc->max_rate);
4206 max_rate = MAX(max_rate, min_rate);
4207 max_rate = MIN(max_rate, hfsc->max_rate);
4208
4209 class->min_rate = min_rate;
4210 class->max_rate = max_rate;
4211
4212 return 0;
4213}
4214
4215/* Create an HFSC qdisc.
4216 *
4217 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4218static int
4219hfsc_setup_qdisc__(struct netdev * netdev)
4220{
4221 struct tcmsg *tcmsg;
4222 struct ofpbuf request;
4223 struct tc_hfsc_qopt opt;
4224
4225 tc_del_qdisc(netdev);
4226
4227 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4228 NLM_F_EXCL | NLM_F_CREATE, &request);
4229
4230 if (!tcmsg) {
4231 return ENODEV;
4232 }
4233
4234 tcmsg->tcm_handle = tc_make_handle(1, 0);
4235 tcmsg->tcm_parent = TC_H_ROOT;
4236
4237 memset(&opt, 0, sizeof opt);
4238 opt.defcls = 1;
4239
4240 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4241 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4242
4243 return tc_transact(&request, NULL);
4244}
4245
4246/* Create an HFSC class.
4247 *
4248 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4249 * sc rate <min_rate> ul rate <max_rate>" */
4250static int
4251hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4252 unsigned int parent, struct hfsc_class *class)
4253{
4254 int error;
4255 size_t opt_offset;
4256 struct tcmsg *tcmsg;
4257 struct ofpbuf request;
4258 struct tc_service_curve min, max;
4259
4260 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4261
4262 if (!tcmsg) {
4263 return ENODEV;
4264 }
4265
4266 tcmsg->tcm_handle = handle;
4267 tcmsg->tcm_parent = parent;
4268
4269 min.m1 = 0;
4270 min.d = 0;
4271 min.m2 = class->min_rate;
4272
4273 max.m1 = 0;
4274 max.d = 0;
4275 max.m2 = class->max_rate;
4276
4277 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4278 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4279 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4280 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4281 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4282 nl_msg_end_nested(&request, opt_offset);
4283
4284 error = tc_transact(&request, NULL);
4285 if (error) {
4286 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4287 "min-rate %ubps, max-rate %ubps (%s)",
4288 netdev_get_name(netdev),
4289 tc_get_major(handle), tc_get_minor(handle),
4290 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4291 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4292 }
4293
4294 return error;
4295}
4296
4297static int
79f1cbe9 4298hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4299{
4300 int error;
4301 struct hfsc_class class;
4302
4303 error = hfsc_setup_qdisc__(netdev);
4304
4305 if (error) {
4306 return error;
4307 }
4308
4309 hfsc_parse_qdisc_details__(netdev, details, &class);
4310 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4311 tc_make_handle(1, 0), &class);
4312
4313 if (error) {
4314 return error;
4315 }
4316
4317 hfsc_install__(netdev, class.max_rate);
4318 return 0;
4319}
4320
4321static int
4322hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4323{
4324 struct ofpbuf msg;
d57695d7 4325 struct queue_dump_state state;
a339aa81
EJ
4326 struct hfsc_class hc;
4327
4328 hc.max_rate = 0;
4329 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4330 hfsc_install__(netdev, hc.max_rate);
a339aa81 4331
d57695d7 4332 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4333 return ENODEV;
4334 }
4335
d57695d7 4336 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4337 unsigned int queue_id;
4338
4339 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4340 hfsc_update_queue__(netdev, queue_id, &hc);
4341 }
4342 }
4343
d57695d7 4344 finish_queue_dump(&state);
a339aa81
EJ
4345 return 0;
4346}
4347
4348static void
4349hfsc_tc_destroy(struct tc *tc)
4350{
4351 struct hfsc *hfsc;
4352 struct hfsc_class *hc, *next;
4353
4354 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4355
4356 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4357 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4358 free(hc);
4359 }
4360
4361 tc_destroy(tc);
4362 free(hfsc);
4363}
4364
4365static int
79f1cbe9 4366hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4367{
4368 const struct hfsc *hfsc;
4369 hfsc = hfsc_get__(netdev);
79f1cbe9 4370 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4371 return 0;
4372}
4373
4374static int
79f1cbe9 4375hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4376{
4377 int error;
4378 struct hfsc_class class;
4379
4380 hfsc_parse_qdisc_details__(netdev, details, &class);
4381 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4382 tc_make_handle(1, 0), &class);
4383
4384 if (!error) {
4385 hfsc_get__(netdev)->max_rate = class.max_rate;
4386 }
4387
4388 return error;
4389}
4390
4391static int
4392hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4393 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4394{
4395 const struct hfsc_class *hc;
4396
4397 hc = hfsc_class_cast__(queue);
79f1cbe9 4398 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4399 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4400 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4401 }
4402 return 0;
4403}
4404
4405static int
4406hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4407 const struct smap *details)
a339aa81
EJ
4408{
4409 int error;
4410 struct hfsc_class class;
4411
4412 error = hfsc_parse_class_details__(netdev, details, &class);
4413 if (error) {
4414 return error;
4415 }
4416
4417 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4418 tc_make_handle(1, 0xfffe), &class);
4419 if (error) {
4420 return error;
4421 }
4422
4423 hfsc_update_queue__(netdev, queue_id, &class);
4424 return 0;
4425}
4426
4427static int
4428hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4429{
4430 int error;
4431 struct hfsc *hfsc;
4432 struct hfsc_class *hc;
4433
4434 hc = hfsc_class_cast__(queue);
4435 hfsc = hfsc_get__(netdev);
4436
4437 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4438 if (!error) {
4439 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4440 free(hc);
4441 }
4442 return error;
4443}
4444
4445static int
4446hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4447 struct netdev_queue_stats *stats)
4448{
4449 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4450 tc_make_handle(1, 0xfffe), NULL, stats);
4451}
4452
4453static int
4454hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4455 const struct ofpbuf *nlmsg,
4456 netdev_dump_queue_stats_cb *cb, void *aux)
4457{
4458 struct netdev_queue_stats stats;
4459 unsigned int handle, major, minor;
4460 int error;
4461
4462 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4463 if (error) {
4464 return error;
4465 }
4466
4467 major = tc_get_major(handle);
4468 minor = tc_get_minor(handle);
4469 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4470 (*cb)(minor - 1, &stats, aux);
4471 }
4472 return 0;
4473}
4474
4475static const struct tc_ops tc_ops_hfsc = {
4476 "hfsc", /* linux_name */
4477 "linux-hfsc", /* ovs_name */
4478 HFSC_N_QUEUES, /* n_queues */
4479 hfsc_tc_install, /* tc_install */
4480 hfsc_tc_load, /* tc_load */
4481 hfsc_tc_destroy, /* tc_destroy */
4482 hfsc_qdisc_get, /* qdisc_get */
4483 hfsc_qdisc_set, /* qdisc_set */
4484 hfsc_class_get, /* class_get */
4485 hfsc_class_set, /* class_set */
4486 hfsc_class_delete, /* class_delete */
4487 hfsc_class_get_stats, /* class_get_stats */
4488 hfsc_class_dump_stats /* class_dump_stats */
4489};
4490\f
c1c9c9c4
BP
4491/* "linux-default" traffic control class.
4492 *
4493 * This class represents the default, unnamed Linux qdisc. It corresponds to
4494 * the "" (empty string) QoS type in the OVS database. */
4495
4496static void
b5d57fc8 4497default_install__(struct netdev *netdev_)
c1c9c9c4 4498{
b5d57fc8 4499 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4500 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4501
559eb230
BP
4502 /* Nothing but a tc class implementation is allowed to write to a tc. This
4503 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4504 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4505}
4506
4507static int
4508default_tc_install(struct netdev *netdev,
79f1cbe9 4509 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4510{
4511 default_install__(netdev);
4512 return 0;
4513}
4514
4515static int
4516default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4517{
4518 default_install__(netdev);
4519 return 0;
4520}
4521
4522static const struct tc_ops tc_ops_default = {
4523 NULL, /* linux_name */
4524 "", /* ovs_name */
4525 0, /* n_queues */
4526 default_tc_install,
4527 default_tc_load,
4528 NULL, /* tc_destroy */
4529 NULL, /* qdisc_get */
4530 NULL, /* qdisc_set */
4531 NULL, /* class_get */
4532 NULL, /* class_set */
4533 NULL, /* class_delete */
4534 NULL, /* class_get_stats */
4535 NULL /* class_dump_stats */
4536};
4537\f
4538/* "linux-other" traffic control class.
4539 *
4540 * */
4541
4542static int
b5d57fc8 4543other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4544{
b5d57fc8 4545 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4546 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4547
559eb230
BP
4548 /* Nothing but a tc class implementation is allowed to write to a tc. This
4549 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4550 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4551 return 0;
4552}
4553
4554static const struct tc_ops tc_ops_other = {
4555 NULL, /* linux_name */
4556 "linux-other", /* ovs_name */
4557 0, /* n_queues */
4558 NULL, /* tc_install */
4559 other_tc_load,
4560 NULL, /* tc_destroy */
4561 NULL, /* qdisc_get */
4562 NULL, /* qdisc_set */
4563 NULL, /* class_get */
4564 NULL, /* class_set */
4565 NULL, /* class_delete */
4566 NULL, /* class_get_stats */
4567 NULL /* class_dump_stats */
4568};
4569\f
4570/* Traffic control. */
4571
4572/* Number of kernel "tc" ticks per second. */
4573static double ticks_per_s;
4574
4575/* Number of kernel "jiffies" per second. This is used for the purpose of
4576 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4577 * one jiffy's worth of data.
4578 *
4579 * There are two possibilities here:
4580 *
4581 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4582 * approximate range of 100 to 1024. That means that we really need to
4583 * make sure that the qdisc can buffer that much data.
4584 *
4585 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4586 * has finely granular timers and there's no need to fudge additional room
4587 * for buffers. (There's no extra effort needed to implement that: the
4588 * large 'buffer_hz' is used as a divisor, so practically any number will
4589 * come out as 0 in the division. Small integer results in the case of
4590 * really high dividends won't have any real effect anyhow.)
4591 */
4592static unsigned int buffer_hz;
4593
4594/* Returns tc handle 'major':'minor'. */
4595static unsigned int
4596tc_make_handle(unsigned int major, unsigned int minor)
4597{
4598 return TC_H_MAKE(major << 16, minor);
4599}
4600
4601/* Returns the major number from 'handle'. */
4602static unsigned int
4603tc_get_major(unsigned int handle)
4604{
4605 return TC_H_MAJ(handle) >> 16;
4606}
4607
4608/* Returns the minor number from 'handle'. */
4609static unsigned int
4610tc_get_minor(unsigned int handle)
4611{
4612 return TC_H_MIN(handle);
4613}
4614
4615static struct tcmsg *
4616tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4617 struct ofpbuf *request)
4618{
4619 struct tcmsg *tcmsg;
4620 int ifindex;
4621 int error;
4622
4623 error = get_ifindex(netdev, &ifindex);
4624 if (error) {
4625 return NULL;
4626 }
4627
4628 ofpbuf_init(request, 512);
4629 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4630 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4631 tcmsg->tcm_family = AF_UNSPEC;
4632 tcmsg->tcm_ifindex = ifindex;
4633 /* Caller should fill in tcmsg->tcm_handle. */
4634 /* Caller should fill in tcmsg->tcm_parent. */
4635
4636 return tcmsg;
4637}
4638
4639static int
4640tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4641{
a88b4e04 4642 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
4643 ofpbuf_uninit(request);
4644 return error;
4645}
4646
f8500004
JP
4647/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4648 * policing configuration.
4649 *
4650 * This function is equivalent to running the following when 'add' is true:
4651 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4652 *
4653 * This function is equivalent to running the following when 'add' is false:
4654 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4655 *
4656 * The configuration and stats may be seen with the following command:
4657 * /sbin/tc -s qdisc show dev <devname>
4658 *
4659 * Returns 0 if successful, otherwise a positive errno value.
4660 */
4661static int
4662tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4663{
4664 struct ofpbuf request;
4665 struct tcmsg *tcmsg;
4666 int error;
4667 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4668 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4669
4670 tcmsg = tc_make_request(netdev, type, flags, &request);
4671 if (!tcmsg) {
4672 return ENODEV;
4673 }
4674 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4675 tcmsg->tcm_parent = TC_H_INGRESS;
4676 nl_msg_put_string(&request, TCA_KIND, "ingress");
4677 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4678
4679 error = tc_transact(&request, NULL);
4680 if (error) {
4681 /* If we're deleting the qdisc, don't worry about some of the
4682 * error conditions. */
4683 if (!add && (error == ENOENT || error == EINVAL)) {
4684 return 0;
4685 }
4686 return error;
4687 }
4688
4689 return 0;
4690}
4691
4692/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4693 * of 'kbits_burst'.
4694 *
4695 * This function is equivalent to running:
4696 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4697 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4698 * mtu 65535 drop
4699 *
4700 * The configuration and stats may be seen with the following command:
c7952afb 4701 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4702 *
4703 * Returns 0 if successful, otherwise a positive errno value.
4704 */
4705static int
c7952afb
BP
4706tc_add_policer(struct netdev *netdev,
4707 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4708{
4709 struct tc_police tc_police;
4710 struct ofpbuf request;
4711 struct tcmsg *tcmsg;
4712 size_t basic_offset;
4713 size_t police_offset;
4714 int error;
4715 int mtu = 65535;
4716
4717 memset(&tc_police, 0, sizeof tc_police);
4718 tc_police.action = TC_POLICE_SHOT;
4719 tc_police.mtu = mtu;
1aca400c 4720 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4721
79abacc8
MAA
4722 /* The following appears wrong in one way: In networking a kilobit is
4723 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4724 *
4725 * However if you "fix" those problems then "tc filter show ..." shows
4726 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4727 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4728 * tc's point of view. Whatever. */
4729 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4730 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004
JP
4731
4732 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4733 NLM_F_EXCL | NLM_F_CREATE, &request);
4734 if (!tcmsg) {
4735 return ENODEV;
4736 }
4737 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4738 tcmsg->tcm_info = tc_make_handle(49,
4739 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4740
4741 nl_msg_put_string(&request, TCA_KIND, "basic");
4742 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4743 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4744 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4745 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4746 nl_msg_end_nested(&request, police_offset);
4747 nl_msg_end_nested(&request, basic_offset);
4748
4749 error = tc_transact(&request, NULL);
4750 if (error) {
4751 return error;
4752 }
4753
4754 return 0;
4755}
4756
c1c9c9c4
BP
4757static void
4758read_psched(void)
4759{
4760 /* The values in psched are not individually very meaningful, but they are
4761 * important. The tables below show some values seen in the wild.
4762 *
4763 * Some notes:
4764 *
4765 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4766 * (Before that, there are hints that it was 1000000000.)
4767 *
4768 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4769 * above.
4770 *
4771 * /proc/net/psched
4772 * -----------------------------------
4773 * [1] 000c8000 000f4240 000f4240 00000064
4774 * [2] 000003e8 00000400 000f4240 3b9aca00
4775 * [3] 000003e8 00000400 000f4240 3b9aca00
4776 * [4] 000003e8 00000400 000f4240 00000064
4777 * [5] 000003e8 00000040 000f4240 3b9aca00
4778 * [6] 000003e8 00000040 000f4240 000000f9
4779 *
4780 * a b c d ticks_per_s buffer_hz
4781 * ------- --------- ---------- ------------- ----------- -------------
4782 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4783 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4784 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4785 * [4] 1,000 1,024 1,000,000 100 976,562 100
4786 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4787 * [6] 1,000 64 1,000,000 249 15,625,000 249
4788 *
4789 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4790 * [2] 2.6.26-1-686-bigmem from Debian lenny
4791 * [3] 2.6.26-2-sparc64 from Debian lenny
4792 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4793 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4794 * [6] 2.6.34 from kernel.org on KVM
4795 */
23882115 4796 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4797 static const char fn[] = "/proc/net/psched";
4798 unsigned int a, b, c, d;
4799 FILE *stream;
4800
23882115
BP
4801 if (!ovsthread_once_start(&once)) {
4802 return;
4803 }
4804
c1c9c9c4
BP
4805 ticks_per_s = 1.0;
4806 buffer_hz = 100;
4807
4808 stream = fopen(fn, "r");
4809 if (!stream) {
10a89ef0 4810 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4811 goto exit;
c1c9c9c4
BP
4812 }
4813
4814 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4815 VLOG_WARN("%s: read failed", fn);
4816 fclose(stream);
23882115 4817 goto exit;
c1c9c9c4
BP
4818 }
4819 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4820 fclose(stream);
4821
4822 if (!a || !c) {
4823 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4824 goto exit;
c1c9c9c4
BP
4825 }
4826
4827 ticks_per_s = (double) a * c / b;
4828 if (c == 1000000) {
4829 buffer_hz = d;
4830 } else {
4831 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4832 fn, a, b, c, d);
4833 }
4834 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4835
4836exit:
4837 ovsthread_once_done(&once);
c1c9c9c4
BP
4838}
4839
4840/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4841 * rate of 'rate' bytes per second. */
4842static unsigned int
4843tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4844{
23882115 4845 read_psched();
c1c9c9c4
BP
4846 return (rate * ticks) / ticks_per_s;
4847}
4848
4849/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4850 * rate of 'rate' bytes per second. */
4851static unsigned int
4852tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4853{
23882115 4854 read_psched();
015c93a4 4855 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4856}
4857
4858/* Returns the number of bytes that need to be reserved for qdisc buffering at
4859 * a transmission rate of 'rate' bytes per second. */
4860static unsigned int
4861tc_buffer_per_jiffy(unsigned int rate)
4862{
23882115 4863 read_psched();
c1c9c9c4
BP
4864 return rate / buffer_hz;
4865}
4866
4867/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4868 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4869 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4870 * stores NULL into it if it is absent.
4871 *
4872 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4873 * 'msg'.
4874 *
4875 * Returns 0 if successful, otherwise a positive errno value. */
4876static int
4877tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4878 struct nlattr **options)
4879{
4880 static const struct nl_policy tca_policy[] = {
4881 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4882 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4883 };
4884 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4885
4886 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4887 tca_policy, ta, ARRAY_SIZE(ta))) {
4888 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4889 goto error;
4890 }
4891
4892 if (kind) {
4893 *kind = nl_attr_get_string(ta[TCA_KIND]);
4894 }
4895
4896 if (options) {
4897 *options = ta[TCA_OPTIONS];
4898 }
4899
4900 return 0;
4901
4902error:
4903 if (kind) {
4904 *kind = NULL;
4905 }
4906 if (options) {
4907 *options = NULL;
4908 }
4909 return EPROTO;
4910}
4911
4912/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4913 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4914 * into '*options', and its queue statistics into '*stats'. Any of the output
4915 * arguments may be null.
4916 *
4917 * Returns 0 if successful, otherwise a positive errno value. */
4918static int
4919tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4920 struct nlattr **options, struct netdev_queue_stats *stats)
4921{
4922 static const struct nl_policy tca_policy[] = {
4923 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4924 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4925 };
4926 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4927
4928 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4929 tca_policy, ta, ARRAY_SIZE(ta))) {
4930 VLOG_WARN_RL(&rl, "failed to parse class message");
4931 goto error;
4932 }
4933
4934 if (handlep) {
4935 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4936 *handlep = tc->tcm_handle;
4937 }
4938
4939 if (options) {
4940 *options = ta[TCA_OPTIONS];
4941 }
4942
4943 if (stats) {
4944 const struct gnet_stats_queue *gsq;
4945 struct gnet_stats_basic gsb;
4946
4947 static const struct nl_policy stats_policy[] = {
4948 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4949 .min_len = sizeof gsb },
4950 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4951 .min_len = sizeof *gsq },
4952 };
4953 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4954
4955 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4956 sa, ARRAY_SIZE(sa))) {
4957 VLOG_WARN_RL(&rl, "failed to parse class stats");
4958 goto error;
4959 }
4960
4961 /* Alignment issues screw up the length of struct gnet_stats_basic on
4962 * some arch/bitsize combinations. Newer versions of Linux have a
4963 * struct gnet_stats_basic_packed, but we can't depend on that. The
4964 * easiest thing to do is just to make a copy. */
4965 memset(&gsb, 0, sizeof gsb);
4966 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4967 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4968 stats->tx_bytes = gsb.bytes;
4969 stats->tx_packets = gsb.packets;
4970
4971 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4972 stats->tx_errors = gsq->drops;
4973 }
4974
4975 return 0;
4976
4977error:
4978 if (options) {
4979 *options = NULL;
4980 }
4981 if (stats) {
4982 memset(stats, 0, sizeof *stats);
4983 }
4984 return EPROTO;
4985}
4986
4987/* Queries the kernel for class with identifier 'handle' and parent 'parent'
4988 * on 'netdev'. */
4989static int
4990tc_query_class(const struct netdev *netdev,
4991 unsigned int handle, unsigned int parent,
4992 struct ofpbuf **replyp)
4993{
4994 struct ofpbuf request;
4995 struct tcmsg *tcmsg;
4996 int error;
4997
4998 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
4999 if (!tcmsg) {
5000 return ENODEV;
5001 }
c1c9c9c4
BP
5002 tcmsg->tcm_handle = handle;
5003 tcmsg->tcm_parent = parent;
5004
5005 error = tc_transact(&request, replyp);
5006 if (error) {
5007 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5008 netdev_get_name(netdev),
5009 tc_get_major(handle), tc_get_minor(handle),
5010 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5011 ovs_strerror(error));
c1c9c9c4
BP
5012 }
5013 return error;
5014}
5015
5016/* Equivalent to "tc class del dev <name> handle <handle>". */
5017static int
5018tc_delete_class(const struct netdev *netdev, unsigned int handle)
5019{
5020 struct ofpbuf request;
5021 struct tcmsg *tcmsg;
5022 int error;
5023
5024 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5025 if (!tcmsg) {
5026 return ENODEV;
5027 }
c1c9c9c4
BP
5028 tcmsg->tcm_handle = handle;
5029 tcmsg->tcm_parent = 0;
5030
5031 error = tc_transact(&request, NULL);
5032 if (error) {
5033 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5034 netdev_get_name(netdev),
5035 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5036 ovs_strerror(error));
c1c9c9c4
BP
5037 }
5038 return error;
5039}
5040
5041/* Equivalent to "tc qdisc del dev <name> root". */
5042static int
b5d57fc8 5043tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5044{
b5d57fc8 5045 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5046 struct ofpbuf request;
5047 struct tcmsg *tcmsg;
5048 int error;
5049
b5d57fc8 5050 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5051 if (!tcmsg) {
5052 return ENODEV;
5053 }
c1c9c9c4
BP
5054 tcmsg->tcm_handle = tc_make_handle(1, 0);
5055 tcmsg->tcm_parent = TC_H_ROOT;
5056
5057 error = tc_transact(&request, NULL);
5058 if (error == EINVAL) {
5059 /* EINVAL probably means that the default qdisc was in use, in which
5060 * case we've accomplished our purpose. */
5061 error = 0;
5062 }
b5d57fc8
BP
5063 if (!error && netdev->tc) {
5064 if (netdev->tc->ops->tc_destroy) {
5065 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5066 }
b5d57fc8 5067 netdev->tc = NULL;
c1c9c9c4
BP
5068 }
5069 return error;
5070}
5071
ac3e3aaa
BP
5072static bool
5073getqdisc_is_safe(void)
5074{
5075 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5076 static bool safe = false;
5077
5078 if (ovsthread_once_start(&once)) {
5079 struct utsname utsname;
5080 int major, minor;
5081
5082 if (uname(&utsname) == -1) {
5083 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5084 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5085 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5086 } else if (major < 2 || (major == 2 && minor < 35)) {
5087 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5088 utsname.release);
5089 } else {
5090 safe = true;
5091 }
5092 ovsthread_once_done(&once);
5093 }
5094 return safe;
5095}
5096
c1c9c9c4
BP
5097/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5098 * kernel to determine what they are. Returns 0 if successful, otherwise a
5099 * positive errno value. */
5100static int
b5d57fc8 5101tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5102{
b5d57fc8 5103 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5104 struct ofpbuf request, *qdisc;
5105 const struct tc_ops *ops;
5106 struct tcmsg *tcmsg;
5107 int load_error;
5108 int error;
5109
b5d57fc8 5110 if (netdev->tc) {
c1c9c9c4
BP
5111 return 0;
5112 }
5113
5114 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5115 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5116 * 2.6.35 without that fix backported to it.
5117 *
5118 * To avoid the OOPS, we must not make a request that would attempt to dump
5119 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5120 * few others. There are a few ways that I can see to do this, but most of
5121 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5122 * technique chosen here is to assume that any non-default qdisc that we
5123 * create will have a class with handle 1:0. The built-in qdiscs only have
5124 * a class with handle 0:0.
5125 *
ac3e3aaa
BP
5126 * On Linux 2.6.35+ we use the straightforward method because it allows us
5127 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5128 * in such a case we get no response at all from the kernel (!) if a
5129 * builtin qdisc is in use (which is later caught by "!error &&
5130 * !qdisc->size"). */
b5d57fc8 5131 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
5132 if (!tcmsg) {
5133 return ENODEV;
5134 }
ac3e3aaa
BP
5135 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5136 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5137
5138 /* Figure out what tc class to instantiate. */
5139 error = tc_transact(&request, &qdisc);
ac3e3aaa 5140 if (!error && qdisc->size) {
c1c9c9c4
BP
5141 const char *kind;
5142
5143 error = tc_parse_qdisc(qdisc, &kind, NULL);
5144 if (error) {
5145 ops = &tc_ops_other;
5146 } else {
5147 ops = tc_lookup_linux_name(kind);
5148 if (!ops) {
5149 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5150 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5151
5152 ops = &tc_ops_other;
5153 }
5154 }
ac3e3aaa
BP
5155 } else if ((!error && !qdisc->size) || error == ENOENT) {
5156 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5157 * set up by some other entity that doesn't have a handle 1:0. We will
5158 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5159 ops = &tc_ops_default;
5160 error = 0;
5161 } else {
5162 /* Who knows? Maybe the device got deleted. */
5163 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5164 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5165 ops = &tc_ops_other;
5166 }
5167
5168 /* Instantiate it. */
b5d57fc8
BP
5169 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5170 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5171 ofpbuf_delete(qdisc);
5172
5173 return error ? error : load_error;
5174}
5175
5176/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5177 approximate the time to transmit packets of various lengths. For an MTU of
5178 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5179 represents two possible packet lengths; for a MTU of 513 through 1024, four
5180 possible lengths; and so on.
5181
5182 Returns, for the specified 'mtu', the number of bits that packet lengths
5183 need to be shifted right to fit within such a 256-entry table. */
5184static int
5185tc_calc_cell_log(unsigned int mtu)
5186{
5187 int cell_log;
5188
5189 if (!mtu) {
5190 mtu = ETH_PAYLOAD_MAX;
5191 }
5192 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5193
5194 for (cell_log = 0; mtu >= 256; cell_log++) {
5195 mtu >>= 1;
5196 }
5197
5198 return cell_log;
5199}
5200
5201/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5202 * of 'mtu'. */
5203static void
5204tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5205{
5206 memset(rate, 0, sizeof *rate);
5207 rate->cell_log = tc_calc_cell_log(mtu);
5208 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5209 /* rate->cell_align = 0; */ /* distro headers. */
5210 rate->mpu = ETH_TOTAL_MIN;
5211 rate->rate = Bps;
5212}
5213
5214/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5215 * attribute of the specified "type".
5216 *
5217 * See tc_calc_cell_log() above for a description of "rtab"s. */
5218static void
5219tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5220{
5221 uint32_t *rtab;
5222 unsigned int i;
5223
5224 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5225 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5226 unsigned packet_size = (i + 1) << rate->cell_log;
5227 if (packet_size < rate->mpu) {
5228 packet_size = rate->mpu;
5229 }
5230 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5231 }
5232}
5233
5234/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5235 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5236 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5237 * 0 is fine.) */
c1c9c9c4
BP
5238static int
5239tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5240{
5241 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5242 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5243}
d3980822 5244\f
aaf2fb1a
BP
5245/* Linux-only functions declared in netdev-linux.h */
5246
5247/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5248 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5249int
5250netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5251 const char *flag_name, bool enable)
5252{
5253 const char *netdev_name = netdev_get_name(netdev);
5254 struct ethtool_value evalue;
5255 uint32_t new_flags;
5256 int error;
5257
ab985a77 5258 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5259 memset(&evalue, 0, sizeof evalue);
5260 error = netdev_linux_do_ethtool(netdev_name,
5261 (struct ethtool_cmd *)&evalue,
5262 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5263 if (error) {
5264 return error;
5265 }
5266
ab985a77 5267 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5268 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5269 if (new_flags == evalue.data) {
5270 return 0;
5271 }
5272 evalue.data = new_flags;
aaf2fb1a
BP
5273 error = netdev_linux_do_ethtool(netdev_name,
5274 (struct ethtool_cmd *)&evalue,
5275 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5276 if (error) {
5277 return error;
5278 }
5279
ab985a77 5280 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5281 memset(&evalue, 0, sizeof evalue);
5282 error = netdev_linux_do_ethtool(netdev_name,
5283 (struct ethtool_cmd *)&evalue,
5284 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5285 if (error) {
5286 return error;
5287 }
5288
5289 if (new_flags != evalue.data) {
5290 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5291 "device %s failed", enable ? "enable" : "disable",
5292 flag_name, netdev_name);
5293 return EOPNOTSUPP;
5294 }
5295
5296 return 0;
5297}
5298\f
5299/* Utility functions. */
5300
d3980822 5301/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5302static void
d3980822
BP
5303netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5304 const struct rtnl_link_stats *src)
5305{
f613a0d7
PS
5306 dst->rx_packets = src->rx_packets;
5307 dst->tx_packets = src->tx_packets;
5308 dst->rx_bytes = src->rx_bytes;
5309 dst->tx_bytes = src->tx_bytes;
5310 dst->rx_errors = src->rx_errors;
5311 dst->tx_errors = src->tx_errors;
5312 dst->rx_dropped = src->rx_dropped;
5313 dst->tx_dropped = src->tx_dropped;
5314 dst->multicast = src->multicast;
5315 dst->collisions = src->collisions;
5316 dst->rx_length_errors = src->rx_length_errors;
5317 dst->rx_over_errors = src->rx_over_errors;
5318 dst->rx_crc_errors = src->rx_crc_errors;
5319 dst->rx_frame_errors = src->rx_frame_errors;
5320 dst->rx_fifo_errors = src->rx_fifo_errors;
5321 dst->rx_missed_errors = src->rx_missed_errors;
5322 dst->tx_aborted_errors = src->tx_aborted_errors;
5323 dst->tx_carrier_errors = src->tx_carrier_errors;
5324 dst->tx_fifo_errors = src->tx_fifo_errors;
5325 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5326 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5327}
5328
337c9b99
BP
5329/* Copies 'src' into 'dst', performing format conversion in the process. */
5330static void
5331netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5332 const struct rtnl_link_stats64 *src)
5333{
5334 dst->rx_packets = src->rx_packets;
5335 dst->tx_packets = src->tx_packets;
5336 dst->rx_bytes = src->rx_bytes;
5337 dst->tx_bytes = src->tx_bytes;
5338 dst->rx_errors = src->rx_errors;
5339 dst->tx_errors = src->tx_errors;
5340 dst->rx_dropped = src->rx_dropped;
5341 dst->tx_dropped = src->tx_dropped;
5342 dst->multicast = src->multicast;
5343 dst->collisions = src->collisions;
5344 dst->rx_length_errors = src->rx_length_errors;
5345 dst->rx_over_errors = src->rx_over_errors;
5346 dst->rx_crc_errors = src->rx_crc_errors;
5347 dst->rx_frame_errors = src->rx_frame_errors;
5348 dst->rx_fifo_errors = src->rx_fifo_errors;
5349 dst->rx_missed_errors = src->rx_missed_errors;
5350 dst->tx_aborted_errors = src->tx_aborted_errors;
5351 dst->tx_carrier_errors = src->tx_carrier_errors;
5352 dst->tx_fifo_errors = src->tx_fifo_errors;
5353 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5354 dst->tx_window_errors = src->tx_window_errors;
5355}
5356
c1c9c9c4 5357static int
35eef899 5358get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5359{
c1c9c9c4
BP
5360 struct ofpbuf request;
5361 struct ofpbuf *reply;
c1c9c9c4
BP
5362 int error;
5363
5364 ofpbuf_init(&request, 0);
13a24df8
BP
5365 nl_msg_put_nlmsghdr(&request,
5366 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5367 RTM_GETLINK, NLM_F_REQUEST);
5368 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5369 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5370 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5371 ofpbuf_uninit(&request);
5372 if (error) {
5373 return error;
5374 }
5375
13a24df8 5376 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5377 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5378 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5379 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5380 error = 0;
5381 } else {
337c9b99
BP
5382 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5383 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5384 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5385 error = 0;
5386 } else {
5387 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5388 error = EPROTO;
5389 }
13a24df8
BP
5390 }
5391 } else {
5392 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5393 error = EPROTO;
c1c9c9c4 5394 }
8b61709d 5395
8b61709d 5396
576e26d7 5397 ofpbuf_delete(reply);
35eef899 5398 return error;
8b61709d 5399}
c1c9c9c4 5400
3a183124 5401static int
b5d57fc8 5402get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5403{
5404 struct ifreq ifr;
5405 int error;
5406
755be9ea 5407 *flags = 0;
259e0b1a 5408 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5409 if (!error) {
5410 *flags = ifr.ifr_flags;
5411 }
8b61709d
BP
5412 return error;
5413}
5414
5415static int
4b609110 5416set_flags(const char *name, unsigned int flags)
8b61709d
BP
5417{
5418 struct ifreq ifr;
5419
5420 ifr.ifr_flags = flags;
259e0b1a 5421 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5422}
5423
5424static int
5425do_get_ifindex(const char *netdev_name)
5426{
5427 struct ifreq ifr;
259e0b1a 5428 int error;
8b61709d 5429
71d7c22f 5430 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5431 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5432
5433 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5434 if (error) {
8b61709d 5435 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
5436 netdev_name, ovs_strerror(error));
5437 return -error;
8b61709d
BP
5438 }
5439 return ifr.ifr_ifindex;
5440}
5441
5442static int
5443get_ifindex(const struct netdev *netdev_, int *ifindexp)
5444{
b5d57fc8 5445 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5446
b5d57fc8 5447 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 5448 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5449
8b61709d 5450 if (ifindex < 0) {
b5d57fc8
BP
5451 netdev->get_ifindex_error = -ifindex;
5452 netdev->ifindex = 0;
c7b1b0a5 5453 } else {
b5d57fc8
BP
5454 netdev->get_ifindex_error = 0;
5455 netdev->ifindex = ifindex;
8b61709d 5456 }
b5d57fc8 5457 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5458 }
c7b1b0a5 5459
b5d57fc8
BP
5460 *ifindexp = netdev->ifindex;
5461 return netdev->get_ifindex_error;
8b61709d
BP
5462}
5463
5464static int
74ff3298 5465get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5466{
5467 struct ifreq ifr;
5468 int hwaddr_family;
259e0b1a 5469 int error;
8b61709d
BP
5470
5471 memset(&ifr, 0, sizeof ifr);
71d7c22f 5472 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5473 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5474 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5475 if (error) {
78857dfb
BP
5476 /* ENODEV probably means that a vif disappeared asynchronously and
5477 * hasn't been removed from the database yet, so reduce the log level
5478 * to INFO for that case. */
259e0b1a 5479 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5480 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5481 netdev_name, ovs_strerror(error));
5482 return error;
8b61709d
BP
5483 }
5484 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5485 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
c9697f35 5486 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5487 netdev_name, hwaddr_family);
c9697f35 5488 return EINVAL;
8b61709d
BP
5489 }
5490 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5491 return 0;
5492}
5493
5494static int
74ff3298 5495set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5496{
5497 struct ifreq ifr;
259e0b1a 5498 int error;
8b61709d
BP
5499
5500 memset(&ifr, 0, sizeof ifr);
71d7c22f 5501 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5502 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5503 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5504 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5505 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5506 if (error) {
8b61709d 5507 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5508 netdev_name, ovs_strerror(error));
8b61709d 5509 }
259e0b1a 5510 return error;
8b61709d
BP
5511}
5512
5513static int
0b0544d7 5514netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5515 int cmd, const char *cmd_name)
5516{
5517 struct ifreq ifr;
259e0b1a 5518 int error;
8b61709d
BP
5519
5520 memset(&ifr, 0, sizeof ifr);
71d7c22f 5521 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5522 ifr.ifr_data = (caddr_t) ecmd;
5523
5524 ecmd->cmd = cmd;
259e0b1a
BP
5525 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5526 if (error) {
5527 if (error != EOPNOTSUPP) {
8b61709d 5528 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5529 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5530 } else {
5531 /* The device doesn't support this operation. That's pretty
5532 * common, so there's no point in logging anything. */
5533 }
8b61709d 5534 }
259e0b1a 5535 return error;
8b61709d 5536}
f1acd62b 5537
488d734d
BP
5538/* Returns an AF_PACKET raw socket or a negative errno value. */
5539static int
5540af_packet_sock(void)
5541{
23882115
BP
5542 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5543 static int sock;
488d734d 5544
23882115 5545 if (ovsthread_once_start(&once)) {
488d734d
BP
5546 sock = socket(AF_PACKET, SOCK_RAW, 0);
5547 if (sock >= 0) {
8450059e
BP
5548 int error = set_nonblocking(sock);
5549 if (error) {
5550 close(sock);
5551 sock = -error;
5552 }
488d734d
BP
5553 } else {
5554 sock = -errno;
10a89ef0
BP
5555 VLOG_ERR("failed to create packet socket: %s",
5556 ovs_strerror(errno));
488d734d 5557 }
23882115 5558 ovsthread_once_done(&once);
488d734d
BP
5559 }
5560
5561 return sock;
5562}