]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
dpif-netdev: Store actions data and size contiguously.
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
c7952afb 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
55bc98d6 43#include <net/if_packet.h>
8b61709d
BP
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
e14deea0 52#include "dp-packet.h"
93451a0a 53#include "dpif-netlink.h"
df1e5a3b 54#include "dpif-netdev.h"
8b61709d
BP
55#include "dynamic-string.h"
56#include "fatal-signal.h"
93b13be8
BP
57#include "hash.h"
58#include "hmap.h"
8b61709d 59#include "netdev-provider.h"
7fbef77a 60#include "netdev-vport.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
c060c4cf 63#include "netlink.h"
e9e28be3 64#include "ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d
BP
67#include "packets.h"
68#include "poll-loop.h"
21d6e22e 69#include "rtnetlink-link.h"
8b61709d 70#include "shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
5136ce49 76
d98e6007 77VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 78
d76f09ea
BP
79COVERAGE_DEFINE(netdev_set_policing);
80COVERAGE_DEFINE(netdev_arp_lookup);
81COVERAGE_DEFINE(netdev_get_ifindex);
82COVERAGE_DEFINE(netdev_get_hwaddr);
83COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
84COVERAGE_DEFINE(netdev_get_ethtool);
85COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 86
8b61709d
BP
87\f
88/* These were introduced in Linux 2.6.14, so they might be missing if we have
89 * old headers. */
90#ifndef ADVERTISED_Pause
91#define ADVERTISED_Pause (1 << 13)
92#endif
93#ifndef ADVERTISED_Asym_Pause
94#define ADVERTISED_Asym_Pause (1 << 14)
95#endif
96
e47bd51a
JP
97/* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99#ifndef ETHTOOL_GFLAGS
100#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101#endif
102#ifndef ETHTOOL_SFLAGS
103#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104#endif
105
c1c9c9c4
BP
106/* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 * headers. */
108#ifndef TC_RTAB_SIZE
109#define TC_RTAB_SIZE 1024
110#endif
111
b73c8518
SH
112/* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
117 *
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
120 */
55bc98d6
BP
121#ifndef PACKET_AUXDATA
122#define PACKET_AUXDATA 8
123#endif
b73c8518
SH
124#ifndef TP_STATUS_VLAN_VALID
125#define TP_STATUS_VLAN_VALID (1 << 4)
126#endif
127#ifndef TP_STATUS_VLAN_TPID_VALID
128#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129#endif
130#undef tpacket_auxdata
131#define tpacket_auxdata rpl_tpacket_auxdata
132struct tpacket_auxdata {
133 uint32_t tp_status;
134 uint32_t tp_len;
135 uint32_t tp_snaplen;
136 uint16_t tp_mac;
137 uint16_t tp_net;
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
140};
141
fa373af4
BP
142/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
143 *
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
149#ifndef IFLA_STATS64
337c9b99 150#define IFLA_STATS64 23
fa373af4
BP
151#endif
152#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
153struct rtnl_link_stats64 {
154 uint64_t rx_packets;
155 uint64_t tx_packets;
156 uint64_t rx_bytes;
157 uint64_t tx_bytes;
158 uint64_t rx_errors;
159 uint64_t tx_errors;
160 uint64_t rx_dropped;
161 uint64_t tx_dropped;
162 uint64_t multicast;
163 uint64_t collisions;
164
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
171
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
177
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
180};
337c9b99 181
8b61709d 182enum {
7fbef77a
JG
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
185 VALID_IN4 = 1 << 2,
186 VALID_IN6 = 1 << 3,
187 VALID_MTU = 1 << 4,
3a183124 188 VALID_POLICING = 1 << 5,
4f925bd3
PS
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
51f87458 191 VALID_FEATURES = 1 << 8,
8b61709d 192};
c1c9c9c4
BP
193\f
194/* Traffic control. */
195
196/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
197 * network device.
198 *
199 * Each TC implementation subclasses this with whatever additional data it
200 * needs. */
c1c9c9c4
BP
201struct tc {
202 const struct tc_ops *ops;
93b13be8
BP
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
206};
c1c9c9c4 207
559eb230
BP
208#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
209
93b13be8
BP
210/* One traffic control queue.
211 *
212 * Each TC implementation subclasses this with whatever additional data it
213 * needs. */
214struct tc_queue {
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 217 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
218};
219
220/* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
222 *
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
226struct tc_ops {
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
231
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
234
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
238
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
244 *
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
248 *
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
251 *
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
79f1cbe9 254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
255
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
259 *
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
265 * 'netdev'.
266 *
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
270
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
273 * tc_destroy(tc).
274 *
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
278 *
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
281
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
283 *
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
287 *
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
291 *
292 * This function may be null if 'tc' is not configurable.
293 */
79f1cbe9 294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
295
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
298 *
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
302 *
303 * This function may be null if 'tc' is not configurable.
304 */
79f1cbe9 305 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 306
93b13be8
BP
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
309 *
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
313 *
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
317 *
318 * This function may be null if 'tc' does not have queues ('n_queues' is
319 * 0). */
93b13be8 320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 321 struct smap *details);
c1c9c9c4
BP
322
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
326 * 'n_queues'.
327 *
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
331 *
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 335 const struct smap *details);
c1c9c9c4 336
93b13be8
BP
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
339 *
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
93b13be8 342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 343
93b13be8
BP
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
346 *
347 * On success, initializes '*stats'.
348 *
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
93b13be8
BP
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
c1c9c9c4
BP
353 struct netdev_queue_stats *stats);
354
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
357 *
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
363};
364
365static void
366tc_init(struct tc *tc, const struct tc_ops *ops)
367{
368 tc->ops = ops;
93b13be8 369 hmap_init(&tc->queues);
c1c9c9c4
BP
370}
371
372static void
373tc_destroy(struct tc *tc)
374{
93b13be8 375 hmap_destroy(&tc->queues);
c1c9c9c4
BP
376}
377
378static const struct tc_ops tc_ops_htb;
a339aa81 379static const struct tc_ops tc_ops_hfsc;
677d9158
JV
380static const struct tc_ops tc_ops_codel;
381static const struct tc_ops tc_ops_fqcodel;
382static const struct tc_ops tc_ops_sfq;
c1c9c9c4
BP
383static const struct tc_ops tc_ops_default;
384static const struct tc_ops tc_ops_other;
385
559eb230 386static const struct tc_ops *const tcs[] = {
c1c9c9c4 387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
c1c9c9c4
BP
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
394 NULL
395};
149f577a 396
c1c9c9c4
BP
397static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398static unsigned int tc_get_major(unsigned int handle);
399static unsigned int tc_get_minor(unsigned int handle);
400
401static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403static unsigned int tc_buffer_per_jiffy(unsigned int rate);
404
405static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004 408static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
c7952afb
BP
409static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
411
412static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420static int tc_delete_class(const struct netdev *, unsigned int handle);
421
422static int tc_del_qdisc(struct netdev *netdev);
423static int tc_query_qdisc(const struct netdev *netdev);
424
425static int tc_calc_cell_log(unsigned int mtu);
426static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
430\f
b5d57fc8
BP
431struct netdev_linux {
432 struct netdev up;
149f577a 433
86383816
BP
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
436
149f577a 437 unsigned int cache_valid;
8b61709d 438
1670c579
EJ
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
442
8722022c
BP
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
445 int ifindex;
446 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 447 struct in_addr address, netmask;
8b61709d
BP
448 struct in6_addr in6;
449 int mtu;
059e5f4f 450 unsigned int ifi_flags;
65c3058c 451 long long int carrier_resets;
80a86fbe
BP
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
bba1e6f3
PS
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
90a6637d 456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 458 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 461
a00ca915
EJ
462 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
463 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
464 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 465
4f925bd3 466 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 467 struct tc *tc;
149f577a 468
d0d08f8a
BP
469 /* For devices of class netdev_tap_class only. */
470 int tap_fd;
8b61709d
BP
471};
472
f7791740
PS
473struct netdev_rxq_linux {
474 struct netdev_rxq up;
796223f5 475 bool is_tap;
5b7448ed 476 int fd;
149f577a 477};
8b61709d 478
8b61709d
BP
479/* This is set pretty low because we probably won't learn anything from the
480 * additional log messages. */
481static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
482
19c8e9c1
JS
483/* Polling miimon status for all ports causes performance degradation when
484 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
485 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
486 *
487 * Readers do not depend on this variable synchronizing with the related
488 * changes in the device miimon status, so we can use atomic_count. */
489static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 490
259e0b1a 491static void netdev_linux_run(void);
6f643e49 492
0b0544d7 493static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 494 int cmd, const char *cmd_name);
f1acd62b
BP
495static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
496 int cmd, const char *cmd_name);
b5d57fc8 497static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 498static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
499static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
500 enum netdev_flags on, enum netdev_flags *old_flagsp)
501 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
502static int do_get_ifindex(const char *netdev_name);
503static int get_ifindex(const struct netdev *, int *ifindexp);
504static int do_set_addr(struct netdev *netdev,
505 int ioctl_nr, const char *ioctl_name,
506 struct in_addr addr);
507static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 508static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
35eef899 509static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 510static int af_packet_sock(void);
19c8e9c1 511static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
512static void netdev_linux_miimon_run(void);
513static void netdev_linux_miimon_wait(void);
df1e5a3b 514static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 515
15b3596a
JG
516static bool
517is_netdev_linux_class(const struct netdev_class *netdev_class)
518{
259e0b1a 519 return netdev_class->run == netdev_linux_run;
15b3596a
JG
520}
521
796223f5
BP
522static bool
523is_tap_netdev(const struct netdev *netdev)
524{
b5d57fc8 525 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
526}
527
8b61709d
BP
528static struct netdev_linux *
529netdev_linux_cast(const struct netdev *netdev)
530{
b5d57fc8 531 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 532
180c6d0b 533 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 534}
796223f5 535
f7791740
PS
536static struct netdev_rxq_linux *
537netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 538{
9dc63482 539 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 540 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 541}
ff4ed3c9 542\f
cee87338 543static void netdev_linux_update(struct netdev_linux *netdev,
86383816
BP
544 const struct rtnetlink_link_change *)
545 OVS_REQUIRES(netdev->mutex);
cee87338 546static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
547 unsigned int ifi_flags, unsigned int mask)
548 OVS_REQUIRES(netdev->mutex);
cee87338
BP
549
550/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
551 * if no such socket could be created. */
552static struct nl_sock *
553netdev_linux_notify_sock(void)
554{
555 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
556 static struct nl_sock *sock;
557
558 if (ovsthread_once_start(&once)) {
559 int error;
560
561 error = nl_sock_create(NETLINK_ROUTE, &sock);
562 if (!error) {
563 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
564 if (error) {
565 nl_sock_destroy(sock);
566 sock = NULL;
567 }
568 }
569 ovsthread_once_done(&once);
570 }
571
572 return sock;
573}
574
19c8e9c1
JS
575static bool
576netdev_linux_miimon_enabled(void)
577{
812c272c 578 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
579}
580
8b61709d
BP
581static void
582netdev_linux_run(void)
583{
cee87338
BP
584 struct nl_sock *sock;
585 int error;
586
19c8e9c1
JS
587 if (netdev_linux_miimon_enabled()) {
588 netdev_linux_miimon_run();
589 }
cee87338
BP
590
591 sock = netdev_linux_notify_sock();
592 if (!sock) {
593 return;
594 }
595
596 do {
597 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
598 uint64_t buf_stub[4096 / 8];
599 struct ofpbuf buf;
600
601 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
602 error = nl_sock_recv(sock, &buf, false);
603 if (!error) {
604 struct rtnetlink_link_change change;
605
606 if (rtnetlink_link_parse(&buf, &change)) {
607 struct netdev *netdev_ = netdev_from_name(change.ifname);
608 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
610
611 ovs_mutex_lock(&netdev->mutex);
cee87338 612 netdev_linux_update(netdev, &change);
86383816 613 ovs_mutex_unlock(&netdev->mutex);
cee87338 614 }
38e0065b 615 netdev_close(netdev_);
cee87338
BP
616 }
617 } else if (error == ENOBUFS) {
618 struct shash device_shash;
619 struct shash_node *node;
620
621 nl_sock_drain(sock);
622
623 shash_init(&device_shash);
624 netdev_get_devices(&netdev_linux_class, &device_shash);
625 SHASH_FOR_EACH (node, &device_shash) {
626 struct netdev *netdev_ = node->data;
627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
628 unsigned int flags;
629
86383816 630 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
631 get_flags(netdev_, &flags);
632 netdev_linux_changed(netdev, flags, 0);
86383816
BP
633 ovs_mutex_unlock(&netdev->mutex);
634
cee87338
BP
635 netdev_close(netdev_);
636 }
637 shash_destroy(&device_shash);
638 } else if (error != EAGAIN) {
639 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
640 ovs_strerror(error));
641 }
642 ofpbuf_uninit(&buf);
643 } while (!error);
8b61709d
BP
644}
645
646static void
647netdev_linux_wait(void)
648{
cee87338
BP
649 struct nl_sock *sock;
650
19c8e9c1
JS
651 if (netdev_linux_miimon_enabled()) {
652 netdev_linux_miimon_wait();
653 }
cee87338
BP
654 sock = netdev_linux_notify_sock();
655 if (sock) {
656 nl_sock_wait(sock, POLLIN);
657 }
8b61709d
BP
658}
659
ac4d3bcb 660static void
b5d57fc8
BP
661netdev_linux_changed(struct netdev_linux *dev,
662 unsigned int ifi_flags, unsigned int mask)
86383816 663 OVS_REQUIRES(dev->mutex)
ac4d3bcb 664{
3e912ffc 665 netdev_change_seq_changed(&dev->up);
8aa77183
BP
666
667 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
668 dev->carrier_resets++;
669 }
670 dev->ifi_flags = ifi_flags;
671
4f925bd3
PS
672 dev->cache_valid &= mask;
673}
674
675static void
b5d57fc8
BP
676netdev_linux_update(struct netdev_linux *dev,
677 const struct rtnetlink_link_change *change)
86383816 678 OVS_REQUIRES(dev->mutex)
4f925bd3
PS
679{
680 if (change->nlmsg_type == RTM_NEWLINK) {
681 /* Keep drv-info */
b5d57fc8 682 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 683
c7b1b0a5 684 /* Update netdev from rtnl-change msg. */
90a6637d
PS
685 if (change->mtu) {
686 dev->mtu = change->mtu;
687 dev->cache_valid |= VALID_MTU;
688 dev->netdev_mtu_error = 0;
689 }
690
44445cac
PS
691 if (!eth_addr_is_zero(change->addr)) {
692 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
693 dev->cache_valid |= VALID_ETHERADDR;
694 dev->ether_addr_error = 0;
695 }
696
c7b1b0a5
PS
697 dev->ifindex = change->ifi_index;
698 dev->cache_valid |= VALID_IFINDEX;
699 dev->get_ifindex_error = 0;
700
4f925bd3 701 } else {
b5d57fc8 702 netdev_linux_changed(dev, change->ifi_flags, 0);
4f925bd3 703 }
ac4d3bcb
EJ
704}
705
9dc63482
BP
706static struct netdev *
707netdev_linux_alloc(void)
708{
709 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
710 return &netdev->up;
711}
712
cee87338 713static void
9dc63482
BP
714netdev_linux_common_construct(struct netdev_linux *netdev)
715{
834d6caf 716 ovs_mutex_init(&netdev->mutex);
9dc63482
BP
717}
718
1f6e0fbd
BP
719/* Creates system and internal devices. */
720static int
9dc63482 721netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 722{
9dc63482 723 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1f6e0fbd
BP
724 int error;
725
cee87338 726 netdev_linux_common_construct(netdev);
1f6e0fbd 727
b5d57fc8
BP
728 error = get_flags(&netdev->up, &netdev->ifi_flags);
729 if (error == ENODEV) {
9dc63482 730 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 731 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
732 return ENODEV;
733 } else {
734 /* "Internal" netdevs have to be created as netdev objects before
735 * they exist in the kernel, because creating them in the kernel
736 * happens by passing a netdev object to dpif_port_add().
737 * Therefore, ignore the error. */
738 }
739 }
46415c90 740
a740f0de
JG
741 return 0;
742}
743
5b7448ed
JG
744/* For most types of netdevs we open the device for each call of
745 * netdev_open(). However, this is not the case with tap devices,
746 * since it is only possible to open the device once. In this
747 * situation we share a single file descriptor, and consequently
748 * buffers, across all readers. Therefore once data is read it will
749 * be unavailable to other reads for tap devices. */
a740f0de 750static int
9dc63482 751netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 752{
9dc63482 753 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 754 static const char tap_dev[] = "/dev/net/tun";
9dc63482 755 const char *name = netdev_->name;
a740f0de
JG
756 struct ifreq ifr;
757 int error;
758
cee87338 759 netdev_linux_common_construct(netdev);
1f6e0fbd 760
6c88d577 761 /* Open tap device. */
d0d08f8a
BP
762 netdev->tap_fd = open(tap_dev, O_RDWR);
763 if (netdev->tap_fd < 0) {
6c88d577 764 error = errno;
10a89ef0 765 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 766 return error;
6c88d577
JP
767 }
768
769 /* Create tap device. */
770 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 771 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 772 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 773 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 774 ovs_strerror(errno));
6c88d577 775 error = errno;
f61d8d29 776 goto error_close;
6c88d577
JP
777 }
778
779 /* Make non-blocking. */
d0d08f8a 780 error = set_nonblocking(netdev->tap_fd);
a740f0de 781 if (error) {
f61d8d29 782 goto error_close;
a740f0de
JG
783 }
784
785 return 0;
786
f61d8d29 787error_close:
d0d08f8a 788 close(netdev->tap_fd);
a740f0de
JG
789 return error;
790}
791
6c88d577 792static void
9dc63482 793netdev_linux_destruct(struct netdev *netdev_)
6c88d577 794{
b5d57fc8 795 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 796
b5d57fc8
BP
797 if (netdev->tc && netdev->tc->ops->tc_destroy) {
798 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
799 }
800
d0d08f8a
BP
801 if (netdev_get_class(netdev_) == &netdev_tap_class
802 && netdev->tap_fd >= 0)
803 {
804 close(netdev->tap_fd);
6c88d577 805 }
86383816 806
19c8e9c1 807 if (netdev->miimon_interval > 0) {
812c272c 808 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
809 }
810
86383816 811 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
812}
813
9dc63482
BP
814static void
815netdev_linux_dealloc(struct netdev *netdev_)
816{
817 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
818 free(netdev);
819}
820
f7791740
PS
821static struct netdev_rxq *
822netdev_linux_rxq_alloc(void)
9dc63482 823{
f7791740 824 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
825 return &rx->up;
826}
827
7b6b0ef4 828static int
f7791740 829netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 830{
f7791740 831 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 832 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 833 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 834 int error;
7b6b0ef4 835
86383816 836 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
837 rx->is_tap = is_tap_netdev(netdev_);
838 if (rx->is_tap) {
839 rx->fd = netdev->tap_fd;
796223f5
BP
840 } else {
841 struct sockaddr_ll sll;
b73c8518 842 int ifindex, val;
32383c3b 843 /* Result of tcpdump -dd inbound */
259e0b1a 844 static const struct sock_filter filt[] = {
32383c3b
MM
845 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
846 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
847 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
848 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
849 };
259e0b1a
BP
850 static const struct sock_fprog fprog = {
851 ARRAY_SIZE(filt), (struct sock_filter *) filt
852 };
7b6b0ef4 853
796223f5 854 /* Create file descriptor. */
9dc63482
BP
855 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
856 if (rx->fd < 0) {
796223f5 857 error = errno;
10a89ef0 858 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
859 goto error;
860 }
33d82a56 861
b73c8518
SH
862 val = 1;
863 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
864 error = errno;
865 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
866 netdev_get_name(netdev_), ovs_strerror(error));
867 goto error;
868 }
869
796223f5 870 /* Set non-blocking mode. */
9dc63482 871 error = set_nonblocking(rx->fd);
796223f5
BP
872 if (error) {
873 goto error;
874 }
7b6b0ef4 875
796223f5 876 /* Get ethernet device index. */
180c6d0b 877 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
878 if (error) {
879 goto error;
880 }
7b6b0ef4 881
796223f5
BP
882 /* Bind to specific ethernet device. */
883 memset(&sll, 0, sizeof sll);
884 sll.sll_family = AF_PACKET;
885 sll.sll_ifindex = ifindex;
b73c8518 886 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 887 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
888 error = errno;
889 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 890 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
891 goto error;
892 }
32383c3b
MM
893
894 /* Filter for only inbound packets. */
9dc63482 895 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
896 sizeof fprog);
897 if (error) {
898 error = errno;
259e0b1a 899 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 900 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
901 goto error;
902 }
7b6b0ef4 903 }
86383816 904 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 905
7b6b0ef4
BP
906 return 0;
907
908error:
9dc63482
BP
909 if (rx->fd >= 0) {
910 close(rx->fd);
7b6b0ef4 911 }
86383816 912 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
913 return error;
914}
915
796223f5 916static void
f7791740 917netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 918{
f7791740 919 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 920
796223f5
BP
921 if (!rx->is_tap) {
922 close(rx->fd);
8b61709d 923 }
9dc63482
BP
924}
925
926static void
f7791740 927netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 928{
f7791740 929 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 930
796223f5
BP
931 free(rx);
932}
8b61709d 933
b73c8518
SH
934static ovs_be16
935auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
936{
937 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
938 return htons(aux->tp_vlan_tpid);
939 } else {
940 return htons(ETH_TYPE_VLAN);
941 }
942}
943
944static bool
945auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
946{
947 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
948}
949
796223f5 950static int
cf62fa4c 951netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 952{
b73c8518 953 size_t size;
796223f5 954 ssize_t retval;
b73c8518
SH
955 struct iovec iov;
956 struct cmsghdr *cmsg;
957 union {
958 struct cmsghdr cmsg;
959 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
960 } cmsg_buffer;
961 struct msghdr msgh;
962
963 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
964 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
965 size = dp_packet_tailroom(buffer);
b73c8518 966
cf62fa4c 967 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
968 iov.iov_len = size;
969 msgh.msg_name = NULL;
970 msgh.msg_namelen = 0;
971 msgh.msg_iov = &iov;
972 msgh.msg_iovlen = 1;
973 msgh.msg_control = &cmsg_buffer;
974 msgh.msg_controllen = sizeof cmsg_buffer;
975 msgh.msg_flags = 0;
8e8cddf7 976
796223f5 977 do {
b73c8518 978 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
979 } while (retval < 0 && errno == EINTR);
980
bfd3367b 981 if (retval < 0) {
b73c8518
SH
982 return errno;
983 } else if (retval > size) {
984 return EMSGSIZE;
985 }
986
cf62fa4c 987 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
988
989 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
990 const struct tpacket_auxdata *aux;
991
992 if (cmsg->cmsg_level != SOL_PACKET
993 || cmsg->cmsg_type != PACKET_AUXDATA
994 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
995 continue;
8b61709d 996 }
b73c8518
SH
997
998 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
999 if (auxdata_has_vlan_tci(aux)) {
1000 if (retval < ETH_HEADER_LEN) {
1001 return EINVAL;
1002 }
1003
1004 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1005 htons(aux->tp_vlan_tci));
1006 break;
1007 }
1008 }
1009
1010 return 0;
1011}
1012
1013static int
cf62fa4c 1014netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1015{
1016 ssize_t retval;
cf62fa4c 1017 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1018
1019 do {
cf62fa4c 1020 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1021 } while (retval < 0 && errno == EINTR);
1022
1023 if (retval < 0) {
bfd3367b
SH
1024 return errno;
1025 } else if (retval > size) {
1026 return EMSGSIZE;
8b61709d 1027 }
b73c8518 1028
cf62fa4c 1029 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1030 return 0;
1031}
1032
1033static int
e14deea0 1034netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
91088554 1035 int *c)
b73c8518 1036{
f7791740 1037 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1038 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1039 struct dp_packet *buffer;
df1e5a3b
PS
1040 ssize_t retval;
1041 int mtu;
1042
1043 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1044 mtu = ETH_PAYLOAD_MAX;
1045 }
1046
cf62fa4c 1047 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1048 DP_NETDEV_HEADROOM);
b73c8518 1049 retval = (rx->is_tap
f7791740
PS
1050 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1051 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1052
1053 if (retval) {
1054 if (retval != EAGAIN && retval != EMSGSIZE) {
1055 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
f7791740 1056 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
df1e5a3b 1057 }
cf62fa4c 1058 dp_packet_delete(buffer);
df1e5a3b
PS
1059 } else {
1060 dp_packet_pad(buffer);
cf62fa4c
PS
1061 dp_packet_set_dp_hash(buffer, 0);
1062 packets[0] = buffer;
df1e5a3b 1063 *c = 1;
b73c8518
SH
1064 }
1065
1066 return retval;
8b61709d
BP
1067}
1068
8b61709d 1069static void
f7791740 1070netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1071{
f7791740 1072 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1073 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1074}
1075
8b61709d 1076static int
f7791740 1077netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1078{
f7791740 1079 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1080 if (rx->is_tap) {
8b61709d 1081 struct ifreq ifr;
f7791740 1082 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1083 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1084 if (error) {
1085 return error;
1086 }
796223f5 1087 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1088 return 0;
1089 } else {
796223f5 1090 return drain_rcvbuf(rx->fd);
8b61709d
BP
1091 }
1092}
1093
1094/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1095 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1096 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1097 * the packet is too big or too small to transmit on the device.
1098 *
1099 * The caller retains ownership of 'buffer' in all cases.
1100 *
1101 * The kernel maintains a packet transmission queue, so the caller is not
1102 * expected to do additional queuing of packets. */
1103static int
f00fa8cb 1104netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
e14deea0 1105 struct dp_packet **pkts, int cnt, bool may_steal)
8b61709d 1106{
f4fd623c
DDP
1107 int i;
1108 int error = 0;
40d26f04 1109
f4fd623c
DDP
1110 /* 'i' is incremented only if there's no error */
1111 for (i = 0; i < cnt;) {
cf62fa4c
PS
1112 const void *data = dp_packet_data(pkts[i]);
1113 size_t size = dp_packet_size(pkts[i]);
f23347ea 1114 ssize_t retval;
8b61709d 1115
796223f5 1116 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
1117 /* Use our AF_PACKET socket to send to this device. */
1118 struct sockaddr_ll sll;
1119 struct msghdr msg;
1120 struct iovec iov;
1121 int ifindex;
488d734d
BP
1122 int sock;
1123
1124 sock = af_packet_sock();
1125 if (sock < 0) {
c4c7a3d7 1126 return -sock;
488d734d 1127 }
f23347ea 1128
86383816
BP
1129 ifindex = netdev_get_ifindex(netdev_);
1130 if (ifindex < 0) {
1131 return -ifindex;
f23347ea 1132 }
8b61709d 1133
f23347ea
BP
1134 /* We don't bother setting most fields in sockaddr_ll because the
1135 * kernel ignores them for SOCK_RAW. */
1136 memset(&sll, 0, sizeof sll);
1137 sll.sll_family = AF_PACKET;
1138 sll.sll_ifindex = ifindex;
76c308b5 1139
ebc56baa 1140 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1141 iov.iov_len = size;
76c308b5 1142
f23347ea
BP
1143 msg.msg_name = &sll;
1144 msg.msg_namelen = sizeof sll;
1145 msg.msg_iov = &iov;
1146 msg.msg_iovlen = 1;
1147 msg.msg_control = NULL;
1148 msg.msg_controllen = 0;
1149 msg.msg_flags = 0;
1150
488d734d 1151 retval = sendmsg(sock, &msg, 0);
f23347ea 1152 } else {
796223f5
BP
1153 /* Use the tap fd to send to this device. This is essential for
1154 * tap devices, because packets sent to a tap device with an
1155 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1156 * tap device. This doesn't occur on other interface types
1157 * because we attach a socket filter to the rx socket. */
b5d57fc8 1158 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1159
d0d08f8a 1160 retval = write(netdev->tap_fd, data, size);
f23347ea 1161 }
76c308b5 1162
8b61709d
BP
1163 if (retval < 0) {
1164 /* The Linux AF_PACKET implementation never blocks waiting for room
1165 * for packets, instead returning ENOBUFS. Translate this into
1166 * EAGAIN for the caller. */
f4fd623c
DDP
1167 error = errno == ENOBUFS ? EAGAIN : errno;
1168 if (error == EINTR) {
1169 /* continue without incrementing 'i', i.e. retry this packet */
8b61709d 1170 continue;
8b61709d 1171 }
f4fd623c 1172 break;
8b61709d 1173 } else if (retval != size) {
f4fd623c
DDP
1174 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1175 " of %"PRIuSIZE") on %s", retval, size,
1176 netdev_get_name(netdev_));
1177 error = EMSGSIZE;
1178 break;
1179 }
1180
1181 /* Process the next packet in the batch */
1182 i++;
1183 }
1184
1185 if (may_steal) {
1186 for (i = 0; i < cnt; i++) {
e14deea0 1187 dp_packet_delete(pkts[i]);
8b61709d
BP
1188 }
1189 }
f4fd623c
DDP
1190
1191 if (error && error != EAGAIN) {
1192 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1193 netdev_get_name(netdev_), ovs_strerror(error));
1194 }
1195
1196 return error;
1197
8b61709d
BP
1198}
1199
1200/* Registers with the poll loop to wake up from the next call to poll_block()
1201 * when the packet transmission queue has sufficient room to transmit a packet
1202 * with netdev_send().
1203 *
1204 * The kernel maintains a packet transmission queue, so the client is not
1205 * expected to do additional queuing of packets. Thus, this function is
1206 * unlikely to ever be used. It is included for completeness. */
1207static void
f00fa8cb 1208netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1209{
796223f5 1210 if (is_tap_netdev(netdev)) {
8b61709d
BP
1211 /* TAP device always accepts packets.*/
1212 poll_immediate_wake();
1213 }
1214}
1215
1216/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1217 * otherwise a positive errno value. */
1218static int
1219netdev_linux_set_etheraddr(struct netdev *netdev_,
1220 const uint8_t mac[ETH_ADDR_LEN])
1221{
b5d57fc8 1222 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1223 enum netdev_flags old_flags = 0;
eb395f2e
BP
1224 int error;
1225
86383816
BP
1226 ovs_mutex_lock(&netdev->mutex);
1227
b5d57fc8 1228 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1229 error = netdev->ether_addr_error;
1230 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1231 goto exit;
44445cac 1232 }
b5d57fc8 1233 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1234 }
1235
7eb1bd81 1236 /* Tap devices must be brought down before setting the address. */
796223f5 1237 if (is_tap_netdev(netdev_)) {
4f9f3f21 1238 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1239 }
44445cac
PS
1240 error = set_etheraddr(netdev_get_name(netdev_), mac);
1241 if (!error || error == ENODEV) {
b5d57fc8
BP
1242 netdev->ether_addr_error = error;
1243 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1244 if (!error) {
b5d57fc8 1245 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 1246 }
8b61709d 1247 }
44445cac 1248
4f9f3f21
BP
1249 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1250 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1251 }
7eb1bd81 1252
86383816
BP
1253exit:
1254 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1255 return error;
1256}
1257
44445cac 1258/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1259static int
1260netdev_linux_get_etheraddr(const struct netdev *netdev_,
1261 uint8_t mac[ETH_ADDR_LEN])
1262{
b5d57fc8 1263 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1264 int error;
44445cac 1265
86383816 1266 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1267 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816
BP
1268 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1269 netdev->etheraddr);
b5d57fc8 1270 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1271 }
44445cac 1272
86383816
BP
1273 error = netdev->ether_addr_error;
1274 if (!error) {
b5d57fc8 1275 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
44445cac 1276 }
86383816 1277 ovs_mutex_unlock(&netdev->mutex);
44445cac 1278
86383816 1279 return error;
8b61709d
BP
1280}
1281
8b61709d 1282static int
73371c09 1283netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1284{
86383816
BP
1285 int error;
1286
b5d57fc8 1287 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1288 struct ifreq ifr;
90a6637d 1289
86383816 1290 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1291 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1292 netdev->mtu = ifr.ifr_mtu;
1293 netdev->cache_valid |= VALID_MTU;
8b61709d 1294 }
90a6637d 1295
86383816
BP
1296 error = netdev->netdev_mtu_error;
1297 if (!error) {
b5d57fc8 1298 *mtup = netdev->mtu;
90a6637d 1299 }
73371c09
BP
1300
1301 return error;
1302}
1303
1304/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1305 * in bytes, not including the hardware header; thus, this is typically 1500
1306 * bytes for Ethernet devices. */
1307static int
1308netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1309{
1310 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1311 int error;
1312
1313 ovs_mutex_lock(&netdev->mutex);
1314 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1315 ovs_mutex_unlock(&netdev->mutex);
1316
1317 return error;
8b61709d
BP
1318}
1319
9b020780
PS
1320/* Sets the maximum size of transmitted (MTU) for given device using linux
1321 * networking ioctl interface.
1322 */
1323static int
1324netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1325{
b5d57fc8 1326 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1327 struct ifreq ifr;
1328 int error;
1329
86383816 1330 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1331 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1332 error = netdev->netdev_mtu_error;
1333 if (error || netdev->mtu == mtu) {
1334 goto exit;
90a6637d 1335 }
b5d57fc8 1336 netdev->cache_valid &= ~VALID_MTU;
153e5481 1337 }
9b020780 1338 ifr.ifr_mtu = mtu;
259e0b1a
BP
1339 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1340 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1341 if (!error || error == ENODEV) {
b5d57fc8
BP
1342 netdev->netdev_mtu_error = error;
1343 netdev->mtu = ifr.ifr_mtu;
1344 netdev->cache_valid |= VALID_MTU;
9b020780 1345 }
86383816
BP
1346exit:
1347 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1348 return error;
9b020780
PS
1349}
1350
9ab3d9a3
BP
1351/* Returns the ifindex of 'netdev', if successful, as a positive number.
1352 * On failure, returns a negative errno value. */
1353static int
86383816 1354netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1355{
86383816 1356 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1357 int ifindex, error;
1358
86383816
BP
1359 ovs_mutex_lock(&netdev->mutex);
1360 error = get_ifindex(netdev_, &ifindex);
1361 ovs_mutex_unlock(&netdev->mutex);
1362
9ab3d9a3
BP
1363 return error ? -error : ifindex;
1364}
1365
8b61709d
BP
1366static int
1367netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1368{
b5d57fc8 1369 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1370
86383816 1371 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1372 if (netdev->miimon_interval > 0) {
1373 *carrier = netdev->miimon;
3a183124 1374 } else {
b5d57fc8 1375 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1376 }
86383816 1377 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1378
3a183124 1379 return 0;
8b61709d
BP
1380}
1381
65c3058c 1382static long long int
86383816 1383netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1384{
86383816
BP
1385 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1386 long long int carrier_resets;
1387
1388 ovs_mutex_lock(&netdev->mutex);
1389 carrier_resets = netdev->carrier_resets;
1390 ovs_mutex_unlock(&netdev->mutex);
1391
1392 return carrier_resets;
65c3058c
EJ
1393}
1394
63331829 1395static int
1670c579
EJ
1396netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1397 struct mii_ioctl_data *data)
63331829 1398{
63331829 1399 struct ifreq ifr;
782e6111 1400 int error;
63331829 1401
63331829 1402 memset(&ifr, 0, sizeof ifr);
782e6111 1403 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1404 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1405 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1406
782e6111
EJ
1407 return error;
1408}
1409
1410static int
1670c579 1411netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1412{
782e6111
EJ
1413 struct mii_ioctl_data data;
1414 int error;
63331829 1415
782e6111
EJ
1416 *miimon = false;
1417
1418 memset(&data, 0, sizeof data);
1670c579 1419 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1420 if (!error) {
1421 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1422 data.reg_num = MII_BMSR;
1670c579 1423 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1424 &data);
63331829
EJ
1425
1426 if (!error) {
782e6111 1427 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1428 } else {
1429 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1430 }
1431 } else {
1432 struct ethtool_cmd ecmd;
63331829
EJ
1433
1434 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1435 name);
1436
ab985a77 1437 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1438 memset(&ecmd, 0, sizeof ecmd);
1439 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1440 "ETHTOOL_GLINK");
1441 if (!error) {
782e6111
EJ
1442 struct ethtool_value eval;
1443
1444 memcpy(&eval, &ecmd, sizeof eval);
1445 *miimon = !!eval.data;
63331829
EJ
1446 } else {
1447 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1448 }
1449 }
1450
1451 return error;
1452}
1453
1670c579
EJ
1454static int
1455netdev_linux_set_miimon_interval(struct netdev *netdev_,
1456 long long int interval)
1457{
b5d57fc8 1458 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1459
86383816 1460 ovs_mutex_lock(&netdev->mutex);
1670c579 1461 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1462 if (netdev->miimon_interval != interval) {
19c8e9c1 1463 if (interval && !netdev->miimon_interval) {
812c272c 1464 atomic_count_inc(&miimon_cnt);
19c8e9c1 1465 } else if (!interval && netdev->miimon_interval) {
812c272c 1466 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1467 }
1468
b5d57fc8
BP
1469 netdev->miimon_interval = interval;
1470 timer_set_expired(&netdev->miimon_timer);
1670c579 1471 }
86383816 1472 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1473
1474 return 0;
1475}
1476
1477static void
1478netdev_linux_miimon_run(void)
1479{
1480 struct shash device_shash;
1481 struct shash_node *node;
1482
1483 shash_init(&device_shash);
b5d57fc8 1484 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1485 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1486 struct netdev *netdev = node->data;
1487 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1488 bool miimon;
1489
86383816
BP
1490 ovs_mutex_lock(&dev->mutex);
1491 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1492 netdev_linux_get_miimon(dev->up.name, &miimon);
1493 if (miimon != dev->miimon) {
1494 dev->miimon = miimon;
1495 netdev_linux_changed(dev, dev->ifi_flags, 0);
1496 }
1670c579 1497
86383816 1498 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1499 }
86383816 1500 ovs_mutex_unlock(&dev->mutex);
2f980d74 1501 netdev_close(netdev);
1670c579
EJ
1502 }
1503
1504 shash_destroy(&device_shash);
1505}
1506
1507static void
1508netdev_linux_miimon_wait(void)
1509{
1510 struct shash device_shash;
1511 struct shash_node *node;
1512
1513 shash_init(&device_shash);
b5d57fc8 1514 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1515 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1516 struct netdev *netdev = node->data;
1517 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1518
86383816 1519 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1520 if (dev->miimon_interval > 0) {
1521 timer_wait(&dev->miimon_timer);
1522 }
86383816 1523 ovs_mutex_unlock(&dev->mutex);
2f980d74 1524 netdev_close(netdev);
1670c579
EJ
1525 }
1526 shash_destroy(&device_shash);
1527}
1528
92df599c
JG
1529static void
1530swap_uint64(uint64_t *a, uint64_t *b)
1531{
1de0e8ae
BP
1532 uint64_t tmp = *a;
1533 *a = *b;
1534 *b = tmp;
92df599c
JG
1535}
1536
c060c4cf
EJ
1537/* Copies 'src' into 'dst', performing format conversion in the process.
1538 *
1539 * 'src' is allowed to be misaligned. */
1540static void
1541netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1542 const struct ovs_vport_stats *src)
1543{
6a54dedc
BP
1544 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1545 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1546 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1547 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1548 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1549 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1550 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1551 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1552 dst->multicast = 0;
1553 dst->collisions = 0;
1554 dst->rx_length_errors = 0;
1555 dst->rx_over_errors = 0;
1556 dst->rx_crc_errors = 0;
1557 dst->rx_frame_errors = 0;
1558 dst->rx_fifo_errors = 0;
1559 dst->rx_missed_errors = 0;
1560 dst->tx_aborted_errors = 0;
1561 dst->tx_carrier_errors = 0;
1562 dst->tx_fifo_errors = 0;
1563 dst->tx_heartbeat_errors = 0;
1564 dst->tx_window_errors = 0;
1565}
1566
1567static int
1568get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1569{
93451a0a 1570 struct dpif_netlink_vport reply;
c060c4cf
EJ
1571 struct ofpbuf *buf;
1572 int error;
1573
93451a0a 1574 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1575 if (error) {
1576 return error;
1577 } else if (!reply.stats) {
1578 ofpbuf_delete(buf);
1579 return EOPNOTSUPP;
1580 }
1581
1582 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1583
1584 ofpbuf_delete(buf);
1585
1586 return 0;
1587}
1588
f613a0d7
PS
1589static void
1590get_stats_via_vport(const struct netdev *netdev_,
1591 struct netdev_stats *stats)
8b61709d 1592{
b5d57fc8 1593 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1594
b5d57fc8
BP
1595 if (!netdev->vport_stats_error ||
1596 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1597 int error;
7fbef77a 1598
c060c4cf 1599 error = get_stats_via_vport__(netdev_, stats);
bcb1f5a1 1600 if (error && error != ENOENT) {
a57a8488 1601 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1602 "(%s)",
1603 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1604 }
b5d57fc8
BP
1605 netdev->vport_stats_error = error;
1606 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1607 }
f613a0d7 1608}
8b61709d 1609
f613a0d7
PS
1610/* Retrieves current device stats for 'netdev-linux'. */
1611static int
1612netdev_linux_get_stats(const struct netdev *netdev_,
1613 struct netdev_stats *stats)
1614{
b5d57fc8 1615 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1616 struct netdev_stats dev_stats;
1617 int error;
1618
86383816 1619 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1620 get_stats_via_vport(netdev_, stats);
35eef899 1621 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1622 if (error) {
86383816
BP
1623 if (!netdev->vport_stats_error) {
1624 error = 0;
f613a0d7 1625 }
86383816 1626 } else if (netdev->vport_stats_error) {
04c881eb 1627 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1628 *stats = dev_stats;
1629 } else {
04c881eb
AZ
1630 /* Use kernel netdev's packet and byte counts since vport's counters
1631 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1632 * enabled. */
1633 stats->rx_packets = dev_stats.rx_packets;
1634 stats->rx_bytes = dev_stats.rx_bytes;
1635 stats->tx_packets = dev_stats.tx_packets;
1636 stats->tx_bytes = dev_stats.tx_bytes;
1637
f613a0d7
PS
1638 stats->rx_errors += dev_stats.rx_errors;
1639 stats->tx_errors += dev_stats.tx_errors;
1640 stats->rx_dropped += dev_stats.rx_dropped;
1641 stats->tx_dropped += dev_stats.tx_dropped;
1642 stats->multicast += dev_stats.multicast;
1643 stats->collisions += dev_stats.collisions;
1644 stats->rx_length_errors += dev_stats.rx_length_errors;
1645 stats->rx_over_errors += dev_stats.rx_over_errors;
1646 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1647 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1648 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1649 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1650 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1651 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1652 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1653 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1654 stats->tx_window_errors += dev_stats.tx_window_errors;
1655 }
86383816
BP
1656 ovs_mutex_unlock(&netdev->mutex);
1657
1658 return error;
f613a0d7
PS
1659}
1660
1661/* Retrieves current device stats for 'netdev-tap' netdev or
1662 * netdev-internal. */
1663static int
15aee116 1664netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1665{
b5d57fc8 1666 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1667 struct netdev_stats dev_stats;
1668 int error;
1669
86383816 1670 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1671 get_stats_via_vport(netdev_, stats);
35eef899 1672 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1673 if (error) {
86383816
BP
1674 if (!netdev->vport_stats_error) {
1675 error = 0;
8b61709d 1676 }
86383816
BP
1677 } else if (netdev->vport_stats_error) {
1678 /* Transmit and receive stats will appear to be swapped relative to the
1679 * other ports since we are the one sending the data, not a remote
1680 * computer. For consistency, we swap them back here. This does not
1681 * apply if we are getting stats from the vport layer because it always
1682 * tracks stats from the perspective of the switch. */
fe6b0e03 1683
f613a0d7 1684 *stats = dev_stats;
92df599c
JG
1685 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1686 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1687 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1688 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1689 stats->rx_length_errors = 0;
1690 stats->rx_over_errors = 0;
1691 stats->rx_crc_errors = 0;
1692 stats->rx_frame_errors = 0;
1693 stats->rx_fifo_errors = 0;
1694 stats->rx_missed_errors = 0;
1695 stats->tx_aborted_errors = 0;
1696 stats->tx_carrier_errors = 0;
1697 stats->tx_fifo_errors = 0;
1698 stats->tx_heartbeat_errors = 0;
1699 stats->tx_window_errors = 0;
f613a0d7 1700 } else {
04c881eb
AZ
1701 /* Use kernel netdev's packet and byte counts since vport counters
1702 * do not reflect packet counts on the wire when GSO, TSO or GRO
1703 * are enabled. */
1704 stats->rx_packets = dev_stats.tx_packets;
1705 stats->rx_bytes = dev_stats.tx_bytes;
1706 stats->tx_packets = dev_stats.rx_packets;
1707 stats->tx_bytes = dev_stats.rx_bytes;
1708
f613a0d7
PS
1709 stats->rx_dropped += dev_stats.tx_dropped;
1710 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1711
f613a0d7
PS
1712 stats->rx_errors += dev_stats.tx_errors;
1713 stats->tx_errors += dev_stats.rx_errors;
1714
1715 stats->multicast += dev_stats.multicast;
1716 stats->collisions += dev_stats.collisions;
1717 }
86383816
BP
1718 ovs_mutex_unlock(&netdev->mutex);
1719
1720 return error;
8b61709d
BP
1721}
1722
bba1e6f3
PS
1723static int
1724netdev_internal_get_stats(const struct netdev *netdev_,
1725 struct netdev_stats *stats)
1726{
b5d57fc8 1727 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1728 int error;
bba1e6f3 1729
86383816 1730 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1731 get_stats_via_vport(netdev_, stats);
86383816
BP
1732 error = netdev->vport_stats_error;
1733 ovs_mutex_unlock(&netdev->mutex);
1734
1735 return error;
bba1e6f3
PS
1736}
1737
51f87458 1738static void
b5d57fc8 1739netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1740{
1741 struct ethtool_cmd ecmd;
6c038611 1742 uint32_t speed;
8b61709d
BP
1743 int error;
1744
b5d57fc8 1745 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1746 return;
1747 }
1748
ab985a77 1749 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1750 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1751 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1752 ETHTOOL_GSET, "ETHTOOL_GSET");
1753 if (error) {
51f87458 1754 goto out;
8b61709d
BP
1755 }
1756
1757 /* Supported features. */
b5d57fc8 1758 netdev->supported = 0;
8b61709d 1759 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1760 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1761 }
1762 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1763 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1764 }
1765 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1766 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1767 }
1768 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1769 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1770 }
1771 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1772 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1773 }
1774 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
b5d57fc8 1775 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1776 }
1777 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
b5d57fc8 1778 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1779 }
1780 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1781 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1782 }
1783 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1784 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1785 }
1786 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1787 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1788 }
1789 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1790 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1791 }
1792 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1793 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1794 }
1795
1796 /* Advertised features. */
b5d57fc8 1797 netdev->advertised = 0;
8b61709d 1798 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1799 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1800 }
1801 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1802 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1803 }
1804 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1805 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1806 }
1807 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1808 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1809 }
1810 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1811 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1812 }
1813 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
b5d57fc8 1814 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1815 }
1816 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
b5d57fc8 1817 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1818 }
1819 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1820 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1821 }
1822 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1823 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1824 }
1825 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1826 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1827 }
1828 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1829 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1830 }
1831 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1832 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1833 }
1834
1835 /* Current settings. */
2a529ead 1836 speed = ecmd.speed;
6c038611 1837 if (speed == SPEED_10) {
b5d57fc8 1838 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1839 } else if (speed == SPEED_100) {
b5d57fc8 1840 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1841 } else if (speed == SPEED_1000) {
b5d57fc8 1842 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1843 } else if (speed == SPEED_10000) {
b5d57fc8 1844 netdev->current = NETDEV_F_10GB_FD;
6c038611 1845 } else if (speed == 40000) {
b5d57fc8 1846 netdev->current = NETDEV_F_40GB_FD;
6c038611 1847 } else if (speed == 100000) {
b5d57fc8 1848 netdev->current = NETDEV_F_100GB_FD;
6c038611 1849 } else if (speed == 1000000) {
b5d57fc8 1850 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1851 } else {
b5d57fc8 1852 netdev->current = 0;
8b61709d
BP
1853 }
1854
1855 if (ecmd.port == PORT_TP) {
b5d57fc8 1856 netdev->current |= NETDEV_F_COPPER;
8b61709d 1857 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1858 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1859 }
1860
1861 if (ecmd.autoneg) {
b5d57fc8 1862 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1863 }
1864
51f87458 1865out:
b5d57fc8
BP
1866 netdev->cache_valid |= VALID_FEATURES;
1867 netdev->get_features_error = error;
51f87458
PS
1868}
1869
887ed8b2
BP
1870/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1871 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1872 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1873static int
1874netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1875 enum netdev_features *current,
1876 enum netdev_features *advertised,
1877 enum netdev_features *supported,
1878 enum netdev_features *peer)
51f87458 1879{
b5d57fc8 1880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1881 int error;
51f87458 1882
86383816 1883 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1884 netdev_linux_read_features(netdev);
b5d57fc8
BP
1885 if (!netdev->get_features_error) {
1886 *current = netdev->current;
1887 *advertised = netdev->advertised;
1888 *supported = netdev->supported;
887ed8b2 1889 *peer = 0; /* XXX */
51f87458 1890 }
86383816
BP
1891 error = netdev->get_features_error;
1892 ovs_mutex_unlock(&netdev->mutex);
1893
1894 return error;
8b61709d
BP
1895}
1896
1897/* Set the features advertised by 'netdev' to 'advertise'. */
1898static int
86383816 1899netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 1900 enum netdev_features advertise)
8b61709d 1901{
86383816 1902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
1903 struct ethtool_cmd ecmd;
1904 int error;
1905
86383816
BP
1906 ovs_mutex_lock(&netdev->mutex);
1907
ab985a77 1908 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1909 memset(&ecmd, 0, sizeof ecmd);
86383816 1910 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
1911 ETHTOOL_GSET, "ETHTOOL_GSET");
1912 if (error) {
86383816 1913 goto exit;
8b61709d
BP
1914 }
1915
1916 ecmd.advertising = 0;
6c038611 1917 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1918 ecmd.advertising |= ADVERTISED_10baseT_Half;
1919 }
6c038611 1920 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1921 ecmd.advertising |= ADVERTISED_10baseT_Full;
1922 }
6c038611 1923 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1924 ecmd.advertising |= ADVERTISED_100baseT_Half;
1925 }
6c038611 1926 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1927 ecmd.advertising |= ADVERTISED_100baseT_Full;
1928 }
6c038611 1929 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1930 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1931 }
6c038611 1932 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1933 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1934 }
6c038611 1935 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1936 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1937 }
6c038611 1938 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1939 ecmd.advertising |= ADVERTISED_TP;
1940 }
6c038611 1941 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1942 ecmd.advertising |= ADVERTISED_FIBRE;
1943 }
6c038611 1944 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1945 ecmd.advertising |= ADVERTISED_Autoneg;
1946 }
6c038611 1947 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1948 ecmd.advertising |= ADVERTISED_Pause;
1949 }
6c038611 1950 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1951 ecmd.advertising |= ADVERTISED_Asym_Pause;
1952 }
ab985a77 1953 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
1954 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1955 ETHTOOL_SSET, "ETHTOOL_SSET");
1956
1957exit:
1958 ovs_mutex_unlock(&netdev->mutex);
1959 return error;
8b61709d
BP
1960}
1961
f8500004
JP
1962/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1963 * successful, otherwise a positive errno value. */
8b61709d 1964static int
b5d57fc8 1965netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
1966 uint32_t kbits_rate, uint32_t kbits_burst)
1967{
b5d57fc8
BP
1968 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1969 const char *netdev_name = netdev_get_name(netdev_);
f8500004 1970 int error;
8b61709d 1971
80a86fbe
BP
1972 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1973 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1974 : kbits_burst); /* Stick with user-specified value. */
1975
86383816 1976 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1977 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
1978 error = netdev->netdev_policing_error;
1979 if (error || (netdev->kbits_rate == kbits_rate &&
1980 netdev->kbits_burst == kbits_burst)) {
c9f71668 1981 /* Assume that settings haven't changed since we last set them. */
86383816 1982 goto out;
c9f71668 1983 }
b5d57fc8 1984 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1985 }
1986
ac8c3412 1987 COVERAGE_INC(netdev_set_policing);
f8500004 1988 /* Remove any existing ingress qdisc. */
b5d57fc8 1989 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
1990 if (error) {
1991 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 1992 netdev_name, ovs_strerror(error));
c9f71668 1993 goto out;
f8500004
JP
1994 }
1995
8b61709d 1996 if (kbits_rate) {
b5d57fc8 1997 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
1998 if (error) {
1999 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2000 netdev_name, ovs_strerror(error));
c9f71668 2001 goto out;
8b61709d
BP
2002 }
2003
b5d57fc8 2004 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2005 if (error){
2006 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2007 netdev_name, ovs_strerror(error));
c9f71668 2008 goto out;
8b61709d 2009 }
8b61709d
BP
2010 }
2011
b5d57fc8
BP
2012 netdev->kbits_rate = kbits_rate;
2013 netdev->kbits_burst = kbits_burst;
f8500004 2014
c9f71668
PS
2015out:
2016 if (!error || error == ENODEV) {
b5d57fc8
BP
2017 netdev->netdev_policing_error = error;
2018 netdev->cache_valid |= VALID_POLICING;
c9f71668 2019 }
86383816 2020 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2021 return error;
8b61709d
BP
2022}
2023
c1c9c9c4
BP
2024static int
2025netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2026 struct sset *types)
c1c9c9c4 2027{
559eb230 2028 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2029
2030 for (opsp = tcs; *opsp != NULL; opsp++) {
2031 const struct tc_ops *ops = *opsp;
2032 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2033 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2034 }
2035 }
2036 return 0;
2037}
2038
2039static const struct tc_ops *
2040tc_lookup_ovs_name(const char *name)
2041{
559eb230 2042 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2043
2044 for (opsp = tcs; *opsp != NULL; opsp++) {
2045 const struct tc_ops *ops = *opsp;
2046 if (!strcmp(name, ops->ovs_name)) {
2047 return ops;
2048 }
2049 }
2050 return NULL;
2051}
2052
2053static const struct tc_ops *
2054tc_lookup_linux_name(const char *name)
2055{
559eb230 2056 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2057
2058 for (opsp = tcs; *opsp != NULL; opsp++) {
2059 const struct tc_ops *ops = *opsp;
2060 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2061 return ops;
2062 }
2063 }
2064 return NULL;
2065}
2066
93b13be8 2067static struct tc_queue *
b5d57fc8 2068tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2069 size_t hash)
2070{
b5d57fc8 2071 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2072 struct tc_queue *queue;
2073
b5d57fc8 2074 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2075 if (queue->queue_id == queue_id) {
2076 return queue;
2077 }
2078 }
2079 return NULL;
2080}
2081
2082static struct tc_queue *
2083tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2084{
2085 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2086}
2087
c1c9c9c4
BP
2088static int
2089netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2090 const char *type,
2091 struct netdev_qos_capabilities *caps)
2092{
2093 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2094 if (!ops) {
2095 return EOPNOTSUPP;
2096 }
2097 caps->n_queues = ops->n_queues;
2098 return 0;
2099}
2100
2101static int
b5d57fc8 2102netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2103 const char **typep, struct smap *details)
c1c9c9c4 2104{
b5d57fc8 2105 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2106 int error;
2107
86383816 2108 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2109 error = tc_query_qdisc(netdev_);
86383816
BP
2110 if (!error) {
2111 *typep = netdev->tc->ops->ovs_name;
2112 error = (netdev->tc->ops->qdisc_get
2113 ? netdev->tc->ops->qdisc_get(netdev_, details)
2114 : 0);
c1c9c9c4 2115 }
86383816 2116 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2117
86383816 2118 return error;
c1c9c9c4
BP
2119}
2120
2121static int
b5d57fc8 2122netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2123 const char *type, const struct smap *details)
c1c9c9c4 2124{
b5d57fc8 2125 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2126 const struct tc_ops *new_ops;
2127 int error;
2128
2129 new_ops = tc_lookup_ovs_name(type);
2130 if (!new_ops || !new_ops->tc_install) {
2131 return EOPNOTSUPP;
2132 }
2133
86383816 2134 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2135 error = tc_query_qdisc(netdev_);
c1c9c9c4 2136 if (error) {
86383816 2137 goto exit;
c1c9c9c4
BP
2138 }
2139
b5d57fc8 2140 if (new_ops == netdev->tc->ops) {
86383816 2141 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2142 } else {
2143 /* Delete existing qdisc. */
b5d57fc8 2144 error = tc_del_qdisc(netdev_);
c1c9c9c4 2145 if (error) {
86383816 2146 goto exit;
c1c9c9c4 2147 }
b5d57fc8 2148 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2149
2150 /* Install new qdisc. */
b5d57fc8
BP
2151 error = new_ops->tc_install(netdev_, details);
2152 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2153 }
86383816
BP
2154
2155exit:
2156 ovs_mutex_unlock(&netdev->mutex);
2157 return error;
c1c9c9c4
BP
2158}
2159
2160static int
b5d57fc8 2161netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2162 unsigned int queue_id, struct smap *details)
c1c9c9c4 2163{
b5d57fc8 2164 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2165 int error;
2166
86383816 2167 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2168 error = tc_query_qdisc(netdev_);
86383816 2169 if (!error) {
b5d57fc8 2170 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2171 error = (queue
b5d57fc8 2172 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2173 : ENOENT);
c1c9c9c4 2174 }
86383816
BP
2175 ovs_mutex_unlock(&netdev->mutex);
2176
2177 return error;
c1c9c9c4
BP
2178}
2179
2180static int
b5d57fc8 2181netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2182 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2183{
b5d57fc8 2184 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2185 int error;
2186
86383816 2187 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2188 error = tc_query_qdisc(netdev_);
86383816
BP
2189 if (!error) {
2190 error = (queue_id < netdev->tc->ops->n_queues
2191 && netdev->tc->ops->class_set
2192 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2193 : EINVAL);
c1c9c9c4 2194 }
86383816 2195 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2196
86383816 2197 return error;
c1c9c9c4
BP
2198}
2199
2200static int
b5d57fc8 2201netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2202{
b5d57fc8 2203 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2204 int error;
2205
86383816 2206 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2207 error = tc_query_qdisc(netdev_);
86383816
BP
2208 if (!error) {
2209 if (netdev->tc->ops->class_delete) {
2210 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2211 error = (queue
2212 ? netdev->tc->ops->class_delete(netdev_, queue)
2213 : ENOENT);
2214 } else {
2215 error = EINVAL;
2216 }
c1c9c9c4 2217 }
86383816
BP
2218 ovs_mutex_unlock(&netdev->mutex);
2219
2220 return error;
c1c9c9c4
BP
2221}
2222
2223static int
b5d57fc8 2224netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2225 unsigned int queue_id,
2226 struct netdev_queue_stats *stats)
2227{
b5d57fc8 2228 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2229 int error;
2230
86383816 2231 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2232 error = tc_query_qdisc(netdev_);
86383816
BP
2233 if (!error) {
2234 if (netdev->tc->ops->class_get_stats) {
2235 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2236 if (queue) {
2237 stats->created = queue->created;
2238 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2239 stats);
2240 } else {
2241 error = ENOENT;
2242 }
2243 } else {
2244 error = EOPNOTSUPP;
6dc34a0d 2245 }
c1c9c9c4 2246 }
86383816
BP
2247 ovs_mutex_unlock(&netdev->mutex);
2248
2249 return error;
c1c9c9c4
BP
2250}
2251
d57695d7
JS
2252struct queue_dump_state {
2253 struct nl_dump dump;
2254 struct ofpbuf buf;
2255};
2256
23a98ffe 2257static bool
d57695d7 2258start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2259{
2260 struct ofpbuf request;
2261 struct tcmsg *tcmsg;
2262
2263 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2264 if (!tcmsg) {
2265 return false;
2266 }
3c4de644 2267 tcmsg->tcm_parent = 0;
d57695d7 2268 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2269 ofpbuf_uninit(&request);
d57695d7
JS
2270
2271 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2272 return true;
c1c9c9c4
BP
2273}
2274
d57695d7
JS
2275static int
2276finish_queue_dump(struct queue_dump_state *state)
2277{
2278 ofpbuf_uninit(&state->buf);
2279 return nl_dump_done(&state->dump);
2280}
2281
89454bf4
BP
2282struct netdev_linux_queue_state {
2283 unsigned int *queues;
2284 size_t cur_queue;
2285 size_t n_queues;
2286};
2287
c1c9c9c4 2288static int
89454bf4 2289netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2290{
89454bf4 2291 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2292 int error;
2293
86383816 2294 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2295 error = tc_query_qdisc(netdev_);
86383816
BP
2296 if (!error) {
2297 if (netdev->tc->ops->class_get) {
89454bf4
BP
2298 struct netdev_linux_queue_state *state;
2299 struct tc_queue *queue;
2300 size_t i;
2301
2302 *statep = state = xmalloc(sizeof *state);
2303 state->n_queues = hmap_count(&netdev->tc->queues);
2304 state->cur_queue = 0;
2305 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2306
2307 i = 0;
2308 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2309 state->queues[i++] = queue->queue_id;
86383816 2310 }
c1c9c9c4 2311 } else {
86383816 2312 error = EOPNOTSUPP;
c1c9c9c4
BP
2313 }
2314 }
86383816 2315 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2316
86383816 2317 return error;
c1c9c9c4
BP
2318}
2319
89454bf4
BP
2320static int
2321netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2322 unsigned int *queue_idp, struct smap *details)
2323{
2324 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2325 struct netdev_linux_queue_state *state = state_;
2326 int error = EOF;
2327
2328 ovs_mutex_lock(&netdev->mutex);
2329 while (state->cur_queue < state->n_queues) {
2330 unsigned int queue_id = state->queues[state->cur_queue++];
2331 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2332
2333 if (queue) {
2334 *queue_idp = queue_id;
2335 error = netdev->tc->ops->class_get(netdev_, queue, details);
2336 break;
2337 }
2338 }
2339 ovs_mutex_unlock(&netdev->mutex);
2340
2341 return error;
2342}
2343
2344static int
2345netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2346 void *state_)
2347{
2348 struct netdev_linux_queue_state *state = state_;
2349
2350 free(state->queues);
2351 free(state);
2352 return 0;
2353}
2354
c1c9c9c4 2355static int
b5d57fc8 2356netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2357 netdev_dump_queue_stats_cb *cb, void *aux)
2358{
b5d57fc8 2359 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2360 int error;
2361
86383816 2362 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2363 error = tc_query_qdisc(netdev_);
86383816 2364 if (!error) {
d57695d7 2365 struct queue_dump_state state;
c1c9c9c4 2366
86383816
BP
2367 if (!netdev->tc->ops->class_dump_stats) {
2368 error = EOPNOTSUPP;
d57695d7 2369 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2370 error = ENODEV;
2371 } else {
2372 struct ofpbuf msg;
2373 int retval;
2374
d57695d7 2375 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2376 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2377 cb, aux);
2378 if (retval) {
2379 error = retval;
2380 }
2381 }
2382
d57695d7 2383 retval = finish_queue_dump(&state);
86383816
BP
2384 if (retval) {
2385 error = retval;
2386 }
c1c9c9c4
BP
2387 }
2388 }
86383816 2389 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2390
86383816 2391 return error;
c1c9c9c4
BP
2392}
2393
8b61709d 2394static int
f1acd62b
BP
2395netdev_linux_get_in4(const struct netdev *netdev_,
2396 struct in_addr *address, struct in_addr *netmask)
8b61709d 2397{
b5d57fc8 2398 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2399 int error;
149f577a 2400
86383816 2401 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2402 if (!(netdev->cache_valid & VALID_IN4)) {
b5d57fc8 2403 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
8b61709d 2404 SIOCGIFADDR, "SIOCGIFADDR");
86383816
BP
2405 if (!error) {
2406 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2407 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2408 if (!error) {
2409 netdev->cache_valid |= VALID_IN4;
2410 }
8b61709d 2411 }
86383816
BP
2412 } else {
2413 error = 0;
2414 }
8b61709d 2415
86383816
BP
2416 if (!error) {
2417 if (netdev->address.s_addr != INADDR_ANY) {
2418 *address = netdev->address;
2419 *netmask = netdev->netmask;
2420 } else {
2421 error = EADDRNOTAVAIL;
f1acd62b 2422 }
8b61709d 2423 }
86383816
BP
2424 ovs_mutex_unlock(&netdev->mutex);
2425
2426 return error;
8b61709d
BP
2427}
2428
8b61709d 2429static int
f1acd62b
BP
2430netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2431 struct in_addr netmask)
8b61709d 2432{
b5d57fc8 2433 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2434 int error;
2435
86383816 2436 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2437 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2438 if (!error) {
b5d57fc8
BP
2439 netdev->cache_valid |= VALID_IN4;
2440 netdev->address = address;
2441 netdev->netmask = netmask;
f1acd62b 2442 if (address.s_addr != INADDR_ANY) {
8b61709d 2443 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2444 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2445 }
2446 }
86383816
BP
2447 ovs_mutex_unlock(&netdev->mutex);
2448
8b61709d
BP
2449 return error;
2450}
2451
2452static bool
2453parse_if_inet6_line(const char *line,
2454 struct in6_addr *in6, char ifname[16 + 1])
2455{
2456 uint8_t *s6 = in6->s6_addr;
2457#define X8 "%2"SCNx8
c2c28dfd
BP
2458 return ovs_scan(line,
2459 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2460 "%*x %*x %*x %*x %16s\n",
2461 &s6[0], &s6[1], &s6[2], &s6[3],
2462 &s6[4], &s6[5], &s6[6], &s6[7],
2463 &s6[8], &s6[9], &s6[10], &s6[11],
2464 &s6[12], &s6[13], &s6[14], &s6[15],
2465 ifname);
8b61709d
BP
2466}
2467
2468/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2469 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2470static int
2471netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2472{
b5d57fc8 2473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
2474
2475 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2476 if (!(netdev->cache_valid & VALID_IN6)) {
8b61709d
BP
2477 FILE *file;
2478 char line[128];
2479
b5d57fc8 2480 netdev->in6 = in6addr_any;
8b61709d
BP
2481
2482 file = fopen("/proc/net/if_inet6", "r");
2483 if (file != NULL) {
2484 const char *name = netdev_get_name(netdev_);
2485 while (fgets(line, sizeof line, file)) {
2a022368 2486 struct in6_addr in6_tmp;
8b61709d 2487 char ifname[16 + 1];
2a022368 2488 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2489 && !strcmp(name, ifname))
2490 {
b5d57fc8 2491 netdev->in6 = in6_tmp;
8b61709d
BP
2492 break;
2493 }
2494 }
2495 fclose(file);
2496 }
b5d57fc8 2497 netdev->cache_valid |= VALID_IN6;
8b61709d 2498 }
b5d57fc8 2499 *in6 = netdev->in6;
86383816
BP
2500 ovs_mutex_unlock(&netdev->mutex);
2501
8b61709d
BP
2502 return 0;
2503}
2504
2505static void
2506make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2507{
2508 struct sockaddr_in sin;
2509 memset(&sin, 0, sizeof sin);
2510 sin.sin_family = AF_INET;
2511 sin.sin_addr = addr;
2512 sin.sin_port = 0;
2513
2514 memset(sa, 0, sizeof *sa);
2515 memcpy(sa, &sin, sizeof sin);
2516}
2517
2518static int
2519do_set_addr(struct netdev *netdev,
2520 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2521{
2522 struct ifreq ifr;
149f577a 2523
259e0b1a
BP
2524 make_in4_sockaddr(&ifr.ifr_addr, addr);
2525 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2526 ioctl_name);
8b61709d
BP
2527}
2528
2529/* Adds 'router' as a default IP gateway. */
2530static int
67a4917b 2531netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2532{
2533 struct in_addr any = { INADDR_ANY };
2534 struct rtentry rt;
2535 int error;
2536
2537 memset(&rt, 0, sizeof rt);
2538 make_in4_sockaddr(&rt.rt_dst, any);
2539 make_in4_sockaddr(&rt.rt_gateway, router);
2540 make_in4_sockaddr(&rt.rt_genmask, any);
2541 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2542 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2543 if (error) {
10a89ef0 2544 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2545 }
2546 return error;
2547}
2548
f1acd62b
BP
2549static int
2550netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2551 char **netdev_name)
2552{
2553 static const char fn[] = "/proc/net/route";
2554 FILE *stream;
2555 char line[256];
2556 int ln;
2557
2558 *netdev_name = NULL;
2559 stream = fopen(fn, "r");
2560 if (stream == NULL) {
10a89ef0 2561 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2562 return errno;
2563 }
2564
2565 ln = 0;
2566 while (fgets(line, sizeof line, stream)) {
2567 if (++ln >= 2) {
2568 char iface[17];
dbba996b 2569 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2570 int refcnt, metric, mtu;
2571 unsigned int flags, use, window, irtt;
2572
c2c28dfd
BP
2573 if (!ovs_scan(line,
2574 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2575 " %d %u %u\n",
2576 iface, &dest, &gateway, &flags, &refcnt,
2577 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2578 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2579 fn, ln, line);
2580 continue;
2581 }
2582 if (!(flags & RTF_UP)) {
2583 /* Skip routes that aren't up. */
2584 continue;
2585 }
2586
2587 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2588 * network byte order, so we don't need need any endian
f1acd62b
BP
2589 * conversions here. */
2590 if ((dest & mask) == (host->s_addr & mask)) {
2591 if (!gateway) {
2592 /* The host is directly reachable. */
2593 next_hop->s_addr = 0;
2594 } else {
2595 /* To reach the host, we must go through a gateway. */
2596 next_hop->s_addr = gateway;
2597 }
2598 *netdev_name = xstrdup(iface);
2599 fclose(stream);
2600 return 0;
2601 }
2602 }
2603 }
2604
2605 fclose(stream);
2606 return ENXIO;
2607}
2608
e210037e 2609static int
b5d57fc8 2610netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2611{
b5d57fc8 2612 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2613 int error = 0;
2614
86383816 2615 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2616 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2617 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2618
2619 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2620 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2621 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2622 cmd,
2623 ETHTOOL_GDRVINFO,
2624 "ETHTOOL_GDRVINFO");
2625 if (!error) {
b5d57fc8 2626 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2627 }
2628 }
e210037e 2629
e210037e 2630 if (!error) {
b5d57fc8
BP
2631 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2632 smap_add(smap, "driver_version", netdev->drvinfo.version);
2633 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2634 }
86383816
BP
2635 ovs_mutex_unlock(&netdev->mutex);
2636
e210037e
AE
2637 return error;
2638}
2639
4f925bd3 2640static int
275707c3
EJ
2641netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2642 struct smap *smap)
4f925bd3 2643{
79f1cbe9 2644 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2645 return 0;
2646}
2647
8b61709d
BP
2648/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2649 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2650 * returns 0. Otherwise, it returns a positive errno value; in particular,
2651 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2652static int
2653netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2654 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2655{
2656 struct arpreq r;
c100e025 2657 struct sockaddr_in sin;
8b61709d
BP
2658 int retval;
2659
2660 memset(&r, 0, sizeof r);
f2cc621b 2661 memset(&sin, 0, sizeof sin);
c100e025
BP
2662 sin.sin_family = AF_INET;
2663 sin.sin_addr.s_addr = ip;
2664 sin.sin_port = 0;
2665 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2666 r.arp_ha.sa_family = ARPHRD_ETHER;
2667 r.arp_flags = 0;
71d7c22f 2668 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2669 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2670 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2671 if (!retval) {
2672 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2673 } else if (retval != ENXIO) {
2674 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2675 netdev_get_name(netdev), IP_ARGS(ip),
2676 ovs_strerror(retval));
8b61709d
BP
2677 }
2678 return retval;
2679}
2680
2681static int
2682nd_to_iff_flags(enum netdev_flags nd)
2683{
2684 int iff = 0;
2685 if (nd & NETDEV_UP) {
2686 iff |= IFF_UP;
2687 }
2688 if (nd & NETDEV_PROMISC) {
2689 iff |= IFF_PROMISC;
2690 }
7ba19d41
AC
2691 if (nd & NETDEV_LOOPBACK) {
2692 iff |= IFF_LOOPBACK;
2693 }
8b61709d
BP
2694 return iff;
2695}
2696
2697static int
2698iff_to_nd_flags(int iff)
2699{
2700 enum netdev_flags nd = 0;
2701 if (iff & IFF_UP) {
2702 nd |= NETDEV_UP;
2703 }
2704 if (iff & IFF_PROMISC) {
2705 nd |= NETDEV_PROMISC;
2706 }
7ba19d41
AC
2707 if (iff & IFF_LOOPBACK) {
2708 nd |= NETDEV_LOOPBACK;
2709 }
8b61709d
BP
2710 return nd;
2711}
2712
2713static int
4f9f3f21
BP
2714update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2715 enum netdev_flags on, enum netdev_flags *old_flagsp)
2716 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2717{
2718 int old_flags, new_flags;
c37d4da4
EJ
2719 int error = 0;
2720
b5d57fc8 2721 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2722 *old_flagsp = iff_to_nd_flags(old_flags);
2723 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2724 if (new_flags != old_flags) {
4f9f3f21
BP
2725 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2726 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2727 }
4f9f3f21
BP
2728
2729 return error;
2730}
2731
2732static int
2733netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2734 enum netdev_flags on, enum netdev_flags *old_flagsp)
2735{
2736 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2737 int error;
2738
2739 ovs_mutex_lock(&netdev->mutex);
2740 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2741 ovs_mutex_unlock(&netdev->mutex);
2742
8b61709d
BP
2743 return error;
2744}
2745
2f9dd77f 2746#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2747 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2748{ \
2749 NAME, \
2750 \
259e0b1a 2751 NULL, \
c3827f61
BP
2752 netdev_linux_run, \
2753 netdev_linux_wait, \
2754 \
9dc63482
BP
2755 netdev_linux_alloc, \
2756 CONSTRUCT, \
2757 netdev_linux_destruct, \
2758 netdev_linux_dealloc, \
de5cdb90 2759 NULL, /* get_config */ \
6d9e6eb4 2760 NULL, /* set_config */ \
f431bf7d 2761 NULL, /* get_tunnel_config */ \
a36de779
PS
2762 NULL, /* build header */ \
2763 NULL, /* push header */ \
2764 NULL, /* pop header */ \
7dec44fe 2765 NULL, /* get_numa_id */ \
5496878c 2766 NULL, /* set_multiq */ \
c3827f61 2767 \
c3827f61
BP
2768 netdev_linux_send, \
2769 netdev_linux_send_wait, \
2770 \
2771 netdev_linux_set_etheraddr, \
2772 netdev_linux_get_etheraddr, \
2773 netdev_linux_get_mtu, \
9b020780 2774 netdev_linux_set_mtu, \
c3827f61
BP
2775 netdev_linux_get_ifindex, \
2776 netdev_linux_get_carrier, \
65c3058c 2777 netdev_linux_get_carrier_resets, \
1670c579 2778 netdev_linux_set_miimon_interval, \
f613a0d7 2779 GET_STATS, \
c3827f61 2780 \
51f87458 2781 GET_FEATURES, \
c3827f61 2782 netdev_linux_set_advertisements, \
c3827f61
BP
2783 \
2784 netdev_linux_set_policing, \
2785 netdev_linux_get_qos_types, \
2786 netdev_linux_get_qos_capabilities, \
2787 netdev_linux_get_qos, \
2788 netdev_linux_set_qos, \
2789 netdev_linux_get_queue, \
2790 netdev_linux_set_queue, \
2791 netdev_linux_delete_queue, \
2792 netdev_linux_get_queue_stats, \
89454bf4
BP
2793 netdev_linux_queue_dump_start, \
2794 netdev_linux_queue_dump_next, \
2795 netdev_linux_queue_dump_done, \
c3827f61
BP
2796 netdev_linux_dump_queue_stats, \
2797 \
2798 netdev_linux_get_in4, \
2799 netdev_linux_set_in4, \
2800 netdev_linux_get_in6, \
2801 netdev_linux_add_router, \
2802 netdev_linux_get_next_hop, \
4f925bd3 2803 GET_STATUS, \
c3827f61
BP
2804 netdev_linux_arp_lookup, \
2805 \
2806 netdev_linux_update_flags, \
2807 \
f7791740
PS
2808 netdev_linux_rxq_alloc, \
2809 netdev_linux_rxq_construct, \
2810 netdev_linux_rxq_destruct, \
2811 netdev_linux_rxq_dealloc, \
2812 netdev_linux_rxq_recv, \
2813 netdev_linux_rxq_wait, \
2814 netdev_linux_rxq_drain, \
c3827f61
BP
2815}
2816
2817const struct netdev_class netdev_linux_class =
2818 NETDEV_LINUX_CLASS(
2819 "system",
9dc63482 2820 netdev_linux_construct,
f613a0d7 2821 netdev_linux_get_stats,
51f87458 2822 netdev_linux_get_features,
275707c3 2823 netdev_linux_get_status);
c3827f61
BP
2824
2825const struct netdev_class netdev_tap_class =
2826 NETDEV_LINUX_CLASS(
2827 "tap",
9dc63482 2828 netdev_linux_construct_tap,
bba1e6f3 2829 netdev_tap_get_stats,
51f87458 2830 netdev_linux_get_features,
275707c3 2831 netdev_linux_get_status);
c3827f61
BP
2832
2833const struct netdev_class netdev_internal_class =
2834 NETDEV_LINUX_CLASS(
2835 "internal",
9dc63482 2836 netdev_linux_construct,
bba1e6f3 2837 netdev_internal_get_stats,
51f87458 2838 NULL, /* get_features */
275707c3 2839 netdev_internal_get_status);
8b61709d 2840\f
677d9158
JV
2841
2842#define CODEL_N_QUEUES 0x0000
2843
2f4298ce
BP
2844/* In sufficiently new kernel headers these are defined as enums in
2845 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2846 * kernels. (This overrides any enum definition in the header file but that's
2847 * harmless.) */
2848#define TCA_CODEL_TARGET 1
2849#define TCA_CODEL_LIMIT 2
2850#define TCA_CODEL_INTERVAL 3
2851
677d9158
JV
2852struct codel {
2853 struct tc tc;
2854 uint32_t target;
2855 uint32_t limit;
2856 uint32_t interval;
2857};
2858
2859static struct codel *
2860codel_get__(const struct netdev *netdev_)
2861{
2862 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2863 return CONTAINER_OF(netdev->tc, struct codel, tc);
2864}
2865
2866static void
2867codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2868 uint32_t interval)
2869{
2870 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2871 struct codel *codel;
2872
2873 codel = xmalloc(sizeof *codel);
2874 tc_init(&codel->tc, &tc_ops_codel);
2875 codel->target = target;
2876 codel->limit = limit;
2877 codel->interval = interval;
2878
2879 netdev->tc = &codel->tc;
2880}
2881
2882static int
2883codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2884 uint32_t interval)
2885{
2886 size_t opt_offset;
2887 struct ofpbuf request;
2888 struct tcmsg *tcmsg;
2889 uint32_t otarget, olimit, ointerval;
2890 int error;
2891
2892 tc_del_qdisc(netdev);
2893
2894 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2895 NLM_F_EXCL | NLM_F_CREATE, &request);
2896 if (!tcmsg) {
2897 return ENODEV;
2898 }
2899 tcmsg->tcm_handle = tc_make_handle(1, 0);
2900 tcmsg->tcm_parent = TC_H_ROOT;
2901
2902 otarget = target ? target : 5000;
2903 olimit = limit ? limit : 10240;
2904 ointerval = interval ? interval : 100000;
2905
2906 nl_msg_put_string(&request, TCA_KIND, "codel");
2907 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2908 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2909 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2910 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2911 nl_msg_end_nested(&request, opt_offset);
2912
2913 error = tc_transact(&request, NULL);
2914 if (error) {
2915 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2916 "target %u, limit %u, interval %u error %d(%s)",
2917 netdev_get_name(netdev),
2918 otarget, olimit, ointerval,
2919 error, ovs_strerror(error));
2920 }
2921 return error;
2922}
2923
2924static void
2925codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2926 const struct smap *details, struct codel *codel)
2927{
2928 const char *target_s;
2929 const char *limit_s;
2930 const char *interval_s;
2931
2932 target_s = smap_get(details, "target");
2933 limit_s = smap_get(details, "limit");
2934 interval_s = smap_get(details, "interval");
2935
2936 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2937 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2938 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2939
2940 if (!codel->target) {
2941 codel->target = 5000;
2942 }
2943 if (!codel->limit) {
2944 codel->limit = 10240;
2945 }
2946 if (!codel->interval) {
2947 codel->interval = 100000;
2948 }
2949}
2950
2951static int
2952codel_tc_install(struct netdev *netdev, const struct smap *details)
2953{
2954 int error;
2955 struct codel codel;
2956
2957 codel_parse_qdisc_details__(netdev, details, &codel);
2958 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2959 codel.interval);
2960 if (!error) {
2961 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2962 }
2963 return error;
2964}
2965
2966static int
2967codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2968{
2969 static const struct nl_policy tca_codel_policy[] = {
2970 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2971 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2972 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2973 };
2974
2975 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2976
2977 if (!nl_parse_nested(nl_options, tca_codel_policy,
2978 attrs, ARRAY_SIZE(tca_codel_policy))) {
2979 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2980 return EPROTO;
2981 }
2982
2983 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2984 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2985 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2986 return 0;
2987}
2988
2989static int
2990codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2991{
2992 struct nlattr *nlattr;
2993 const char * kind;
2994 int error;
2995 struct codel codel;
2996
2997 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
2998 if (error != 0) {
2999 return error;
3000 }
3001
3002 error = codel_parse_tca_options__(nlattr, &codel);
3003 if (error != 0) {
3004 return error;
3005 }
3006
3007 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3008 return 0;
3009}
3010
3011
3012static void
3013codel_tc_destroy(struct tc *tc)
3014{
3015 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3016 tc_destroy(tc);
3017 free(codel);
3018}
3019
3020static int
3021codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3022{
3023 const struct codel *codel = codel_get__(netdev);
3024 smap_add_format(details, "target", "%u", codel->target);
3025 smap_add_format(details, "limit", "%u", codel->limit);
3026 smap_add_format(details, "interval", "%u", codel->interval);
3027 return 0;
3028}
3029
3030static int
3031codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3032{
3033 struct codel codel;
3034
3035 codel_parse_qdisc_details__(netdev, details, &codel);
3036 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3037 codel_get__(netdev)->target = codel.target;
3038 codel_get__(netdev)->limit = codel.limit;
3039 codel_get__(netdev)->interval = codel.interval;
3040 return 0;
3041}
3042
3043static const struct tc_ops tc_ops_codel = {
3044 "codel", /* linux_name */
3045 "linux-codel", /* ovs_name */
3046 CODEL_N_QUEUES, /* n_queues */
3047 codel_tc_install,
3048 codel_tc_load,
3049 codel_tc_destroy,
3050 codel_qdisc_get,
3051 codel_qdisc_set,
3052 NULL,
3053 NULL,
3054 NULL,
3055 NULL,
3056 NULL
3057};
3058\f
3059/* FQ-CoDel traffic control class. */
3060
3061#define FQCODEL_N_QUEUES 0x0000
3062
2f4298ce
BP
3063/* In sufficiently new kernel headers these are defined as enums in
3064 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3065 * kernels. (This overrides any enum definition in the header file but that's
3066 * harmless.) */
3067#define TCA_FQ_CODEL_TARGET 1
3068#define TCA_FQ_CODEL_LIMIT 2
3069#define TCA_FQ_CODEL_INTERVAL 3
3070#define TCA_FQ_CODEL_ECN 4
3071#define TCA_FQ_CODEL_FLOWS 5
3072#define TCA_FQ_CODEL_QUANTUM 6
3073
677d9158
JV
3074struct fqcodel {
3075 struct tc tc;
3076 uint32_t target;
3077 uint32_t limit;
3078 uint32_t interval;
3079 uint32_t flows;
3080 uint32_t quantum;
3081};
3082
3083static struct fqcodel *
3084fqcodel_get__(const struct netdev *netdev_)
3085{
3086 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3087 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3088}
3089
3090static void
3091fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3092 uint32_t interval, uint32_t flows, uint32_t quantum)
3093{
3094 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3095 struct fqcodel *fqcodel;
3096
3097 fqcodel = xmalloc(sizeof *fqcodel);
3098 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3099 fqcodel->target = target;
3100 fqcodel->limit = limit;
3101 fqcodel->interval = interval;
3102 fqcodel->flows = flows;
3103 fqcodel->quantum = quantum;
3104
3105 netdev->tc = &fqcodel->tc;
3106}
3107
3108static int
3109fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3110 uint32_t interval, uint32_t flows, uint32_t quantum)
3111{
3112 size_t opt_offset;
3113 struct ofpbuf request;
3114 struct tcmsg *tcmsg;
3115 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3116 int error;
3117
3118 tc_del_qdisc(netdev);
3119
3120 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3121 NLM_F_EXCL | NLM_F_CREATE, &request);
3122 if (!tcmsg) {
3123 return ENODEV;
3124 }
3125 tcmsg->tcm_handle = tc_make_handle(1, 0);
3126 tcmsg->tcm_parent = TC_H_ROOT;
3127
3128 otarget = target ? target : 5000;
3129 olimit = limit ? limit : 10240;
3130 ointerval = interval ? interval : 100000;
3131 oflows = flows ? flows : 1024;
3132 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3133 not mtu */
3134
3135 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3136 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3137 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3138 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3139 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3140 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3141 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3142 nl_msg_end_nested(&request, opt_offset);
3143
3144 error = tc_transact(&request, NULL);
3145 if (error) {
3146 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3147 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3148 netdev_get_name(netdev),
3149 otarget, olimit, ointerval, oflows, oquantum,
3150 error, ovs_strerror(error));
3151 }
3152 return error;
3153}
3154
3155static void
3156fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3157 const struct smap *details, struct fqcodel *fqcodel)
3158{
3159 const char *target_s;
3160 const char *limit_s;
3161 const char *interval_s;
3162 const char *flows_s;
3163 const char *quantum_s;
3164
3165 target_s = smap_get(details, "target");
3166 limit_s = smap_get(details, "limit");
3167 interval_s = smap_get(details, "interval");
3168 flows_s = smap_get(details, "flows");
3169 quantum_s = smap_get(details, "quantum");
3170 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3171 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3172 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3173 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3174 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3175 if (!fqcodel->target) {
3176 fqcodel->target = 5000;
3177 }
3178 if (!fqcodel->limit) {
3179 fqcodel->limit = 10240;
3180 }
3181 if (!fqcodel->interval) {
3182 fqcodel->interval = 1000000;
3183 }
3184 if (!fqcodel->flows) {
3185 fqcodel->flows = 1024;
3186 }
3187 if (!fqcodel->quantum) {
3188 fqcodel->quantum = 1514;
3189 }
3190}
3191
3192static int
3193fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3194{
3195 int error;
3196 struct fqcodel fqcodel;
3197
3198 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3199 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3200 fqcodel.interval, fqcodel.flows,
3201 fqcodel.quantum);
3202 if (!error) {
3203 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3204 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3205 }
3206 return error;
3207}
3208
3209static int
3210fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3211{
3212 static const struct nl_policy tca_fqcodel_policy[] = {
3213 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3214 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3215 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3216 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3217 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3218 };
3219
3220 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3221
3222 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3223 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3224 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3225 return EPROTO;
3226 }
3227
3228 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3229 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3230 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3231 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3232 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3233 return 0;
3234}
3235
3236static int
3237fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3238{
3239 struct nlattr *nlattr;
3240 const char * kind;
3241 int error;
3242 struct fqcodel fqcodel;
3243
3244 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3245 if (error != 0) {
3246 return error;
3247 }
3248
3249 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3250 if (error != 0) {
3251 return error;
3252 }
3253
3254 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3255 fqcodel.flows, fqcodel.quantum);
3256 return 0;
3257}
3258
3259static void
3260fqcodel_tc_destroy(struct tc *tc)
3261{
3262 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3263 tc_destroy(tc);
3264 free(fqcodel);
3265}
3266
3267static int
3268fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3269{
3270 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3271 smap_add_format(details, "target", "%u", fqcodel->target);
3272 smap_add_format(details, "limit", "%u", fqcodel->limit);
3273 smap_add_format(details, "interval", "%u", fqcodel->interval);
3274 smap_add_format(details, "flows", "%u", fqcodel->flows);
3275 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3276 return 0;
3277}
3278
3279static int
3280fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3281{
3282 struct fqcodel fqcodel;
3283
3284 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3285 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3286 fqcodel.flows, fqcodel.quantum);
3287 fqcodel_get__(netdev)->target = fqcodel.target;
3288 fqcodel_get__(netdev)->limit = fqcodel.limit;
3289 fqcodel_get__(netdev)->interval = fqcodel.interval;
3290 fqcodel_get__(netdev)->flows = fqcodel.flows;
3291 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3292 return 0;
3293}
3294
3295static const struct tc_ops tc_ops_fqcodel = {
3296 "fq_codel", /* linux_name */
3297 "linux-fq_codel", /* ovs_name */
3298 FQCODEL_N_QUEUES, /* n_queues */
3299 fqcodel_tc_install,
3300 fqcodel_tc_load,
3301 fqcodel_tc_destroy,
3302 fqcodel_qdisc_get,
3303 fqcodel_qdisc_set,
3304 NULL,
3305 NULL,
3306 NULL,
3307 NULL,
3308 NULL
3309};
3310\f
3311/* SFQ traffic control class. */
3312
3313#define SFQ_N_QUEUES 0x0000
3314
3315struct sfq {
3316 struct tc tc;
3317 uint32_t quantum;
3318 uint32_t perturb;
3319};
3320
3321static struct sfq *
3322sfq_get__(const struct netdev *netdev_)
3323{
3324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3325 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3326}
3327
3328static void
3329sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3330{
3331 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3332 struct sfq *sfq;
3333
3334 sfq = xmalloc(sizeof *sfq);
3335 tc_init(&sfq->tc, &tc_ops_sfq);
3336 sfq->perturb = perturb;
3337 sfq->quantum = quantum;
3338
3339 netdev->tc = &sfq->tc;
3340}
3341
3342static int
3343sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3344{
3345 struct tc_sfq_qopt opt;
3346 struct ofpbuf request;
3347 struct tcmsg *tcmsg;
3348 int mtu;
3349 int mtu_error, error;
3350 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3351
3352 tc_del_qdisc(netdev);
3353
3354 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3355 NLM_F_EXCL | NLM_F_CREATE, &request);
3356 if (!tcmsg) {
3357 return ENODEV;
3358 }
3359 tcmsg->tcm_handle = tc_make_handle(1, 0);
3360 tcmsg->tcm_parent = TC_H_ROOT;
3361
3362 memset(&opt, 0, sizeof opt);
3363 if (!quantum) {
3364 if (!mtu_error) {
3365 opt.quantum = mtu; /* if we cannot find mtu, use default */
3366 }
3367 } else {
3368 opt.quantum = quantum;
3369 }
3370
3371 if (!perturb) {
3372 opt.perturb_period = 10;
3373 } else {
3374 opt.perturb_period = perturb;
3375 }
3376
3377 nl_msg_put_string(&request, TCA_KIND, "sfq");
3378 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3379
3380 error = tc_transact(&request, NULL);
3381 if (error) {
3382 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3383 "quantum %u, perturb %u error %d(%s)",
3384 netdev_get_name(netdev),
3385 opt.quantum, opt.perturb_period,
3386 error, ovs_strerror(error));
3387 }
3388 return error;
3389}
3390
3391static void
3392sfq_parse_qdisc_details__(struct netdev *netdev,
3393 const struct smap *details, struct sfq *sfq)
3394{
3395 const char *perturb_s;
3396 const char *quantum_s;
3397 int mtu;
3398 int mtu_error;
3399
3400 perturb_s = smap_get(details, "perturb");
3401 quantum_s = smap_get(details, "quantum");
3402 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3403 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3404 if (!sfq->perturb) {
3405 sfq->perturb = 10;
3406 }
3407
3408 if (!sfq->quantum) {
3409 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3410 if (!mtu_error) {
3411 sfq->quantum = mtu;
3412 } else {
3413 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3414 "device without mtu");
3415 return;
3416 }
3417 }
3418}
3419
3420static int
3421sfq_tc_install(struct netdev *netdev, const struct smap *details)
3422{
3423 int error;
3424 struct sfq sfq;
3425
3426 sfq_parse_qdisc_details__(netdev, details, &sfq);
3427 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3428 if (!error) {
3429 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3430 }
3431 return error;
3432}
3433
3434static int
3435sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3436{
3437 const struct tc_sfq_qopt *sfq;
3438 struct nlattr *nlattr;
3439 const char * kind;
3440 int error;
3441
3442 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3443 if (error == 0) {
3444 sfq = nl_attr_get(nlattr);
3445 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3446 return 0;
3447 }
3448
3449 return error;
3450}
3451
3452static void
3453sfq_tc_destroy(struct tc *tc)
3454{
3455 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3456 tc_destroy(tc);
3457 free(sfq);
3458}
3459
3460static int
3461sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3462{
3463 const struct sfq *sfq = sfq_get__(netdev);
3464 smap_add_format(details, "quantum", "%u", sfq->quantum);
3465 smap_add_format(details, "perturb", "%u", sfq->perturb);
3466 return 0;
3467}
3468
3469static int
3470sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3471{
3472 struct sfq sfq;
3473
3474 sfq_parse_qdisc_details__(netdev, details, &sfq);
3475 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3476 sfq_get__(netdev)->quantum = sfq.quantum;
3477 sfq_get__(netdev)->perturb = sfq.perturb;
3478 return 0;
3479}
3480
3481static const struct tc_ops tc_ops_sfq = {
3482 "sfq", /* linux_name */
3483 "linux-sfq", /* ovs_name */
3484 SFQ_N_QUEUES, /* n_queues */
3485 sfq_tc_install,
3486 sfq_tc_load,
3487 sfq_tc_destroy,
3488 sfq_qdisc_get,
3489 sfq_qdisc_set,
3490 NULL,
3491 NULL,
3492 NULL,
3493 NULL,
3494 NULL
3495};
3496\f
c1c9c9c4 3497/* HTB traffic control class. */
559843ed 3498
c1c9c9c4 3499#define HTB_N_QUEUES 0xf000
4f631ccd 3500#define HTB_RATE2QUANTUM 10
8b61709d 3501
c1c9c9c4
BP
3502struct htb {
3503 struct tc tc;
3504 unsigned int max_rate; /* In bytes/s. */
3505};
8b61709d 3506
c1c9c9c4 3507struct htb_class {
93b13be8 3508 struct tc_queue tc_queue;
c1c9c9c4
BP
3509 unsigned int min_rate; /* In bytes/s. */
3510 unsigned int max_rate; /* In bytes/s. */
3511 unsigned int burst; /* In bytes. */
3512 unsigned int priority; /* Lower values are higher priorities. */
3513};
8b61709d 3514
c1c9c9c4 3515static struct htb *
b5d57fc8 3516htb_get__(const struct netdev *netdev_)
c1c9c9c4 3517{
b5d57fc8
BP
3518 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3519 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3520}
3521
24045e35 3522static void
b5d57fc8 3523htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3524{
b5d57fc8 3525 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3526 struct htb *htb;
3527
3528 htb = xmalloc(sizeof *htb);
3529 tc_init(&htb->tc, &tc_ops_htb);
3530 htb->max_rate = max_rate;
3531
b5d57fc8 3532 netdev->tc = &htb->tc;
c1c9c9c4
BP
3533}
3534
3535/* Create an HTB qdisc.
3536 *
a339aa81 3537 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3538static int
3539htb_setup_qdisc__(struct netdev *netdev)
3540{
3541 size_t opt_offset;
3542 struct tc_htb_glob opt;
3543 struct ofpbuf request;
3544 struct tcmsg *tcmsg;
3545
3546 tc_del_qdisc(netdev);
3547
3548 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3549 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3550 if (!tcmsg) {
3551 return ENODEV;
3552 }
c1c9c9c4
BP
3553 tcmsg->tcm_handle = tc_make_handle(1, 0);
3554 tcmsg->tcm_parent = TC_H_ROOT;
3555
3556 nl_msg_put_string(&request, TCA_KIND, "htb");
3557
3558 memset(&opt, 0, sizeof opt);
4f631ccd 3559 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3560 opt.version = 3;
4ecf12d5 3561 opt.defcls = 1;
c1c9c9c4
BP
3562
3563 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3564 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3565 nl_msg_end_nested(&request, opt_offset);
3566
3567 return tc_transact(&request, NULL);
3568}
3569
3570/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3571 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3572static int
3573htb_setup_class__(struct netdev *netdev, unsigned int handle,
3574 unsigned int parent, struct htb_class *class)
3575{
3576 size_t opt_offset;
3577 struct tc_htb_opt opt;
3578 struct ofpbuf request;
3579 struct tcmsg *tcmsg;
3580 int error;
3581 int mtu;
3582
73371c09 3583 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3584 if (error) {
f915f1a8
BP
3585 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3586 netdev_get_name(netdev));
9b020780 3587 return error;
f915f1a8 3588 }
c1c9c9c4
BP
3589
3590 memset(&opt, 0, sizeof opt);
3591 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3592 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3593 /* Makes sure the quantum is at least MTU. Setting quantum will
3594 * make htb ignore the r2q for this class. */
3595 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3596 opt.quantum = mtu;
3597 }
c1c9c9c4
BP
3598 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3599 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3600 opt.prio = class->priority;
3601
3602 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
3603 if (!tcmsg) {
3604 return ENODEV;
3605 }
c1c9c9c4
BP
3606 tcmsg->tcm_handle = handle;
3607 tcmsg->tcm_parent = parent;
3608
3609 nl_msg_put_string(&request, TCA_KIND, "htb");
3610 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3611 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3612 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3613 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3614 nl_msg_end_nested(&request, opt_offset);
3615
3616 error = tc_transact(&request, NULL);
3617 if (error) {
3618 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3619 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3620 netdev_get_name(netdev),
3621 tc_get_major(handle), tc_get_minor(handle),
3622 tc_get_major(parent), tc_get_minor(parent),
3623 class->min_rate, class->max_rate,
10a89ef0 3624 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3625 }
3626 return error;
3627}
3628
3629/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3630 * description of them into 'details'. The description complies with the
3631 * specification given in the vswitch database documentation for linux-htb
3632 * queue details. */
3633static int
3634htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3635{
3636 static const struct nl_policy tca_htb_policy[] = {
3637 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3638 .min_len = sizeof(struct tc_htb_opt) },
3639 };
3640
3641 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3642 const struct tc_htb_opt *htb;
3643
3644 if (!nl_parse_nested(nl_options, tca_htb_policy,
3645 attrs, ARRAY_SIZE(tca_htb_policy))) {
3646 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3647 return EPROTO;
3648 }
3649
3650 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3651 class->min_rate = htb->rate.rate;
3652 class->max_rate = htb->ceil.rate;
3653 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3654 class->priority = htb->prio;
3655 return 0;
3656}
3657
3658static int
3659htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3660 struct htb_class *options,
3661 struct netdev_queue_stats *stats)
3662{
3663 struct nlattr *nl_options;
3664 unsigned int handle;
3665 int error;
3666
3667 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3668 if (!error && queue_id) {
17ee3c1f
BP
3669 unsigned int major = tc_get_major(handle);
3670 unsigned int minor = tc_get_minor(handle);
3671 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3672 *queue_id = minor - 1;
c1c9c9c4
BP
3673 } else {
3674 error = EPROTO;
3675 }
3676 }
3677 if (!error && options) {
3678 error = htb_parse_tca_options__(nl_options, options);
3679 }
3680 return error;
3681}
3682
3683static void
73371c09 3684htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3685 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3686{
73371c09 3687 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3688 const char *max_rate_s;
3689
79f1cbe9 3690 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
3691 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3692 if (!hc->max_rate) {
a00ca915 3693 enum netdev_features current;
c1c9c9c4 3694
73371c09
BP
3695 netdev_linux_read_features(netdev);
3696 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3697 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3698 }
3699 hc->min_rate = hc->max_rate;
3700 hc->burst = 0;
3701 hc->priority = 0;
3702}
3703
3704static int
3705htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3706 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3707{
3708 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
3709 const char *min_rate_s = smap_get(details, "min-rate");
3710 const char *max_rate_s = smap_get(details, "max-rate");
3711 const char *burst_s = smap_get(details, "burst");
3712 const char *priority_s = smap_get(details, "priority");
9b020780 3713 int mtu, error;
c1c9c9c4 3714
73371c09 3715 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3716 if (error) {
f915f1a8
BP
3717 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3718 netdev_get_name(netdev));
9b020780 3719 return error;
f915f1a8
BP
3720 }
3721
4f104611
EJ
3722 /* HTB requires at least an mtu sized min-rate to send any traffic even
3723 * on uncongested links. */
c45ab5e9 3724 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 3725 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3726 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3727
3728 /* max-rate */
3729 hc->max_rate = (max_rate_s
3730 ? strtoull(max_rate_s, NULL, 10) / 8
3731 : htb->max_rate);
3732 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3733 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3734
3735 /* burst
3736 *
3737 * According to hints in the documentation that I've read, it is important
3738 * that 'burst' be at least as big as the largest frame that might be
3739 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3740 * but having it a bit too small is a problem. Since netdev_get_mtu()
3741 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3742 * the MTU. We actually add 64, instead of 14, as a guard against
3743 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
3744 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3745 hc->burst = MAX(hc->burst, mtu + 64);
3746
3747 /* priority */
3748 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3749
3750 return 0;
3751}
3752
3753static int
3754htb_query_class__(const struct netdev *netdev, unsigned int handle,
3755 unsigned int parent, struct htb_class *options,
3756 struct netdev_queue_stats *stats)
3757{
3758 struct ofpbuf *reply;
3759 int error;
3760
3761 error = tc_query_class(netdev, handle, parent, &reply);
3762 if (!error) {
3763 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3764 ofpbuf_delete(reply);
3765 }
3766 return error;
3767}
3768
3769static int
79f1cbe9 3770htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3771{
3772 int error;
3773
3774 error = htb_setup_qdisc__(netdev);
3775 if (!error) {
3776 struct htb_class hc;
3777
3778 htb_parse_qdisc_details__(netdev, details, &hc);
3779 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3780 tc_make_handle(1, 0), &hc);
3781 if (!error) {
3782 htb_install__(netdev, hc.max_rate);
3783 }
3784 }
3785 return error;
3786}
3787
93b13be8
BP
3788static struct htb_class *
3789htb_class_cast__(const struct tc_queue *queue)
3790{
3791 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3792}
3793
c1c9c9c4
BP
3794static void
3795htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3796 const struct htb_class *hc)
3797{
3798 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3799 size_t hash = hash_int(queue_id, 0);
3800 struct tc_queue *queue;
c1c9c9c4
BP
3801 struct htb_class *hcp;
3802
93b13be8
BP
3803 queue = tc_find_queue__(netdev, queue_id, hash);
3804 if (queue) {
3805 hcp = htb_class_cast__(queue);
3806 } else {
c1c9c9c4 3807 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3808 queue = &hcp->tc_queue;
3809 queue->queue_id = queue_id;
6dc34a0d 3810 queue->created = time_msec();
93b13be8 3811 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3812 }
93b13be8
BP
3813
3814 hcp->min_rate = hc->min_rate;
3815 hcp->max_rate = hc->max_rate;
3816 hcp->burst = hc->burst;
3817 hcp->priority = hc->priority;
c1c9c9c4
BP
3818}
3819
3820static int
3821htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3822{
c1c9c9c4 3823 struct ofpbuf msg;
d57695d7 3824 struct queue_dump_state state;
c1c9c9c4 3825 struct htb_class hc;
c1c9c9c4
BP
3826
3827 /* Get qdisc options. */
3828 hc.max_rate = 0;
3829 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3830 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3831
3832 /* Get queues. */
d57695d7 3833 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3834 return ENODEV;
3835 }
d57695d7 3836 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3837 unsigned int queue_id;
3838
3839 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3840 htb_update_queue__(netdev, queue_id, &hc);
3841 }
3842 }
d57695d7 3843 finish_queue_dump(&state);
c1c9c9c4
BP
3844
3845 return 0;
3846}
3847
3848static void
3849htb_tc_destroy(struct tc *tc)
3850{
3851 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 3852 struct htb_class *hc, *next;
c1c9c9c4 3853
4e8e4213 3854 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 3855 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
3856 free(hc);
3857 }
3858 tc_destroy(tc);
3859 free(htb);
3860}
3861
3862static int
79f1cbe9 3863htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3864{
3865 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3866 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3867 return 0;
3868}
3869
3870static int
79f1cbe9 3871htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3872{
3873 struct htb_class hc;
3874 int error;
3875
3876 htb_parse_qdisc_details__(netdev, details, &hc);
3877 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3878 tc_make_handle(1, 0), &hc);
3879 if (!error) {
3880 htb_get__(netdev)->max_rate = hc.max_rate;
3881 }
3882 return error;
3883}
3884
3885static int
93b13be8 3886htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3887 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3888{
93b13be8 3889 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3890
79f1cbe9 3891 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3892 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3893 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3894 }
79f1cbe9 3895 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3896 if (hc->priority) {
79f1cbe9 3897 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3898 }
3899 return 0;
3900}
3901
3902static int
3903htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3904 const struct smap *details)
c1c9c9c4
BP
3905{
3906 struct htb_class hc;
3907 int error;
3908
3909 error = htb_parse_class_details__(netdev, details, &hc);
3910 if (error) {
3911 return error;
3912 }
3913
17ee3c1f 3914 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3915 tc_make_handle(1, 0xfffe), &hc);
3916 if (error) {
3917 return error;
3918 }
3919
3920 htb_update_queue__(netdev, queue_id, &hc);
3921 return 0;
3922}
3923
3924static int
93b13be8 3925htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3926{
93b13be8 3927 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3928 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3929 int error;
3930
93b13be8 3931 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3932 if (!error) {
93b13be8 3933 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3934 free(hc);
c1c9c9c4
BP
3935 }
3936 return error;
3937}
3938
3939static int
93b13be8 3940htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3941 struct netdev_queue_stats *stats)
3942{
93b13be8 3943 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3944 tc_make_handle(1, 0xfffe), NULL, stats);
3945}
3946
3947static int
3948htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3949 const struct ofpbuf *nlmsg,
3950 netdev_dump_queue_stats_cb *cb, void *aux)
3951{
3952 struct netdev_queue_stats stats;
17ee3c1f 3953 unsigned int handle, major, minor;
c1c9c9c4
BP
3954 int error;
3955
3956 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3957 if (error) {
3958 return error;
3959 }
3960
17ee3c1f
BP
3961 major = tc_get_major(handle);
3962 minor = tc_get_minor(handle);
3963 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3964 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3965 }
3966 return 0;
3967}
3968
3969static const struct tc_ops tc_ops_htb = {
3970 "htb", /* linux_name */
3971 "linux-htb", /* ovs_name */
3972 HTB_N_QUEUES, /* n_queues */
3973 htb_tc_install,
3974 htb_tc_load,
3975 htb_tc_destroy,
3976 htb_qdisc_get,
3977 htb_qdisc_set,
3978 htb_class_get,
3979 htb_class_set,
3980 htb_class_delete,
3981 htb_class_get_stats,
3982 htb_class_dump_stats
3983};
3984\f
a339aa81
EJ
3985/* "linux-hfsc" traffic control class. */
3986
3987#define HFSC_N_QUEUES 0xf000
3988
3989struct hfsc {
3990 struct tc tc;
3991 uint32_t max_rate;
3992};
3993
3994struct hfsc_class {
3995 struct tc_queue tc_queue;
3996 uint32_t min_rate;
3997 uint32_t max_rate;
3998};
3999
4000static struct hfsc *
b5d57fc8 4001hfsc_get__(const struct netdev *netdev_)
a339aa81 4002{
b5d57fc8
BP
4003 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4004 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4005}
4006
4007static struct hfsc_class *
4008hfsc_class_cast__(const struct tc_queue *queue)
4009{
4010 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4011}
4012
24045e35 4013static void
b5d57fc8 4014hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4015{
b5d57fc8 4016 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4017 struct hfsc *hfsc;
4018
a339aa81
EJ
4019 hfsc = xmalloc(sizeof *hfsc);
4020 tc_init(&hfsc->tc, &tc_ops_hfsc);
4021 hfsc->max_rate = max_rate;
b5d57fc8 4022 netdev->tc = &hfsc->tc;
a339aa81
EJ
4023}
4024
4025static void
4026hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4027 const struct hfsc_class *hc)
4028{
4029 size_t hash;
4030 struct hfsc *hfsc;
4031 struct hfsc_class *hcp;
4032 struct tc_queue *queue;
4033
4034 hfsc = hfsc_get__(netdev);
4035 hash = hash_int(queue_id, 0);
4036
4037 queue = tc_find_queue__(netdev, queue_id, hash);
4038 if (queue) {
4039 hcp = hfsc_class_cast__(queue);
4040 } else {
4041 hcp = xmalloc(sizeof *hcp);
4042 queue = &hcp->tc_queue;
4043 queue->queue_id = queue_id;
6dc34a0d 4044 queue->created = time_msec();
a339aa81
EJ
4045 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4046 }
4047
4048 hcp->min_rate = hc->min_rate;
4049 hcp->max_rate = hc->max_rate;
4050}
4051
4052static int
4053hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4054{
4055 const struct tc_service_curve *rsc, *fsc, *usc;
4056 static const struct nl_policy tca_hfsc_policy[] = {
4057 [TCA_HFSC_RSC] = {
4058 .type = NL_A_UNSPEC,
4059 .optional = false,
4060 .min_len = sizeof(struct tc_service_curve),
4061 },
4062 [TCA_HFSC_FSC] = {
4063 .type = NL_A_UNSPEC,
4064 .optional = false,
4065 .min_len = sizeof(struct tc_service_curve),
4066 },
4067 [TCA_HFSC_USC] = {
4068 .type = NL_A_UNSPEC,
4069 .optional = false,
4070 .min_len = sizeof(struct tc_service_curve),
4071 },
4072 };
4073 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4074
4075 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4076 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4077 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4078 return EPROTO;
4079 }
4080
4081 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4082 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4083 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4084
4085 if (rsc->m1 != 0 || rsc->d != 0 ||
4086 fsc->m1 != 0 || fsc->d != 0 ||
4087 usc->m1 != 0 || usc->d != 0) {
4088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4089 "Non-linear service curves are not supported.");
4090 return EPROTO;
4091 }
4092
4093 if (rsc->m2 != fsc->m2) {
4094 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4095 "Real-time service curves are not supported ");
4096 return EPROTO;
4097 }
4098
4099 if (rsc->m2 > usc->m2) {
4100 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4101 "Min-rate service curve is greater than "
4102 "the max-rate service curve.");
4103 return EPROTO;
4104 }
4105
4106 class->min_rate = fsc->m2;
4107 class->max_rate = usc->m2;
4108 return 0;
4109}
4110
4111static int
4112hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4113 struct hfsc_class *options,
4114 struct netdev_queue_stats *stats)
4115{
4116 int error;
4117 unsigned int handle;
4118 struct nlattr *nl_options;
4119
4120 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4121 if (error) {
4122 return error;
4123 }
4124
4125 if (queue_id) {
4126 unsigned int major, minor;
4127
4128 major = tc_get_major(handle);
4129 minor = tc_get_minor(handle);
4130 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4131 *queue_id = minor - 1;
4132 } else {
4133 return EPROTO;
4134 }
4135 }
4136
4137 if (options) {
4138 error = hfsc_parse_tca_options__(nl_options, options);
4139 }
4140
4141 return error;
4142}
4143
4144static int
4145hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4146 unsigned int parent, struct hfsc_class *options,
4147 struct netdev_queue_stats *stats)
4148{
4149 int error;
4150 struct ofpbuf *reply;
4151
4152 error = tc_query_class(netdev, handle, parent, &reply);
4153 if (error) {
4154 return error;
4155 }
4156
4157 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4158 ofpbuf_delete(reply);
4159 return error;
4160}
4161
4162static void
73371c09 4163hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4164 struct hfsc_class *class)
4165{
73371c09 4166 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4167 uint32_t max_rate;
4168 const char *max_rate_s;
4169
79f1cbe9 4170 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
4171 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4172
4173 if (!max_rate) {
a00ca915 4174 enum netdev_features current;
a339aa81 4175
73371c09
BP
4176 netdev_linux_read_features(netdev);
4177 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4178 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4179 }
4180
4181 class->min_rate = max_rate;
4182 class->max_rate = max_rate;
4183}
4184
4185static int
4186hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4187 const struct smap *details,
a339aa81
EJ
4188 struct hfsc_class * class)
4189{
4190 const struct hfsc *hfsc;
4191 uint32_t min_rate, max_rate;
4192 const char *min_rate_s, *max_rate_s;
4193
4194 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
4195 min_rate_s = smap_get(details, "min-rate");
4196 max_rate_s = smap_get(details, "max-rate");
a339aa81 4197
c45ab5e9 4198 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 4199 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4200 min_rate = MIN(min_rate, hfsc->max_rate);
4201
4202 max_rate = (max_rate_s
4203 ? strtoull(max_rate_s, NULL, 10) / 8
4204 : hfsc->max_rate);
4205 max_rate = MAX(max_rate, min_rate);
4206 max_rate = MIN(max_rate, hfsc->max_rate);
4207
4208 class->min_rate = min_rate;
4209 class->max_rate = max_rate;
4210
4211 return 0;
4212}
4213
4214/* Create an HFSC qdisc.
4215 *
4216 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4217static int
4218hfsc_setup_qdisc__(struct netdev * netdev)
4219{
4220 struct tcmsg *tcmsg;
4221 struct ofpbuf request;
4222 struct tc_hfsc_qopt opt;
4223
4224 tc_del_qdisc(netdev);
4225
4226 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4227 NLM_F_EXCL | NLM_F_CREATE, &request);
4228
4229 if (!tcmsg) {
4230 return ENODEV;
4231 }
4232
4233 tcmsg->tcm_handle = tc_make_handle(1, 0);
4234 tcmsg->tcm_parent = TC_H_ROOT;
4235
4236 memset(&opt, 0, sizeof opt);
4237 opt.defcls = 1;
4238
4239 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4240 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4241
4242 return tc_transact(&request, NULL);
4243}
4244
4245/* Create an HFSC class.
4246 *
4247 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4248 * sc rate <min_rate> ul rate <max_rate>" */
4249static int
4250hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4251 unsigned int parent, struct hfsc_class *class)
4252{
4253 int error;
4254 size_t opt_offset;
4255 struct tcmsg *tcmsg;
4256 struct ofpbuf request;
4257 struct tc_service_curve min, max;
4258
4259 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4260
4261 if (!tcmsg) {
4262 return ENODEV;
4263 }
4264
4265 tcmsg->tcm_handle = handle;
4266 tcmsg->tcm_parent = parent;
4267
4268 min.m1 = 0;
4269 min.d = 0;
4270 min.m2 = class->min_rate;
4271
4272 max.m1 = 0;
4273 max.d = 0;
4274 max.m2 = class->max_rate;
4275
4276 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4277 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4278 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4279 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4280 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4281 nl_msg_end_nested(&request, opt_offset);
4282
4283 error = tc_transact(&request, NULL);
4284 if (error) {
4285 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4286 "min-rate %ubps, max-rate %ubps (%s)",
4287 netdev_get_name(netdev),
4288 tc_get_major(handle), tc_get_minor(handle),
4289 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4290 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4291 }
4292
4293 return error;
4294}
4295
4296static int
79f1cbe9 4297hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4298{
4299 int error;
4300 struct hfsc_class class;
4301
4302 error = hfsc_setup_qdisc__(netdev);
4303
4304 if (error) {
4305 return error;
4306 }
4307
4308 hfsc_parse_qdisc_details__(netdev, details, &class);
4309 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4310 tc_make_handle(1, 0), &class);
4311
4312 if (error) {
4313 return error;
4314 }
4315
4316 hfsc_install__(netdev, class.max_rate);
4317 return 0;
4318}
4319
4320static int
4321hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4322{
4323 struct ofpbuf msg;
d57695d7 4324 struct queue_dump_state state;
a339aa81
EJ
4325 struct hfsc_class hc;
4326
4327 hc.max_rate = 0;
4328 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4329 hfsc_install__(netdev, hc.max_rate);
a339aa81 4330
d57695d7 4331 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4332 return ENODEV;
4333 }
4334
d57695d7 4335 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4336 unsigned int queue_id;
4337
4338 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4339 hfsc_update_queue__(netdev, queue_id, &hc);
4340 }
4341 }
4342
d57695d7 4343 finish_queue_dump(&state);
a339aa81
EJ
4344 return 0;
4345}
4346
4347static void
4348hfsc_tc_destroy(struct tc *tc)
4349{
4350 struct hfsc *hfsc;
4351 struct hfsc_class *hc, *next;
4352
4353 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4354
4355 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4356 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4357 free(hc);
4358 }
4359
4360 tc_destroy(tc);
4361 free(hfsc);
4362}
4363
4364static int
79f1cbe9 4365hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4366{
4367 const struct hfsc *hfsc;
4368 hfsc = hfsc_get__(netdev);
79f1cbe9 4369 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4370 return 0;
4371}
4372
4373static int
79f1cbe9 4374hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4375{
4376 int error;
4377 struct hfsc_class class;
4378
4379 hfsc_parse_qdisc_details__(netdev, details, &class);
4380 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4381 tc_make_handle(1, 0), &class);
4382
4383 if (!error) {
4384 hfsc_get__(netdev)->max_rate = class.max_rate;
4385 }
4386
4387 return error;
4388}
4389
4390static int
4391hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4392 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4393{
4394 const struct hfsc_class *hc;
4395
4396 hc = hfsc_class_cast__(queue);
79f1cbe9 4397 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4398 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4399 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4400 }
4401 return 0;
4402}
4403
4404static int
4405hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4406 const struct smap *details)
a339aa81
EJ
4407{
4408 int error;
4409 struct hfsc_class class;
4410
4411 error = hfsc_parse_class_details__(netdev, details, &class);
4412 if (error) {
4413 return error;
4414 }
4415
4416 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4417 tc_make_handle(1, 0xfffe), &class);
4418 if (error) {
4419 return error;
4420 }
4421
4422 hfsc_update_queue__(netdev, queue_id, &class);
4423 return 0;
4424}
4425
4426static int
4427hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4428{
4429 int error;
4430 struct hfsc *hfsc;
4431 struct hfsc_class *hc;
4432
4433 hc = hfsc_class_cast__(queue);
4434 hfsc = hfsc_get__(netdev);
4435
4436 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4437 if (!error) {
4438 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4439 free(hc);
4440 }
4441 return error;
4442}
4443
4444static int
4445hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4446 struct netdev_queue_stats *stats)
4447{
4448 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4449 tc_make_handle(1, 0xfffe), NULL, stats);
4450}
4451
4452static int
4453hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4454 const struct ofpbuf *nlmsg,
4455 netdev_dump_queue_stats_cb *cb, void *aux)
4456{
4457 struct netdev_queue_stats stats;
4458 unsigned int handle, major, minor;
4459 int error;
4460
4461 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4462 if (error) {
4463 return error;
4464 }
4465
4466 major = tc_get_major(handle);
4467 minor = tc_get_minor(handle);
4468 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4469 (*cb)(minor - 1, &stats, aux);
4470 }
4471 return 0;
4472}
4473
4474static const struct tc_ops tc_ops_hfsc = {
4475 "hfsc", /* linux_name */
4476 "linux-hfsc", /* ovs_name */
4477 HFSC_N_QUEUES, /* n_queues */
4478 hfsc_tc_install, /* tc_install */
4479 hfsc_tc_load, /* tc_load */
4480 hfsc_tc_destroy, /* tc_destroy */
4481 hfsc_qdisc_get, /* qdisc_get */
4482 hfsc_qdisc_set, /* qdisc_set */
4483 hfsc_class_get, /* class_get */
4484 hfsc_class_set, /* class_set */
4485 hfsc_class_delete, /* class_delete */
4486 hfsc_class_get_stats, /* class_get_stats */
4487 hfsc_class_dump_stats /* class_dump_stats */
4488};
4489\f
c1c9c9c4
BP
4490/* "linux-default" traffic control class.
4491 *
4492 * This class represents the default, unnamed Linux qdisc. It corresponds to
4493 * the "" (empty string) QoS type in the OVS database. */
4494
4495static void
b5d57fc8 4496default_install__(struct netdev *netdev_)
c1c9c9c4 4497{
b5d57fc8 4498 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4499 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4500
559eb230
BP
4501 /* Nothing but a tc class implementation is allowed to write to a tc. This
4502 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4503 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4504}
4505
4506static int
4507default_tc_install(struct netdev *netdev,
79f1cbe9 4508 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4509{
4510 default_install__(netdev);
4511 return 0;
4512}
4513
4514static int
4515default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4516{
4517 default_install__(netdev);
4518 return 0;
4519}
4520
4521static const struct tc_ops tc_ops_default = {
4522 NULL, /* linux_name */
4523 "", /* ovs_name */
4524 0, /* n_queues */
4525 default_tc_install,
4526 default_tc_load,
4527 NULL, /* tc_destroy */
4528 NULL, /* qdisc_get */
4529 NULL, /* qdisc_set */
4530 NULL, /* class_get */
4531 NULL, /* class_set */
4532 NULL, /* class_delete */
4533 NULL, /* class_get_stats */
4534 NULL /* class_dump_stats */
4535};
4536\f
4537/* "linux-other" traffic control class.
4538 *
4539 * */
4540
4541static int
b5d57fc8 4542other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4543{
b5d57fc8 4544 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4545 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4546
559eb230
BP
4547 /* Nothing but a tc class implementation is allowed to write to a tc. This
4548 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4549 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4550 return 0;
4551}
4552
4553static const struct tc_ops tc_ops_other = {
4554 NULL, /* linux_name */
4555 "linux-other", /* ovs_name */
4556 0, /* n_queues */
4557 NULL, /* tc_install */
4558 other_tc_load,
4559 NULL, /* tc_destroy */
4560 NULL, /* qdisc_get */
4561 NULL, /* qdisc_set */
4562 NULL, /* class_get */
4563 NULL, /* class_set */
4564 NULL, /* class_delete */
4565 NULL, /* class_get_stats */
4566 NULL /* class_dump_stats */
4567};
4568\f
4569/* Traffic control. */
4570
4571/* Number of kernel "tc" ticks per second. */
4572static double ticks_per_s;
4573
4574/* Number of kernel "jiffies" per second. This is used for the purpose of
4575 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4576 * one jiffy's worth of data.
4577 *
4578 * There are two possibilities here:
4579 *
4580 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4581 * approximate range of 100 to 1024. That means that we really need to
4582 * make sure that the qdisc can buffer that much data.
4583 *
4584 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4585 * has finely granular timers and there's no need to fudge additional room
4586 * for buffers. (There's no extra effort needed to implement that: the
4587 * large 'buffer_hz' is used as a divisor, so practically any number will
4588 * come out as 0 in the division. Small integer results in the case of
4589 * really high dividends won't have any real effect anyhow.)
4590 */
4591static unsigned int buffer_hz;
4592
4593/* Returns tc handle 'major':'minor'. */
4594static unsigned int
4595tc_make_handle(unsigned int major, unsigned int minor)
4596{
4597 return TC_H_MAKE(major << 16, minor);
4598}
4599
4600/* Returns the major number from 'handle'. */
4601static unsigned int
4602tc_get_major(unsigned int handle)
4603{
4604 return TC_H_MAJ(handle) >> 16;
4605}
4606
4607/* Returns the minor number from 'handle'. */
4608static unsigned int
4609tc_get_minor(unsigned int handle)
4610{
4611 return TC_H_MIN(handle);
4612}
4613
4614static struct tcmsg *
4615tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4616 struct ofpbuf *request)
4617{
4618 struct tcmsg *tcmsg;
4619 int ifindex;
4620 int error;
4621
4622 error = get_ifindex(netdev, &ifindex);
4623 if (error) {
4624 return NULL;
4625 }
4626
4627 ofpbuf_init(request, 512);
4628 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4629 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4630 tcmsg->tcm_family = AF_UNSPEC;
4631 tcmsg->tcm_ifindex = ifindex;
4632 /* Caller should fill in tcmsg->tcm_handle. */
4633 /* Caller should fill in tcmsg->tcm_parent. */
4634
4635 return tcmsg;
4636}
4637
4638static int
4639tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4640{
a88b4e04 4641 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
4642 ofpbuf_uninit(request);
4643 return error;
4644}
4645
f8500004
JP
4646/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4647 * policing configuration.
4648 *
4649 * This function is equivalent to running the following when 'add' is true:
4650 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4651 *
4652 * This function is equivalent to running the following when 'add' is false:
4653 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4654 *
4655 * The configuration and stats may be seen with the following command:
4656 * /sbin/tc -s qdisc show dev <devname>
4657 *
4658 * Returns 0 if successful, otherwise a positive errno value.
4659 */
4660static int
4661tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4662{
4663 struct ofpbuf request;
4664 struct tcmsg *tcmsg;
4665 int error;
4666 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4667 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4668
4669 tcmsg = tc_make_request(netdev, type, flags, &request);
4670 if (!tcmsg) {
4671 return ENODEV;
4672 }
4673 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4674 tcmsg->tcm_parent = TC_H_INGRESS;
4675 nl_msg_put_string(&request, TCA_KIND, "ingress");
4676 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4677
4678 error = tc_transact(&request, NULL);
4679 if (error) {
4680 /* If we're deleting the qdisc, don't worry about some of the
4681 * error conditions. */
4682 if (!add && (error == ENOENT || error == EINVAL)) {
4683 return 0;
4684 }
4685 return error;
4686 }
4687
4688 return 0;
4689}
4690
4691/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4692 * of 'kbits_burst'.
4693 *
4694 * This function is equivalent to running:
4695 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4696 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4697 * mtu 65535 drop
4698 *
4699 * The configuration and stats may be seen with the following command:
c7952afb 4700 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4701 *
4702 * Returns 0 if successful, otherwise a positive errno value.
4703 */
4704static int
c7952afb
BP
4705tc_add_policer(struct netdev *netdev,
4706 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4707{
4708 struct tc_police tc_police;
4709 struct ofpbuf request;
4710 struct tcmsg *tcmsg;
4711 size_t basic_offset;
4712 size_t police_offset;
4713 int error;
4714 int mtu = 65535;
4715
4716 memset(&tc_police, 0, sizeof tc_police);
4717 tc_police.action = TC_POLICE_SHOT;
4718 tc_police.mtu = mtu;
1aca400c 4719 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb
BP
4720
4721 /* The following appears wrong in two ways:
4722 *
4723 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4724 * arguments (or at least consistently "bytes" as both or "bits" as
4725 * both), but this supplies bytes for the first argument and bits for the
4726 * second.
4727 *
4728 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4729 *
4730 * However if you "fix" those problems then "tc filter show ..." shows
4731 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4732 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4733 * tc's point of view. Whatever. */
4734 tc_police.burst = tc_bytes_to_ticks(
4735 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
f8500004
JP
4736
4737 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4738 NLM_F_EXCL | NLM_F_CREATE, &request);
4739 if (!tcmsg) {
4740 return ENODEV;
4741 }
4742 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4743 tcmsg->tcm_info = tc_make_handle(49,
4744 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4745
4746 nl_msg_put_string(&request, TCA_KIND, "basic");
4747 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4748 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4749 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4750 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4751 nl_msg_end_nested(&request, police_offset);
4752 nl_msg_end_nested(&request, basic_offset);
4753
4754 error = tc_transact(&request, NULL);
4755 if (error) {
4756 return error;
4757 }
4758
4759 return 0;
4760}
4761
c1c9c9c4
BP
4762static void
4763read_psched(void)
4764{
4765 /* The values in psched are not individually very meaningful, but they are
4766 * important. The tables below show some values seen in the wild.
4767 *
4768 * Some notes:
4769 *
4770 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4771 * (Before that, there are hints that it was 1000000000.)
4772 *
4773 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4774 * above.
4775 *
4776 * /proc/net/psched
4777 * -----------------------------------
4778 * [1] 000c8000 000f4240 000f4240 00000064
4779 * [2] 000003e8 00000400 000f4240 3b9aca00
4780 * [3] 000003e8 00000400 000f4240 3b9aca00
4781 * [4] 000003e8 00000400 000f4240 00000064
4782 * [5] 000003e8 00000040 000f4240 3b9aca00
4783 * [6] 000003e8 00000040 000f4240 000000f9
4784 *
4785 * a b c d ticks_per_s buffer_hz
4786 * ------- --------- ---------- ------------- ----------- -------------
4787 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4788 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4789 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4790 * [4] 1,000 1,024 1,000,000 100 976,562 100
4791 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4792 * [6] 1,000 64 1,000,000 249 15,625,000 249
4793 *
4794 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4795 * [2] 2.6.26-1-686-bigmem from Debian lenny
4796 * [3] 2.6.26-2-sparc64 from Debian lenny
4797 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4798 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4799 * [6] 2.6.34 from kernel.org on KVM
4800 */
23882115 4801 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4802 static const char fn[] = "/proc/net/psched";
4803 unsigned int a, b, c, d;
4804 FILE *stream;
4805
23882115
BP
4806 if (!ovsthread_once_start(&once)) {
4807 return;
4808 }
4809
c1c9c9c4
BP
4810 ticks_per_s = 1.0;
4811 buffer_hz = 100;
4812
4813 stream = fopen(fn, "r");
4814 if (!stream) {
10a89ef0 4815 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4816 goto exit;
c1c9c9c4
BP
4817 }
4818
4819 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4820 VLOG_WARN("%s: read failed", fn);
4821 fclose(stream);
23882115 4822 goto exit;
c1c9c9c4
BP
4823 }
4824 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4825 fclose(stream);
4826
4827 if (!a || !c) {
4828 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4829 goto exit;
c1c9c9c4
BP
4830 }
4831
4832 ticks_per_s = (double) a * c / b;
4833 if (c == 1000000) {
4834 buffer_hz = d;
4835 } else {
4836 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4837 fn, a, b, c, d);
4838 }
4839 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4840
4841exit:
4842 ovsthread_once_done(&once);
c1c9c9c4
BP
4843}
4844
4845/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4846 * rate of 'rate' bytes per second. */
4847static unsigned int
4848tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4849{
23882115 4850 read_psched();
c1c9c9c4
BP
4851 return (rate * ticks) / ticks_per_s;
4852}
4853
4854/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4855 * rate of 'rate' bytes per second. */
4856static unsigned int
4857tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4858{
23882115 4859 read_psched();
015c93a4 4860 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4861}
4862
4863/* Returns the number of bytes that need to be reserved for qdisc buffering at
4864 * a transmission rate of 'rate' bytes per second. */
4865static unsigned int
4866tc_buffer_per_jiffy(unsigned int rate)
4867{
23882115 4868 read_psched();
c1c9c9c4
BP
4869 return rate / buffer_hz;
4870}
4871
4872/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4873 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4874 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4875 * stores NULL into it if it is absent.
4876 *
4877 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4878 * 'msg'.
4879 *
4880 * Returns 0 if successful, otherwise a positive errno value. */
4881static int
4882tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4883 struct nlattr **options)
4884{
4885 static const struct nl_policy tca_policy[] = {
4886 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4887 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4888 };
4889 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4890
4891 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4892 tca_policy, ta, ARRAY_SIZE(ta))) {
4893 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4894 goto error;
4895 }
4896
4897 if (kind) {
4898 *kind = nl_attr_get_string(ta[TCA_KIND]);
4899 }
4900
4901 if (options) {
4902 *options = ta[TCA_OPTIONS];
4903 }
4904
4905 return 0;
4906
4907error:
4908 if (kind) {
4909 *kind = NULL;
4910 }
4911 if (options) {
4912 *options = NULL;
4913 }
4914 return EPROTO;
4915}
4916
4917/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4918 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4919 * into '*options', and its queue statistics into '*stats'. Any of the output
4920 * arguments may be null.
4921 *
4922 * Returns 0 if successful, otherwise a positive errno value. */
4923static int
4924tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4925 struct nlattr **options, struct netdev_queue_stats *stats)
4926{
4927 static const struct nl_policy tca_policy[] = {
4928 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4929 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4930 };
4931 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4932
4933 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4934 tca_policy, ta, ARRAY_SIZE(ta))) {
4935 VLOG_WARN_RL(&rl, "failed to parse class message");
4936 goto error;
4937 }
4938
4939 if (handlep) {
4940 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4941 *handlep = tc->tcm_handle;
4942 }
4943
4944 if (options) {
4945 *options = ta[TCA_OPTIONS];
4946 }
4947
4948 if (stats) {
4949 const struct gnet_stats_queue *gsq;
4950 struct gnet_stats_basic gsb;
4951
4952 static const struct nl_policy stats_policy[] = {
4953 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4954 .min_len = sizeof gsb },
4955 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4956 .min_len = sizeof *gsq },
4957 };
4958 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4959
4960 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4961 sa, ARRAY_SIZE(sa))) {
4962 VLOG_WARN_RL(&rl, "failed to parse class stats");
4963 goto error;
4964 }
4965
4966 /* Alignment issues screw up the length of struct gnet_stats_basic on
4967 * some arch/bitsize combinations. Newer versions of Linux have a
4968 * struct gnet_stats_basic_packed, but we can't depend on that. The
4969 * easiest thing to do is just to make a copy. */
4970 memset(&gsb, 0, sizeof gsb);
4971 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4972 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4973 stats->tx_bytes = gsb.bytes;
4974 stats->tx_packets = gsb.packets;
4975
4976 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4977 stats->tx_errors = gsq->drops;
4978 }
4979
4980 return 0;
4981
4982error:
4983 if (options) {
4984 *options = NULL;
4985 }
4986 if (stats) {
4987 memset(stats, 0, sizeof *stats);
4988 }
4989 return EPROTO;
4990}
4991
4992/* Queries the kernel for class with identifier 'handle' and parent 'parent'
4993 * on 'netdev'. */
4994static int
4995tc_query_class(const struct netdev *netdev,
4996 unsigned int handle, unsigned int parent,
4997 struct ofpbuf **replyp)
4998{
4999 struct ofpbuf request;
5000 struct tcmsg *tcmsg;
5001 int error;
5002
5003 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
5004 if (!tcmsg) {
5005 return ENODEV;
5006 }
c1c9c9c4
BP
5007 tcmsg->tcm_handle = handle;
5008 tcmsg->tcm_parent = parent;
5009
5010 error = tc_transact(&request, replyp);
5011 if (error) {
5012 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5013 netdev_get_name(netdev),
5014 tc_get_major(handle), tc_get_minor(handle),
5015 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5016 ovs_strerror(error));
c1c9c9c4
BP
5017 }
5018 return error;
5019}
5020
5021/* Equivalent to "tc class del dev <name> handle <handle>". */
5022static int
5023tc_delete_class(const struct netdev *netdev, unsigned int handle)
5024{
5025 struct ofpbuf request;
5026 struct tcmsg *tcmsg;
5027 int error;
5028
5029 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5030 if (!tcmsg) {
5031 return ENODEV;
5032 }
c1c9c9c4
BP
5033 tcmsg->tcm_handle = handle;
5034 tcmsg->tcm_parent = 0;
5035
5036 error = tc_transact(&request, NULL);
5037 if (error) {
5038 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5039 netdev_get_name(netdev),
5040 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5041 ovs_strerror(error));
c1c9c9c4
BP
5042 }
5043 return error;
5044}
5045
5046/* Equivalent to "tc qdisc del dev <name> root". */
5047static int
b5d57fc8 5048tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5049{
b5d57fc8 5050 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5051 struct ofpbuf request;
5052 struct tcmsg *tcmsg;
5053 int error;
5054
b5d57fc8 5055 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5056 if (!tcmsg) {
5057 return ENODEV;
5058 }
c1c9c9c4
BP
5059 tcmsg->tcm_handle = tc_make_handle(1, 0);
5060 tcmsg->tcm_parent = TC_H_ROOT;
5061
5062 error = tc_transact(&request, NULL);
5063 if (error == EINVAL) {
5064 /* EINVAL probably means that the default qdisc was in use, in which
5065 * case we've accomplished our purpose. */
5066 error = 0;
5067 }
b5d57fc8
BP
5068 if (!error && netdev->tc) {
5069 if (netdev->tc->ops->tc_destroy) {
5070 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5071 }
b5d57fc8 5072 netdev->tc = NULL;
c1c9c9c4
BP
5073 }
5074 return error;
5075}
5076
ac3e3aaa
BP
5077static bool
5078getqdisc_is_safe(void)
5079{
5080 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5081 static bool safe = false;
5082
5083 if (ovsthread_once_start(&once)) {
5084 struct utsname utsname;
5085 int major, minor;
5086
5087 if (uname(&utsname) == -1) {
5088 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5089 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5090 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5091 } else if (major < 2 || (major == 2 && minor < 35)) {
5092 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5093 utsname.release);
5094 } else {
5095 safe = true;
5096 }
5097 ovsthread_once_done(&once);
5098 }
5099 return safe;
5100}
5101
c1c9c9c4
BP
5102/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5103 * kernel to determine what they are. Returns 0 if successful, otherwise a
5104 * positive errno value. */
5105static int
b5d57fc8 5106tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5107{
b5d57fc8 5108 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5109 struct ofpbuf request, *qdisc;
5110 const struct tc_ops *ops;
5111 struct tcmsg *tcmsg;
5112 int load_error;
5113 int error;
5114
b5d57fc8 5115 if (netdev->tc) {
c1c9c9c4
BP
5116 return 0;
5117 }
5118
5119 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5120 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5121 * 2.6.35 without that fix backported to it.
5122 *
5123 * To avoid the OOPS, we must not make a request that would attempt to dump
5124 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5125 * few others. There are a few ways that I can see to do this, but most of
5126 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5127 * technique chosen here is to assume that any non-default qdisc that we
5128 * create will have a class with handle 1:0. The built-in qdiscs only have
5129 * a class with handle 0:0.
5130 *
ac3e3aaa
BP
5131 * On Linux 2.6.35+ we use the straightforward method because it allows us
5132 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5133 * in such a case we get no response at all from the kernel (!) if a
5134 * builtin qdisc is in use (which is later caught by "!error &&
5135 * !qdisc->size"). */
b5d57fc8 5136 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
5137 if (!tcmsg) {
5138 return ENODEV;
5139 }
ac3e3aaa
BP
5140 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5141 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5142
5143 /* Figure out what tc class to instantiate. */
5144 error = tc_transact(&request, &qdisc);
ac3e3aaa 5145 if (!error && qdisc->size) {
c1c9c9c4
BP
5146 const char *kind;
5147
5148 error = tc_parse_qdisc(qdisc, &kind, NULL);
5149 if (error) {
5150 ops = &tc_ops_other;
5151 } else {
5152 ops = tc_lookup_linux_name(kind);
5153 if (!ops) {
5154 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5155 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5156
5157 ops = &tc_ops_other;
5158 }
5159 }
ac3e3aaa
BP
5160 } else if ((!error && !qdisc->size) || error == ENOENT) {
5161 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5162 * set up by some other entity that doesn't have a handle 1:0. We will
5163 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5164 ops = &tc_ops_default;
5165 error = 0;
5166 } else {
5167 /* Who knows? Maybe the device got deleted. */
5168 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5169 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5170 ops = &tc_ops_other;
5171 }
5172
5173 /* Instantiate it. */
b5d57fc8
BP
5174 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5175 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5176 ofpbuf_delete(qdisc);
5177
5178 return error ? error : load_error;
5179}
5180
5181/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5182 approximate the time to transmit packets of various lengths. For an MTU of
5183 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5184 represents two possible packet lengths; for a MTU of 513 through 1024, four
5185 possible lengths; and so on.
5186
5187 Returns, for the specified 'mtu', the number of bits that packet lengths
5188 need to be shifted right to fit within such a 256-entry table. */
5189static int
5190tc_calc_cell_log(unsigned int mtu)
5191{
5192 int cell_log;
5193
5194 if (!mtu) {
5195 mtu = ETH_PAYLOAD_MAX;
5196 }
5197 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5198
5199 for (cell_log = 0; mtu >= 256; cell_log++) {
5200 mtu >>= 1;
5201 }
5202
5203 return cell_log;
5204}
5205
5206/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5207 * of 'mtu'. */
5208static void
5209tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5210{
5211 memset(rate, 0, sizeof *rate);
5212 rate->cell_log = tc_calc_cell_log(mtu);
5213 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5214 /* rate->cell_align = 0; */ /* distro headers. */
5215 rate->mpu = ETH_TOTAL_MIN;
5216 rate->rate = Bps;
5217}
5218
5219/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5220 * attribute of the specified "type".
5221 *
5222 * See tc_calc_cell_log() above for a description of "rtab"s. */
5223static void
5224tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5225{
5226 uint32_t *rtab;
5227 unsigned int i;
5228
5229 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5230 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5231 unsigned packet_size = (i + 1) << rate->cell_log;
5232 if (packet_size < rate->mpu) {
5233 packet_size = rate->mpu;
5234 }
5235 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5236 }
5237}
5238
5239/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5240 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5241 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5242 * 0 is fine.) */
c1c9c9c4
BP
5243static int
5244tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5245{
5246 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5247 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5248}
d3980822 5249\f
aaf2fb1a
BP
5250/* Linux-only functions declared in netdev-linux.h */
5251
5252/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5253 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5254int
5255netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5256 const char *flag_name, bool enable)
5257{
5258 const char *netdev_name = netdev_get_name(netdev);
5259 struct ethtool_value evalue;
5260 uint32_t new_flags;
5261 int error;
5262
ab985a77 5263 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5264 memset(&evalue, 0, sizeof evalue);
5265 error = netdev_linux_do_ethtool(netdev_name,
5266 (struct ethtool_cmd *)&evalue,
5267 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5268 if (error) {
5269 return error;
5270 }
5271
ab985a77 5272 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
5273 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5274 error = netdev_linux_do_ethtool(netdev_name,
5275 (struct ethtool_cmd *)&evalue,
5276 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5277 if (error) {
5278 return error;
5279 }
5280
ab985a77 5281 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5282 memset(&evalue, 0, sizeof evalue);
5283 error = netdev_linux_do_ethtool(netdev_name,
5284 (struct ethtool_cmd *)&evalue,
5285 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5286 if (error) {
5287 return error;
5288 }
5289
5290 if (new_flags != evalue.data) {
5291 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5292 "device %s failed", enable ? "enable" : "disable",
5293 flag_name, netdev_name);
5294 return EOPNOTSUPP;
5295 }
5296
5297 return 0;
5298}
5299\f
5300/* Utility functions. */
5301
d3980822 5302/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5303static void
d3980822
BP
5304netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5305 const struct rtnl_link_stats *src)
5306{
f613a0d7
PS
5307 dst->rx_packets = src->rx_packets;
5308 dst->tx_packets = src->tx_packets;
5309 dst->rx_bytes = src->rx_bytes;
5310 dst->tx_bytes = src->tx_bytes;
5311 dst->rx_errors = src->rx_errors;
5312 dst->tx_errors = src->tx_errors;
5313 dst->rx_dropped = src->rx_dropped;
5314 dst->tx_dropped = src->tx_dropped;
5315 dst->multicast = src->multicast;
5316 dst->collisions = src->collisions;
5317 dst->rx_length_errors = src->rx_length_errors;
5318 dst->rx_over_errors = src->rx_over_errors;
5319 dst->rx_crc_errors = src->rx_crc_errors;
5320 dst->rx_frame_errors = src->rx_frame_errors;
5321 dst->rx_fifo_errors = src->rx_fifo_errors;
5322 dst->rx_missed_errors = src->rx_missed_errors;
5323 dst->tx_aborted_errors = src->tx_aborted_errors;
5324 dst->tx_carrier_errors = src->tx_carrier_errors;
5325 dst->tx_fifo_errors = src->tx_fifo_errors;
5326 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5327 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5328}
5329
337c9b99
BP
5330/* Copies 'src' into 'dst', performing format conversion in the process. */
5331static void
5332netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5333 const struct rtnl_link_stats64 *src)
5334{
5335 dst->rx_packets = src->rx_packets;
5336 dst->tx_packets = src->tx_packets;
5337 dst->rx_bytes = src->rx_bytes;
5338 dst->tx_bytes = src->tx_bytes;
5339 dst->rx_errors = src->rx_errors;
5340 dst->tx_errors = src->tx_errors;
5341 dst->rx_dropped = src->rx_dropped;
5342 dst->tx_dropped = src->tx_dropped;
5343 dst->multicast = src->multicast;
5344 dst->collisions = src->collisions;
5345 dst->rx_length_errors = src->rx_length_errors;
5346 dst->rx_over_errors = src->rx_over_errors;
5347 dst->rx_crc_errors = src->rx_crc_errors;
5348 dst->rx_frame_errors = src->rx_frame_errors;
5349 dst->rx_fifo_errors = src->rx_fifo_errors;
5350 dst->rx_missed_errors = src->rx_missed_errors;
5351 dst->tx_aborted_errors = src->tx_aborted_errors;
5352 dst->tx_carrier_errors = src->tx_carrier_errors;
5353 dst->tx_fifo_errors = src->tx_fifo_errors;
5354 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5355 dst->tx_window_errors = src->tx_window_errors;
5356}
5357
c1c9c9c4 5358static int
35eef899 5359get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5360{
c1c9c9c4
BP
5361 struct ofpbuf request;
5362 struct ofpbuf *reply;
c1c9c9c4
BP
5363 int error;
5364
5365 ofpbuf_init(&request, 0);
13a24df8
BP
5366 nl_msg_put_nlmsghdr(&request,
5367 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5368 RTM_GETLINK, NLM_F_REQUEST);
5369 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5370 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5371 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5372 ofpbuf_uninit(&request);
5373 if (error) {
5374 return error;
5375 }
5376
13a24df8 5377 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5378 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5379 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5380 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5381 error = 0;
5382 } else {
337c9b99
BP
5383 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5384 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5385 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5386 error = 0;
5387 } else {
5388 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5389 error = EPROTO;
5390 }
13a24df8
BP
5391 }
5392 } else {
5393 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5394 error = EPROTO;
c1c9c9c4 5395 }
8b61709d 5396
8b61709d 5397
576e26d7 5398 ofpbuf_delete(reply);
35eef899 5399 return error;
8b61709d 5400}
c1c9c9c4 5401
3a183124 5402static int
b5d57fc8 5403get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5404{
5405 struct ifreq ifr;
5406 int error;
5407
755be9ea 5408 *flags = 0;
259e0b1a 5409 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5410 if (!error) {
5411 *flags = ifr.ifr_flags;
5412 }
8b61709d
BP
5413 return error;
5414}
5415
5416static int
4b609110 5417set_flags(const char *name, unsigned int flags)
8b61709d
BP
5418{
5419 struct ifreq ifr;
5420
5421 ifr.ifr_flags = flags;
259e0b1a 5422 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5423}
5424
5425static int
5426do_get_ifindex(const char *netdev_name)
5427{
5428 struct ifreq ifr;
259e0b1a 5429 int error;
8b61709d 5430
71d7c22f 5431 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5432 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5433
5434 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5435 if (error) {
8b61709d 5436 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
5437 netdev_name, ovs_strerror(error));
5438 return -error;
8b61709d
BP
5439 }
5440 return ifr.ifr_ifindex;
5441}
5442
5443static int
5444get_ifindex(const struct netdev *netdev_, int *ifindexp)
5445{
b5d57fc8 5446 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5447
b5d57fc8 5448 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 5449 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5450
8b61709d 5451 if (ifindex < 0) {
b5d57fc8
BP
5452 netdev->get_ifindex_error = -ifindex;
5453 netdev->ifindex = 0;
c7b1b0a5 5454 } else {
b5d57fc8
BP
5455 netdev->get_ifindex_error = 0;
5456 netdev->ifindex = ifindex;
8b61709d 5457 }
b5d57fc8 5458 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5459 }
c7b1b0a5 5460
b5d57fc8
BP
5461 *ifindexp = netdev->ifindex;
5462 return netdev->get_ifindex_error;
8b61709d
BP
5463}
5464
5465static int
5466get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
5467{
5468 struct ifreq ifr;
5469 int hwaddr_family;
259e0b1a 5470 int error;
8b61709d
BP
5471
5472 memset(&ifr, 0, sizeof ifr);
71d7c22f 5473 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5474 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5475 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5476 if (error) {
78857dfb
BP
5477 /* ENODEV probably means that a vif disappeared asynchronously and
5478 * hasn't been removed from the database yet, so reduce the log level
5479 * to INFO for that case. */
259e0b1a 5480 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5481 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5482 netdev_name, ovs_strerror(error));
5483 return error;
8b61709d
BP
5484 }
5485 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5486 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5487 VLOG_WARN("%s device has unknown hardware address family %d",
5488 netdev_name, hwaddr_family);
5489 }
5490 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5491 return 0;
5492}
5493
5494static int
44445cac 5495set_etheraddr(const char *netdev_name,
8b61709d
BP
5496 const uint8_t mac[ETH_ADDR_LEN])
5497{
5498 struct ifreq ifr;
259e0b1a 5499 int error;
8b61709d
BP
5500
5501 memset(&ifr, 0, sizeof ifr);
71d7c22f 5502 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5503 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
5504 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
5505 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5506 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5507 if (error) {
8b61709d 5508 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5509 netdev_name, ovs_strerror(error));
8b61709d 5510 }
259e0b1a 5511 return error;
8b61709d
BP
5512}
5513
5514static int
0b0544d7 5515netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5516 int cmd, const char *cmd_name)
5517{
5518 struct ifreq ifr;
259e0b1a 5519 int error;
8b61709d
BP
5520
5521 memset(&ifr, 0, sizeof ifr);
71d7c22f 5522 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5523 ifr.ifr_data = (caddr_t) ecmd;
5524
5525 ecmd->cmd = cmd;
259e0b1a
BP
5526 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5527 if (error) {
5528 if (error != EOPNOTSUPP) {
8b61709d 5529 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5530 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5531 } else {
5532 /* The device doesn't support this operation. That's pretty
5533 * common, so there's no point in logging anything. */
5534 }
8b61709d 5535 }
259e0b1a 5536 return error;
8b61709d 5537}
f1acd62b
BP
5538
5539static int
5540netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5541 int cmd, const char *cmd_name)
5542{
5543 struct ifreq ifr;
5544 int error;
5545
5546 ifr.ifr_addr.sa_family = AF_INET;
259e0b1a 5547 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b 5548 if (!error) {
db5a1019
AW
5549 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5550 &ifr.ifr_addr);
f1acd62b
BP
5551 *ip = sin->sin_addr;
5552 }
5553 return error;
5554}
488d734d
BP
5555
5556/* Returns an AF_PACKET raw socket or a negative errno value. */
5557static int
5558af_packet_sock(void)
5559{
23882115
BP
5560 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5561 static int sock;
488d734d 5562
23882115 5563 if (ovsthread_once_start(&once)) {
488d734d
BP
5564 sock = socket(AF_PACKET, SOCK_RAW, 0);
5565 if (sock >= 0) {
8450059e
BP
5566 int error = set_nonblocking(sock);
5567 if (error) {
5568 close(sock);
5569 sock = -error;
5570 }
488d734d
BP
5571 } else {
5572 sock = -errno;
10a89ef0
BP
5573 VLOG_ERR("failed to create packet socket: %s",
5574 ovs_strerror(errno));
488d734d 5575 }
23882115 5576 ovsthread_once_done(&once);
488d734d
BP
5577 }
5578
5579 return sock;
5580}