]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
rtnetlink: Extend rtnetlink to support RTNLGRP_IPV4_IFADDR and
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
c7952afb 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
55bc98d6 43#include <net/if_packet.h>
8b61709d
BP
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
e14deea0 52#include "dp-packet.h"
93451a0a 53#include "dpif-netlink.h"
df1e5a3b 54#include "dpif-netdev.h"
8b61709d
BP
55#include "dynamic-string.h"
56#include "fatal-signal.h"
93b13be8
BP
57#include "hash.h"
58#include "hmap.h"
8b61709d 59#include "netdev-provider.h"
7fbef77a 60#include "netdev-vport.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
c060c4cf 63#include "netlink.h"
e9e28be3 64#include "ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d
BP
67#include "packets.h"
68#include "poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
8b61709d 70#include "shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
5136ce49 76
d98e6007 77VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 78
d76f09ea
BP
79COVERAGE_DEFINE(netdev_set_policing);
80COVERAGE_DEFINE(netdev_arp_lookup);
81COVERAGE_DEFINE(netdev_get_ifindex);
82COVERAGE_DEFINE(netdev_get_hwaddr);
83COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
84COVERAGE_DEFINE(netdev_get_ethtool);
85COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 86
8b61709d
BP
87\f
88/* These were introduced in Linux 2.6.14, so they might be missing if we have
89 * old headers. */
90#ifndef ADVERTISED_Pause
91#define ADVERTISED_Pause (1 << 13)
92#endif
93#ifndef ADVERTISED_Asym_Pause
94#define ADVERTISED_Asym_Pause (1 << 14)
95#endif
96
e47bd51a
JP
97/* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99#ifndef ETHTOOL_GFLAGS
100#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101#endif
102#ifndef ETHTOOL_SFLAGS
103#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
104#endif
105
c1c9c9c4
BP
106/* This was introduced in Linux 2.6.25, so it might be missing if we have old
107 * headers. */
108#ifndef TC_RTAB_SIZE
109#define TC_RTAB_SIZE 1024
110#endif
111
b73c8518
SH
112/* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
117 *
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
120 */
55bc98d6
BP
121#ifndef PACKET_AUXDATA
122#define PACKET_AUXDATA 8
123#endif
b73c8518
SH
124#ifndef TP_STATUS_VLAN_VALID
125#define TP_STATUS_VLAN_VALID (1 << 4)
126#endif
127#ifndef TP_STATUS_VLAN_TPID_VALID
128#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129#endif
130#undef tpacket_auxdata
131#define tpacket_auxdata rpl_tpacket_auxdata
132struct tpacket_auxdata {
133 uint32_t tp_status;
134 uint32_t tp_len;
135 uint32_t tp_snaplen;
136 uint16_t tp_mac;
137 uint16_t tp_net;
138 uint16_t tp_vlan_tci;
139 uint16_t tp_vlan_tpid;
140};
141
fa373af4
BP
142/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
143 *
144 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
145 * 2.6.32-431.29.2.el6.x86_64 (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
147 * if_link.h is not self-contained on those kernels. It is easiest to
148 * unconditionally define a replacement. */
149#ifndef IFLA_STATS64
337c9b99 150#define IFLA_STATS64 23
fa373af4
BP
151#endif
152#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
153struct rtnl_link_stats64 {
154 uint64_t rx_packets;
155 uint64_t tx_packets;
156 uint64_t rx_bytes;
157 uint64_t tx_bytes;
158 uint64_t rx_errors;
159 uint64_t tx_errors;
160 uint64_t rx_dropped;
161 uint64_t tx_dropped;
162 uint64_t multicast;
163 uint64_t collisions;
164
165 uint64_t rx_length_errors;
166 uint64_t rx_over_errors;
167 uint64_t rx_crc_errors;
168 uint64_t rx_frame_errors;
169 uint64_t rx_fifo_errors;
170 uint64_t rx_missed_errors;
171
172 uint64_t tx_aborted_errors;
173 uint64_t tx_carrier_errors;
174 uint64_t tx_fifo_errors;
175 uint64_t tx_heartbeat_errors;
176 uint64_t tx_window_errors;
177
178 uint64_t rx_compressed;
179 uint64_t tx_compressed;
180};
337c9b99 181
8b61709d 182enum {
7fbef77a
JG
183 VALID_IFINDEX = 1 << 0,
184 VALID_ETHERADDR = 1 << 1,
185 VALID_IN4 = 1 << 2,
186 VALID_IN6 = 1 << 3,
187 VALID_MTU = 1 << 4,
3a183124 188 VALID_POLICING = 1 << 5,
4f925bd3
PS
189 VALID_VPORT_STAT_ERROR = 1 << 6,
190 VALID_DRVINFO = 1 << 7,
51f87458 191 VALID_FEATURES = 1 << 8,
8b61709d 192};
c1c9c9c4
BP
193\f
194/* Traffic control. */
195
196/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
197 * network device.
198 *
199 * Each TC implementation subclasses this with whatever additional data it
200 * needs. */
c1c9c9c4
BP
201struct tc {
202 const struct tc_ops *ops;
93b13be8
BP
203 struct hmap queues; /* Contains "struct tc_queue"s.
204 * Read by generic TC layer.
205 * Written only by TC implementation. */
206};
c1c9c9c4 207
559eb230
BP
208#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
209
93b13be8
BP
210/* One traffic control queue.
211 *
212 * Each TC implementation subclasses this with whatever additional data it
213 * needs. */
214struct tc_queue {
215 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
216 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 217 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
218};
219
220/* A particular kind of traffic control. Each implementation generally maps to
221 * one particular Linux qdisc class.
222 *
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted. */
226struct tc_ops {
227 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
228 * This is null for tc_ops_default and tc_ops_other, for which there are no
229 * appropriate values. */
230 const char *linux_name;
231
232 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
233 const char *ovs_name;
234
235 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
236 * queues. The queues are numbered 0 through n_queues - 1. */
237 unsigned int n_queues;
238
239 /* Called to install this TC class on 'netdev'. The implementation should
240 * make the Netlink calls required to set up 'netdev' with the right qdisc
241 * and configure it according to 'details'. The implementation may assume
242 * that the current qdisc is the default; that is, there is no need for it
243 * to delete the current qdisc before installing itself.
244 *
245 * The contents of 'details' should be documented as valid for 'ovs_name'
246 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
247 * (which is built as ovs-vswitchd.conf.db(8)).
248 *
249 * This function must return 0 if and only if it sets 'netdev->tc' to an
250 * initialized 'struct tc'.
251 *
252 * (This function is null for tc_ops_other, which cannot be installed. For
253 * other TC classes it should always be nonnull.) */
79f1cbe9 254 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
255
256 /* Called when the netdev code determines (through a Netlink query) that
257 * this TC class's qdisc is installed on 'netdev', but we didn't install
258 * it ourselves and so don't know any of the details.
259 *
260 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
261 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
262 * implementation should parse the other attributes of 'nlmsg' as
263 * necessary to determine its configuration. If necessary it should also
264 * use Netlink queries to determine the configuration of queues on
265 * 'netdev'.
266 *
267 * This function must return 0 if and only if it sets 'netdev->tc' to an
268 * initialized 'struct tc'. */
269 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
270
271 /* Destroys the data structures allocated by the implementation as part of
272 * 'tc'. (This includes destroying 'tc->queues' by calling
273 * tc_destroy(tc).
274 *
275 * The implementation should not need to perform any Netlink calls. If
276 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
277 * (But it may not be desirable.)
278 *
279 * This function may be null if 'tc' is trivial. */
280 void (*tc_destroy)(struct tc *tc);
281
282 /* Retrieves details of 'netdev->tc' configuration into 'details'.
283 *
284 * The implementation should not need to perform any Netlink calls, because
285 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
286 * cached the configuration.
287 *
288 * The contents of 'details' should be documented as valid for 'ovs_name'
289 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
290 * (which is built as ovs-vswitchd.conf.db(8)).
291 *
292 * This function may be null if 'tc' is not configurable.
293 */
79f1cbe9 294 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
295
296 /* Reconfigures 'netdev->tc' according to 'details', performing any
297 * required Netlink calls to complete the reconfiguration.
298 *
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
302 *
303 * This function may be null if 'tc' is not configurable.
304 */
79f1cbe9 305 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 306
93b13be8
BP
307 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
308 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
309 *
310 * The contents of 'details' should be documented as valid for 'ovs_name'
311 * in the "other_config" column in the "Queue" table in
312 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
313 *
314 * The implementation should not need to perform any Netlink calls, because
315 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
316 * cached the queue configuration.
317 *
318 * This function may be null if 'tc' does not have queues ('n_queues' is
319 * 0). */
93b13be8 320 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 321 struct smap *details);
c1c9c9c4
BP
322
323 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
324 * 'details', perfoming any required Netlink calls to complete the
325 * reconfiguration. The caller ensures that 'queue_id' is less than
326 * 'n_queues'.
327 *
328 * The contents of 'details' should be documented as valid for 'ovs_name'
329 * in the "other_config" column in the "Queue" table in
330 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
331 *
332 * This function may be null if 'tc' does not have queues or its queues are
333 * not configurable. */
334 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 335 const struct smap *details);
c1c9c9c4 336
93b13be8
BP
337 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
338 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
339 *
340 * This function may be null if 'tc' does not have queues or its queues
341 * cannot be deleted. */
93b13be8 342 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 343
93b13be8
BP
344 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
345 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
346 *
347 * On success, initializes '*stats'.
348 *
349 * This function may be null if 'tc' does not have queues or if it cannot
350 * report queue statistics. */
93b13be8
BP
351 int (*class_get_stats)(const struct netdev *netdev,
352 const struct tc_queue *queue,
c1c9c9c4
BP
353 struct netdev_queue_stats *stats);
354
355 /* Extracts queue stats from 'nlmsg', which is a response to a
356 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
357 *
358 * This function may be null if 'tc' does not have queues or if it cannot
359 * report queue statistics. */
360 int (*class_dump_stats)(const struct netdev *netdev,
361 const struct ofpbuf *nlmsg,
362 netdev_dump_queue_stats_cb *cb, void *aux);
363};
364
365static void
366tc_init(struct tc *tc, const struct tc_ops *ops)
367{
368 tc->ops = ops;
93b13be8 369 hmap_init(&tc->queues);
c1c9c9c4
BP
370}
371
372static void
373tc_destroy(struct tc *tc)
374{
93b13be8 375 hmap_destroy(&tc->queues);
c1c9c9c4
BP
376}
377
378static const struct tc_ops tc_ops_htb;
a339aa81 379static const struct tc_ops tc_ops_hfsc;
677d9158
JV
380static const struct tc_ops tc_ops_codel;
381static const struct tc_ops tc_ops_fqcodel;
382static const struct tc_ops tc_ops_sfq;
c1c9c9c4
BP
383static const struct tc_ops tc_ops_default;
384static const struct tc_ops tc_ops_other;
385
559eb230 386static const struct tc_ops *const tcs[] = {
c1c9c9c4 387 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 388 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
389 &tc_ops_codel, /* Controlled delay */
390 &tc_ops_fqcodel, /* Fair queue controlled delay */
391 &tc_ops_sfq, /* Stochastic fair queueing */
c1c9c9c4
BP
392 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
393 &tc_ops_other, /* Some other qdisc. */
394 NULL
395};
149f577a 396
c1c9c9c4
BP
397static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
398static unsigned int tc_get_major(unsigned int handle);
399static unsigned int tc_get_minor(unsigned int handle);
400
401static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
402static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
403static unsigned int tc_buffer_per_jiffy(unsigned int rate);
404
405static struct tcmsg *tc_make_request(const struct netdev *, int type,
406 unsigned int flags, struct ofpbuf *);
407static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004 408static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
c7952afb
BP
409static int tc_add_policer(struct netdev *,
410 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
411
412static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
413 struct nlattr **options);
414static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
415 struct nlattr **options,
416 struct netdev_queue_stats *);
417static int tc_query_class(const struct netdev *,
418 unsigned int handle, unsigned int parent,
419 struct ofpbuf **replyp);
420static int tc_delete_class(const struct netdev *, unsigned int handle);
421
422static int tc_del_qdisc(struct netdev *netdev);
423static int tc_query_qdisc(const struct netdev *netdev);
424
425static int tc_calc_cell_log(unsigned int mtu);
426static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
427static void tc_put_rtab(struct ofpbuf *, uint16_t type,
428 const struct tc_ratespec *rate);
429static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
430\f
b5d57fc8
BP
431struct netdev_linux {
432 struct netdev up;
149f577a 433
86383816
BP
434 /* Protects all members below. */
435 struct ovs_mutex mutex;
436
149f577a 437 unsigned int cache_valid;
8b61709d 438
1670c579
EJ
439 bool miimon; /* Link status of last poll. */
440 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
441 struct timer miimon_timer;
442
8722022c
BP
443 /* The following are figured out "on demand" only. They are only valid
444 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
445 int ifindex;
446 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 447 struct in_addr address, netmask;
8b61709d
BP
448 struct in6_addr in6;
449 int mtu;
059e5f4f 450 unsigned int ifi_flags;
65c3058c 451 long long int carrier_resets;
80a86fbe
BP
452 uint32_t kbits_rate; /* Policing data. */
453 uint32_t kbits_burst;
bba1e6f3
PS
454 int vport_stats_error; /* Cached error code from vport_get_stats().
455 0 or an errno value. */
90a6637d 456 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 457 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 458 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 459 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 460 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 461
a00ca915
EJ
462 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
463 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
464 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 465
4f925bd3 466 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 467 struct tc *tc;
149f577a 468
d0d08f8a
BP
469 /* For devices of class netdev_tap_class only. */
470 int tap_fd;
8b61709d
BP
471};
472
f7791740
PS
473struct netdev_rxq_linux {
474 struct netdev_rxq up;
796223f5 475 bool is_tap;
5b7448ed 476 int fd;
149f577a 477};
8b61709d 478
8b61709d
BP
479/* This is set pretty low because we probably won't learn anything from the
480 * additional log messages. */
481static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
482
19c8e9c1
JS
483/* Polling miimon status for all ports causes performance degradation when
484 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
485 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
486 *
487 * Readers do not depend on this variable synchronizing with the related
488 * changes in the device miimon status, so we can use atomic_count. */
489static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 490
259e0b1a 491static void netdev_linux_run(void);
6f643e49 492
0b0544d7 493static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 494 int cmd, const char *cmd_name);
f1acd62b
BP
495static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
496 int cmd, const char *cmd_name);
b5d57fc8 497static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 498static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
499static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
500 enum netdev_flags on, enum netdev_flags *old_flagsp)
501 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
502static int do_get_ifindex(const char *netdev_name);
503static int get_ifindex(const struct netdev *, int *ifindexp);
504static int do_set_addr(struct netdev *netdev,
505 int ioctl_nr, const char *ioctl_name,
506 struct in_addr addr);
507static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 508static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
35eef899 509static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 510static int af_packet_sock(void);
19c8e9c1 511static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
512static void netdev_linux_miimon_run(void);
513static void netdev_linux_miimon_wait(void);
df1e5a3b 514static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 515
15b3596a
JG
516static bool
517is_netdev_linux_class(const struct netdev_class *netdev_class)
518{
259e0b1a 519 return netdev_class->run == netdev_linux_run;
15b3596a
JG
520}
521
796223f5
BP
522static bool
523is_tap_netdev(const struct netdev *netdev)
524{
b5d57fc8 525 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
526}
527
8b61709d
BP
528static struct netdev_linux *
529netdev_linux_cast(const struct netdev *netdev)
530{
b5d57fc8 531 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 532
180c6d0b 533 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 534}
796223f5 535
f7791740
PS
536static struct netdev_rxq_linux *
537netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 538{
9dc63482 539 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 540 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 541}
ff4ed3c9 542\f
cee87338 543static void netdev_linux_update(struct netdev_linux *netdev,
7e9dcc0f 544 const struct rtnetlink_change *)
86383816 545 OVS_REQUIRES(netdev->mutex);
cee87338 546static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
547 unsigned int ifi_flags, unsigned int mask)
548 OVS_REQUIRES(netdev->mutex);
cee87338
BP
549
550/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
551 * if no such socket could be created. */
552static struct nl_sock *
553netdev_linux_notify_sock(void)
554{
555 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
556 static struct nl_sock *sock;
557
558 if (ovsthread_once_start(&once)) {
559 int error;
560
561 error = nl_sock_create(NETLINK_ROUTE, &sock);
562 if (!error) {
563 error = nl_sock_join_mcgroup(sock, RTNLGRP_LINK);
564 if (error) {
565 nl_sock_destroy(sock);
566 sock = NULL;
567 }
568 }
569 ovsthread_once_done(&once);
570 }
571
572 return sock;
573}
574
19c8e9c1
JS
575static bool
576netdev_linux_miimon_enabled(void)
577{
812c272c 578 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
579}
580
8b61709d
BP
581static void
582netdev_linux_run(void)
583{
cee87338
BP
584 struct nl_sock *sock;
585 int error;
586
19c8e9c1
JS
587 if (netdev_linux_miimon_enabled()) {
588 netdev_linux_miimon_run();
589 }
cee87338
BP
590
591 sock = netdev_linux_notify_sock();
592 if (!sock) {
593 return;
594 }
595
596 do {
597 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
598 uint64_t buf_stub[4096 / 8];
599 struct ofpbuf buf;
600
601 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
602 error = nl_sock_recv(sock, &buf, false);
603 if (!error) {
7e9dcc0f 604 struct rtnetlink_change change;
cee87338 605
7e9dcc0f 606 if (rtnetlink_parse(&buf, &change)) {
cee87338
BP
607 struct netdev *netdev_ = netdev_from_name(change.ifname);
608 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
609 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
610
611 ovs_mutex_lock(&netdev->mutex);
cee87338 612 netdev_linux_update(netdev, &change);
86383816 613 ovs_mutex_unlock(&netdev->mutex);
cee87338 614 }
38e0065b 615 netdev_close(netdev_);
cee87338
BP
616 }
617 } else if (error == ENOBUFS) {
618 struct shash device_shash;
619 struct shash_node *node;
620
621 nl_sock_drain(sock);
622
623 shash_init(&device_shash);
624 netdev_get_devices(&netdev_linux_class, &device_shash);
625 SHASH_FOR_EACH (node, &device_shash) {
626 struct netdev *netdev_ = node->data;
627 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
628 unsigned int flags;
629
86383816 630 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
631 get_flags(netdev_, &flags);
632 netdev_linux_changed(netdev, flags, 0);
86383816
BP
633 ovs_mutex_unlock(&netdev->mutex);
634
cee87338
BP
635 netdev_close(netdev_);
636 }
637 shash_destroy(&device_shash);
638 } else if (error != EAGAIN) {
639 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
640 ovs_strerror(error));
641 }
642 ofpbuf_uninit(&buf);
643 } while (!error);
8b61709d
BP
644}
645
646static void
647netdev_linux_wait(void)
648{
cee87338
BP
649 struct nl_sock *sock;
650
19c8e9c1
JS
651 if (netdev_linux_miimon_enabled()) {
652 netdev_linux_miimon_wait();
653 }
cee87338
BP
654 sock = netdev_linux_notify_sock();
655 if (sock) {
656 nl_sock_wait(sock, POLLIN);
657 }
8b61709d
BP
658}
659
ac4d3bcb 660static void
b5d57fc8
BP
661netdev_linux_changed(struct netdev_linux *dev,
662 unsigned int ifi_flags, unsigned int mask)
86383816 663 OVS_REQUIRES(dev->mutex)
ac4d3bcb 664{
3e912ffc 665 netdev_change_seq_changed(&dev->up);
8aa77183
BP
666
667 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
668 dev->carrier_resets++;
669 }
670 dev->ifi_flags = ifi_flags;
671
4f925bd3
PS
672 dev->cache_valid &= mask;
673}
674
675static void
b5d57fc8 676netdev_linux_update(struct netdev_linux *dev,
7e9dcc0f 677 const struct rtnetlink_change *change)
86383816 678 OVS_REQUIRES(dev->mutex)
4f925bd3
PS
679{
680 if (change->nlmsg_type == RTM_NEWLINK) {
681 /* Keep drv-info */
b5d57fc8 682 netdev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 683
c7b1b0a5 684 /* Update netdev from rtnl-change msg. */
90a6637d
PS
685 if (change->mtu) {
686 dev->mtu = change->mtu;
687 dev->cache_valid |= VALID_MTU;
688 dev->netdev_mtu_error = 0;
689 }
690
44445cac
PS
691 if (!eth_addr_is_zero(change->addr)) {
692 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
693 dev->cache_valid |= VALID_ETHERADDR;
694 dev->ether_addr_error = 0;
695 }
696
7e9dcc0f 697 dev->ifindex = change->if_index;
c7b1b0a5
PS
698 dev->cache_valid |= VALID_IFINDEX;
699 dev->get_ifindex_error = 0;
4f925bd3 700 } else {
b5d57fc8 701 netdev_linux_changed(dev, change->ifi_flags, 0);
4f925bd3 702 }
ac4d3bcb
EJ
703}
704
9dc63482
BP
705static struct netdev *
706netdev_linux_alloc(void)
707{
708 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
709 return &netdev->up;
710}
711
cee87338 712static void
9dc63482
BP
713netdev_linux_common_construct(struct netdev_linux *netdev)
714{
834d6caf 715 ovs_mutex_init(&netdev->mutex);
9dc63482
BP
716}
717
1f6e0fbd
BP
718/* Creates system and internal devices. */
719static int
9dc63482 720netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 721{
9dc63482 722 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1f6e0fbd
BP
723 int error;
724
cee87338 725 netdev_linux_common_construct(netdev);
1f6e0fbd 726
b5d57fc8
BP
727 error = get_flags(&netdev->up, &netdev->ifi_flags);
728 if (error == ENODEV) {
9dc63482 729 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 730 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
731 return ENODEV;
732 } else {
733 /* "Internal" netdevs have to be created as netdev objects before
734 * they exist in the kernel, because creating them in the kernel
735 * happens by passing a netdev object to dpif_port_add().
736 * Therefore, ignore the error. */
737 }
738 }
46415c90 739
a740f0de
JG
740 return 0;
741}
742
5b7448ed
JG
743/* For most types of netdevs we open the device for each call of
744 * netdev_open(). However, this is not the case with tap devices,
745 * since it is only possible to open the device once. In this
746 * situation we share a single file descriptor, and consequently
747 * buffers, across all readers. Therefore once data is read it will
748 * be unavailable to other reads for tap devices. */
a740f0de 749static int
9dc63482 750netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 751{
9dc63482 752 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 753 static const char tap_dev[] = "/dev/net/tun";
9dc63482 754 const char *name = netdev_->name;
a740f0de
JG
755 struct ifreq ifr;
756 int error;
757
cee87338 758 netdev_linux_common_construct(netdev);
1f6e0fbd 759
6c88d577 760 /* Open tap device. */
d0d08f8a
BP
761 netdev->tap_fd = open(tap_dev, O_RDWR);
762 if (netdev->tap_fd < 0) {
6c88d577 763 error = errno;
10a89ef0 764 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 765 return error;
6c88d577
JP
766 }
767
768 /* Create tap device. */
769 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 770 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 771 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 772 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 773 ovs_strerror(errno));
6c88d577 774 error = errno;
f61d8d29 775 goto error_close;
6c88d577
JP
776 }
777
778 /* Make non-blocking. */
d0d08f8a 779 error = set_nonblocking(netdev->tap_fd);
a740f0de 780 if (error) {
f61d8d29 781 goto error_close;
a740f0de
JG
782 }
783
784 return 0;
785
f61d8d29 786error_close:
d0d08f8a 787 close(netdev->tap_fd);
a740f0de
JG
788 return error;
789}
790
6c88d577 791static void
9dc63482 792netdev_linux_destruct(struct netdev *netdev_)
6c88d577 793{
b5d57fc8 794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 795
b5d57fc8
BP
796 if (netdev->tc && netdev->tc->ops->tc_destroy) {
797 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
798 }
799
d0d08f8a
BP
800 if (netdev_get_class(netdev_) == &netdev_tap_class
801 && netdev->tap_fd >= 0)
802 {
803 close(netdev->tap_fd);
6c88d577 804 }
86383816 805
19c8e9c1 806 if (netdev->miimon_interval > 0) {
812c272c 807 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
808 }
809
86383816 810 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
811}
812
9dc63482
BP
813static void
814netdev_linux_dealloc(struct netdev *netdev_)
815{
816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
817 free(netdev);
818}
819
f7791740
PS
820static struct netdev_rxq *
821netdev_linux_rxq_alloc(void)
9dc63482 822{
f7791740 823 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
824 return &rx->up;
825}
826
7b6b0ef4 827static int
f7791740 828netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 829{
f7791740 830 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 831 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 832 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 833 int error;
7b6b0ef4 834
86383816 835 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
836 rx->is_tap = is_tap_netdev(netdev_);
837 if (rx->is_tap) {
838 rx->fd = netdev->tap_fd;
796223f5
BP
839 } else {
840 struct sockaddr_ll sll;
b73c8518 841 int ifindex, val;
32383c3b 842 /* Result of tcpdump -dd inbound */
259e0b1a 843 static const struct sock_filter filt[] = {
32383c3b
MM
844 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
845 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
846 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
847 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
848 };
259e0b1a
BP
849 static const struct sock_fprog fprog = {
850 ARRAY_SIZE(filt), (struct sock_filter *) filt
851 };
7b6b0ef4 852
796223f5 853 /* Create file descriptor. */
9dc63482
BP
854 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
855 if (rx->fd < 0) {
796223f5 856 error = errno;
10a89ef0 857 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
858 goto error;
859 }
33d82a56 860
b73c8518
SH
861 val = 1;
862 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
863 error = errno;
864 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
865 netdev_get_name(netdev_), ovs_strerror(error));
866 goto error;
867 }
868
796223f5 869 /* Set non-blocking mode. */
9dc63482 870 error = set_nonblocking(rx->fd);
796223f5
BP
871 if (error) {
872 goto error;
873 }
7b6b0ef4 874
796223f5 875 /* Get ethernet device index. */
180c6d0b 876 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
877 if (error) {
878 goto error;
879 }
7b6b0ef4 880
796223f5
BP
881 /* Bind to specific ethernet device. */
882 memset(&sll, 0, sizeof sll);
883 sll.sll_family = AF_PACKET;
884 sll.sll_ifindex = ifindex;
b73c8518 885 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 886 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
887 error = errno;
888 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 889 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
890 goto error;
891 }
32383c3b
MM
892
893 /* Filter for only inbound packets. */
9dc63482 894 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
895 sizeof fprog);
896 if (error) {
897 error = errno;
259e0b1a 898 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 899 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
900 goto error;
901 }
7b6b0ef4 902 }
86383816 903 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 904
7b6b0ef4
BP
905 return 0;
906
907error:
9dc63482
BP
908 if (rx->fd >= 0) {
909 close(rx->fd);
7b6b0ef4 910 }
86383816 911 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
912 return error;
913}
914
796223f5 915static void
f7791740 916netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 917{
f7791740 918 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 919
796223f5
BP
920 if (!rx->is_tap) {
921 close(rx->fd);
8b61709d 922 }
9dc63482
BP
923}
924
925static void
f7791740 926netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 927{
f7791740 928 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 929
796223f5
BP
930 free(rx);
931}
8b61709d 932
b73c8518
SH
933static ovs_be16
934auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux)
935{
936 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
937 return htons(aux->tp_vlan_tpid);
938 } else {
939 return htons(ETH_TYPE_VLAN);
940 }
941}
942
943static bool
944auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
945{
946 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
947}
948
796223f5 949static int
cf62fa4c 950netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 951{
b73c8518 952 size_t size;
796223f5 953 ssize_t retval;
b73c8518
SH
954 struct iovec iov;
955 struct cmsghdr *cmsg;
956 union {
957 struct cmsghdr cmsg;
958 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
959 } cmsg_buffer;
960 struct msghdr msgh;
961
962 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
963 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
964 size = dp_packet_tailroom(buffer);
b73c8518 965
cf62fa4c 966 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
967 iov.iov_len = size;
968 msgh.msg_name = NULL;
969 msgh.msg_namelen = 0;
970 msgh.msg_iov = &iov;
971 msgh.msg_iovlen = 1;
972 msgh.msg_control = &cmsg_buffer;
973 msgh.msg_controllen = sizeof cmsg_buffer;
974 msgh.msg_flags = 0;
8e8cddf7 975
796223f5 976 do {
b73c8518 977 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
978 } while (retval < 0 && errno == EINTR);
979
bfd3367b 980 if (retval < 0) {
b73c8518
SH
981 return errno;
982 } else if (retval > size) {
983 return EMSGSIZE;
984 }
985
cf62fa4c 986 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
987
988 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
989 const struct tpacket_auxdata *aux;
990
991 if (cmsg->cmsg_level != SOL_PACKET
992 || cmsg->cmsg_type != PACKET_AUXDATA
993 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
994 continue;
8b61709d 995 }
b73c8518
SH
996
997 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
998 if (auxdata_has_vlan_tci(aux)) {
999 if (retval < ETH_HEADER_LEN) {
1000 return EINVAL;
1001 }
1002
1003 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux),
1004 htons(aux->tp_vlan_tci));
1005 break;
1006 }
1007 }
1008
1009 return 0;
1010}
1011
1012static int
cf62fa4c 1013netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1014{
1015 ssize_t retval;
cf62fa4c 1016 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1017
1018 do {
cf62fa4c 1019 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1020 } while (retval < 0 && errno == EINTR);
1021
1022 if (retval < 0) {
bfd3367b
SH
1023 return errno;
1024 } else if (retval > size) {
1025 return EMSGSIZE;
8b61709d 1026 }
b73c8518 1027
cf62fa4c 1028 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1029 return 0;
1030}
1031
1032static int
e14deea0 1033netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
91088554 1034 int *c)
b73c8518 1035{
f7791740 1036 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1037 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1038 struct dp_packet *buffer;
df1e5a3b
PS
1039 ssize_t retval;
1040 int mtu;
1041
1042 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1043 mtu = ETH_PAYLOAD_MAX;
1044 }
1045
cf62fa4c 1046 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1047 DP_NETDEV_HEADROOM);
b73c8518 1048 retval = (rx->is_tap
f7791740
PS
1049 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1050 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1051
1052 if (retval) {
1053 if (retval != EAGAIN && retval != EMSGSIZE) {
1054 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
f7791740 1055 ovs_strerror(errno), netdev_rxq_get_name(rxq_));
df1e5a3b 1056 }
cf62fa4c 1057 dp_packet_delete(buffer);
df1e5a3b
PS
1058 } else {
1059 dp_packet_pad(buffer);
2bc1bbd2 1060 dp_packet_set_rss_hash(buffer, 0);
cf62fa4c 1061 packets[0] = buffer;
df1e5a3b 1062 *c = 1;
b73c8518
SH
1063 }
1064
1065 return retval;
8b61709d
BP
1066}
1067
8b61709d 1068static void
f7791740 1069netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1070{
f7791740 1071 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1072 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1073}
1074
8b61709d 1075static int
f7791740 1076netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1077{
f7791740 1078 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1079 if (rx->is_tap) {
8b61709d 1080 struct ifreq ifr;
f7791740 1081 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1082 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1083 if (error) {
1084 return error;
1085 }
796223f5 1086 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1087 return 0;
1088 } else {
796223f5 1089 return drain_rcvbuf(rx->fd);
8b61709d
BP
1090 }
1091}
1092
1093/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1094 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1095 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1096 * the packet is too big or too small to transmit on the device.
1097 *
1098 * The caller retains ownership of 'buffer' in all cases.
1099 *
1100 * The kernel maintains a packet transmission queue, so the caller is not
1101 * expected to do additional queuing of packets. */
1102static int
f00fa8cb 1103netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
e14deea0 1104 struct dp_packet **pkts, int cnt, bool may_steal)
8b61709d 1105{
f4fd623c
DDP
1106 int i;
1107 int error = 0;
40d26f04 1108
f4fd623c
DDP
1109 /* 'i' is incremented only if there's no error */
1110 for (i = 0; i < cnt;) {
cf62fa4c
PS
1111 const void *data = dp_packet_data(pkts[i]);
1112 size_t size = dp_packet_size(pkts[i]);
f23347ea 1113 ssize_t retval;
8b61709d 1114
796223f5 1115 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
1116 /* Use our AF_PACKET socket to send to this device. */
1117 struct sockaddr_ll sll;
1118 struct msghdr msg;
1119 struct iovec iov;
1120 int ifindex;
488d734d
BP
1121 int sock;
1122
1123 sock = af_packet_sock();
1124 if (sock < 0) {
c4c7a3d7 1125 return -sock;
488d734d 1126 }
f23347ea 1127
86383816
BP
1128 ifindex = netdev_get_ifindex(netdev_);
1129 if (ifindex < 0) {
1130 return -ifindex;
f23347ea 1131 }
8b61709d 1132
f23347ea
BP
1133 /* We don't bother setting most fields in sockaddr_ll because the
1134 * kernel ignores them for SOCK_RAW. */
1135 memset(&sll, 0, sizeof sll);
1136 sll.sll_family = AF_PACKET;
1137 sll.sll_ifindex = ifindex;
76c308b5 1138
ebc56baa 1139 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1140 iov.iov_len = size;
76c308b5 1141
f23347ea
BP
1142 msg.msg_name = &sll;
1143 msg.msg_namelen = sizeof sll;
1144 msg.msg_iov = &iov;
1145 msg.msg_iovlen = 1;
1146 msg.msg_control = NULL;
1147 msg.msg_controllen = 0;
1148 msg.msg_flags = 0;
1149
488d734d 1150 retval = sendmsg(sock, &msg, 0);
f23347ea 1151 } else {
796223f5
BP
1152 /* Use the tap fd to send to this device. This is essential for
1153 * tap devices, because packets sent to a tap device with an
1154 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1155 * tap device. This doesn't occur on other interface types
1156 * because we attach a socket filter to the rx socket. */
b5d57fc8 1157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1158
d0d08f8a 1159 retval = write(netdev->tap_fd, data, size);
f23347ea 1160 }
76c308b5 1161
8b61709d
BP
1162 if (retval < 0) {
1163 /* The Linux AF_PACKET implementation never blocks waiting for room
1164 * for packets, instead returning ENOBUFS. Translate this into
1165 * EAGAIN for the caller. */
f4fd623c
DDP
1166 error = errno == ENOBUFS ? EAGAIN : errno;
1167 if (error == EINTR) {
1168 /* continue without incrementing 'i', i.e. retry this packet */
8b61709d 1169 continue;
8b61709d 1170 }
f4fd623c 1171 break;
8b61709d 1172 } else if (retval != size) {
f4fd623c
DDP
1173 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1174 " of %"PRIuSIZE") on %s", retval, size,
1175 netdev_get_name(netdev_));
1176 error = EMSGSIZE;
1177 break;
1178 }
1179
1180 /* Process the next packet in the batch */
1181 i++;
1182 }
1183
1184 if (may_steal) {
1185 for (i = 0; i < cnt; i++) {
e14deea0 1186 dp_packet_delete(pkts[i]);
8b61709d
BP
1187 }
1188 }
f4fd623c
DDP
1189
1190 if (error && error != EAGAIN) {
1191 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1192 netdev_get_name(netdev_), ovs_strerror(error));
1193 }
1194
1195 return error;
1196
8b61709d
BP
1197}
1198
1199/* Registers with the poll loop to wake up from the next call to poll_block()
1200 * when the packet transmission queue has sufficient room to transmit a packet
1201 * with netdev_send().
1202 *
1203 * The kernel maintains a packet transmission queue, so the client is not
1204 * expected to do additional queuing of packets. Thus, this function is
1205 * unlikely to ever be used. It is included for completeness. */
1206static void
f00fa8cb 1207netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1208{
796223f5 1209 if (is_tap_netdev(netdev)) {
8b61709d
BP
1210 /* TAP device always accepts packets.*/
1211 poll_immediate_wake();
1212 }
1213}
1214
1215/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1216 * otherwise a positive errno value. */
1217static int
1218netdev_linux_set_etheraddr(struct netdev *netdev_,
1219 const uint8_t mac[ETH_ADDR_LEN])
1220{
b5d57fc8 1221 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1222 enum netdev_flags old_flags = 0;
eb395f2e
BP
1223 int error;
1224
86383816
BP
1225 ovs_mutex_lock(&netdev->mutex);
1226
b5d57fc8 1227 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1228 error = netdev->ether_addr_error;
1229 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1230 goto exit;
44445cac 1231 }
b5d57fc8 1232 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1233 }
1234
7eb1bd81 1235 /* Tap devices must be brought down before setting the address. */
796223f5 1236 if (is_tap_netdev(netdev_)) {
4f9f3f21 1237 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1238 }
44445cac
PS
1239 error = set_etheraddr(netdev_get_name(netdev_), mac);
1240 if (!error || error == ENODEV) {
b5d57fc8
BP
1241 netdev->ether_addr_error = error;
1242 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1243 if (!error) {
b5d57fc8 1244 memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 1245 }
8b61709d 1246 }
44445cac 1247
4f9f3f21
BP
1248 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1249 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1250 }
7eb1bd81 1251
86383816
BP
1252exit:
1253 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1254 return error;
1255}
1256
44445cac 1257/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1258static int
1259netdev_linux_get_etheraddr(const struct netdev *netdev_,
1260 uint8_t mac[ETH_ADDR_LEN])
1261{
b5d57fc8 1262 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1263 int error;
44445cac 1264
86383816 1265 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1266 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816
BP
1267 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1268 netdev->etheraddr);
b5d57fc8 1269 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1270 }
44445cac 1271
86383816
BP
1272 error = netdev->ether_addr_error;
1273 if (!error) {
b5d57fc8 1274 memcpy(mac, netdev->etheraddr, ETH_ADDR_LEN);
44445cac 1275 }
86383816 1276 ovs_mutex_unlock(&netdev->mutex);
44445cac 1277
86383816 1278 return error;
8b61709d
BP
1279}
1280
8b61709d 1281static int
73371c09 1282netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1283{
86383816
BP
1284 int error;
1285
b5d57fc8 1286 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1287 struct ifreq ifr;
90a6637d 1288
86383816 1289 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1290 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1291 netdev->mtu = ifr.ifr_mtu;
1292 netdev->cache_valid |= VALID_MTU;
8b61709d 1293 }
90a6637d 1294
86383816
BP
1295 error = netdev->netdev_mtu_error;
1296 if (!error) {
b5d57fc8 1297 *mtup = netdev->mtu;
90a6637d 1298 }
73371c09
BP
1299
1300 return error;
1301}
1302
1303/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1304 * in bytes, not including the hardware header; thus, this is typically 1500
1305 * bytes for Ethernet devices. */
1306static int
1307netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1308{
1309 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1310 int error;
1311
1312 ovs_mutex_lock(&netdev->mutex);
1313 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1314 ovs_mutex_unlock(&netdev->mutex);
1315
1316 return error;
8b61709d
BP
1317}
1318
9b020780
PS
1319/* Sets the maximum size of transmitted (MTU) for given device using linux
1320 * networking ioctl interface.
1321 */
1322static int
1323netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1324{
b5d57fc8 1325 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1326 struct ifreq ifr;
1327 int error;
1328
86383816 1329 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1330 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1331 error = netdev->netdev_mtu_error;
1332 if (error || netdev->mtu == mtu) {
1333 goto exit;
90a6637d 1334 }
b5d57fc8 1335 netdev->cache_valid &= ~VALID_MTU;
153e5481 1336 }
9b020780 1337 ifr.ifr_mtu = mtu;
259e0b1a
BP
1338 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1339 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1340 if (!error || error == ENODEV) {
b5d57fc8
BP
1341 netdev->netdev_mtu_error = error;
1342 netdev->mtu = ifr.ifr_mtu;
1343 netdev->cache_valid |= VALID_MTU;
9b020780 1344 }
86383816
BP
1345exit:
1346 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1347 return error;
9b020780
PS
1348}
1349
9ab3d9a3
BP
1350/* Returns the ifindex of 'netdev', if successful, as a positive number.
1351 * On failure, returns a negative errno value. */
1352static int
86383816 1353netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1354{
86383816 1355 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1356 int ifindex, error;
1357
86383816
BP
1358 ovs_mutex_lock(&netdev->mutex);
1359 error = get_ifindex(netdev_, &ifindex);
1360 ovs_mutex_unlock(&netdev->mutex);
1361
9ab3d9a3
BP
1362 return error ? -error : ifindex;
1363}
1364
8b61709d
BP
1365static int
1366netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1367{
b5d57fc8 1368 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1369
86383816 1370 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1371 if (netdev->miimon_interval > 0) {
1372 *carrier = netdev->miimon;
3a183124 1373 } else {
b5d57fc8 1374 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1375 }
86383816 1376 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1377
3a183124 1378 return 0;
8b61709d
BP
1379}
1380
65c3058c 1381static long long int
86383816 1382netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1383{
86383816
BP
1384 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1385 long long int carrier_resets;
1386
1387 ovs_mutex_lock(&netdev->mutex);
1388 carrier_resets = netdev->carrier_resets;
1389 ovs_mutex_unlock(&netdev->mutex);
1390
1391 return carrier_resets;
65c3058c
EJ
1392}
1393
63331829 1394static int
1670c579
EJ
1395netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1396 struct mii_ioctl_data *data)
63331829 1397{
63331829 1398 struct ifreq ifr;
782e6111 1399 int error;
63331829 1400
63331829 1401 memset(&ifr, 0, sizeof ifr);
782e6111 1402 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1403 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1404 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1405
782e6111
EJ
1406 return error;
1407}
1408
1409static int
1670c579 1410netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1411{
782e6111
EJ
1412 struct mii_ioctl_data data;
1413 int error;
63331829 1414
782e6111
EJ
1415 *miimon = false;
1416
1417 memset(&data, 0, sizeof data);
1670c579 1418 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1419 if (!error) {
1420 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1421 data.reg_num = MII_BMSR;
1670c579 1422 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1423 &data);
63331829
EJ
1424
1425 if (!error) {
782e6111 1426 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1427 } else {
1428 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1429 }
1430 } else {
1431 struct ethtool_cmd ecmd;
63331829
EJ
1432
1433 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1434 name);
1435
ab985a77 1436 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1437 memset(&ecmd, 0, sizeof ecmd);
1438 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1439 "ETHTOOL_GLINK");
1440 if (!error) {
782e6111
EJ
1441 struct ethtool_value eval;
1442
1443 memcpy(&eval, &ecmd, sizeof eval);
1444 *miimon = !!eval.data;
63331829
EJ
1445 } else {
1446 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1447 }
1448 }
1449
1450 return error;
1451}
1452
1670c579
EJ
1453static int
1454netdev_linux_set_miimon_interval(struct netdev *netdev_,
1455 long long int interval)
1456{
b5d57fc8 1457 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1458
86383816 1459 ovs_mutex_lock(&netdev->mutex);
1670c579 1460 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1461 if (netdev->miimon_interval != interval) {
19c8e9c1 1462 if (interval && !netdev->miimon_interval) {
812c272c 1463 atomic_count_inc(&miimon_cnt);
19c8e9c1 1464 } else if (!interval && netdev->miimon_interval) {
812c272c 1465 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1466 }
1467
b5d57fc8
BP
1468 netdev->miimon_interval = interval;
1469 timer_set_expired(&netdev->miimon_timer);
1670c579 1470 }
86383816 1471 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1472
1473 return 0;
1474}
1475
1476static void
1477netdev_linux_miimon_run(void)
1478{
1479 struct shash device_shash;
1480 struct shash_node *node;
1481
1482 shash_init(&device_shash);
b5d57fc8 1483 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1484 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1485 struct netdev *netdev = node->data;
1486 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1487 bool miimon;
1488
86383816
BP
1489 ovs_mutex_lock(&dev->mutex);
1490 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1491 netdev_linux_get_miimon(dev->up.name, &miimon);
1492 if (miimon != dev->miimon) {
1493 dev->miimon = miimon;
1494 netdev_linux_changed(dev, dev->ifi_flags, 0);
1495 }
1670c579 1496
86383816 1497 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1498 }
86383816 1499 ovs_mutex_unlock(&dev->mutex);
2f980d74 1500 netdev_close(netdev);
1670c579
EJ
1501 }
1502
1503 shash_destroy(&device_shash);
1504}
1505
1506static void
1507netdev_linux_miimon_wait(void)
1508{
1509 struct shash device_shash;
1510 struct shash_node *node;
1511
1512 shash_init(&device_shash);
b5d57fc8 1513 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1514 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1515 struct netdev *netdev = node->data;
1516 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1517
86383816 1518 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1519 if (dev->miimon_interval > 0) {
1520 timer_wait(&dev->miimon_timer);
1521 }
86383816 1522 ovs_mutex_unlock(&dev->mutex);
2f980d74 1523 netdev_close(netdev);
1670c579
EJ
1524 }
1525 shash_destroy(&device_shash);
1526}
1527
92df599c
JG
1528static void
1529swap_uint64(uint64_t *a, uint64_t *b)
1530{
1de0e8ae
BP
1531 uint64_t tmp = *a;
1532 *a = *b;
1533 *b = tmp;
92df599c
JG
1534}
1535
c060c4cf
EJ
1536/* Copies 'src' into 'dst', performing format conversion in the process.
1537 *
1538 * 'src' is allowed to be misaligned. */
1539static void
1540netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1541 const struct ovs_vport_stats *src)
1542{
6a54dedc
BP
1543 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1544 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1545 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1546 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1547 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1548 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1549 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1550 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1551 dst->multicast = 0;
1552 dst->collisions = 0;
1553 dst->rx_length_errors = 0;
1554 dst->rx_over_errors = 0;
1555 dst->rx_crc_errors = 0;
1556 dst->rx_frame_errors = 0;
1557 dst->rx_fifo_errors = 0;
1558 dst->rx_missed_errors = 0;
1559 dst->tx_aborted_errors = 0;
1560 dst->tx_carrier_errors = 0;
1561 dst->tx_fifo_errors = 0;
1562 dst->tx_heartbeat_errors = 0;
1563 dst->tx_window_errors = 0;
1564}
1565
1566static int
1567get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1568{
93451a0a 1569 struct dpif_netlink_vport reply;
c060c4cf
EJ
1570 struct ofpbuf *buf;
1571 int error;
1572
93451a0a 1573 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1574 if (error) {
1575 return error;
1576 } else if (!reply.stats) {
1577 ofpbuf_delete(buf);
1578 return EOPNOTSUPP;
1579 }
1580
1581 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1582
1583 ofpbuf_delete(buf);
1584
1585 return 0;
1586}
1587
f613a0d7
PS
1588static void
1589get_stats_via_vport(const struct netdev *netdev_,
1590 struct netdev_stats *stats)
8b61709d 1591{
b5d57fc8 1592 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1593
b5d57fc8
BP
1594 if (!netdev->vport_stats_error ||
1595 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1596 int error;
7fbef77a 1597
c060c4cf 1598 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1599 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1600 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1601 "(%s)",
1602 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1603 }
b5d57fc8
BP
1604 netdev->vport_stats_error = error;
1605 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1606 }
f613a0d7 1607}
8b61709d 1608
f613a0d7
PS
1609/* Retrieves current device stats for 'netdev-linux'. */
1610static int
1611netdev_linux_get_stats(const struct netdev *netdev_,
1612 struct netdev_stats *stats)
1613{
b5d57fc8 1614 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1615 struct netdev_stats dev_stats;
1616 int error;
1617
86383816 1618 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1619 get_stats_via_vport(netdev_, stats);
35eef899 1620 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1621 if (error) {
86383816
BP
1622 if (!netdev->vport_stats_error) {
1623 error = 0;
f613a0d7 1624 }
86383816 1625 } else if (netdev->vport_stats_error) {
04c881eb 1626 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1627 *stats = dev_stats;
1628 } else {
04c881eb
AZ
1629 /* Use kernel netdev's packet and byte counts since vport's counters
1630 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1631 * enabled. */
1632 stats->rx_packets = dev_stats.rx_packets;
1633 stats->rx_bytes = dev_stats.rx_bytes;
1634 stats->tx_packets = dev_stats.tx_packets;
1635 stats->tx_bytes = dev_stats.tx_bytes;
1636
f613a0d7
PS
1637 stats->rx_errors += dev_stats.rx_errors;
1638 stats->tx_errors += dev_stats.tx_errors;
1639 stats->rx_dropped += dev_stats.rx_dropped;
1640 stats->tx_dropped += dev_stats.tx_dropped;
1641 stats->multicast += dev_stats.multicast;
1642 stats->collisions += dev_stats.collisions;
1643 stats->rx_length_errors += dev_stats.rx_length_errors;
1644 stats->rx_over_errors += dev_stats.rx_over_errors;
1645 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1646 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1647 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1648 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1649 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1650 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1651 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1652 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1653 stats->tx_window_errors += dev_stats.tx_window_errors;
1654 }
86383816
BP
1655 ovs_mutex_unlock(&netdev->mutex);
1656
1657 return error;
f613a0d7
PS
1658}
1659
1660/* Retrieves current device stats for 'netdev-tap' netdev or
1661 * netdev-internal. */
1662static int
15aee116 1663netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1664{
b5d57fc8 1665 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1666 struct netdev_stats dev_stats;
1667 int error;
1668
86383816 1669 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1670 get_stats_via_vport(netdev_, stats);
35eef899 1671 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1672 if (error) {
86383816
BP
1673 if (!netdev->vport_stats_error) {
1674 error = 0;
8b61709d 1675 }
86383816
BP
1676 } else if (netdev->vport_stats_error) {
1677 /* Transmit and receive stats will appear to be swapped relative to the
1678 * other ports since we are the one sending the data, not a remote
1679 * computer. For consistency, we swap them back here. This does not
1680 * apply if we are getting stats from the vport layer because it always
1681 * tracks stats from the perspective of the switch. */
fe6b0e03 1682
f613a0d7 1683 *stats = dev_stats;
92df599c
JG
1684 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1685 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1686 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1687 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1688 stats->rx_length_errors = 0;
1689 stats->rx_over_errors = 0;
1690 stats->rx_crc_errors = 0;
1691 stats->rx_frame_errors = 0;
1692 stats->rx_fifo_errors = 0;
1693 stats->rx_missed_errors = 0;
1694 stats->tx_aborted_errors = 0;
1695 stats->tx_carrier_errors = 0;
1696 stats->tx_fifo_errors = 0;
1697 stats->tx_heartbeat_errors = 0;
1698 stats->tx_window_errors = 0;
f613a0d7 1699 } else {
04c881eb
AZ
1700 /* Use kernel netdev's packet and byte counts since vport counters
1701 * do not reflect packet counts on the wire when GSO, TSO or GRO
1702 * are enabled. */
1703 stats->rx_packets = dev_stats.tx_packets;
1704 stats->rx_bytes = dev_stats.tx_bytes;
1705 stats->tx_packets = dev_stats.rx_packets;
1706 stats->tx_bytes = dev_stats.rx_bytes;
1707
f613a0d7
PS
1708 stats->rx_dropped += dev_stats.tx_dropped;
1709 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1710
f613a0d7
PS
1711 stats->rx_errors += dev_stats.tx_errors;
1712 stats->tx_errors += dev_stats.rx_errors;
1713
1714 stats->multicast += dev_stats.multicast;
1715 stats->collisions += dev_stats.collisions;
1716 }
86383816
BP
1717 ovs_mutex_unlock(&netdev->mutex);
1718
1719 return error;
8b61709d
BP
1720}
1721
bba1e6f3
PS
1722static int
1723netdev_internal_get_stats(const struct netdev *netdev_,
1724 struct netdev_stats *stats)
1725{
b5d57fc8 1726 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1727 int error;
bba1e6f3 1728
86383816 1729 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1730 get_stats_via_vport(netdev_, stats);
86383816
BP
1731 error = netdev->vport_stats_error;
1732 ovs_mutex_unlock(&netdev->mutex);
1733
1734 return error;
bba1e6f3
PS
1735}
1736
51f87458 1737static void
b5d57fc8 1738netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1739{
1740 struct ethtool_cmd ecmd;
6c038611 1741 uint32_t speed;
8b61709d
BP
1742 int error;
1743
b5d57fc8 1744 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1745 return;
1746 }
1747
ab985a77 1748 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1749 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1750 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1751 ETHTOOL_GSET, "ETHTOOL_GSET");
1752 if (error) {
51f87458 1753 goto out;
8b61709d
BP
1754 }
1755
1756 /* Supported features. */
b5d57fc8 1757 netdev->supported = 0;
8b61709d 1758 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1759 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1760 }
1761 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1762 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1763 }
1764 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1765 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1766 }
1767 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1768 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1769 }
1770 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1771 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1772 }
1773 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
b5d57fc8 1774 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1775 }
1776 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
b5d57fc8 1777 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1778 }
1779 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1780 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1781 }
1782 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1783 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1784 }
1785 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1786 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1787 }
1788 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1789 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1790 }
1791 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1792 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1793 }
1794
1795 /* Advertised features. */
b5d57fc8 1796 netdev->advertised = 0;
8b61709d 1797 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1798 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1799 }
1800 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1801 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1802 }
1803 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1804 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1805 }
1806 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1807 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1808 }
1809 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1810 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1811 }
1812 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
b5d57fc8 1813 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1814 }
1815 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
b5d57fc8 1816 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1817 }
1818 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1819 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1820 }
1821 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1822 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1823 }
1824 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1825 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1826 }
1827 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1828 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1829 }
1830 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1831 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1832 }
1833
1834 /* Current settings. */
2a529ead 1835 speed = ecmd.speed;
6c038611 1836 if (speed == SPEED_10) {
b5d57fc8 1837 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1838 } else if (speed == SPEED_100) {
b5d57fc8 1839 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1840 } else if (speed == SPEED_1000) {
b5d57fc8 1841 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1842 } else if (speed == SPEED_10000) {
b5d57fc8 1843 netdev->current = NETDEV_F_10GB_FD;
6c038611 1844 } else if (speed == 40000) {
b5d57fc8 1845 netdev->current = NETDEV_F_40GB_FD;
6c038611 1846 } else if (speed == 100000) {
b5d57fc8 1847 netdev->current = NETDEV_F_100GB_FD;
6c038611 1848 } else if (speed == 1000000) {
b5d57fc8 1849 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1850 } else {
b5d57fc8 1851 netdev->current = 0;
8b61709d
BP
1852 }
1853
1854 if (ecmd.port == PORT_TP) {
b5d57fc8 1855 netdev->current |= NETDEV_F_COPPER;
8b61709d 1856 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1857 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1858 }
1859
1860 if (ecmd.autoneg) {
b5d57fc8 1861 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1862 }
1863
51f87458 1864out:
b5d57fc8
BP
1865 netdev->cache_valid |= VALID_FEATURES;
1866 netdev->get_features_error = error;
51f87458
PS
1867}
1868
887ed8b2
BP
1869/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1870 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1871 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1872static int
1873netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1874 enum netdev_features *current,
1875 enum netdev_features *advertised,
1876 enum netdev_features *supported,
1877 enum netdev_features *peer)
51f87458 1878{
b5d57fc8 1879 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1880 int error;
51f87458 1881
86383816 1882 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1883 netdev_linux_read_features(netdev);
b5d57fc8
BP
1884 if (!netdev->get_features_error) {
1885 *current = netdev->current;
1886 *advertised = netdev->advertised;
1887 *supported = netdev->supported;
887ed8b2 1888 *peer = 0; /* XXX */
51f87458 1889 }
86383816
BP
1890 error = netdev->get_features_error;
1891 ovs_mutex_unlock(&netdev->mutex);
1892
1893 return error;
8b61709d
BP
1894}
1895
1896/* Set the features advertised by 'netdev' to 'advertise'. */
1897static int
86383816 1898netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 1899 enum netdev_features advertise)
8b61709d 1900{
86383816 1901 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
1902 struct ethtool_cmd ecmd;
1903 int error;
1904
86383816
BP
1905 ovs_mutex_lock(&netdev->mutex);
1906
ab985a77 1907 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1908 memset(&ecmd, 0, sizeof ecmd);
86383816 1909 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
1910 ETHTOOL_GSET, "ETHTOOL_GSET");
1911 if (error) {
86383816 1912 goto exit;
8b61709d
BP
1913 }
1914
1915 ecmd.advertising = 0;
6c038611 1916 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1917 ecmd.advertising |= ADVERTISED_10baseT_Half;
1918 }
6c038611 1919 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1920 ecmd.advertising |= ADVERTISED_10baseT_Full;
1921 }
6c038611 1922 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1923 ecmd.advertising |= ADVERTISED_100baseT_Half;
1924 }
6c038611 1925 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1926 ecmd.advertising |= ADVERTISED_100baseT_Full;
1927 }
6c038611 1928 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1929 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1930 }
6c038611 1931 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1932 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1933 }
6c038611 1934 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1935 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1936 }
6c038611 1937 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1938 ecmd.advertising |= ADVERTISED_TP;
1939 }
6c038611 1940 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1941 ecmd.advertising |= ADVERTISED_FIBRE;
1942 }
6c038611 1943 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1944 ecmd.advertising |= ADVERTISED_Autoneg;
1945 }
6c038611 1946 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1947 ecmd.advertising |= ADVERTISED_Pause;
1948 }
6c038611 1949 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1950 ecmd.advertising |= ADVERTISED_Asym_Pause;
1951 }
ab985a77 1952 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
1953 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
1954 ETHTOOL_SSET, "ETHTOOL_SSET");
1955
1956exit:
1957 ovs_mutex_unlock(&netdev->mutex);
1958 return error;
8b61709d
BP
1959}
1960
f8500004
JP
1961/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1962 * successful, otherwise a positive errno value. */
8b61709d 1963static int
b5d57fc8 1964netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
1965 uint32_t kbits_rate, uint32_t kbits_burst)
1966{
b5d57fc8
BP
1967 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1968 const char *netdev_name = netdev_get_name(netdev_);
f8500004 1969 int error;
8b61709d 1970
80a86fbe
BP
1971 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1972 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1973 : kbits_burst); /* Stick with user-specified value. */
1974
86383816 1975 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1976 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
1977 error = netdev->netdev_policing_error;
1978 if (error || (netdev->kbits_rate == kbits_rate &&
1979 netdev->kbits_burst == kbits_burst)) {
c9f71668 1980 /* Assume that settings haven't changed since we last set them. */
86383816 1981 goto out;
c9f71668 1982 }
b5d57fc8 1983 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1984 }
1985
ac8c3412 1986 COVERAGE_INC(netdev_set_policing);
f8500004 1987 /* Remove any existing ingress qdisc. */
b5d57fc8 1988 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
1989 if (error) {
1990 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 1991 netdev_name, ovs_strerror(error));
c9f71668 1992 goto out;
f8500004
JP
1993 }
1994
8b61709d 1995 if (kbits_rate) {
b5d57fc8 1996 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
1997 if (error) {
1998 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 1999 netdev_name, ovs_strerror(error));
c9f71668 2000 goto out;
8b61709d
BP
2001 }
2002
b5d57fc8 2003 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2004 if (error){
2005 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2006 netdev_name, ovs_strerror(error));
c9f71668 2007 goto out;
8b61709d 2008 }
8b61709d
BP
2009 }
2010
b5d57fc8
BP
2011 netdev->kbits_rate = kbits_rate;
2012 netdev->kbits_burst = kbits_burst;
f8500004 2013
c9f71668
PS
2014out:
2015 if (!error || error == ENODEV) {
b5d57fc8
BP
2016 netdev->netdev_policing_error = error;
2017 netdev->cache_valid |= VALID_POLICING;
c9f71668 2018 }
86383816 2019 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2020 return error;
8b61709d
BP
2021}
2022
c1c9c9c4
BP
2023static int
2024netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2025 struct sset *types)
c1c9c9c4 2026{
559eb230 2027 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2028
2029 for (opsp = tcs; *opsp != NULL; opsp++) {
2030 const struct tc_ops *ops = *opsp;
2031 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2032 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2033 }
2034 }
2035 return 0;
2036}
2037
2038static const struct tc_ops *
2039tc_lookup_ovs_name(const char *name)
2040{
559eb230 2041 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2042
2043 for (opsp = tcs; *opsp != NULL; opsp++) {
2044 const struct tc_ops *ops = *opsp;
2045 if (!strcmp(name, ops->ovs_name)) {
2046 return ops;
2047 }
2048 }
2049 return NULL;
2050}
2051
2052static const struct tc_ops *
2053tc_lookup_linux_name(const char *name)
2054{
559eb230 2055 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2056
2057 for (opsp = tcs; *opsp != NULL; opsp++) {
2058 const struct tc_ops *ops = *opsp;
2059 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2060 return ops;
2061 }
2062 }
2063 return NULL;
2064}
2065
93b13be8 2066static struct tc_queue *
b5d57fc8 2067tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2068 size_t hash)
2069{
b5d57fc8 2070 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2071 struct tc_queue *queue;
2072
b5d57fc8 2073 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2074 if (queue->queue_id == queue_id) {
2075 return queue;
2076 }
2077 }
2078 return NULL;
2079}
2080
2081static struct tc_queue *
2082tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2083{
2084 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2085}
2086
c1c9c9c4
BP
2087static int
2088netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2089 const char *type,
2090 struct netdev_qos_capabilities *caps)
2091{
2092 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2093 if (!ops) {
2094 return EOPNOTSUPP;
2095 }
2096 caps->n_queues = ops->n_queues;
2097 return 0;
2098}
2099
2100static int
b5d57fc8 2101netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2102 const char **typep, struct smap *details)
c1c9c9c4 2103{
b5d57fc8 2104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2105 int error;
2106
86383816 2107 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2108 error = tc_query_qdisc(netdev_);
86383816
BP
2109 if (!error) {
2110 *typep = netdev->tc->ops->ovs_name;
2111 error = (netdev->tc->ops->qdisc_get
2112 ? netdev->tc->ops->qdisc_get(netdev_, details)
2113 : 0);
c1c9c9c4 2114 }
86383816 2115 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2116
86383816 2117 return error;
c1c9c9c4
BP
2118}
2119
2120static int
b5d57fc8 2121netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2122 const char *type, const struct smap *details)
c1c9c9c4 2123{
b5d57fc8 2124 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2125 const struct tc_ops *new_ops;
2126 int error;
2127
2128 new_ops = tc_lookup_ovs_name(type);
2129 if (!new_ops || !new_ops->tc_install) {
2130 return EOPNOTSUPP;
2131 }
2132
86383816 2133 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2134 error = tc_query_qdisc(netdev_);
c1c9c9c4 2135 if (error) {
86383816 2136 goto exit;
c1c9c9c4
BP
2137 }
2138
b5d57fc8 2139 if (new_ops == netdev->tc->ops) {
86383816 2140 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2141 } else {
2142 /* Delete existing qdisc. */
b5d57fc8 2143 error = tc_del_qdisc(netdev_);
c1c9c9c4 2144 if (error) {
86383816 2145 goto exit;
c1c9c9c4 2146 }
b5d57fc8 2147 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2148
2149 /* Install new qdisc. */
b5d57fc8
BP
2150 error = new_ops->tc_install(netdev_, details);
2151 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2152 }
86383816
BP
2153
2154exit:
2155 ovs_mutex_unlock(&netdev->mutex);
2156 return error;
c1c9c9c4
BP
2157}
2158
2159static int
b5d57fc8 2160netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2161 unsigned int queue_id, struct smap *details)
c1c9c9c4 2162{
b5d57fc8 2163 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2164 int error;
2165
86383816 2166 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2167 error = tc_query_qdisc(netdev_);
86383816 2168 if (!error) {
b5d57fc8 2169 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2170 error = (queue
b5d57fc8 2171 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2172 : ENOENT);
c1c9c9c4 2173 }
86383816
BP
2174 ovs_mutex_unlock(&netdev->mutex);
2175
2176 return error;
c1c9c9c4
BP
2177}
2178
2179static int
b5d57fc8 2180netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2181 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2182{
b5d57fc8 2183 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2184 int error;
2185
86383816 2186 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2187 error = tc_query_qdisc(netdev_);
86383816
BP
2188 if (!error) {
2189 error = (queue_id < netdev->tc->ops->n_queues
2190 && netdev->tc->ops->class_set
2191 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2192 : EINVAL);
c1c9c9c4 2193 }
86383816 2194 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2195
86383816 2196 return error;
c1c9c9c4
BP
2197}
2198
2199static int
b5d57fc8 2200netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2201{
b5d57fc8 2202 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2203 int error;
2204
86383816 2205 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2206 error = tc_query_qdisc(netdev_);
86383816
BP
2207 if (!error) {
2208 if (netdev->tc->ops->class_delete) {
2209 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2210 error = (queue
2211 ? netdev->tc->ops->class_delete(netdev_, queue)
2212 : ENOENT);
2213 } else {
2214 error = EINVAL;
2215 }
c1c9c9c4 2216 }
86383816
BP
2217 ovs_mutex_unlock(&netdev->mutex);
2218
2219 return error;
c1c9c9c4
BP
2220}
2221
2222static int
b5d57fc8 2223netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2224 unsigned int queue_id,
2225 struct netdev_queue_stats *stats)
2226{
b5d57fc8 2227 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2228 int error;
2229
86383816 2230 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2231 error = tc_query_qdisc(netdev_);
86383816
BP
2232 if (!error) {
2233 if (netdev->tc->ops->class_get_stats) {
2234 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2235 if (queue) {
2236 stats->created = queue->created;
2237 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2238 stats);
2239 } else {
2240 error = ENOENT;
2241 }
2242 } else {
2243 error = EOPNOTSUPP;
6dc34a0d 2244 }
c1c9c9c4 2245 }
86383816
BP
2246 ovs_mutex_unlock(&netdev->mutex);
2247
2248 return error;
c1c9c9c4
BP
2249}
2250
d57695d7
JS
2251struct queue_dump_state {
2252 struct nl_dump dump;
2253 struct ofpbuf buf;
2254};
2255
23a98ffe 2256static bool
d57695d7 2257start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2258{
2259 struct ofpbuf request;
2260 struct tcmsg *tcmsg;
2261
2262 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2263 if (!tcmsg) {
2264 return false;
2265 }
3c4de644 2266 tcmsg->tcm_parent = 0;
d57695d7 2267 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2268 ofpbuf_uninit(&request);
d57695d7
JS
2269
2270 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2271 return true;
c1c9c9c4
BP
2272}
2273
d57695d7
JS
2274static int
2275finish_queue_dump(struct queue_dump_state *state)
2276{
2277 ofpbuf_uninit(&state->buf);
2278 return nl_dump_done(&state->dump);
2279}
2280
89454bf4
BP
2281struct netdev_linux_queue_state {
2282 unsigned int *queues;
2283 size_t cur_queue;
2284 size_t n_queues;
2285};
2286
c1c9c9c4 2287static int
89454bf4 2288netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2289{
89454bf4 2290 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2291 int error;
2292
86383816 2293 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2294 error = tc_query_qdisc(netdev_);
86383816
BP
2295 if (!error) {
2296 if (netdev->tc->ops->class_get) {
89454bf4
BP
2297 struct netdev_linux_queue_state *state;
2298 struct tc_queue *queue;
2299 size_t i;
2300
2301 *statep = state = xmalloc(sizeof *state);
2302 state->n_queues = hmap_count(&netdev->tc->queues);
2303 state->cur_queue = 0;
2304 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2305
2306 i = 0;
2307 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2308 state->queues[i++] = queue->queue_id;
86383816 2309 }
c1c9c9c4 2310 } else {
86383816 2311 error = EOPNOTSUPP;
c1c9c9c4
BP
2312 }
2313 }
86383816 2314 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2315
86383816 2316 return error;
c1c9c9c4
BP
2317}
2318
89454bf4
BP
2319static int
2320netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2321 unsigned int *queue_idp, struct smap *details)
2322{
2323 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2324 struct netdev_linux_queue_state *state = state_;
2325 int error = EOF;
2326
2327 ovs_mutex_lock(&netdev->mutex);
2328 while (state->cur_queue < state->n_queues) {
2329 unsigned int queue_id = state->queues[state->cur_queue++];
2330 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2331
2332 if (queue) {
2333 *queue_idp = queue_id;
2334 error = netdev->tc->ops->class_get(netdev_, queue, details);
2335 break;
2336 }
2337 }
2338 ovs_mutex_unlock(&netdev->mutex);
2339
2340 return error;
2341}
2342
2343static int
2344netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2345 void *state_)
2346{
2347 struct netdev_linux_queue_state *state = state_;
2348
2349 free(state->queues);
2350 free(state);
2351 return 0;
2352}
2353
c1c9c9c4 2354static int
b5d57fc8 2355netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2356 netdev_dump_queue_stats_cb *cb, void *aux)
2357{
b5d57fc8 2358 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2359 int error;
2360
86383816 2361 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2362 error = tc_query_qdisc(netdev_);
86383816 2363 if (!error) {
d57695d7 2364 struct queue_dump_state state;
c1c9c9c4 2365
86383816
BP
2366 if (!netdev->tc->ops->class_dump_stats) {
2367 error = EOPNOTSUPP;
d57695d7 2368 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2369 error = ENODEV;
2370 } else {
2371 struct ofpbuf msg;
2372 int retval;
2373
d57695d7 2374 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2375 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2376 cb, aux);
2377 if (retval) {
2378 error = retval;
2379 }
2380 }
2381
d57695d7 2382 retval = finish_queue_dump(&state);
86383816
BP
2383 if (retval) {
2384 error = retval;
2385 }
c1c9c9c4
BP
2386 }
2387 }
86383816 2388 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2389
86383816 2390 return error;
c1c9c9c4
BP
2391}
2392
8b61709d 2393static int
f1acd62b
BP
2394netdev_linux_get_in4(const struct netdev *netdev_,
2395 struct in_addr *address, struct in_addr *netmask)
8b61709d 2396{
b5d57fc8 2397 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2398 int error;
149f577a 2399
86383816 2400 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2401 if (!(netdev->cache_valid & VALID_IN4)) {
b5d57fc8 2402 error = netdev_linux_get_ipv4(netdev_, &netdev->address,
8b61709d 2403 SIOCGIFADDR, "SIOCGIFADDR");
86383816
BP
2404 if (!error) {
2405 error = netdev_linux_get_ipv4(netdev_, &netdev->netmask,
2406 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2407 if (!error) {
2408 netdev->cache_valid |= VALID_IN4;
2409 }
8b61709d 2410 }
86383816
BP
2411 } else {
2412 error = 0;
2413 }
8b61709d 2414
86383816
BP
2415 if (!error) {
2416 if (netdev->address.s_addr != INADDR_ANY) {
2417 *address = netdev->address;
2418 *netmask = netdev->netmask;
2419 } else {
2420 error = EADDRNOTAVAIL;
f1acd62b 2421 }
8b61709d 2422 }
86383816
BP
2423 ovs_mutex_unlock(&netdev->mutex);
2424
2425 return error;
8b61709d
BP
2426}
2427
8b61709d 2428static int
f1acd62b
BP
2429netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2430 struct in_addr netmask)
8b61709d 2431{
b5d57fc8 2432 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2433 int error;
2434
86383816 2435 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2436 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2437 if (!error) {
b5d57fc8
BP
2438 netdev->cache_valid |= VALID_IN4;
2439 netdev->address = address;
2440 netdev->netmask = netmask;
f1acd62b 2441 if (address.s_addr != INADDR_ANY) {
8b61709d 2442 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2443 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2444 }
2445 }
86383816
BP
2446 ovs_mutex_unlock(&netdev->mutex);
2447
8b61709d
BP
2448 return error;
2449}
2450
2451static bool
2452parse_if_inet6_line(const char *line,
2453 struct in6_addr *in6, char ifname[16 + 1])
2454{
2455 uint8_t *s6 = in6->s6_addr;
2456#define X8 "%2"SCNx8
c2c28dfd
BP
2457 return ovs_scan(line,
2458 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2459 "%*x %*x %*x %*x %16s\n",
2460 &s6[0], &s6[1], &s6[2], &s6[3],
2461 &s6[4], &s6[5], &s6[6], &s6[7],
2462 &s6[8], &s6[9], &s6[10], &s6[11],
2463 &s6[12], &s6[13], &s6[14], &s6[15],
2464 ifname);
8b61709d
BP
2465}
2466
2467/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2468 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2469static int
2470netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2471{
b5d57fc8 2472 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
2473
2474 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2475 if (!(netdev->cache_valid & VALID_IN6)) {
8b61709d
BP
2476 FILE *file;
2477 char line[128];
2478
b5d57fc8 2479 netdev->in6 = in6addr_any;
8b61709d
BP
2480
2481 file = fopen("/proc/net/if_inet6", "r");
2482 if (file != NULL) {
2483 const char *name = netdev_get_name(netdev_);
2484 while (fgets(line, sizeof line, file)) {
2a022368 2485 struct in6_addr in6_tmp;
8b61709d 2486 char ifname[16 + 1];
2a022368 2487 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2488 && !strcmp(name, ifname))
2489 {
b5d57fc8 2490 netdev->in6 = in6_tmp;
8b61709d
BP
2491 break;
2492 }
2493 }
2494 fclose(file);
2495 }
b5d57fc8 2496 netdev->cache_valid |= VALID_IN6;
8b61709d 2497 }
b5d57fc8 2498 *in6 = netdev->in6;
86383816
BP
2499 ovs_mutex_unlock(&netdev->mutex);
2500
8b61709d
BP
2501 return 0;
2502}
2503
2504static void
2505make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2506{
2507 struct sockaddr_in sin;
2508 memset(&sin, 0, sizeof sin);
2509 sin.sin_family = AF_INET;
2510 sin.sin_addr = addr;
2511 sin.sin_port = 0;
2512
2513 memset(sa, 0, sizeof *sa);
2514 memcpy(sa, &sin, sizeof sin);
2515}
2516
2517static int
2518do_set_addr(struct netdev *netdev,
2519 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2520{
2521 struct ifreq ifr;
149f577a 2522
259e0b1a
BP
2523 make_in4_sockaddr(&ifr.ifr_addr, addr);
2524 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2525 ioctl_name);
8b61709d
BP
2526}
2527
2528/* Adds 'router' as a default IP gateway. */
2529static int
67a4917b 2530netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2531{
2532 struct in_addr any = { INADDR_ANY };
2533 struct rtentry rt;
2534 int error;
2535
2536 memset(&rt, 0, sizeof rt);
2537 make_in4_sockaddr(&rt.rt_dst, any);
2538 make_in4_sockaddr(&rt.rt_gateway, router);
2539 make_in4_sockaddr(&rt.rt_genmask, any);
2540 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2541 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2542 if (error) {
10a89ef0 2543 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2544 }
2545 return error;
2546}
2547
f1acd62b
BP
2548static int
2549netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2550 char **netdev_name)
2551{
2552 static const char fn[] = "/proc/net/route";
2553 FILE *stream;
2554 char line[256];
2555 int ln;
2556
2557 *netdev_name = NULL;
2558 stream = fopen(fn, "r");
2559 if (stream == NULL) {
10a89ef0 2560 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2561 return errno;
2562 }
2563
2564 ln = 0;
2565 while (fgets(line, sizeof line, stream)) {
2566 if (++ln >= 2) {
2567 char iface[17];
dbba996b 2568 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2569 int refcnt, metric, mtu;
2570 unsigned int flags, use, window, irtt;
2571
c2c28dfd
BP
2572 if (!ovs_scan(line,
2573 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2574 " %d %u %u\n",
2575 iface, &dest, &gateway, &flags, &refcnt,
2576 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2577 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2578 fn, ln, line);
2579 continue;
2580 }
2581 if (!(flags & RTF_UP)) {
2582 /* Skip routes that aren't up. */
2583 continue;
2584 }
2585
2586 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2587 * network byte order, so we don't need need any endian
f1acd62b
BP
2588 * conversions here. */
2589 if ((dest & mask) == (host->s_addr & mask)) {
2590 if (!gateway) {
2591 /* The host is directly reachable. */
2592 next_hop->s_addr = 0;
2593 } else {
2594 /* To reach the host, we must go through a gateway. */
2595 next_hop->s_addr = gateway;
2596 }
2597 *netdev_name = xstrdup(iface);
2598 fclose(stream);
2599 return 0;
2600 }
2601 }
2602 }
2603
2604 fclose(stream);
2605 return ENXIO;
2606}
2607
e210037e 2608static int
b5d57fc8 2609netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2610{
b5d57fc8 2611 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2612 int error = 0;
2613
86383816 2614 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2615 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2616 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2617
2618 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2619 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2620 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2621 cmd,
2622 ETHTOOL_GDRVINFO,
2623 "ETHTOOL_GDRVINFO");
2624 if (!error) {
b5d57fc8 2625 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2626 }
2627 }
e210037e 2628
e210037e 2629 if (!error) {
b5d57fc8
BP
2630 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2631 smap_add(smap, "driver_version", netdev->drvinfo.version);
2632 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2633 }
86383816
BP
2634 ovs_mutex_unlock(&netdev->mutex);
2635
e210037e
AE
2636 return error;
2637}
2638
4f925bd3 2639static int
275707c3
EJ
2640netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2641 struct smap *smap)
4f925bd3 2642{
79f1cbe9 2643 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2644 return 0;
2645}
2646
8b61709d
BP
2647/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2648 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2649 * returns 0. Otherwise, it returns a positive errno value; in particular,
2650 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2651static int
2652netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2653 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2654{
2655 struct arpreq r;
c100e025 2656 struct sockaddr_in sin;
8b61709d
BP
2657 int retval;
2658
2659 memset(&r, 0, sizeof r);
f2cc621b 2660 memset(&sin, 0, sizeof sin);
c100e025
BP
2661 sin.sin_family = AF_INET;
2662 sin.sin_addr.s_addr = ip;
2663 sin.sin_port = 0;
2664 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2665 r.arp_ha.sa_family = ARPHRD_ETHER;
2666 r.arp_flags = 0;
71d7c22f 2667 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2668 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2669 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2670 if (!retval) {
2671 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2672 } else if (retval != ENXIO) {
2673 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2674 netdev_get_name(netdev), IP_ARGS(ip),
2675 ovs_strerror(retval));
8b61709d
BP
2676 }
2677 return retval;
2678}
2679
2680static int
2681nd_to_iff_flags(enum netdev_flags nd)
2682{
2683 int iff = 0;
2684 if (nd & NETDEV_UP) {
2685 iff |= IFF_UP;
2686 }
2687 if (nd & NETDEV_PROMISC) {
2688 iff |= IFF_PROMISC;
2689 }
7ba19d41
AC
2690 if (nd & NETDEV_LOOPBACK) {
2691 iff |= IFF_LOOPBACK;
2692 }
8b61709d
BP
2693 return iff;
2694}
2695
2696static int
2697iff_to_nd_flags(int iff)
2698{
2699 enum netdev_flags nd = 0;
2700 if (iff & IFF_UP) {
2701 nd |= NETDEV_UP;
2702 }
2703 if (iff & IFF_PROMISC) {
2704 nd |= NETDEV_PROMISC;
2705 }
7ba19d41
AC
2706 if (iff & IFF_LOOPBACK) {
2707 nd |= NETDEV_LOOPBACK;
2708 }
8b61709d
BP
2709 return nd;
2710}
2711
2712static int
4f9f3f21
BP
2713update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2714 enum netdev_flags on, enum netdev_flags *old_flagsp)
2715 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2716{
2717 int old_flags, new_flags;
c37d4da4
EJ
2718 int error = 0;
2719
b5d57fc8 2720 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2721 *old_flagsp = iff_to_nd_flags(old_flags);
2722 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2723 if (new_flags != old_flags) {
4f9f3f21
BP
2724 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2725 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2726 }
4f9f3f21
BP
2727
2728 return error;
2729}
2730
2731static int
2732netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2733 enum netdev_flags on, enum netdev_flags *old_flagsp)
2734{
2735 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2736 int error;
2737
2738 ovs_mutex_lock(&netdev->mutex);
2739 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2740 ovs_mutex_unlock(&netdev->mutex);
2741
8b61709d
BP
2742 return error;
2743}
2744
2f9dd77f 2745#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2746 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2747{ \
2748 NAME, \
2749 \
259e0b1a 2750 NULL, \
c3827f61
BP
2751 netdev_linux_run, \
2752 netdev_linux_wait, \
2753 \
9dc63482
BP
2754 netdev_linux_alloc, \
2755 CONSTRUCT, \
2756 netdev_linux_destruct, \
2757 netdev_linux_dealloc, \
de5cdb90 2758 NULL, /* get_config */ \
6d9e6eb4 2759 NULL, /* set_config */ \
f431bf7d 2760 NULL, /* get_tunnel_config */ \
a36de779
PS
2761 NULL, /* build header */ \
2762 NULL, /* push header */ \
2763 NULL, /* pop header */ \
7dec44fe 2764 NULL, /* get_numa_id */ \
5496878c 2765 NULL, /* set_multiq */ \
c3827f61 2766 \
c3827f61
BP
2767 netdev_linux_send, \
2768 netdev_linux_send_wait, \
2769 \
2770 netdev_linux_set_etheraddr, \
2771 netdev_linux_get_etheraddr, \
2772 netdev_linux_get_mtu, \
9b020780 2773 netdev_linux_set_mtu, \
c3827f61
BP
2774 netdev_linux_get_ifindex, \
2775 netdev_linux_get_carrier, \
65c3058c 2776 netdev_linux_get_carrier_resets, \
1670c579 2777 netdev_linux_set_miimon_interval, \
f613a0d7 2778 GET_STATS, \
c3827f61 2779 \
51f87458 2780 GET_FEATURES, \
c3827f61 2781 netdev_linux_set_advertisements, \
c3827f61
BP
2782 \
2783 netdev_linux_set_policing, \
2784 netdev_linux_get_qos_types, \
2785 netdev_linux_get_qos_capabilities, \
2786 netdev_linux_get_qos, \
2787 netdev_linux_set_qos, \
2788 netdev_linux_get_queue, \
2789 netdev_linux_set_queue, \
2790 netdev_linux_delete_queue, \
2791 netdev_linux_get_queue_stats, \
89454bf4
BP
2792 netdev_linux_queue_dump_start, \
2793 netdev_linux_queue_dump_next, \
2794 netdev_linux_queue_dump_done, \
c3827f61
BP
2795 netdev_linux_dump_queue_stats, \
2796 \
2797 netdev_linux_get_in4, \
2798 netdev_linux_set_in4, \
2799 netdev_linux_get_in6, \
2800 netdev_linux_add_router, \
2801 netdev_linux_get_next_hop, \
4f925bd3 2802 GET_STATUS, \
c3827f61
BP
2803 netdev_linux_arp_lookup, \
2804 \
2805 netdev_linux_update_flags, \
2806 \
f7791740
PS
2807 netdev_linux_rxq_alloc, \
2808 netdev_linux_rxq_construct, \
2809 netdev_linux_rxq_destruct, \
2810 netdev_linux_rxq_dealloc, \
2811 netdev_linux_rxq_recv, \
2812 netdev_linux_rxq_wait, \
2813 netdev_linux_rxq_drain, \
c3827f61
BP
2814}
2815
2816const struct netdev_class netdev_linux_class =
2817 NETDEV_LINUX_CLASS(
2818 "system",
9dc63482 2819 netdev_linux_construct,
f613a0d7 2820 netdev_linux_get_stats,
51f87458 2821 netdev_linux_get_features,
275707c3 2822 netdev_linux_get_status);
c3827f61
BP
2823
2824const struct netdev_class netdev_tap_class =
2825 NETDEV_LINUX_CLASS(
2826 "tap",
9dc63482 2827 netdev_linux_construct_tap,
bba1e6f3 2828 netdev_tap_get_stats,
51f87458 2829 netdev_linux_get_features,
275707c3 2830 netdev_linux_get_status);
c3827f61
BP
2831
2832const struct netdev_class netdev_internal_class =
2833 NETDEV_LINUX_CLASS(
2834 "internal",
9dc63482 2835 netdev_linux_construct,
bba1e6f3 2836 netdev_internal_get_stats,
51f87458 2837 NULL, /* get_features */
275707c3 2838 netdev_internal_get_status);
8b61709d 2839\f
677d9158
JV
2840
2841#define CODEL_N_QUEUES 0x0000
2842
2f4298ce
BP
2843/* In sufficiently new kernel headers these are defined as enums in
2844 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2845 * kernels. (This overrides any enum definition in the header file but that's
2846 * harmless.) */
2847#define TCA_CODEL_TARGET 1
2848#define TCA_CODEL_LIMIT 2
2849#define TCA_CODEL_INTERVAL 3
2850
677d9158
JV
2851struct codel {
2852 struct tc tc;
2853 uint32_t target;
2854 uint32_t limit;
2855 uint32_t interval;
2856};
2857
2858static struct codel *
2859codel_get__(const struct netdev *netdev_)
2860{
2861 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2862 return CONTAINER_OF(netdev->tc, struct codel, tc);
2863}
2864
2865static void
2866codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2867 uint32_t interval)
2868{
2869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2870 struct codel *codel;
2871
2872 codel = xmalloc(sizeof *codel);
2873 tc_init(&codel->tc, &tc_ops_codel);
2874 codel->target = target;
2875 codel->limit = limit;
2876 codel->interval = interval;
2877
2878 netdev->tc = &codel->tc;
2879}
2880
2881static int
2882codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2883 uint32_t interval)
2884{
2885 size_t opt_offset;
2886 struct ofpbuf request;
2887 struct tcmsg *tcmsg;
2888 uint32_t otarget, olimit, ointerval;
2889 int error;
2890
2891 tc_del_qdisc(netdev);
2892
2893 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2894 NLM_F_EXCL | NLM_F_CREATE, &request);
2895 if (!tcmsg) {
2896 return ENODEV;
2897 }
2898 tcmsg->tcm_handle = tc_make_handle(1, 0);
2899 tcmsg->tcm_parent = TC_H_ROOT;
2900
2901 otarget = target ? target : 5000;
2902 olimit = limit ? limit : 10240;
2903 ointerval = interval ? interval : 100000;
2904
2905 nl_msg_put_string(&request, TCA_KIND, "codel");
2906 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2907 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2908 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2909 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2910 nl_msg_end_nested(&request, opt_offset);
2911
2912 error = tc_transact(&request, NULL);
2913 if (error) {
2914 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2915 "target %u, limit %u, interval %u error %d(%s)",
2916 netdev_get_name(netdev),
2917 otarget, olimit, ointerval,
2918 error, ovs_strerror(error));
2919 }
2920 return error;
2921}
2922
2923static void
2924codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2925 const struct smap *details, struct codel *codel)
2926{
2927 const char *target_s;
2928 const char *limit_s;
2929 const char *interval_s;
2930
2931 target_s = smap_get(details, "target");
2932 limit_s = smap_get(details, "limit");
2933 interval_s = smap_get(details, "interval");
2934
2935 codel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
2936 codel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
2937 codel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
2938
2939 if (!codel->target) {
2940 codel->target = 5000;
2941 }
2942 if (!codel->limit) {
2943 codel->limit = 10240;
2944 }
2945 if (!codel->interval) {
2946 codel->interval = 100000;
2947 }
2948}
2949
2950static int
2951codel_tc_install(struct netdev *netdev, const struct smap *details)
2952{
2953 int error;
2954 struct codel codel;
2955
2956 codel_parse_qdisc_details__(netdev, details, &codel);
2957 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2958 codel.interval);
2959 if (!error) {
2960 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2961 }
2962 return error;
2963}
2964
2965static int
2966codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2967{
2968 static const struct nl_policy tca_codel_policy[] = {
2969 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2970 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2971 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2972 };
2973
2974 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2975
2976 if (!nl_parse_nested(nl_options, tca_codel_policy,
2977 attrs, ARRAY_SIZE(tca_codel_policy))) {
2978 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2979 return EPROTO;
2980 }
2981
2982 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2983 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2984 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2985 return 0;
2986}
2987
2988static int
2989codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
2990{
2991 struct nlattr *nlattr;
2992 const char * kind;
2993 int error;
2994 struct codel codel;
2995
2996 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
2997 if (error != 0) {
2998 return error;
2999 }
3000
3001 error = codel_parse_tca_options__(nlattr, &codel);
3002 if (error != 0) {
3003 return error;
3004 }
3005
3006 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3007 return 0;
3008}
3009
3010
3011static void
3012codel_tc_destroy(struct tc *tc)
3013{
3014 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3015 tc_destroy(tc);
3016 free(codel);
3017}
3018
3019static int
3020codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3021{
3022 const struct codel *codel = codel_get__(netdev);
3023 smap_add_format(details, "target", "%u", codel->target);
3024 smap_add_format(details, "limit", "%u", codel->limit);
3025 smap_add_format(details, "interval", "%u", codel->interval);
3026 return 0;
3027}
3028
3029static int
3030codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3031{
3032 struct codel codel;
3033
3034 codel_parse_qdisc_details__(netdev, details, &codel);
3035 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3036 codel_get__(netdev)->target = codel.target;
3037 codel_get__(netdev)->limit = codel.limit;
3038 codel_get__(netdev)->interval = codel.interval;
3039 return 0;
3040}
3041
3042static const struct tc_ops tc_ops_codel = {
3043 "codel", /* linux_name */
3044 "linux-codel", /* ovs_name */
3045 CODEL_N_QUEUES, /* n_queues */
3046 codel_tc_install,
3047 codel_tc_load,
3048 codel_tc_destroy,
3049 codel_qdisc_get,
3050 codel_qdisc_set,
3051 NULL,
3052 NULL,
3053 NULL,
3054 NULL,
3055 NULL
3056};
3057\f
3058/* FQ-CoDel traffic control class. */
3059
3060#define FQCODEL_N_QUEUES 0x0000
3061
2f4298ce
BP
3062/* In sufficiently new kernel headers these are defined as enums in
3063 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3064 * kernels. (This overrides any enum definition in the header file but that's
3065 * harmless.) */
3066#define TCA_FQ_CODEL_TARGET 1
3067#define TCA_FQ_CODEL_LIMIT 2
3068#define TCA_FQ_CODEL_INTERVAL 3
3069#define TCA_FQ_CODEL_ECN 4
3070#define TCA_FQ_CODEL_FLOWS 5
3071#define TCA_FQ_CODEL_QUANTUM 6
3072
677d9158
JV
3073struct fqcodel {
3074 struct tc tc;
3075 uint32_t target;
3076 uint32_t limit;
3077 uint32_t interval;
3078 uint32_t flows;
3079 uint32_t quantum;
3080};
3081
3082static struct fqcodel *
3083fqcodel_get__(const struct netdev *netdev_)
3084{
3085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3086 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3087}
3088
3089static void
3090fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3091 uint32_t interval, uint32_t flows, uint32_t quantum)
3092{
3093 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3094 struct fqcodel *fqcodel;
3095
3096 fqcodel = xmalloc(sizeof *fqcodel);
3097 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3098 fqcodel->target = target;
3099 fqcodel->limit = limit;
3100 fqcodel->interval = interval;
3101 fqcodel->flows = flows;
3102 fqcodel->quantum = quantum;
3103
3104 netdev->tc = &fqcodel->tc;
3105}
3106
3107static int
3108fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3109 uint32_t interval, uint32_t flows, uint32_t quantum)
3110{
3111 size_t opt_offset;
3112 struct ofpbuf request;
3113 struct tcmsg *tcmsg;
3114 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3115 int error;
3116
3117 tc_del_qdisc(netdev);
3118
3119 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3120 NLM_F_EXCL | NLM_F_CREATE, &request);
3121 if (!tcmsg) {
3122 return ENODEV;
3123 }
3124 tcmsg->tcm_handle = tc_make_handle(1, 0);
3125 tcmsg->tcm_parent = TC_H_ROOT;
3126
3127 otarget = target ? target : 5000;
3128 olimit = limit ? limit : 10240;
3129 ointerval = interval ? interval : 100000;
3130 oflows = flows ? flows : 1024;
3131 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3132 not mtu */
3133
3134 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3135 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3136 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3137 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3138 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3139 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3140 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3141 nl_msg_end_nested(&request, opt_offset);
3142
3143 error = tc_transact(&request, NULL);
3144 if (error) {
3145 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3146 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3147 netdev_get_name(netdev),
3148 otarget, olimit, ointerval, oflows, oquantum,
3149 error, ovs_strerror(error));
3150 }
3151 return error;
3152}
3153
3154static void
3155fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3156 const struct smap *details, struct fqcodel *fqcodel)
3157{
3158 const char *target_s;
3159 const char *limit_s;
3160 const char *interval_s;
3161 const char *flows_s;
3162 const char *quantum_s;
3163
3164 target_s = smap_get(details, "target");
3165 limit_s = smap_get(details, "limit");
3166 interval_s = smap_get(details, "interval");
3167 flows_s = smap_get(details, "flows");
3168 quantum_s = smap_get(details, "quantum");
3169 fqcodel->target = target_s ? strtoull(target_s, NULL, 10) : 0;
3170 fqcodel->limit = limit_s ? strtoull(limit_s, NULL, 10) : 0;
3171 fqcodel->interval = interval_s ? strtoull(interval_s, NULL, 10) : 0;
3172 fqcodel->flows = flows_s ? strtoull(flows_s, NULL, 10) : 0;
3173 fqcodel->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3174 if (!fqcodel->target) {
3175 fqcodel->target = 5000;
3176 }
3177 if (!fqcodel->limit) {
3178 fqcodel->limit = 10240;
3179 }
3180 if (!fqcodel->interval) {
3181 fqcodel->interval = 1000000;
3182 }
3183 if (!fqcodel->flows) {
3184 fqcodel->flows = 1024;
3185 }
3186 if (!fqcodel->quantum) {
3187 fqcodel->quantum = 1514;
3188 }
3189}
3190
3191static int
3192fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3193{
3194 int error;
3195 struct fqcodel fqcodel;
3196
3197 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3198 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3199 fqcodel.interval, fqcodel.flows,
3200 fqcodel.quantum);
3201 if (!error) {
3202 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3203 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3204 }
3205 return error;
3206}
3207
3208static int
3209fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3210{
3211 static const struct nl_policy tca_fqcodel_policy[] = {
3212 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3213 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3214 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3215 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3216 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3217 };
3218
3219 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3220
3221 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3222 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3223 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3224 return EPROTO;
3225 }
3226
3227 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3228 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3229 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3230 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3231 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3232 return 0;
3233}
3234
3235static int
3236fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3237{
3238 struct nlattr *nlattr;
3239 const char * kind;
3240 int error;
3241 struct fqcodel fqcodel;
3242
3243 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3244 if (error != 0) {
3245 return error;
3246 }
3247
3248 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3249 if (error != 0) {
3250 return error;
3251 }
3252
3253 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3254 fqcodel.flows, fqcodel.quantum);
3255 return 0;
3256}
3257
3258static void
3259fqcodel_tc_destroy(struct tc *tc)
3260{
3261 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3262 tc_destroy(tc);
3263 free(fqcodel);
3264}
3265
3266static int
3267fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3268{
3269 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3270 smap_add_format(details, "target", "%u", fqcodel->target);
3271 smap_add_format(details, "limit", "%u", fqcodel->limit);
3272 smap_add_format(details, "interval", "%u", fqcodel->interval);
3273 smap_add_format(details, "flows", "%u", fqcodel->flows);
3274 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3275 return 0;
3276}
3277
3278static int
3279fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3280{
3281 struct fqcodel fqcodel;
3282
3283 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3284 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3285 fqcodel.flows, fqcodel.quantum);
3286 fqcodel_get__(netdev)->target = fqcodel.target;
3287 fqcodel_get__(netdev)->limit = fqcodel.limit;
3288 fqcodel_get__(netdev)->interval = fqcodel.interval;
3289 fqcodel_get__(netdev)->flows = fqcodel.flows;
3290 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3291 return 0;
3292}
3293
3294static const struct tc_ops tc_ops_fqcodel = {
3295 "fq_codel", /* linux_name */
3296 "linux-fq_codel", /* ovs_name */
3297 FQCODEL_N_QUEUES, /* n_queues */
3298 fqcodel_tc_install,
3299 fqcodel_tc_load,
3300 fqcodel_tc_destroy,
3301 fqcodel_qdisc_get,
3302 fqcodel_qdisc_set,
3303 NULL,
3304 NULL,
3305 NULL,
3306 NULL,
3307 NULL
3308};
3309\f
3310/* SFQ traffic control class. */
3311
3312#define SFQ_N_QUEUES 0x0000
3313
3314struct sfq {
3315 struct tc tc;
3316 uint32_t quantum;
3317 uint32_t perturb;
3318};
3319
3320static struct sfq *
3321sfq_get__(const struct netdev *netdev_)
3322{
3323 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3324 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3325}
3326
3327static void
3328sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3329{
3330 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3331 struct sfq *sfq;
3332
3333 sfq = xmalloc(sizeof *sfq);
3334 tc_init(&sfq->tc, &tc_ops_sfq);
3335 sfq->perturb = perturb;
3336 sfq->quantum = quantum;
3337
3338 netdev->tc = &sfq->tc;
3339}
3340
3341static int
3342sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3343{
3344 struct tc_sfq_qopt opt;
3345 struct ofpbuf request;
3346 struct tcmsg *tcmsg;
3347 int mtu;
3348 int mtu_error, error;
3349 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3350
3351 tc_del_qdisc(netdev);
3352
3353 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3354 NLM_F_EXCL | NLM_F_CREATE, &request);
3355 if (!tcmsg) {
3356 return ENODEV;
3357 }
3358 tcmsg->tcm_handle = tc_make_handle(1, 0);
3359 tcmsg->tcm_parent = TC_H_ROOT;
3360
3361 memset(&opt, 0, sizeof opt);
3362 if (!quantum) {
3363 if (!mtu_error) {
3364 opt.quantum = mtu; /* if we cannot find mtu, use default */
3365 }
3366 } else {
3367 opt.quantum = quantum;
3368 }
3369
3370 if (!perturb) {
3371 opt.perturb_period = 10;
3372 } else {
3373 opt.perturb_period = perturb;
3374 }
3375
3376 nl_msg_put_string(&request, TCA_KIND, "sfq");
3377 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3378
3379 error = tc_transact(&request, NULL);
3380 if (error) {
3381 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3382 "quantum %u, perturb %u error %d(%s)",
3383 netdev_get_name(netdev),
3384 opt.quantum, opt.perturb_period,
3385 error, ovs_strerror(error));
3386 }
3387 return error;
3388}
3389
3390static void
3391sfq_parse_qdisc_details__(struct netdev *netdev,
3392 const struct smap *details, struct sfq *sfq)
3393{
3394 const char *perturb_s;
3395 const char *quantum_s;
3396 int mtu;
3397 int mtu_error;
3398
3399 perturb_s = smap_get(details, "perturb");
3400 quantum_s = smap_get(details, "quantum");
3401 sfq->perturb = perturb_s ? strtoull(perturb_s, NULL, 10) : 0;
3402 sfq->quantum = quantum_s ? strtoull(quantum_s, NULL, 10) : 0;
3403 if (!sfq->perturb) {
3404 sfq->perturb = 10;
3405 }
3406
3407 if (!sfq->quantum) {
3408 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3409 if (!mtu_error) {
3410 sfq->quantum = mtu;
3411 } else {
3412 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3413 "device without mtu");
3414 return;
3415 }
3416 }
3417}
3418
3419static int
3420sfq_tc_install(struct netdev *netdev, const struct smap *details)
3421{
3422 int error;
3423 struct sfq sfq;
3424
3425 sfq_parse_qdisc_details__(netdev, details, &sfq);
3426 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3427 if (!error) {
3428 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3429 }
3430 return error;
3431}
3432
3433static int
3434sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3435{
3436 const struct tc_sfq_qopt *sfq;
3437 struct nlattr *nlattr;
3438 const char * kind;
3439 int error;
3440
3441 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3442 if (error == 0) {
3443 sfq = nl_attr_get(nlattr);
3444 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3445 return 0;
3446 }
3447
3448 return error;
3449}
3450
3451static void
3452sfq_tc_destroy(struct tc *tc)
3453{
3454 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3455 tc_destroy(tc);
3456 free(sfq);
3457}
3458
3459static int
3460sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3461{
3462 const struct sfq *sfq = sfq_get__(netdev);
3463 smap_add_format(details, "quantum", "%u", sfq->quantum);
3464 smap_add_format(details, "perturb", "%u", sfq->perturb);
3465 return 0;
3466}
3467
3468static int
3469sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3470{
3471 struct sfq sfq;
3472
3473 sfq_parse_qdisc_details__(netdev, details, &sfq);
3474 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3475 sfq_get__(netdev)->quantum = sfq.quantum;
3476 sfq_get__(netdev)->perturb = sfq.perturb;
3477 return 0;
3478}
3479
3480static const struct tc_ops tc_ops_sfq = {
3481 "sfq", /* linux_name */
3482 "linux-sfq", /* ovs_name */
3483 SFQ_N_QUEUES, /* n_queues */
3484 sfq_tc_install,
3485 sfq_tc_load,
3486 sfq_tc_destroy,
3487 sfq_qdisc_get,
3488 sfq_qdisc_set,
3489 NULL,
3490 NULL,
3491 NULL,
3492 NULL,
3493 NULL
3494};
3495\f
c1c9c9c4 3496/* HTB traffic control class. */
559843ed 3497
c1c9c9c4 3498#define HTB_N_QUEUES 0xf000
4f631ccd 3499#define HTB_RATE2QUANTUM 10
8b61709d 3500
c1c9c9c4
BP
3501struct htb {
3502 struct tc tc;
3503 unsigned int max_rate; /* In bytes/s. */
3504};
8b61709d 3505
c1c9c9c4 3506struct htb_class {
93b13be8 3507 struct tc_queue tc_queue;
c1c9c9c4
BP
3508 unsigned int min_rate; /* In bytes/s. */
3509 unsigned int max_rate; /* In bytes/s. */
3510 unsigned int burst; /* In bytes. */
3511 unsigned int priority; /* Lower values are higher priorities. */
3512};
8b61709d 3513
c1c9c9c4 3514static struct htb *
b5d57fc8 3515htb_get__(const struct netdev *netdev_)
c1c9c9c4 3516{
b5d57fc8
BP
3517 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3518 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3519}
3520
24045e35 3521static void
b5d57fc8 3522htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3523{
b5d57fc8 3524 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3525 struct htb *htb;
3526
3527 htb = xmalloc(sizeof *htb);
3528 tc_init(&htb->tc, &tc_ops_htb);
3529 htb->max_rate = max_rate;
3530
b5d57fc8 3531 netdev->tc = &htb->tc;
c1c9c9c4
BP
3532}
3533
3534/* Create an HTB qdisc.
3535 *
a339aa81 3536 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3537static int
3538htb_setup_qdisc__(struct netdev *netdev)
3539{
3540 size_t opt_offset;
3541 struct tc_htb_glob opt;
3542 struct ofpbuf request;
3543 struct tcmsg *tcmsg;
3544
3545 tc_del_qdisc(netdev);
3546
3547 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3548 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3549 if (!tcmsg) {
3550 return ENODEV;
3551 }
c1c9c9c4
BP
3552 tcmsg->tcm_handle = tc_make_handle(1, 0);
3553 tcmsg->tcm_parent = TC_H_ROOT;
3554
3555 nl_msg_put_string(&request, TCA_KIND, "htb");
3556
3557 memset(&opt, 0, sizeof opt);
4f631ccd 3558 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3559 opt.version = 3;
4ecf12d5 3560 opt.defcls = 1;
c1c9c9c4
BP
3561
3562 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3563 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3564 nl_msg_end_nested(&request, opt_offset);
3565
3566 return tc_transact(&request, NULL);
3567}
3568
3569/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3570 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3571static int
3572htb_setup_class__(struct netdev *netdev, unsigned int handle,
3573 unsigned int parent, struct htb_class *class)
3574{
3575 size_t opt_offset;
3576 struct tc_htb_opt opt;
3577 struct ofpbuf request;
3578 struct tcmsg *tcmsg;
3579 int error;
3580 int mtu;
3581
73371c09 3582 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3583 if (error) {
f915f1a8
BP
3584 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3585 netdev_get_name(netdev));
9b020780 3586 return error;
f915f1a8 3587 }
c1c9c9c4
BP
3588
3589 memset(&opt, 0, sizeof opt);
3590 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3591 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3592 /* Makes sure the quantum is at least MTU. Setting quantum will
3593 * make htb ignore the r2q for this class. */
3594 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3595 opt.quantum = mtu;
3596 }
c1c9c9c4
BP
3597 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3598 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3599 opt.prio = class->priority;
3600
3601 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
3602 if (!tcmsg) {
3603 return ENODEV;
3604 }
c1c9c9c4
BP
3605 tcmsg->tcm_handle = handle;
3606 tcmsg->tcm_parent = parent;
3607
3608 nl_msg_put_string(&request, TCA_KIND, "htb");
3609 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3610 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3611 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3612 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3613 nl_msg_end_nested(&request, opt_offset);
3614
3615 error = tc_transact(&request, NULL);
3616 if (error) {
3617 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3618 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3619 netdev_get_name(netdev),
3620 tc_get_major(handle), tc_get_minor(handle),
3621 tc_get_major(parent), tc_get_minor(parent),
3622 class->min_rate, class->max_rate,
10a89ef0 3623 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3624 }
3625 return error;
3626}
3627
3628/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3629 * description of them into 'details'. The description complies with the
3630 * specification given in the vswitch database documentation for linux-htb
3631 * queue details. */
3632static int
3633htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3634{
3635 static const struct nl_policy tca_htb_policy[] = {
3636 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3637 .min_len = sizeof(struct tc_htb_opt) },
3638 };
3639
3640 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3641 const struct tc_htb_opt *htb;
3642
3643 if (!nl_parse_nested(nl_options, tca_htb_policy,
3644 attrs, ARRAY_SIZE(tca_htb_policy))) {
3645 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3646 return EPROTO;
3647 }
3648
3649 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3650 class->min_rate = htb->rate.rate;
3651 class->max_rate = htb->ceil.rate;
3652 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3653 class->priority = htb->prio;
3654 return 0;
3655}
3656
3657static int
3658htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3659 struct htb_class *options,
3660 struct netdev_queue_stats *stats)
3661{
3662 struct nlattr *nl_options;
3663 unsigned int handle;
3664 int error;
3665
3666 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3667 if (!error && queue_id) {
17ee3c1f
BP
3668 unsigned int major = tc_get_major(handle);
3669 unsigned int minor = tc_get_minor(handle);
3670 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3671 *queue_id = minor - 1;
c1c9c9c4
BP
3672 } else {
3673 error = EPROTO;
3674 }
3675 }
3676 if (!error && options) {
3677 error = htb_parse_tca_options__(nl_options, options);
3678 }
3679 return error;
3680}
3681
3682static void
73371c09 3683htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3684 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3685{
73371c09 3686 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3687 const char *max_rate_s;
3688
79f1cbe9 3689 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
3690 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3691 if (!hc->max_rate) {
a00ca915 3692 enum netdev_features current;
c1c9c9c4 3693
73371c09
BP
3694 netdev_linux_read_features(netdev);
3695 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3696 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3697 }
3698 hc->min_rate = hc->max_rate;
3699 hc->burst = 0;
3700 hc->priority = 0;
3701}
3702
3703static int
3704htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3705 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3706{
3707 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
3708 const char *min_rate_s = smap_get(details, "min-rate");
3709 const char *max_rate_s = smap_get(details, "max-rate");
3710 const char *burst_s = smap_get(details, "burst");
3711 const char *priority_s = smap_get(details, "priority");
9b020780 3712 int mtu, error;
c1c9c9c4 3713
73371c09 3714 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3715 if (error) {
f915f1a8
BP
3716 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3717 netdev_get_name(netdev));
9b020780 3718 return error;
f915f1a8
BP
3719 }
3720
4f104611
EJ
3721 /* HTB requires at least an mtu sized min-rate to send any traffic even
3722 * on uncongested links. */
c45ab5e9 3723 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 3724 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3725 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3726
3727 /* max-rate */
3728 hc->max_rate = (max_rate_s
3729 ? strtoull(max_rate_s, NULL, 10) / 8
3730 : htb->max_rate);
3731 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3732 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3733
3734 /* burst
3735 *
3736 * According to hints in the documentation that I've read, it is important
3737 * that 'burst' be at least as big as the largest frame that might be
3738 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3739 * but having it a bit too small is a problem. Since netdev_get_mtu()
3740 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3741 * the MTU. We actually add 64, instead of 14, as a guard against
3742 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
3743 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
3744 hc->burst = MAX(hc->burst, mtu + 64);
3745
3746 /* priority */
3747 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
3748
3749 return 0;
3750}
3751
3752static int
3753htb_query_class__(const struct netdev *netdev, unsigned int handle,
3754 unsigned int parent, struct htb_class *options,
3755 struct netdev_queue_stats *stats)
3756{
3757 struct ofpbuf *reply;
3758 int error;
3759
3760 error = tc_query_class(netdev, handle, parent, &reply);
3761 if (!error) {
3762 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3763 ofpbuf_delete(reply);
3764 }
3765 return error;
3766}
3767
3768static int
79f1cbe9 3769htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3770{
3771 int error;
3772
3773 error = htb_setup_qdisc__(netdev);
3774 if (!error) {
3775 struct htb_class hc;
3776
3777 htb_parse_qdisc_details__(netdev, details, &hc);
3778 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3779 tc_make_handle(1, 0), &hc);
3780 if (!error) {
3781 htb_install__(netdev, hc.max_rate);
3782 }
3783 }
3784 return error;
3785}
3786
93b13be8
BP
3787static struct htb_class *
3788htb_class_cast__(const struct tc_queue *queue)
3789{
3790 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3791}
3792
c1c9c9c4
BP
3793static void
3794htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3795 const struct htb_class *hc)
3796{
3797 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3798 size_t hash = hash_int(queue_id, 0);
3799 struct tc_queue *queue;
c1c9c9c4
BP
3800 struct htb_class *hcp;
3801
93b13be8
BP
3802 queue = tc_find_queue__(netdev, queue_id, hash);
3803 if (queue) {
3804 hcp = htb_class_cast__(queue);
3805 } else {
c1c9c9c4 3806 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3807 queue = &hcp->tc_queue;
3808 queue->queue_id = queue_id;
6dc34a0d 3809 queue->created = time_msec();
93b13be8 3810 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3811 }
93b13be8
BP
3812
3813 hcp->min_rate = hc->min_rate;
3814 hcp->max_rate = hc->max_rate;
3815 hcp->burst = hc->burst;
3816 hcp->priority = hc->priority;
c1c9c9c4
BP
3817}
3818
3819static int
3820htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3821{
c1c9c9c4 3822 struct ofpbuf msg;
d57695d7 3823 struct queue_dump_state state;
c1c9c9c4 3824 struct htb_class hc;
c1c9c9c4
BP
3825
3826 /* Get qdisc options. */
3827 hc.max_rate = 0;
3828 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3829 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3830
3831 /* Get queues. */
d57695d7 3832 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3833 return ENODEV;
3834 }
d57695d7 3835 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3836 unsigned int queue_id;
3837
3838 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3839 htb_update_queue__(netdev, queue_id, &hc);
3840 }
3841 }
d57695d7 3842 finish_queue_dump(&state);
c1c9c9c4
BP
3843
3844 return 0;
3845}
3846
3847static void
3848htb_tc_destroy(struct tc *tc)
3849{
3850 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 3851 struct htb_class *hc, *next;
c1c9c9c4 3852
4e8e4213 3853 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 3854 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
3855 free(hc);
3856 }
3857 tc_destroy(tc);
3858 free(htb);
3859}
3860
3861static int
79f1cbe9 3862htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3863{
3864 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3865 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3866 return 0;
3867}
3868
3869static int
79f1cbe9 3870htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3871{
3872 struct htb_class hc;
3873 int error;
3874
3875 htb_parse_qdisc_details__(netdev, details, &hc);
3876 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3877 tc_make_handle(1, 0), &hc);
3878 if (!error) {
3879 htb_get__(netdev)->max_rate = hc.max_rate;
3880 }
3881 return error;
3882}
3883
3884static int
93b13be8 3885htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3886 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3887{
93b13be8 3888 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3889
79f1cbe9 3890 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3891 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3892 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3893 }
79f1cbe9 3894 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3895 if (hc->priority) {
79f1cbe9 3896 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3897 }
3898 return 0;
3899}
3900
3901static int
3902htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3903 const struct smap *details)
c1c9c9c4
BP
3904{
3905 struct htb_class hc;
3906 int error;
3907
3908 error = htb_parse_class_details__(netdev, details, &hc);
3909 if (error) {
3910 return error;
3911 }
3912
17ee3c1f 3913 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3914 tc_make_handle(1, 0xfffe), &hc);
3915 if (error) {
3916 return error;
3917 }
3918
3919 htb_update_queue__(netdev, queue_id, &hc);
3920 return 0;
3921}
3922
3923static int
93b13be8 3924htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3925{
93b13be8 3926 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3927 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3928 int error;
3929
93b13be8 3930 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3931 if (!error) {
93b13be8 3932 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3933 free(hc);
c1c9c9c4
BP
3934 }
3935 return error;
3936}
3937
3938static int
93b13be8 3939htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3940 struct netdev_queue_stats *stats)
3941{
93b13be8 3942 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3943 tc_make_handle(1, 0xfffe), NULL, stats);
3944}
3945
3946static int
3947htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3948 const struct ofpbuf *nlmsg,
3949 netdev_dump_queue_stats_cb *cb, void *aux)
3950{
3951 struct netdev_queue_stats stats;
17ee3c1f 3952 unsigned int handle, major, minor;
c1c9c9c4
BP
3953 int error;
3954
3955 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3956 if (error) {
3957 return error;
3958 }
3959
17ee3c1f
BP
3960 major = tc_get_major(handle);
3961 minor = tc_get_minor(handle);
3962 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3963 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3964 }
3965 return 0;
3966}
3967
3968static const struct tc_ops tc_ops_htb = {
3969 "htb", /* linux_name */
3970 "linux-htb", /* ovs_name */
3971 HTB_N_QUEUES, /* n_queues */
3972 htb_tc_install,
3973 htb_tc_load,
3974 htb_tc_destroy,
3975 htb_qdisc_get,
3976 htb_qdisc_set,
3977 htb_class_get,
3978 htb_class_set,
3979 htb_class_delete,
3980 htb_class_get_stats,
3981 htb_class_dump_stats
3982};
3983\f
a339aa81
EJ
3984/* "linux-hfsc" traffic control class. */
3985
3986#define HFSC_N_QUEUES 0xf000
3987
3988struct hfsc {
3989 struct tc tc;
3990 uint32_t max_rate;
3991};
3992
3993struct hfsc_class {
3994 struct tc_queue tc_queue;
3995 uint32_t min_rate;
3996 uint32_t max_rate;
3997};
3998
3999static struct hfsc *
b5d57fc8 4000hfsc_get__(const struct netdev *netdev_)
a339aa81 4001{
b5d57fc8
BP
4002 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4003 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4004}
4005
4006static struct hfsc_class *
4007hfsc_class_cast__(const struct tc_queue *queue)
4008{
4009 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4010}
4011
24045e35 4012static void
b5d57fc8 4013hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4014{
b5d57fc8 4015 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4016 struct hfsc *hfsc;
4017
a339aa81
EJ
4018 hfsc = xmalloc(sizeof *hfsc);
4019 tc_init(&hfsc->tc, &tc_ops_hfsc);
4020 hfsc->max_rate = max_rate;
b5d57fc8 4021 netdev->tc = &hfsc->tc;
a339aa81
EJ
4022}
4023
4024static void
4025hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4026 const struct hfsc_class *hc)
4027{
4028 size_t hash;
4029 struct hfsc *hfsc;
4030 struct hfsc_class *hcp;
4031 struct tc_queue *queue;
4032
4033 hfsc = hfsc_get__(netdev);
4034 hash = hash_int(queue_id, 0);
4035
4036 queue = tc_find_queue__(netdev, queue_id, hash);
4037 if (queue) {
4038 hcp = hfsc_class_cast__(queue);
4039 } else {
4040 hcp = xmalloc(sizeof *hcp);
4041 queue = &hcp->tc_queue;
4042 queue->queue_id = queue_id;
6dc34a0d 4043 queue->created = time_msec();
a339aa81
EJ
4044 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4045 }
4046
4047 hcp->min_rate = hc->min_rate;
4048 hcp->max_rate = hc->max_rate;
4049}
4050
4051static int
4052hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4053{
4054 const struct tc_service_curve *rsc, *fsc, *usc;
4055 static const struct nl_policy tca_hfsc_policy[] = {
4056 [TCA_HFSC_RSC] = {
4057 .type = NL_A_UNSPEC,
4058 .optional = false,
4059 .min_len = sizeof(struct tc_service_curve),
4060 },
4061 [TCA_HFSC_FSC] = {
4062 .type = NL_A_UNSPEC,
4063 .optional = false,
4064 .min_len = sizeof(struct tc_service_curve),
4065 },
4066 [TCA_HFSC_USC] = {
4067 .type = NL_A_UNSPEC,
4068 .optional = false,
4069 .min_len = sizeof(struct tc_service_curve),
4070 },
4071 };
4072 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4073
4074 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4075 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4076 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4077 return EPROTO;
4078 }
4079
4080 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4081 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4082 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4083
4084 if (rsc->m1 != 0 || rsc->d != 0 ||
4085 fsc->m1 != 0 || fsc->d != 0 ||
4086 usc->m1 != 0 || usc->d != 0) {
4087 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4088 "Non-linear service curves are not supported.");
4089 return EPROTO;
4090 }
4091
4092 if (rsc->m2 != fsc->m2) {
4093 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4094 "Real-time service curves are not supported ");
4095 return EPROTO;
4096 }
4097
4098 if (rsc->m2 > usc->m2) {
4099 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4100 "Min-rate service curve is greater than "
4101 "the max-rate service curve.");
4102 return EPROTO;
4103 }
4104
4105 class->min_rate = fsc->m2;
4106 class->max_rate = usc->m2;
4107 return 0;
4108}
4109
4110static int
4111hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4112 struct hfsc_class *options,
4113 struct netdev_queue_stats *stats)
4114{
4115 int error;
4116 unsigned int handle;
4117 struct nlattr *nl_options;
4118
4119 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4120 if (error) {
4121 return error;
4122 }
4123
4124 if (queue_id) {
4125 unsigned int major, minor;
4126
4127 major = tc_get_major(handle);
4128 minor = tc_get_minor(handle);
4129 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4130 *queue_id = minor - 1;
4131 } else {
4132 return EPROTO;
4133 }
4134 }
4135
4136 if (options) {
4137 error = hfsc_parse_tca_options__(nl_options, options);
4138 }
4139
4140 return error;
4141}
4142
4143static int
4144hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4145 unsigned int parent, struct hfsc_class *options,
4146 struct netdev_queue_stats *stats)
4147{
4148 int error;
4149 struct ofpbuf *reply;
4150
4151 error = tc_query_class(netdev, handle, parent, &reply);
4152 if (error) {
4153 return error;
4154 }
4155
4156 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4157 ofpbuf_delete(reply);
4158 return error;
4159}
4160
4161static void
73371c09 4162hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4163 struct hfsc_class *class)
4164{
73371c09 4165 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4166 uint32_t max_rate;
4167 const char *max_rate_s;
4168
79f1cbe9 4169 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
4170 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
4171
4172 if (!max_rate) {
a00ca915 4173 enum netdev_features current;
a339aa81 4174
73371c09
BP
4175 netdev_linux_read_features(netdev);
4176 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4177 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4178 }
4179
4180 class->min_rate = max_rate;
4181 class->max_rate = max_rate;
4182}
4183
4184static int
4185hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4186 const struct smap *details,
a339aa81
EJ
4187 struct hfsc_class * class)
4188{
4189 const struct hfsc *hfsc;
4190 uint32_t min_rate, max_rate;
4191 const char *min_rate_s, *max_rate_s;
4192
4193 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
4194 min_rate_s = smap_get(details, "min-rate");
4195 max_rate_s = smap_get(details, "max-rate");
a339aa81 4196
c45ab5e9 4197 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 4198 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4199 min_rate = MIN(min_rate, hfsc->max_rate);
4200
4201 max_rate = (max_rate_s
4202 ? strtoull(max_rate_s, NULL, 10) / 8
4203 : hfsc->max_rate);
4204 max_rate = MAX(max_rate, min_rate);
4205 max_rate = MIN(max_rate, hfsc->max_rate);
4206
4207 class->min_rate = min_rate;
4208 class->max_rate = max_rate;
4209
4210 return 0;
4211}
4212
4213/* Create an HFSC qdisc.
4214 *
4215 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4216static int
4217hfsc_setup_qdisc__(struct netdev * netdev)
4218{
4219 struct tcmsg *tcmsg;
4220 struct ofpbuf request;
4221 struct tc_hfsc_qopt opt;
4222
4223 tc_del_qdisc(netdev);
4224
4225 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4226 NLM_F_EXCL | NLM_F_CREATE, &request);
4227
4228 if (!tcmsg) {
4229 return ENODEV;
4230 }
4231
4232 tcmsg->tcm_handle = tc_make_handle(1, 0);
4233 tcmsg->tcm_parent = TC_H_ROOT;
4234
4235 memset(&opt, 0, sizeof opt);
4236 opt.defcls = 1;
4237
4238 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4239 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4240
4241 return tc_transact(&request, NULL);
4242}
4243
4244/* Create an HFSC class.
4245 *
4246 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4247 * sc rate <min_rate> ul rate <max_rate>" */
4248static int
4249hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4250 unsigned int parent, struct hfsc_class *class)
4251{
4252 int error;
4253 size_t opt_offset;
4254 struct tcmsg *tcmsg;
4255 struct ofpbuf request;
4256 struct tc_service_curve min, max;
4257
4258 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4259
4260 if (!tcmsg) {
4261 return ENODEV;
4262 }
4263
4264 tcmsg->tcm_handle = handle;
4265 tcmsg->tcm_parent = parent;
4266
4267 min.m1 = 0;
4268 min.d = 0;
4269 min.m2 = class->min_rate;
4270
4271 max.m1 = 0;
4272 max.d = 0;
4273 max.m2 = class->max_rate;
4274
4275 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4276 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4277 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4278 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4279 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4280 nl_msg_end_nested(&request, opt_offset);
4281
4282 error = tc_transact(&request, NULL);
4283 if (error) {
4284 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4285 "min-rate %ubps, max-rate %ubps (%s)",
4286 netdev_get_name(netdev),
4287 tc_get_major(handle), tc_get_minor(handle),
4288 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4289 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4290 }
4291
4292 return error;
4293}
4294
4295static int
79f1cbe9 4296hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4297{
4298 int error;
4299 struct hfsc_class class;
4300
4301 error = hfsc_setup_qdisc__(netdev);
4302
4303 if (error) {
4304 return error;
4305 }
4306
4307 hfsc_parse_qdisc_details__(netdev, details, &class);
4308 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4309 tc_make_handle(1, 0), &class);
4310
4311 if (error) {
4312 return error;
4313 }
4314
4315 hfsc_install__(netdev, class.max_rate);
4316 return 0;
4317}
4318
4319static int
4320hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4321{
4322 struct ofpbuf msg;
d57695d7 4323 struct queue_dump_state state;
a339aa81
EJ
4324 struct hfsc_class hc;
4325
4326 hc.max_rate = 0;
4327 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4328 hfsc_install__(netdev, hc.max_rate);
a339aa81 4329
d57695d7 4330 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4331 return ENODEV;
4332 }
4333
d57695d7 4334 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4335 unsigned int queue_id;
4336
4337 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4338 hfsc_update_queue__(netdev, queue_id, &hc);
4339 }
4340 }
4341
d57695d7 4342 finish_queue_dump(&state);
a339aa81
EJ
4343 return 0;
4344}
4345
4346static void
4347hfsc_tc_destroy(struct tc *tc)
4348{
4349 struct hfsc *hfsc;
4350 struct hfsc_class *hc, *next;
4351
4352 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4353
4354 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4355 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4356 free(hc);
4357 }
4358
4359 tc_destroy(tc);
4360 free(hfsc);
4361}
4362
4363static int
79f1cbe9 4364hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4365{
4366 const struct hfsc *hfsc;
4367 hfsc = hfsc_get__(netdev);
79f1cbe9 4368 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4369 return 0;
4370}
4371
4372static int
79f1cbe9 4373hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4374{
4375 int error;
4376 struct hfsc_class class;
4377
4378 hfsc_parse_qdisc_details__(netdev, details, &class);
4379 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4380 tc_make_handle(1, 0), &class);
4381
4382 if (!error) {
4383 hfsc_get__(netdev)->max_rate = class.max_rate;
4384 }
4385
4386 return error;
4387}
4388
4389static int
4390hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4391 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4392{
4393 const struct hfsc_class *hc;
4394
4395 hc = hfsc_class_cast__(queue);
79f1cbe9 4396 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4397 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4398 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4399 }
4400 return 0;
4401}
4402
4403static int
4404hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4405 const struct smap *details)
a339aa81
EJ
4406{
4407 int error;
4408 struct hfsc_class class;
4409
4410 error = hfsc_parse_class_details__(netdev, details, &class);
4411 if (error) {
4412 return error;
4413 }
4414
4415 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4416 tc_make_handle(1, 0xfffe), &class);
4417 if (error) {
4418 return error;
4419 }
4420
4421 hfsc_update_queue__(netdev, queue_id, &class);
4422 return 0;
4423}
4424
4425static int
4426hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4427{
4428 int error;
4429 struct hfsc *hfsc;
4430 struct hfsc_class *hc;
4431
4432 hc = hfsc_class_cast__(queue);
4433 hfsc = hfsc_get__(netdev);
4434
4435 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4436 if (!error) {
4437 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4438 free(hc);
4439 }
4440 return error;
4441}
4442
4443static int
4444hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4445 struct netdev_queue_stats *stats)
4446{
4447 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4448 tc_make_handle(1, 0xfffe), NULL, stats);
4449}
4450
4451static int
4452hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4453 const struct ofpbuf *nlmsg,
4454 netdev_dump_queue_stats_cb *cb, void *aux)
4455{
4456 struct netdev_queue_stats stats;
4457 unsigned int handle, major, minor;
4458 int error;
4459
4460 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4461 if (error) {
4462 return error;
4463 }
4464
4465 major = tc_get_major(handle);
4466 minor = tc_get_minor(handle);
4467 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4468 (*cb)(minor - 1, &stats, aux);
4469 }
4470 return 0;
4471}
4472
4473static const struct tc_ops tc_ops_hfsc = {
4474 "hfsc", /* linux_name */
4475 "linux-hfsc", /* ovs_name */
4476 HFSC_N_QUEUES, /* n_queues */
4477 hfsc_tc_install, /* tc_install */
4478 hfsc_tc_load, /* tc_load */
4479 hfsc_tc_destroy, /* tc_destroy */
4480 hfsc_qdisc_get, /* qdisc_get */
4481 hfsc_qdisc_set, /* qdisc_set */
4482 hfsc_class_get, /* class_get */
4483 hfsc_class_set, /* class_set */
4484 hfsc_class_delete, /* class_delete */
4485 hfsc_class_get_stats, /* class_get_stats */
4486 hfsc_class_dump_stats /* class_dump_stats */
4487};
4488\f
c1c9c9c4
BP
4489/* "linux-default" traffic control class.
4490 *
4491 * This class represents the default, unnamed Linux qdisc. It corresponds to
4492 * the "" (empty string) QoS type in the OVS database. */
4493
4494static void
b5d57fc8 4495default_install__(struct netdev *netdev_)
c1c9c9c4 4496{
b5d57fc8 4497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4498 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4499
559eb230
BP
4500 /* Nothing but a tc class implementation is allowed to write to a tc. This
4501 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4502 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4503}
4504
4505static int
4506default_tc_install(struct netdev *netdev,
79f1cbe9 4507 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4508{
4509 default_install__(netdev);
4510 return 0;
4511}
4512
4513static int
4514default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4515{
4516 default_install__(netdev);
4517 return 0;
4518}
4519
4520static const struct tc_ops tc_ops_default = {
4521 NULL, /* linux_name */
4522 "", /* ovs_name */
4523 0, /* n_queues */
4524 default_tc_install,
4525 default_tc_load,
4526 NULL, /* tc_destroy */
4527 NULL, /* qdisc_get */
4528 NULL, /* qdisc_set */
4529 NULL, /* class_get */
4530 NULL, /* class_set */
4531 NULL, /* class_delete */
4532 NULL, /* class_get_stats */
4533 NULL /* class_dump_stats */
4534};
4535\f
4536/* "linux-other" traffic control class.
4537 *
4538 * */
4539
4540static int
b5d57fc8 4541other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4542{
b5d57fc8 4543 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4544 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4545
559eb230
BP
4546 /* Nothing but a tc class implementation is allowed to write to a tc. This
4547 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4548 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4549 return 0;
4550}
4551
4552static const struct tc_ops tc_ops_other = {
4553 NULL, /* linux_name */
4554 "linux-other", /* ovs_name */
4555 0, /* n_queues */
4556 NULL, /* tc_install */
4557 other_tc_load,
4558 NULL, /* tc_destroy */
4559 NULL, /* qdisc_get */
4560 NULL, /* qdisc_set */
4561 NULL, /* class_get */
4562 NULL, /* class_set */
4563 NULL, /* class_delete */
4564 NULL, /* class_get_stats */
4565 NULL /* class_dump_stats */
4566};
4567\f
4568/* Traffic control. */
4569
4570/* Number of kernel "tc" ticks per second. */
4571static double ticks_per_s;
4572
4573/* Number of kernel "jiffies" per second. This is used for the purpose of
4574 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4575 * one jiffy's worth of data.
4576 *
4577 * There are two possibilities here:
4578 *
4579 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4580 * approximate range of 100 to 1024. That means that we really need to
4581 * make sure that the qdisc can buffer that much data.
4582 *
4583 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4584 * has finely granular timers and there's no need to fudge additional room
4585 * for buffers. (There's no extra effort needed to implement that: the
4586 * large 'buffer_hz' is used as a divisor, so practically any number will
4587 * come out as 0 in the division. Small integer results in the case of
4588 * really high dividends won't have any real effect anyhow.)
4589 */
4590static unsigned int buffer_hz;
4591
4592/* Returns tc handle 'major':'minor'. */
4593static unsigned int
4594tc_make_handle(unsigned int major, unsigned int minor)
4595{
4596 return TC_H_MAKE(major << 16, minor);
4597}
4598
4599/* Returns the major number from 'handle'. */
4600static unsigned int
4601tc_get_major(unsigned int handle)
4602{
4603 return TC_H_MAJ(handle) >> 16;
4604}
4605
4606/* Returns the minor number from 'handle'. */
4607static unsigned int
4608tc_get_minor(unsigned int handle)
4609{
4610 return TC_H_MIN(handle);
4611}
4612
4613static struct tcmsg *
4614tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4615 struct ofpbuf *request)
4616{
4617 struct tcmsg *tcmsg;
4618 int ifindex;
4619 int error;
4620
4621 error = get_ifindex(netdev, &ifindex);
4622 if (error) {
4623 return NULL;
4624 }
4625
4626 ofpbuf_init(request, 512);
4627 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4628 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4629 tcmsg->tcm_family = AF_UNSPEC;
4630 tcmsg->tcm_ifindex = ifindex;
4631 /* Caller should fill in tcmsg->tcm_handle. */
4632 /* Caller should fill in tcmsg->tcm_parent. */
4633
4634 return tcmsg;
4635}
4636
4637static int
4638tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4639{
a88b4e04 4640 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
4641 ofpbuf_uninit(request);
4642 return error;
4643}
4644
f8500004
JP
4645/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4646 * policing configuration.
4647 *
4648 * This function is equivalent to running the following when 'add' is true:
4649 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4650 *
4651 * This function is equivalent to running the following when 'add' is false:
4652 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4653 *
4654 * The configuration and stats may be seen with the following command:
4655 * /sbin/tc -s qdisc show dev <devname>
4656 *
4657 * Returns 0 if successful, otherwise a positive errno value.
4658 */
4659static int
4660tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4661{
4662 struct ofpbuf request;
4663 struct tcmsg *tcmsg;
4664 int error;
4665 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4666 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4667
4668 tcmsg = tc_make_request(netdev, type, flags, &request);
4669 if (!tcmsg) {
4670 return ENODEV;
4671 }
4672 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4673 tcmsg->tcm_parent = TC_H_INGRESS;
4674 nl_msg_put_string(&request, TCA_KIND, "ingress");
4675 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4676
4677 error = tc_transact(&request, NULL);
4678 if (error) {
4679 /* If we're deleting the qdisc, don't worry about some of the
4680 * error conditions. */
4681 if (!add && (error == ENOENT || error == EINVAL)) {
4682 return 0;
4683 }
4684 return error;
4685 }
4686
4687 return 0;
4688}
4689
4690/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4691 * of 'kbits_burst'.
4692 *
4693 * This function is equivalent to running:
4694 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4695 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4696 * mtu 65535 drop
4697 *
4698 * The configuration and stats may be seen with the following command:
c7952afb 4699 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4700 *
4701 * Returns 0 if successful, otherwise a positive errno value.
4702 */
4703static int
c7952afb
BP
4704tc_add_policer(struct netdev *netdev,
4705 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4706{
4707 struct tc_police tc_police;
4708 struct ofpbuf request;
4709 struct tcmsg *tcmsg;
4710 size_t basic_offset;
4711 size_t police_offset;
4712 int error;
4713 int mtu = 65535;
4714
4715 memset(&tc_police, 0, sizeof tc_police);
4716 tc_police.action = TC_POLICE_SHOT;
4717 tc_police.mtu = mtu;
1aca400c 4718 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb
BP
4719
4720 /* The following appears wrong in two ways:
4721 *
4722 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4723 * arguments (or at least consistently "bytes" as both or "bits" as
4724 * both), but this supplies bytes for the first argument and bits for the
4725 * second.
4726 *
4727 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4728 *
4729 * However if you "fix" those problems then "tc filter show ..." shows
4730 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4731 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4732 * tc's point of view. Whatever. */
4733 tc_police.burst = tc_bytes_to_ticks(
4734 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024);
f8500004
JP
4735
4736 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4737 NLM_F_EXCL | NLM_F_CREATE, &request);
4738 if (!tcmsg) {
4739 return ENODEV;
4740 }
4741 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4742 tcmsg->tcm_info = tc_make_handle(49,
4743 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4744
4745 nl_msg_put_string(&request, TCA_KIND, "basic");
4746 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4747 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4748 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4749 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4750 nl_msg_end_nested(&request, police_offset);
4751 nl_msg_end_nested(&request, basic_offset);
4752
4753 error = tc_transact(&request, NULL);
4754 if (error) {
4755 return error;
4756 }
4757
4758 return 0;
4759}
4760
c1c9c9c4
BP
4761static void
4762read_psched(void)
4763{
4764 /* The values in psched are not individually very meaningful, but they are
4765 * important. The tables below show some values seen in the wild.
4766 *
4767 * Some notes:
4768 *
4769 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4770 * (Before that, there are hints that it was 1000000000.)
4771 *
4772 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4773 * above.
4774 *
4775 * /proc/net/psched
4776 * -----------------------------------
4777 * [1] 000c8000 000f4240 000f4240 00000064
4778 * [2] 000003e8 00000400 000f4240 3b9aca00
4779 * [3] 000003e8 00000400 000f4240 3b9aca00
4780 * [4] 000003e8 00000400 000f4240 00000064
4781 * [5] 000003e8 00000040 000f4240 3b9aca00
4782 * [6] 000003e8 00000040 000f4240 000000f9
4783 *
4784 * a b c d ticks_per_s buffer_hz
4785 * ------- --------- ---------- ------------- ----------- -------------
4786 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4787 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4788 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4789 * [4] 1,000 1,024 1,000,000 100 976,562 100
4790 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4791 * [6] 1,000 64 1,000,000 249 15,625,000 249
4792 *
4793 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4794 * [2] 2.6.26-1-686-bigmem from Debian lenny
4795 * [3] 2.6.26-2-sparc64 from Debian lenny
4796 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4797 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4798 * [6] 2.6.34 from kernel.org on KVM
4799 */
23882115 4800 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4801 static const char fn[] = "/proc/net/psched";
4802 unsigned int a, b, c, d;
4803 FILE *stream;
4804
23882115
BP
4805 if (!ovsthread_once_start(&once)) {
4806 return;
4807 }
4808
c1c9c9c4
BP
4809 ticks_per_s = 1.0;
4810 buffer_hz = 100;
4811
4812 stream = fopen(fn, "r");
4813 if (!stream) {
10a89ef0 4814 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4815 goto exit;
c1c9c9c4
BP
4816 }
4817
4818 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4819 VLOG_WARN("%s: read failed", fn);
4820 fclose(stream);
23882115 4821 goto exit;
c1c9c9c4
BP
4822 }
4823 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4824 fclose(stream);
4825
4826 if (!a || !c) {
4827 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4828 goto exit;
c1c9c9c4
BP
4829 }
4830
4831 ticks_per_s = (double) a * c / b;
4832 if (c == 1000000) {
4833 buffer_hz = d;
4834 } else {
4835 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4836 fn, a, b, c, d);
4837 }
4838 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4839
4840exit:
4841 ovsthread_once_done(&once);
c1c9c9c4
BP
4842}
4843
4844/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4845 * rate of 'rate' bytes per second. */
4846static unsigned int
4847tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4848{
23882115 4849 read_psched();
c1c9c9c4
BP
4850 return (rate * ticks) / ticks_per_s;
4851}
4852
4853/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4854 * rate of 'rate' bytes per second. */
4855static unsigned int
4856tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4857{
23882115 4858 read_psched();
015c93a4 4859 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4860}
4861
4862/* Returns the number of bytes that need to be reserved for qdisc buffering at
4863 * a transmission rate of 'rate' bytes per second. */
4864static unsigned int
4865tc_buffer_per_jiffy(unsigned int rate)
4866{
23882115 4867 read_psched();
c1c9c9c4
BP
4868 return rate / buffer_hz;
4869}
4870
4871/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4872 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4873 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4874 * stores NULL into it if it is absent.
4875 *
4876 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4877 * 'msg'.
4878 *
4879 * Returns 0 if successful, otherwise a positive errno value. */
4880static int
4881tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4882 struct nlattr **options)
4883{
4884 static const struct nl_policy tca_policy[] = {
4885 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4886 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4887 };
4888 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4889
4890 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4891 tca_policy, ta, ARRAY_SIZE(ta))) {
4892 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4893 goto error;
4894 }
4895
4896 if (kind) {
4897 *kind = nl_attr_get_string(ta[TCA_KIND]);
4898 }
4899
4900 if (options) {
4901 *options = ta[TCA_OPTIONS];
4902 }
4903
4904 return 0;
4905
4906error:
4907 if (kind) {
4908 *kind = NULL;
4909 }
4910 if (options) {
4911 *options = NULL;
4912 }
4913 return EPROTO;
4914}
4915
4916/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4917 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4918 * into '*options', and its queue statistics into '*stats'. Any of the output
4919 * arguments may be null.
4920 *
4921 * Returns 0 if successful, otherwise a positive errno value. */
4922static int
4923tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4924 struct nlattr **options, struct netdev_queue_stats *stats)
4925{
4926 static const struct nl_policy tca_policy[] = {
4927 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4928 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4929 };
4930 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4931
4932 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4933 tca_policy, ta, ARRAY_SIZE(ta))) {
4934 VLOG_WARN_RL(&rl, "failed to parse class message");
4935 goto error;
4936 }
4937
4938 if (handlep) {
4939 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4940 *handlep = tc->tcm_handle;
4941 }
4942
4943 if (options) {
4944 *options = ta[TCA_OPTIONS];
4945 }
4946
4947 if (stats) {
4948 const struct gnet_stats_queue *gsq;
4949 struct gnet_stats_basic gsb;
4950
4951 static const struct nl_policy stats_policy[] = {
4952 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4953 .min_len = sizeof gsb },
4954 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4955 .min_len = sizeof *gsq },
4956 };
4957 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4958
4959 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4960 sa, ARRAY_SIZE(sa))) {
4961 VLOG_WARN_RL(&rl, "failed to parse class stats");
4962 goto error;
4963 }
4964
4965 /* Alignment issues screw up the length of struct gnet_stats_basic on
4966 * some arch/bitsize combinations. Newer versions of Linux have a
4967 * struct gnet_stats_basic_packed, but we can't depend on that. The
4968 * easiest thing to do is just to make a copy. */
4969 memset(&gsb, 0, sizeof gsb);
4970 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4971 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4972 stats->tx_bytes = gsb.bytes;
4973 stats->tx_packets = gsb.packets;
4974
4975 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4976 stats->tx_errors = gsq->drops;
4977 }
4978
4979 return 0;
4980
4981error:
4982 if (options) {
4983 *options = NULL;
4984 }
4985 if (stats) {
4986 memset(stats, 0, sizeof *stats);
4987 }
4988 return EPROTO;
4989}
4990
4991/* Queries the kernel for class with identifier 'handle' and parent 'parent'
4992 * on 'netdev'. */
4993static int
4994tc_query_class(const struct netdev *netdev,
4995 unsigned int handle, unsigned int parent,
4996 struct ofpbuf **replyp)
4997{
4998 struct ofpbuf request;
4999 struct tcmsg *tcmsg;
5000 int error;
5001
5002 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
5003 if (!tcmsg) {
5004 return ENODEV;
5005 }
c1c9c9c4
BP
5006 tcmsg->tcm_handle = handle;
5007 tcmsg->tcm_parent = parent;
5008
5009 error = tc_transact(&request, replyp);
5010 if (error) {
5011 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5012 netdev_get_name(netdev),
5013 tc_get_major(handle), tc_get_minor(handle),
5014 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5015 ovs_strerror(error));
c1c9c9c4
BP
5016 }
5017 return error;
5018}
5019
5020/* Equivalent to "tc class del dev <name> handle <handle>". */
5021static int
5022tc_delete_class(const struct netdev *netdev, unsigned int handle)
5023{
5024 struct ofpbuf request;
5025 struct tcmsg *tcmsg;
5026 int error;
5027
5028 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5029 if (!tcmsg) {
5030 return ENODEV;
5031 }
c1c9c9c4
BP
5032 tcmsg->tcm_handle = handle;
5033 tcmsg->tcm_parent = 0;
5034
5035 error = tc_transact(&request, NULL);
5036 if (error) {
5037 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5038 netdev_get_name(netdev),
5039 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5040 ovs_strerror(error));
c1c9c9c4
BP
5041 }
5042 return error;
5043}
5044
5045/* Equivalent to "tc qdisc del dev <name> root". */
5046static int
b5d57fc8 5047tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5048{
b5d57fc8 5049 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5050 struct ofpbuf request;
5051 struct tcmsg *tcmsg;
5052 int error;
5053
b5d57fc8 5054 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5055 if (!tcmsg) {
5056 return ENODEV;
5057 }
c1c9c9c4
BP
5058 tcmsg->tcm_handle = tc_make_handle(1, 0);
5059 tcmsg->tcm_parent = TC_H_ROOT;
5060
5061 error = tc_transact(&request, NULL);
5062 if (error == EINVAL) {
5063 /* EINVAL probably means that the default qdisc was in use, in which
5064 * case we've accomplished our purpose. */
5065 error = 0;
5066 }
b5d57fc8
BP
5067 if (!error && netdev->tc) {
5068 if (netdev->tc->ops->tc_destroy) {
5069 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5070 }
b5d57fc8 5071 netdev->tc = NULL;
c1c9c9c4
BP
5072 }
5073 return error;
5074}
5075
ac3e3aaa
BP
5076static bool
5077getqdisc_is_safe(void)
5078{
5079 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5080 static bool safe = false;
5081
5082 if (ovsthread_once_start(&once)) {
5083 struct utsname utsname;
5084 int major, minor;
5085
5086 if (uname(&utsname) == -1) {
5087 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5088 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5089 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5090 } else if (major < 2 || (major == 2 && minor < 35)) {
5091 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5092 utsname.release);
5093 } else {
5094 safe = true;
5095 }
5096 ovsthread_once_done(&once);
5097 }
5098 return safe;
5099}
5100
c1c9c9c4
BP
5101/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5102 * kernel to determine what they are. Returns 0 if successful, otherwise a
5103 * positive errno value. */
5104static int
b5d57fc8 5105tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5106{
b5d57fc8 5107 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5108 struct ofpbuf request, *qdisc;
5109 const struct tc_ops *ops;
5110 struct tcmsg *tcmsg;
5111 int load_error;
5112 int error;
5113
b5d57fc8 5114 if (netdev->tc) {
c1c9c9c4
BP
5115 return 0;
5116 }
5117
5118 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5119 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5120 * 2.6.35 without that fix backported to it.
5121 *
5122 * To avoid the OOPS, we must not make a request that would attempt to dump
5123 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5124 * few others. There are a few ways that I can see to do this, but most of
5125 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5126 * technique chosen here is to assume that any non-default qdisc that we
5127 * create will have a class with handle 1:0. The built-in qdiscs only have
5128 * a class with handle 0:0.
5129 *
ac3e3aaa
BP
5130 * On Linux 2.6.35+ we use the straightforward method because it allows us
5131 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5132 * in such a case we get no response at all from the kernel (!) if a
5133 * builtin qdisc is in use (which is later caught by "!error &&
5134 * !qdisc->size"). */
b5d57fc8 5135 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
5136 if (!tcmsg) {
5137 return ENODEV;
5138 }
ac3e3aaa
BP
5139 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5140 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5141
5142 /* Figure out what tc class to instantiate. */
5143 error = tc_transact(&request, &qdisc);
ac3e3aaa 5144 if (!error && qdisc->size) {
c1c9c9c4
BP
5145 const char *kind;
5146
5147 error = tc_parse_qdisc(qdisc, &kind, NULL);
5148 if (error) {
5149 ops = &tc_ops_other;
5150 } else {
5151 ops = tc_lookup_linux_name(kind);
5152 if (!ops) {
5153 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5154 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5155
5156 ops = &tc_ops_other;
5157 }
5158 }
ac3e3aaa
BP
5159 } else if ((!error && !qdisc->size) || error == ENOENT) {
5160 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5161 * set up by some other entity that doesn't have a handle 1:0. We will
5162 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5163 ops = &tc_ops_default;
5164 error = 0;
5165 } else {
5166 /* Who knows? Maybe the device got deleted. */
5167 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5168 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5169 ops = &tc_ops_other;
5170 }
5171
5172 /* Instantiate it. */
b5d57fc8
BP
5173 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5174 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5175 ofpbuf_delete(qdisc);
5176
5177 return error ? error : load_error;
5178}
5179
5180/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5181 approximate the time to transmit packets of various lengths. For an MTU of
5182 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5183 represents two possible packet lengths; for a MTU of 513 through 1024, four
5184 possible lengths; and so on.
5185
5186 Returns, for the specified 'mtu', the number of bits that packet lengths
5187 need to be shifted right to fit within such a 256-entry table. */
5188static int
5189tc_calc_cell_log(unsigned int mtu)
5190{
5191 int cell_log;
5192
5193 if (!mtu) {
5194 mtu = ETH_PAYLOAD_MAX;
5195 }
5196 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5197
5198 for (cell_log = 0; mtu >= 256; cell_log++) {
5199 mtu >>= 1;
5200 }
5201
5202 return cell_log;
5203}
5204
5205/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5206 * of 'mtu'. */
5207static void
5208tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5209{
5210 memset(rate, 0, sizeof *rate);
5211 rate->cell_log = tc_calc_cell_log(mtu);
5212 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5213 /* rate->cell_align = 0; */ /* distro headers. */
5214 rate->mpu = ETH_TOTAL_MIN;
5215 rate->rate = Bps;
5216}
5217
5218/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5219 * attribute of the specified "type".
5220 *
5221 * See tc_calc_cell_log() above for a description of "rtab"s. */
5222static void
5223tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5224{
5225 uint32_t *rtab;
5226 unsigned int i;
5227
5228 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5229 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5230 unsigned packet_size = (i + 1) << rate->cell_log;
5231 if (packet_size < rate->mpu) {
5232 packet_size = rate->mpu;
5233 }
5234 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5235 }
5236}
5237
5238/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5239 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5240 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5241 * 0 is fine.) */
c1c9c9c4
BP
5242static int
5243tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5244{
5245 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5246 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5247}
d3980822 5248\f
aaf2fb1a
BP
5249/* Linux-only functions declared in netdev-linux.h */
5250
5251/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5252 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5253int
5254netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5255 const char *flag_name, bool enable)
5256{
5257 const char *netdev_name = netdev_get_name(netdev);
5258 struct ethtool_value evalue;
5259 uint32_t new_flags;
5260 int error;
5261
ab985a77 5262 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5263 memset(&evalue, 0, sizeof evalue);
5264 error = netdev_linux_do_ethtool(netdev_name,
5265 (struct ethtool_cmd *)&evalue,
5266 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5267 if (error) {
5268 return error;
5269 }
5270
ab985a77 5271 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
5272 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5273 error = netdev_linux_do_ethtool(netdev_name,
5274 (struct ethtool_cmd *)&evalue,
5275 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5276 if (error) {
5277 return error;
5278 }
5279
ab985a77 5280 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5281 memset(&evalue, 0, sizeof evalue);
5282 error = netdev_linux_do_ethtool(netdev_name,
5283 (struct ethtool_cmd *)&evalue,
5284 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5285 if (error) {
5286 return error;
5287 }
5288
5289 if (new_flags != evalue.data) {
5290 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5291 "device %s failed", enable ? "enable" : "disable",
5292 flag_name, netdev_name);
5293 return EOPNOTSUPP;
5294 }
5295
5296 return 0;
5297}
5298\f
5299/* Utility functions. */
5300
d3980822 5301/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5302static void
d3980822
BP
5303netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5304 const struct rtnl_link_stats *src)
5305{
f613a0d7
PS
5306 dst->rx_packets = src->rx_packets;
5307 dst->tx_packets = src->tx_packets;
5308 dst->rx_bytes = src->rx_bytes;
5309 dst->tx_bytes = src->tx_bytes;
5310 dst->rx_errors = src->rx_errors;
5311 dst->tx_errors = src->tx_errors;
5312 dst->rx_dropped = src->rx_dropped;
5313 dst->tx_dropped = src->tx_dropped;
5314 dst->multicast = src->multicast;
5315 dst->collisions = src->collisions;
5316 dst->rx_length_errors = src->rx_length_errors;
5317 dst->rx_over_errors = src->rx_over_errors;
5318 dst->rx_crc_errors = src->rx_crc_errors;
5319 dst->rx_frame_errors = src->rx_frame_errors;
5320 dst->rx_fifo_errors = src->rx_fifo_errors;
5321 dst->rx_missed_errors = src->rx_missed_errors;
5322 dst->tx_aborted_errors = src->tx_aborted_errors;
5323 dst->tx_carrier_errors = src->tx_carrier_errors;
5324 dst->tx_fifo_errors = src->tx_fifo_errors;
5325 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5326 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5327}
5328
337c9b99
BP
5329/* Copies 'src' into 'dst', performing format conversion in the process. */
5330static void
5331netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5332 const struct rtnl_link_stats64 *src)
5333{
5334 dst->rx_packets = src->rx_packets;
5335 dst->tx_packets = src->tx_packets;
5336 dst->rx_bytes = src->rx_bytes;
5337 dst->tx_bytes = src->tx_bytes;
5338 dst->rx_errors = src->rx_errors;
5339 dst->tx_errors = src->tx_errors;
5340 dst->rx_dropped = src->rx_dropped;
5341 dst->tx_dropped = src->tx_dropped;
5342 dst->multicast = src->multicast;
5343 dst->collisions = src->collisions;
5344 dst->rx_length_errors = src->rx_length_errors;
5345 dst->rx_over_errors = src->rx_over_errors;
5346 dst->rx_crc_errors = src->rx_crc_errors;
5347 dst->rx_frame_errors = src->rx_frame_errors;
5348 dst->rx_fifo_errors = src->rx_fifo_errors;
5349 dst->rx_missed_errors = src->rx_missed_errors;
5350 dst->tx_aborted_errors = src->tx_aborted_errors;
5351 dst->tx_carrier_errors = src->tx_carrier_errors;
5352 dst->tx_fifo_errors = src->tx_fifo_errors;
5353 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5354 dst->tx_window_errors = src->tx_window_errors;
5355}
5356
c1c9c9c4 5357static int
35eef899 5358get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5359{
c1c9c9c4
BP
5360 struct ofpbuf request;
5361 struct ofpbuf *reply;
c1c9c9c4
BP
5362 int error;
5363
5364 ofpbuf_init(&request, 0);
13a24df8
BP
5365 nl_msg_put_nlmsghdr(&request,
5366 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5367 RTM_GETLINK, NLM_F_REQUEST);
5368 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5369 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5370 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5371 ofpbuf_uninit(&request);
5372 if (error) {
5373 return error;
5374 }
5375
13a24df8 5376 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5377 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5378 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5379 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5380 error = 0;
5381 } else {
337c9b99
BP
5382 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5383 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5384 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5385 error = 0;
5386 } else {
5387 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5388 error = EPROTO;
5389 }
13a24df8
BP
5390 }
5391 } else {
5392 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5393 error = EPROTO;
c1c9c9c4 5394 }
8b61709d 5395
8b61709d 5396
576e26d7 5397 ofpbuf_delete(reply);
35eef899 5398 return error;
8b61709d 5399}
c1c9c9c4 5400
3a183124 5401static int
b5d57fc8 5402get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5403{
5404 struct ifreq ifr;
5405 int error;
5406
755be9ea 5407 *flags = 0;
259e0b1a 5408 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5409 if (!error) {
5410 *flags = ifr.ifr_flags;
5411 }
8b61709d
BP
5412 return error;
5413}
5414
5415static int
4b609110 5416set_flags(const char *name, unsigned int flags)
8b61709d
BP
5417{
5418 struct ifreq ifr;
5419
5420 ifr.ifr_flags = flags;
259e0b1a 5421 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5422}
5423
5424static int
5425do_get_ifindex(const char *netdev_name)
5426{
5427 struct ifreq ifr;
259e0b1a 5428 int error;
8b61709d 5429
71d7c22f 5430 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5431 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5432
5433 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5434 if (error) {
8b61709d 5435 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
5436 netdev_name, ovs_strerror(error));
5437 return -error;
8b61709d
BP
5438 }
5439 return ifr.ifr_ifindex;
5440}
5441
5442static int
5443get_ifindex(const struct netdev *netdev_, int *ifindexp)
5444{
b5d57fc8 5445 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5446
b5d57fc8 5447 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 5448 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5449
8b61709d 5450 if (ifindex < 0) {
b5d57fc8
BP
5451 netdev->get_ifindex_error = -ifindex;
5452 netdev->ifindex = 0;
c7b1b0a5 5453 } else {
b5d57fc8
BP
5454 netdev->get_ifindex_error = 0;
5455 netdev->ifindex = ifindex;
8b61709d 5456 }
b5d57fc8 5457 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5458 }
c7b1b0a5 5459
b5d57fc8
BP
5460 *ifindexp = netdev->ifindex;
5461 return netdev->get_ifindex_error;
8b61709d
BP
5462}
5463
5464static int
5465get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
5466{
5467 struct ifreq ifr;
5468 int hwaddr_family;
259e0b1a 5469 int error;
8b61709d
BP
5470
5471 memset(&ifr, 0, sizeof ifr);
71d7c22f 5472 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5473 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5474 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5475 if (error) {
78857dfb
BP
5476 /* ENODEV probably means that a vif disappeared asynchronously and
5477 * hasn't been removed from the database yet, so reduce the log level
5478 * to INFO for that case. */
259e0b1a 5479 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5480 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5481 netdev_name, ovs_strerror(error));
5482 return error;
8b61709d
BP
5483 }
5484 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5485 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
5486 VLOG_WARN("%s device has unknown hardware address family %d",
5487 netdev_name, hwaddr_family);
5488 }
5489 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5490 return 0;
5491}
5492
5493static int
44445cac 5494set_etheraddr(const char *netdev_name,
8b61709d
BP
5495 const uint8_t mac[ETH_ADDR_LEN])
5496{
5497 struct ifreq ifr;
259e0b1a 5498 int error;
8b61709d
BP
5499
5500 memset(&ifr, 0, sizeof ifr);
71d7c22f 5501 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5502 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
5503 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
5504 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5505 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5506 if (error) {
8b61709d 5507 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5508 netdev_name, ovs_strerror(error));
8b61709d 5509 }
259e0b1a 5510 return error;
8b61709d
BP
5511}
5512
5513static int
0b0544d7 5514netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5515 int cmd, const char *cmd_name)
5516{
5517 struct ifreq ifr;
259e0b1a 5518 int error;
8b61709d
BP
5519
5520 memset(&ifr, 0, sizeof ifr);
71d7c22f 5521 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5522 ifr.ifr_data = (caddr_t) ecmd;
5523
5524 ecmd->cmd = cmd;
259e0b1a
BP
5525 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5526 if (error) {
5527 if (error != EOPNOTSUPP) {
8b61709d 5528 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5529 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5530 } else {
5531 /* The device doesn't support this operation. That's pretty
5532 * common, so there's no point in logging anything. */
5533 }
8b61709d 5534 }
259e0b1a 5535 return error;
8b61709d 5536}
f1acd62b
BP
5537
5538static int
5539netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
5540 int cmd, const char *cmd_name)
5541{
5542 struct ifreq ifr;
5543 int error;
5544
5545 ifr.ifr_addr.sa_family = AF_INET;
259e0b1a 5546 error = af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b 5547 if (!error) {
db5a1019
AW
5548 const struct sockaddr_in *sin = ALIGNED_CAST(struct sockaddr_in *,
5549 &ifr.ifr_addr);
f1acd62b
BP
5550 *ip = sin->sin_addr;
5551 }
5552 return error;
5553}
488d734d
BP
5554
5555/* Returns an AF_PACKET raw socket or a negative errno value. */
5556static int
5557af_packet_sock(void)
5558{
23882115
BP
5559 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5560 static int sock;
488d734d 5561
23882115 5562 if (ovsthread_once_start(&once)) {
488d734d
BP
5563 sock = socket(AF_PACKET, SOCK_RAW, 0);
5564 if (sock >= 0) {
8450059e
BP
5565 int error = set_nonblocking(sock);
5566 if (error) {
5567 close(sock);
5568 sock = -error;
5569 }
488d734d
BP
5570 } else {
5571 sock = -errno;
10a89ef0
BP
5572 VLOG_ERR("failed to create packet socket: %s",
5573 ovs_strerror(errno));
488d734d 5574 }
23882115 5575 ovsthread_once_done(&once);
488d734d
BP
5576 }
5577
5578 return sock;
5579}