]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
bridge: Always "up" internal devices.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
e0edde6f 2 * Copyright (c) 2009, 2010, 2011, 2012 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
8b61709d 21#include <assert.h>
e9e28be3 22#include <errno.h>
8b61709d
BP
23#include <fcntl.h>
24#include <arpa/inet.h>
25#include <inttypes.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d
BP
35#include <linux/sockios.h>
36#include <linux/version.h>
37#include <sys/types.h>
38#include <sys/ioctl.h>
39#include <sys/socket.h>
40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
43#include <net/if_packet.h>
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
9fe3b9a2 52#include "dpif-linux.h"
8b61709d
BP
53#include "dynamic-string.h"
54#include "fatal-signal.h"
93b13be8
BP
55#include "hash.h"
56#include "hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
e9e28be3 59#include "netlink.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d
BP
63#include "openflow/openflow.h"
64#include "packets.h"
65#include "poll-loop.h"
21d6e22e 66#include "rtnetlink-link.h"
8b61709d
BP
67#include "socket-util.h"
68#include "shash.h"
19993ef3 69#include "sset.h"
1670c579 70#include "timer.h"
e9e28be3 71#include "vlog.h"
5136ce49 72
d98e6007 73VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 74
d76f09ea
BP
75COVERAGE_DEFINE(netdev_set_policing);
76COVERAGE_DEFINE(netdev_arp_lookup);
77COVERAGE_DEFINE(netdev_get_ifindex);
78COVERAGE_DEFINE(netdev_get_hwaddr);
79COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
80COVERAGE_DEFINE(netdev_get_ethtool);
81COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 82
8b61709d
BP
83\f
84/* These were introduced in Linux 2.6.14, so they might be missing if we have
85 * old headers. */
86#ifndef ADVERTISED_Pause
87#define ADVERTISED_Pause (1 << 13)
88#endif
89#ifndef ADVERTISED_Asym_Pause
90#define ADVERTISED_Asym_Pause (1 << 14)
91#endif
92
e47bd51a
JP
93/* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95#ifndef ETHTOOL_GFLAGS
96#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97#endif
98#ifndef ETHTOOL_SFLAGS
99#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100#endif
101
c1c9c9c4
BP
102/* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 * headers. */
104#ifndef TC_RTAB_SIZE
105#define TC_RTAB_SIZE 1024
106#endif
107
2ee6545f 108static struct nln_notifier *netdev_linux_cache_notifier = NULL;
46415c90 109static int cache_notifier_refcount;
8b61709d
BP
110
111enum {
7fbef77a
JG
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
114 VALID_IN4 = 1 << 2,
115 VALID_IN6 = 1 << 3,
116 VALID_MTU = 1 << 4,
3a183124 117 VALID_POLICING = 1 << 5,
4f925bd3
PS
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
51f87458 120 VALID_FEATURES = 1 << 8,
8b61709d
BP
121};
122
149f577a
JG
123struct tap_state {
124 int fd;
61b999dd 125 bool opened;
149f577a 126};
c1c9c9c4
BP
127\f
128/* Traffic control. */
129
130/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
c1c9c9c4
BP
135struct tc {
136 const struct tc_ops *ops;
93b13be8
BP
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140};
c1c9c9c4 141
93b13be8
BP
142/* One traffic control queue.
143 *
144 * Each TC implementation subclasses this with whatever additional data it
145 * needs. */
146struct tc_queue {
147 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
148 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
149};
150
151/* A particular kind of traffic control. Each implementation generally maps to
152 * one particular Linux qdisc class.
153 *
154 * The functions below return 0 if successful or a positive errno value on
155 * failure, except where otherwise noted. All of them must be provided, except
156 * where otherwise noted. */
157struct tc_ops {
158 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
159 * This is null for tc_ops_default and tc_ops_other, for which there are no
160 * appropriate values. */
161 const char *linux_name;
162
163 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
164 const char *ovs_name;
165
166 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
167 * queues. The queues are numbered 0 through n_queues - 1. */
168 unsigned int n_queues;
169
170 /* Called to install this TC class on 'netdev'. The implementation should
171 * make the Netlink calls required to set up 'netdev' with the right qdisc
172 * and configure it according to 'details'. The implementation may assume
173 * that the current qdisc is the default; that is, there is no need for it
174 * to delete the current qdisc before installing itself.
175 *
176 * The contents of 'details' should be documented as valid for 'ovs_name'
177 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
178 * (which is built as ovs-vswitchd.conf.db(8)).
179 *
180 * This function must return 0 if and only if it sets 'netdev->tc' to an
181 * initialized 'struct tc'.
182 *
183 * (This function is null for tc_ops_other, which cannot be installed. For
184 * other TC classes it should always be nonnull.) */
79f1cbe9 185 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
186
187 /* Called when the netdev code determines (through a Netlink query) that
188 * this TC class's qdisc is installed on 'netdev', but we didn't install
189 * it ourselves and so don't know any of the details.
190 *
191 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
192 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
193 * implementation should parse the other attributes of 'nlmsg' as
194 * necessary to determine its configuration. If necessary it should also
195 * use Netlink queries to determine the configuration of queues on
196 * 'netdev'.
197 *
198 * This function must return 0 if and only if it sets 'netdev->tc' to an
199 * initialized 'struct tc'. */
200 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
201
202 /* Destroys the data structures allocated by the implementation as part of
203 * 'tc'. (This includes destroying 'tc->queues' by calling
204 * tc_destroy(tc).
205 *
206 * The implementation should not need to perform any Netlink calls. If
207 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
208 * (But it may not be desirable.)
209 *
210 * This function may be null if 'tc' is trivial. */
211 void (*tc_destroy)(struct tc *tc);
212
213 /* Retrieves details of 'netdev->tc' configuration into 'details'.
214 *
215 * The implementation should not need to perform any Netlink calls, because
216 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
217 * cached the configuration.
218 *
219 * The contents of 'details' should be documented as valid for 'ovs_name'
220 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
221 * (which is built as ovs-vswitchd.conf.db(8)).
222 *
223 * This function may be null if 'tc' is not configurable.
224 */
79f1cbe9 225 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
226
227 /* Reconfigures 'netdev->tc' according to 'details', performing any
228 * required Netlink calls to complete the reconfiguration.
229 *
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
233 *
234 * This function may be null if 'tc' is not configurable.
235 */
79f1cbe9 236 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 237
93b13be8
BP
238 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
239 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
240 *
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "Queue" table in
243 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
244 *
245 * The implementation should not need to perform any Netlink calls, because
246 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
247 * cached the queue configuration.
248 *
249 * This function may be null if 'tc' does not have queues ('n_queues' is
250 * 0). */
93b13be8 251 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 252 struct smap *details);
c1c9c9c4
BP
253
254 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
255 * 'details', perfoming any required Netlink calls to complete the
256 * reconfiguration. The caller ensures that 'queue_id' is less than
257 * 'n_queues'.
258 *
259 * The contents of 'details' should be documented as valid for 'ovs_name'
260 * in the "other_config" column in the "Queue" table in
261 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
262 *
263 * This function may be null if 'tc' does not have queues or its queues are
264 * not configurable. */
265 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 266 const struct smap *details);
c1c9c9c4 267
93b13be8
BP
268 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
269 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
270 *
271 * This function may be null if 'tc' does not have queues or its queues
272 * cannot be deleted. */
93b13be8 273 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 274
93b13be8
BP
275 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
276 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
277 *
278 * On success, initializes '*stats'.
279 *
280 * This function may be null if 'tc' does not have queues or if it cannot
281 * report queue statistics. */
93b13be8
BP
282 int (*class_get_stats)(const struct netdev *netdev,
283 const struct tc_queue *queue,
c1c9c9c4
BP
284 struct netdev_queue_stats *stats);
285
286 /* Extracts queue stats from 'nlmsg', which is a response to a
287 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
288 *
289 * This function may be null if 'tc' does not have queues or if it cannot
290 * report queue statistics. */
291 int (*class_dump_stats)(const struct netdev *netdev,
292 const struct ofpbuf *nlmsg,
293 netdev_dump_queue_stats_cb *cb, void *aux);
294};
295
296static void
297tc_init(struct tc *tc, const struct tc_ops *ops)
298{
299 tc->ops = ops;
93b13be8 300 hmap_init(&tc->queues);
c1c9c9c4
BP
301}
302
303static void
304tc_destroy(struct tc *tc)
305{
93b13be8 306 hmap_destroy(&tc->queues);
c1c9c9c4
BP
307}
308
309static const struct tc_ops tc_ops_htb;
a339aa81 310static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
311static const struct tc_ops tc_ops_default;
312static const struct tc_ops tc_ops_other;
313
314static const struct tc_ops *tcs[] = {
315 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 316 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
317 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
318 &tc_ops_other, /* Some other qdisc. */
319 NULL
320};
149f577a 321
c1c9c9c4
BP
322static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
323static unsigned int tc_get_major(unsigned int handle);
324static unsigned int tc_get_minor(unsigned int handle);
325
326static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
327static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
328static unsigned int tc_buffer_per_jiffy(unsigned int rate);
329
330static struct tcmsg *tc_make_request(const struct netdev *, int type,
331 unsigned int flags, struct ofpbuf *);
332static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004
JP
333static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
334static int tc_add_policer(struct netdev *netdev, int kbits_rate,
335 int kbits_burst);
c1c9c9c4
BP
336
337static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
338 struct nlattr **options);
339static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
340 struct nlattr **options,
341 struct netdev_queue_stats *);
342static int tc_query_class(const struct netdev *,
343 unsigned int handle, unsigned int parent,
344 struct ofpbuf **replyp);
345static int tc_delete_class(const struct netdev *, unsigned int handle);
346
347static int tc_del_qdisc(struct netdev *netdev);
348static int tc_query_qdisc(const struct netdev *netdev);
349
350static int tc_calc_cell_log(unsigned int mtu);
351static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
352static void tc_put_rtab(struct ofpbuf *, uint16_t type,
353 const struct tc_ratespec *rate);
354static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
355\f
149f577a
JG
356struct netdev_dev_linux {
357 struct netdev_dev netdev_dev;
358
8b61709d 359 struct shash_node *shash_node;
149f577a 360 unsigned int cache_valid;
ac4d3bcb 361 unsigned int change_seq;
8b61709d 362
1670c579
EJ
363 bool miimon; /* Link status of last poll. */
364 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
365 struct timer miimon_timer;
366
8722022c
BP
367 /* The following are figured out "on demand" only. They are only valid
368 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
369 int ifindex;
370 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 371 struct in_addr address, netmask;
8b61709d
BP
372 struct in6_addr in6;
373 int mtu;
059e5f4f 374 unsigned int ifi_flags;
65c3058c 375 long long int carrier_resets;
80a86fbe
BP
376 uint32_t kbits_rate; /* Policing data. */
377 uint32_t kbits_burst;
bba1e6f3
PS
378 int vport_stats_error; /* Cached error code from vport_get_stats().
379 0 or an errno value. */
90a6637d 380 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 381 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 382 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 383 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 384 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 385
a00ca915
EJ
386 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
90a6637d 390
4f925bd3 391 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 392 struct tc *tc;
149f577a
JG
393
394 union {
395 struct tap_state tap;
396 } state;
8b61709d
BP
397};
398
149f577a
JG
399struct netdev_linux {
400 struct netdev netdev;
5b7448ed 401 int fd;
149f577a 402};
8b61709d 403
76c308b5
BP
404/* Sockets used for ioctl operations. */
405static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 406
ff4ed3c9
BP
407/* A Netlink routing socket that is not subscribed to any multicast groups. */
408static struct nl_sock *rtnl_sock;
409
8b61709d
BP
410/* This is set pretty low because we probably won't learn anything from the
411 * additional log messages. */
412static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
413
15b3596a 414static int netdev_linux_init(void);
6f643e49 415
0b0544d7 416static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 417 int cmd, const char *cmd_name);
149f577a
JG
418static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
419 const char *cmd_name);
f1acd62b
BP
420static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
421 int cmd, const char *cmd_name);
059e5f4f
EJ
422static int get_flags(const struct netdev_dev *, unsigned int *flags);
423static int set_flags(struct netdev *, unsigned int flags);
8b61709d
BP
424static int do_get_ifindex(const char *netdev_name);
425static int get_ifindex(const struct netdev *, int *ifindexp);
426static int do_set_addr(struct netdev *netdev,
427 int ioctl_nr, const char *ioctl_name,
428 struct in_addr addr);
429static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 430static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
8b61709d
BP
431static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
432static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 433static int af_packet_sock(void);
1670c579
EJ
434static void netdev_linux_miimon_run(void);
435static void netdev_linux_miimon_wait(void);
8b61709d 436
15b3596a
JG
437static bool
438is_netdev_linux_class(const struct netdev_class *netdev_class)
439{
440 return netdev_class->init == netdev_linux_init;
441}
442
149f577a
JG
443static struct netdev_dev_linux *
444netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 445{
15b3596a
JG
446 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
447 assert(is_netdev_linux_class(netdev_class));
448
149f577a 449 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
450}
451
8b61709d
BP
452static struct netdev_linux *
453netdev_linux_cast(const struct netdev *netdev)
454{
15b3596a
JG
455 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
456 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
457 assert(is_netdev_linux_class(netdev_class));
458
8b61709d
BP
459 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
460}
ff4ed3c9 461\f
8b61709d
BP
462static int
463netdev_linux_init(void)
464{
465 static int status = -1;
466 if (status < 0) {
ff4ed3c9 467 /* Create AF_INET socket. */
8b61709d
BP
468 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
469 status = af_inet_sock >= 0 ? 0 : errno;
470 if (status) {
471 VLOG_ERR("failed to create inet socket: %s", strerror(status));
472 }
ff4ed3c9
BP
473
474 /* Create rtnetlink socket. */
475 if (!status) {
cceb11f5 476 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
477 if (status) {
478 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
479 strerror(status));
480 }
481 }
8b61709d
BP
482 }
483 return status;
484}
485
486static void
487netdev_linux_run(void)
488{
18a23781 489 rtnetlink_link_run();
1670c579 490 netdev_linux_miimon_run();
8b61709d
BP
491}
492
493static void
494netdev_linux_wait(void)
495{
18a23781 496 rtnetlink_link_wait();
1670c579 497 netdev_linux_miimon_wait();
8b61709d
BP
498}
499
4f925bd3
PS
500static int
501netdev_linux_get_drvinfo(struct netdev_dev_linux *netdev_dev)
502{
503
504 int error;
505
506 if (netdev_dev->cache_valid & VALID_DRVINFO) {
507 return 0;
508 }
509
ab985a77 510 COVERAGE_INC(netdev_get_ethtool);
4f925bd3
PS
511 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
512 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
513 (struct ethtool_cmd *)&netdev_dev->drvinfo,
514 ETHTOOL_GDRVINFO,
515 "ETHTOOL_GDRVINFO");
516 if (!error) {
517 netdev_dev->cache_valid |= VALID_DRVINFO;
518 }
519 return error;
520}
521
ac4d3bcb 522static void
4f925bd3
PS
523netdev_dev_linux_changed(struct netdev_dev_linux *dev,
524 unsigned int ifi_flags,
525 unsigned int mask)
ac4d3bcb
EJ
526{
527 dev->change_seq++;
528 if (!dev->change_seq) {
529 dev->change_seq++;
530 }
8aa77183
BP
531
532 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
533 dev->carrier_resets++;
534 }
535 dev->ifi_flags = ifi_flags;
536
4f925bd3
PS
537 dev->cache_valid &= mask;
538}
539
540static void
541netdev_dev_linux_update(struct netdev_dev_linux *dev,
542 const struct rtnetlink_link_change *change)
543{
544 if (change->nlmsg_type == RTM_NEWLINK) {
545 /* Keep drv-info */
546 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 547
c7b1b0a5 548 /* Update netdev from rtnl-change msg. */
90a6637d
PS
549 if (change->mtu) {
550 dev->mtu = change->mtu;
551 dev->cache_valid |= VALID_MTU;
552 dev->netdev_mtu_error = 0;
553 }
554
44445cac
PS
555 if (!eth_addr_is_zero(change->addr)) {
556 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
557 dev->cache_valid |= VALID_ETHERADDR;
558 dev->ether_addr_error = 0;
559 }
560
c7b1b0a5
PS
561 dev->ifindex = change->ifi_index;
562 dev->cache_valid |= VALID_IFINDEX;
563 dev->get_ifindex_error = 0;
564
4f925bd3
PS
565 } else {
566 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
567 }
ac4d3bcb
EJ
568}
569
8b61709d 570static void
21d6e22e 571netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 572 void *aux OVS_UNUSED)
8b61709d 573{
149f577a 574 struct netdev_dev_linux *dev;
8b61709d 575 if (change) {
46415c90
JG
576 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
577 if (base_dev) {
15b3596a
JG
578 const struct netdev_class *netdev_class =
579 netdev_dev_get_class(base_dev);
580
581 if (is_netdev_linux_class(netdev_class)) {
582 dev = netdev_dev_linux_cast(base_dev);
4f925bd3 583 netdev_dev_linux_update(dev, change);
15b3596a 584 }
8b61709d
BP
585 }
586 } else {
46415c90 587 struct shash device_shash;
8b61709d 588 struct shash_node *node;
46415c90
JG
589
590 shash_init(&device_shash);
591 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
592 SHASH_FOR_EACH (node, &device_shash) {
059e5f4f 593 unsigned int flags;
3a183124 594
149f577a 595 dev = node->data;
3a183124 596
755be9ea 597 get_flags(&dev->netdev_dev, &flags);
4f925bd3 598 netdev_dev_linux_changed(dev, flags, 0);
8b61709d 599 }
46415c90 600 shash_destroy(&device_shash);
8b61709d
BP
601 }
602}
603
604static int
1f6e0fbd 605cache_notifier_ref(void)
6c88d577 606{
46415c90 607 if (!cache_notifier_refcount) {
2ee6545f
EJ
608 assert(!netdev_linux_cache_notifier);
609
610 netdev_linux_cache_notifier =
611 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
612
613 if (!netdev_linux_cache_notifier) {
614 return EINVAL;
149f577a
JG
615 }
616 }
46415c90 617 cache_notifier_refcount++;
6c88d577 618
1f6e0fbd
BP
619 return 0;
620}
621
622static void
623cache_notifier_unref(void)
624{
625 assert(cache_notifier_refcount > 0);
626 if (!--cache_notifier_refcount) {
627 assert(netdev_linux_cache_notifier);
628 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
629 netdev_linux_cache_notifier = NULL;
630 }
631}
632
633/* Creates system and internal devices. */
634static int
635netdev_linux_create(const struct netdev_class *class, const char *name,
636 struct netdev_dev **netdev_devp)
637{
638 struct netdev_dev_linux *netdev_dev;
639 int error;
640
641 error = cache_notifier_ref();
642 if (error) {
643 return error;
644 }
645
149f577a 646 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 647 netdev_dev->change_seq = 1;
de5cdb90 648 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
c37d4da4 649 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
46415c90 650
149f577a 651 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
652 return 0;
653}
654
5b7448ed
JG
655/* For most types of netdevs we open the device for each call of
656 * netdev_open(). However, this is not the case with tap devices,
657 * since it is only possible to open the device once. In this
658 * situation we share a single file descriptor, and consequently
659 * buffers, across all readers. Therefore once data is read it will
660 * be unavailable to other reads for tap devices. */
a740f0de 661static int
b8dcf5e9 662netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
de5cdb90 663 const char *name, struct netdev_dev **netdev_devp)
a740f0de 664{
149f577a 665 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
666 struct tap_state *state;
667 static const char tap_dev[] = "/dev/net/tun";
668 struct ifreq ifr;
669 int error;
670
149f577a
JG
671 netdev_dev = xzalloc(sizeof *netdev_dev);
672 state = &netdev_dev->state.tap;
a740f0de 673
1f6e0fbd
BP
674 error = cache_notifier_ref();
675 if (error) {
676 goto error;
677 }
678
6c88d577 679 /* Open tap device. */
149f577a
JG
680 state->fd = open(tap_dev, O_RDWR);
681 if (state->fd < 0) {
6c88d577
JP
682 error = errno;
683 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
1f6e0fbd 684 goto error_unref_notifier;
6c88d577
JP
685 }
686
687 /* Create tap device. */
688 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 689 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 690 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
691 VLOG_WARN("%s: creating tap device failed: %s", name,
692 strerror(errno));
693 error = errno;
1f6e0fbd 694 goto error_unref_notifier;
6c88d577
JP
695 }
696
697 /* Make non-blocking. */
149f577a 698 error = set_nonblocking(state->fd);
a740f0de 699 if (error) {
1f6e0fbd 700 goto error_unref_notifier;
a740f0de
JG
701 }
702
de5cdb90 703 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
149f577a 704 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
705 return 0;
706
1f6e0fbd
BP
707error_unref_notifier:
708 cache_notifier_unref();
a740f0de 709error:
149f577a 710 free(netdev_dev);
a740f0de
JG
711 return error;
712}
713
a740f0de 714static void
149f577a 715destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 716{
149f577a
JG
717 struct tap_state *state = &netdev_dev->state.tap;
718
719 if (state->fd >= 0) {
720 close(state->fd);
a740f0de
JG
721 }
722}
723
149f577a 724/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 725static void
149f577a 726netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 727{
149f577a 728 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 729 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 730
c1c9c9c4
BP
731 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
732 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
733 }
734
1f6e0fbd 735 if (class == &netdev_tap_class) {
149f577a 736 destroy_tap(netdev_dev);
6c88d577 737 }
658797c8 738 free(netdev_dev);
1f6e0fbd
BP
739
740 cache_notifier_unref();
6c88d577
JP
741}
742
8b61709d 743static int
7b6b0ef4 744netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d 745{
5b7448ed 746 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
747 struct netdev_linux *netdev;
748 enum netdev_flags flags;
749 int error;
750
751 /* Allocate network device. */
ec6fde61 752 netdev = xzalloc(sizeof *netdev);
49a6a163 753 netdev->fd = -1;
5b7448ed 754 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 755
c3827f61
BP
756 /* Verify that the device really exists, by attempting to read its flags.
757 * (The flags might be cached, in which case this won't actually do an
758 * ioctl.)
759 *
760 * Don't do this for "internal" netdevs, though, because those have to be
761 * created as netdev objects before they exist in the kernel, because
762 * creating them in the kernel happens by passing a netdev object to
763 * dpif_port_add(). */
764 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
765 error = netdev_get_flags(&netdev->netdev, &flags);
766 if (error == ENODEV) {
767 goto error;
768 }
8b61709d
BP
769 }
770
61b999dd
JG
771 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
772 !netdev_dev->state.tap.opened) {
773
774 /* We assume that the first user of the tap device is the primary user
775 * and give them the tap FD. Subsequent users probably just expect
776 * this to be a system device so open it normally to avoid send/receive
777 * directions appearing to be reversed. */
5b7448ed 778 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 779 netdev_dev->state.tap.opened = true;
8b61709d
BP
780 }
781
782 *netdevp = &netdev->netdev;
783 return 0;
784
785error:
149f577a 786 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
787 return error;
788}
789
790/* Closes and destroys 'netdev'. */
791static void
792netdev_linux_close(struct netdev *netdev_)
793{
794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
795
49a6a163 796 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 797 close(netdev->fd);
8b61709d
BP
798 }
799 free(netdev);
800}
e9e28be3 801
7b6b0ef4
BP
802static int
803netdev_linux_listen(struct netdev *netdev_)
804{
805 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
806 struct sockaddr_ll sll;
807 int ifindex;
808 int error;
809 int fd;
810
811 if (netdev->fd >= 0) {
812 return 0;
813 }
814
815 /* Create file descriptor. */
816 fd = socket(PF_PACKET, SOCK_RAW, 0);
817 if (fd < 0) {
818 error = errno;
819 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
820 goto error;
821 }
822
823 /* Set non-blocking mode. */
824 error = set_nonblocking(fd);
825 if (error) {
826 goto error;
827 }
828
829 /* Get ethernet device index. */
830 error = get_ifindex(&netdev->netdev, &ifindex);
831 if (error) {
832 goto error;
833 }
834
835 /* Bind to specific ethernet device. */
836 memset(&sll, 0, sizeof sll);
837 sll.sll_family = AF_PACKET;
838 sll.sll_ifindex = ifindex;
839 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
840 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
841 error = errno;
842 VLOG_ERR("%s: failed to bind raw socket (%s)",
843 netdev_get_name(netdev_), strerror(error));
844 goto error;
845 }
846
847 netdev->fd = fd;
848 return 0;
849
850error:
851 if (fd >= 0) {
852 close(fd);
853 }
854 return error;
855}
856
8b61709d
BP
857static int
858netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
859{
860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
861
5b7448ed 862 if (netdev->fd < 0) {
7b6b0ef4 863 /* Device is not listening. */
c0e5f6ca 864 return -EAGAIN;
8b61709d
BP
865 }
866
867 for (;;) {
8e8cddf7
BP
868 ssize_t retval;
869
870 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
871 ? read(netdev->fd, data, size)
872 : recv(netdev->fd, data, size, MSG_TRUNC));
0e15264f
BP
873 if (retval >= 0) {
874 return retval <= size ? retval : -EMSGSIZE;
8b61709d
BP
875 } else if (errno != EINTR) {
876 if (errno != EAGAIN) {
877 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
878 strerror(errno), netdev_get_name(netdev_));
879 }
c0e5f6ca 880 return -errno;
8b61709d
BP
881 }
882 }
883}
884
885/* Registers with the poll loop to wake up from the next call to poll_block()
886 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
887static void
888netdev_linux_recv_wait(struct netdev *netdev_)
889{
890 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
891 if (netdev->fd >= 0) {
892 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
893 }
894}
895
896/* Discards all packets waiting to be received from 'netdev'. */
897static int
898netdev_linux_drain(struct netdev *netdev_)
899{
900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 901 if (netdev->fd < 0) {
8b61709d 902 return 0;
5b7448ed 903 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 904 struct ifreq ifr;
149f577a 905 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
906 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
907 if (error) {
908 return error;
909 }
5b7448ed 910 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
911 return 0;
912 } else {
5b7448ed 913 return drain_rcvbuf(netdev->fd);
8b61709d
BP
914 }
915}
916
917/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
918 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
919 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
920 * the packet is too big or too small to transmit on the device.
921 *
922 * The caller retains ownership of 'buffer' in all cases.
923 *
924 * The kernel maintains a packet transmission queue, so the caller is not
925 * expected to do additional queuing of packets. */
926static int
927netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
928{
f23347ea
BP
929 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
930 for (;;) {
931 ssize_t retval;
8b61709d 932
f23347ea
BP
933 if (netdev->fd < 0) {
934 /* Use our AF_PACKET socket to send to this device. */
935 struct sockaddr_ll sll;
936 struct msghdr msg;
937 struct iovec iov;
938 int ifindex;
939 int error;
488d734d
BP
940 int sock;
941
942 sock = af_packet_sock();
943 if (sock < 0) {
944 return sock;
945 }
f23347ea
BP
946
947 error = get_ifindex(netdev_, &ifindex);
948 if (error) {
949 return error;
950 }
8b61709d 951
f23347ea
BP
952 /* We don't bother setting most fields in sockaddr_ll because the
953 * kernel ignores them for SOCK_RAW. */
954 memset(&sll, 0, sizeof sll);
955 sll.sll_family = AF_PACKET;
956 sll.sll_ifindex = ifindex;
76c308b5 957
ebc56baa 958 iov.iov_base = CONST_CAST(void *, data);
f23347ea 959 iov.iov_len = size;
76c308b5 960
f23347ea
BP
961 msg.msg_name = &sll;
962 msg.msg_namelen = sizeof sll;
963 msg.msg_iov = &iov;
964 msg.msg_iovlen = 1;
965 msg.msg_control = NULL;
966 msg.msg_controllen = 0;
967 msg.msg_flags = 0;
968
488d734d 969 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
970 } else {
971 /* Use the netdev's own fd to send to this device. This is
972 * essential for tap devices, because packets sent to a tap device
973 * with an AF_PACKET socket will loop back to be *received* again
974 * on the tap device. */
975 retval = write(netdev->fd, data, size);
976 }
76c308b5 977
8b61709d
BP
978 if (retval < 0) {
979 /* The Linux AF_PACKET implementation never blocks waiting for room
980 * for packets, instead returning ENOBUFS. Translate this into
981 * EAGAIN for the caller. */
982 if (errno == ENOBUFS) {
983 return EAGAIN;
984 } else if (errno == EINTR) {
985 continue;
986 } else if (errno != EAGAIN) {
987 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
988 netdev_get_name(netdev_), strerror(errno));
989 }
990 return errno;
991 } else if (retval != size) {
992 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
993 "%zu) on %s", retval, size, netdev_get_name(netdev_));
994 return EMSGSIZE;
995 } else {
996 return 0;
997 }
998 }
999}
1000
1001/* Registers with the poll loop to wake up from the next call to poll_block()
1002 * when the packet transmission queue has sufficient room to transmit a packet
1003 * with netdev_send().
1004 *
1005 * The kernel maintains a packet transmission queue, so the client is not
1006 * expected to do additional queuing of packets. Thus, this function is
1007 * unlikely to ever be used. It is included for completeness. */
1008static void
1009netdev_linux_send_wait(struct netdev *netdev_)
1010{
1011 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 1012 if (netdev->fd < 0) {
8b61709d 1013 /* Nothing to do. */
5b7448ed
JG
1014 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
1015 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
1016 } else {
1017 /* TAP device always accepts packets.*/
1018 poll_immediate_wake();
1019 }
1020}
1021
1022/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1023 * otherwise a positive errno value. */
1024static int
1025netdev_linux_set_etheraddr(struct netdev *netdev_,
1026 const uint8_t mac[ETH_ADDR_LEN])
1027{
149f577a
JG
1028 struct netdev_dev_linux *netdev_dev =
1029 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
1030 int error;
1031
44445cac
PS
1032 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1033 if (netdev_dev->ether_addr_error) {
1034 return netdev_dev->ether_addr_error;
1035 }
1036 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1037 return 0;
1038 }
1039 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1040 }
1041
1042 error = set_etheraddr(netdev_get_name(netdev_), mac);
1043 if (!error || error == ENODEV) {
1044 netdev_dev->ether_addr_error = error;
1045 netdev_dev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1046 if (!error) {
149f577a 1047 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 1048 }
8b61709d 1049 }
44445cac 1050
8b61709d
BP
1051 return error;
1052}
1053
44445cac 1054/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1055static int
1056netdev_linux_get_etheraddr(const struct netdev *netdev_,
1057 uint8_t mac[ETH_ADDR_LEN])
1058{
149f577a
JG
1059 struct netdev_dev_linux *netdev_dev =
1060 netdev_dev_linux_cast(netdev_get_dev(netdev_));
44445cac 1061
149f577a 1062 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 1063 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 1064 netdev_dev->etheraddr);
44445cac
PS
1065
1066 netdev_dev->ether_addr_error = error;
149f577a 1067 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 1068 }
44445cac
PS
1069
1070 if (!netdev_dev->ether_addr_error) {
1071 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1072 }
1073
1074 return netdev_dev->ether_addr_error;
8b61709d
BP
1075}
1076
1077/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1078 * in bytes, not including the hardware header; thus, this is typically 1500
1079 * bytes for Ethernet devices. */
1080static int
1081netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1082{
149f577a
JG
1083 struct netdev_dev_linux *netdev_dev =
1084 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1085 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
1086 struct ifreq ifr;
1087 int error;
1088
149f577a
JG
1089 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1090 SIOCGIFMTU, "SIOCGIFMTU");
90a6637d
PS
1091
1092 netdev_dev->netdev_mtu_error = error;
149f577a
JG
1093 netdev_dev->mtu = ifr.ifr_mtu;
1094 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 1095 }
90a6637d
PS
1096
1097 if (!netdev_dev->netdev_mtu_error) {
1098 *mtup = netdev_dev->mtu;
1099 }
1100 return netdev_dev->netdev_mtu_error;
8b61709d
BP
1101}
1102
9b020780
PS
1103/* Sets the maximum size of transmitted (MTU) for given device using linux
1104 * networking ioctl interface.
1105 */
1106static int
1107netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1108{
1109 struct netdev_dev_linux *netdev_dev =
1110 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1111 struct ifreq ifr;
1112 int error;
1113
90a6637d
PS
1114 if (netdev_dev->cache_valid & VALID_MTU) {
1115 if (netdev_dev->netdev_mtu_error) {
1116 return netdev_dev->netdev_mtu_error;
1117 }
1118 if (netdev_dev->mtu == mtu) {
1119 return 0;
1120 }
1121 netdev_dev->cache_valid &= ~VALID_MTU;
153e5481 1122 }
9b020780
PS
1123 ifr.ifr_mtu = mtu;
1124 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1125 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d
PS
1126 if (!error || error == ENODEV) {
1127 netdev_dev->netdev_mtu_error = error;
1128 netdev_dev->mtu = ifr.ifr_mtu;
1129 netdev_dev->cache_valid |= VALID_MTU;
9b020780 1130 }
90a6637d 1131 return error;
9b020780
PS
1132}
1133
9ab3d9a3
BP
1134/* Returns the ifindex of 'netdev', if successful, as a positive number.
1135 * On failure, returns a negative errno value. */
1136static int
1137netdev_linux_get_ifindex(const struct netdev *netdev)
1138{
1139 int ifindex, error;
1140
1141 error = get_ifindex(netdev, &ifindex);
1142 return error ? -error : ifindex;
1143}
1144
8b61709d
BP
1145static int
1146netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1147{
149f577a
JG
1148 struct netdev_dev_linux *netdev_dev =
1149 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1150
1670c579
EJ
1151 if (netdev_dev->miimon_interval > 0) {
1152 *carrier = netdev_dev->miimon;
3a183124 1153 } else {
c37d4da4 1154 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1155 }
8b61709d 1156
3a183124 1157 return 0;
8b61709d
BP
1158}
1159
65c3058c
EJ
1160static long long int
1161netdev_linux_get_carrier_resets(const struct netdev *netdev)
1162{
1163 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1164}
1165
63331829 1166static int
1670c579
EJ
1167netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1168 struct mii_ioctl_data *data)
63331829 1169{
63331829 1170 struct ifreq ifr;
782e6111 1171 int error;
63331829 1172
63331829 1173 memset(&ifr, 0, sizeof ifr);
782e6111 1174 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1175 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1176 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1177
782e6111
EJ
1178 return error;
1179}
1180
1181static int
1670c579 1182netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1183{
782e6111
EJ
1184 struct mii_ioctl_data data;
1185 int error;
63331829 1186
782e6111
EJ
1187 *miimon = false;
1188
1189 memset(&data, 0, sizeof data);
1670c579 1190 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1191 if (!error) {
1192 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1193 data.reg_num = MII_BMSR;
1670c579 1194 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1195 &data);
63331829
EJ
1196
1197 if (!error) {
782e6111 1198 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1199 } else {
1200 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1201 }
1202 } else {
1203 struct ethtool_cmd ecmd;
63331829
EJ
1204
1205 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1206 name);
1207
ab985a77 1208 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1209 memset(&ecmd, 0, sizeof ecmd);
1210 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1211 "ETHTOOL_GLINK");
1212 if (!error) {
782e6111
EJ
1213 struct ethtool_value eval;
1214
1215 memcpy(&eval, &ecmd, sizeof eval);
1216 *miimon = !!eval.data;
63331829
EJ
1217 } else {
1218 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1219 }
1220 }
1221
1222 return error;
1223}
1224
1670c579
EJ
1225static int
1226netdev_linux_set_miimon_interval(struct netdev *netdev_,
1227 long long int interval)
1228{
1229 struct netdev_dev_linux *netdev_dev;
1230
1231 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1232
1233 interval = interval > 0 ? MAX(interval, 100) : 0;
1234 if (netdev_dev->miimon_interval != interval) {
1235 netdev_dev->miimon_interval = interval;
1236 timer_set_expired(&netdev_dev->miimon_timer);
1237 }
1238
1239 return 0;
1240}
1241
1242static void
1243netdev_linux_miimon_run(void)
1244{
1245 struct shash device_shash;
1246 struct shash_node *node;
1247
1248 shash_init(&device_shash);
1249 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1250 SHASH_FOR_EACH (node, &device_shash) {
1251 struct netdev_dev_linux *dev = node->data;
1252 bool miimon;
1253
1254 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1255 continue;
1256 }
1257
1258 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1259 if (miimon != dev->miimon) {
1670c579 1260 dev->miimon = miimon;
4f925bd3 1261 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1670c579
EJ
1262 }
1263
1264 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1265 }
1266
1267 shash_destroy(&device_shash);
1268}
1269
1270static void
1271netdev_linux_miimon_wait(void)
1272{
1273 struct shash device_shash;
1274 struct shash_node *node;
1275
1276 shash_init(&device_shash);
1277 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1278 SHASH_FOR_EACH (node, &device_shash) {
1279 struct netdev_dev_linux *dev = node->data;
1280
1281 if (dev->miimon_interval > 0) {
1282 timer_wait(&dev->miimon_timer);
1283 }
1284 }
1285 shash_destroy(&device_shash);
1286}
1287
8b61709d
BP
1288/* Check whether we can we use RTM_GETLINK to get network device statistics.
1289 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1290 * enabled. */
1291static bool
1292check_for_working_netlink_stats(void)
1293{
1294 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1295 * preferable, so if that works, we'll use it. */
1296 int ifindex = do_get_ifindex("lo");
1297 if (ifindex < 0) {
1298 VLOG_WARN("failed to get ifindex for lo, "
1299 "obtaining netdev stats from proc");
1300 return false;
1301 } else {
1302 struct netdev_stats stats;
1303 int error = get_stats_via_netlink(ifindex, &stats);
1304 if (!error) {
1305 VLOG_DBG("obtaining netdev stats via rtnetlink");
1306 return true;
1307 } else {
1308 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1309 "via proc (you are probably running a pre-2.6.19 "
1310 "kernel)", strerror(error));
1311 return false;
1312 }
1313 }
1314}
1315
92df599c
JG
1316static void
1317swap_uint64(uint64_t *a, uint64_t *b)
1318{
1de0e8ae
BP
1319 uint64_t tmp = *a;
1320 *a = *b;
1321 *b = tmp;
92df599c
JG
1322}
1323
f613a0d7
PS
1324static void
1325get_stats_via_vport(const struct netdev *netdev_,
1326 struct netdev_stats *stats)
8b61709d 1327{
149f577a
JG
1328 struct netdev_dev_linux *netdev_dev =
1329 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1330
bba1e6f3
PS
1331 if (!netdev_dev->vport_stats_error ||
1332 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1333 int error;
7fbef77a
JG
1334
1335 error = netdev_vport_get_stats(netdev_, stats);
f613a0d7 1336 if (error) {
a57a8488
BP
1337 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1338 "(%s)", netdev_get_name(netdev_), strerror(error));
f613a0d7 1339 }
bba1e6f3
PS
1340 netdev_dev->vport_stats_error = error;
1341 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1342 }
f613a0d7 1343}
8b61709d 1344
f613a0d7
PS
1345static int
1346netdev_linux_sys_get_stats(const struct netdev *netdev_,
1347 struct netdev_stats *stats)
1348{
1349 static int use_netlink_stats = -1;
1350 int error;
1351
1352 if (use_netlink_stats < 0) {
1353 use_netlink_stats = check_for_working_netlink_stats();
1354 }
1355
1356 if (use_netlink_stats) {
1357 int ifindex;
1358
1359 error = get_ifindex(netdev_, &ifindex);
1360 if (!error) {
1361 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1362 }
f613a0d7
PS
1363 } else {
1364 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1365 }
7fbef77a 1366
f613a0d7
PS
1367 if (error) {
1368 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1369 netdev_get_name(netdev_), error);
1370 }
1371 return error;
1372
1373}
1374
1375/* Retrieves current device stats for 'netdev-linux'. */
1376static int
1377netdev_linux_get_stats(const struct netdev *netdev_,
1378 struct netdev_stats *stats)
1379{
1380 struct netdev_dev_linux *netdev_dev =
1381 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1382 struct netdev_stats dev_stats;
1383 int error;
1384
1385 get_stats_via_vport(netdev_, stats);
1386
1387 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1388
1389 if (error) {
bba1e6f3 1390 if (netdev_dev->vport_stats_error) {
f613a0d7 1391 return error;
7fbef77a 1392 } else {
f613a0d7
PS
1393 return 0;
1394 }
1395 }
1396
bba1e6f3 1397 if (netdev_dev->vport_stats_error) {
f613a0d7
PS
1398 /* stats not available from OVS then use ioctl stats. */
1399 *stats = dev_stats;
1400 } else {
1401 stats->rx_errors += dev_stats.rx_errors;
1402 stats->tx_errors += dev_stats.tx_errors;
1403 stats->rx_dropped += dev_stats.rx_dropped;
1404 stats->tx_dropped += dev_stats.tx_dropped;
1405 stats->multicast += dev_stats.multicast;
1406 stats->collisions += dev_stats.collisions;
1407 stats->rx_length_errors += dev_stats.rx_length_errors;
1408 stats->rx_over_errors += dev_stats.rx_over_errors;
1409 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1410 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1411 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1412 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1413 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1414 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1415 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1416 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1417 stats->tx_window_errors += dev_stats.tx_window_errors;
1418 }
1419 return 0;
1420}
1421
1422/* Retrieves current device stats for 'netdev-tap' netdev or
1423 * netdev-internal. */
1424static int
bba1e6f3 1425netdev_tap_get_stats(const struct netdev *netdev_,
f613a0d7
PS
1426 struct netdev_stats *stats)
1427{
1428 struct netdev_dev_linux *netdev_dev =
1429 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1430 struct netdev_stats dev_stats;
1431 int error;
1432
1433 get_stats_via_vport(netdev_, stats);
1434
1435 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1436 if (error) {
bba1e6f3 1437 if (netdev_dev->vport_stats_error) {
f613a0d7
PS
1438 return error;
1439 } else {
1440 return 0;
8b61709d 1441 }
8b61709d 1442 }
fe6b0e03
JG
1443
1444 /* If this port is an internal port then the transmit and receive stats
1445 * will appear to be swapped relative to the other ports since we are the
1446 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1447 * them back here. This does not apply if we are getting stats from the
1448 * vport layer because it always tracks stats from the perspective of the
1449 * switch. */
bba1e6f3 1450 if (netdev_dev->vport_stats_error) {
f613a0d7 1451 *stats = dev_stats;
92df599c
JG
1452 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1453 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1454 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1455 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1456 stats->rx_length_errors = 0;
1457 stats->rx_over_errors = 0;
1458 stats->rx_crc_errors = 0;
1459 stats->rx_frame_errors = 0;
1460 stats->rx_fifo_errors = 0;
1461 stats->rx_missed_errors = 0;
1462 stats->tx_aborted_errors = 0;
1463 stats->tx_carrier_errors = 0;
1464 stats->tx_fifo_errors = 0;
1465 stats->tx_heartbeat_errors = 0;
1466 stats->tx_window_errors = 0;
f613a0d7
PS
1467 } else {
1468 stats->rx_dropped += dev_stats.tx_dropped;
1469 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1470
f613a0d7
PS
1471 stats->rx_errors += dev_stats.tx_errors;
1472 stats->tx_errors += dev_stats.rx_errors;
1473
1474 stats->multicast += dev_stats.multicast;
1475 stats->collisions += dev_stats.collisions;
1476 }
1477 return 0;
8b61709d
BP
1478}
1479
bba1e6f3
PS
1480static int
1481netdev_internal_get_stats(const struct netdev *netdev_,
1482 struct netdev_stats *stats)
1483{
1484 struct netdev_dev_linux *netdev_dev =
1485 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1486
1487 get_stats_via_vport(netdev_, stats);
1488 return netdev_dev->vport_stats_error;
1489}
1490
51f87458
PS
1491static void
1492netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
8b61709d
BP
1493{
1494 struct ethtool_cmd ecmd;
6c038611 1495 uint32_t speed;
8b61709d
BP
1496 int error;
1497
51f87458
PS
1498 if (netdev_dev->cache_valid & VALID_FEATURES) {
1499 return;
1500 }
1501
ab985a77 1502 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1503 memset(&ecmd, 0, sizeof ecmd);
51f87458 1504 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
8b61709d
BP
1505 ETHTOOL_GSET, "ETHTOOL_GSET");
1506 if (error) {
51f87458 1507 goto out;
8b61709d
BP
1508 }
1509
1510 /* Supported features. */
51f87458 1511 netdev_dev->supported = 0;
8b61709d 1512 if (ecmd.supported & SUPPORTED_10baseT_Half) {
51f87458 1513 netdev_dev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1514 }
1515 if (ecmd.supported & SUPPORTED_10baseT_Full) {
51f87458 1516 netdev_dev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1517 }
1518 if (ecmd.supported & SUPPORTED_100baseT_Half) {
51f87458 1519 netdev_dev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1520 }
1521 if (ecmd.supported & SUPPORTED_100baseT_Full) {
51f87458 1522 netdev_dev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1523 }
1524 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
51f87458 1525 netdev_dev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1526 }
1527 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
51f87458 1528 netdev_dev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1529 }
1530 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
51f87458 1531 netdev_dev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1532 }
1533 if (ecmd.supported & SUPPORTED_TP) {
51f87458 1534 netdev_dev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1535 }
1536 if (ecmd.supported & SUPPORTED_FIBRE) {
51f87458 1537 netdev_dev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1538 }
1539 if (ecmd.supported & SUPPORTED_Autoneg) {
51f87458 1540 netdev_dev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1541 }
1542 if (ecmd.supported & SUPPORTED_Pause) {
51f87458 1543 netdev_dev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1544 }
1545 if (ecmd.supported & SUPPORTED_Asym_Pause) {
51f87458 1546 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1547 }
1548
1549 /* Advertised features. */
51f87458 1550 netdev_dev->advertised = 0;
8b61709d 1551 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
51f87458 1552 netdev_dev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1553 }
1554 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
51f87458 1555 netdev_dev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1556 }
1557 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
51f87458 1558 netdev_dev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1559 }
1560 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
51f87458 1561 netdev_dev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1562 }
1563 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
51f87458 1564 netdev_dev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1565 }
1566 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
51f87458 1567 netdev_dev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1568 }
1569 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
51f87458 1570 netdev_dev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1571 }
1572 if (ecmd.advertising & ADVERTISED_TP) {
51f87458 1573 netdev_dev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1574 }
1575 if (ecmd.advertising & ADVERTISED_FIBRE) {
51f87458 1576 netdev_dev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1577 }
1578 if (ecmd.advertising & ADVERTISED_Autoneg) {
51f87458 1579 netdev_dev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1580 }
1581 if (ecmd.advertising & ADVERTISED_Pause) {
51f87458 1582 netdev_dev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1583 }
1584 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
51f87458 1585 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1586 }
1587
1588 /* Current settings. */
2a529ead 1589 speed = ecmd.speed;
6c038611 1590 if (speed == SPEED_10) {
51f87458 1591 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1592 } else if (speed == SPEED_100) {
51f87458 1593 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1594 } else if (speed == SPEED_1000) {
51f87458 1595 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1596 } else if (speed == SPEED_10000) {
51f87458 1597 netdev_dev->current = NETDEV_F_10GB_FD;
6c038611 1598 } else if (speed == 40000) {
51f87458 1599 netdev_dev->current = NETDEV_F_40GB_FD;
6c038611 1600 } else if (speed == 100000) {
51f87458 1601 netdev_dev->current = NETDEV_F_100GB_FD;
6c038611 1602 } else if (speed == 1000000) {
51f87458 1603 netdev_dev->current = NETDEV_F_1TB_FD;
8b61709d 1604 } else {
51f87458 1605 netdev_dev->current = 0;
8b61709d
BP
1606 }
1607
1608 if (ecmd.port == PORT_TP) {
51f87458 1609 netdev_dev->current |= NETDEV_F_COPPER;
8b61709d 1610 } else if (ecmd.port == PORT_FIBRE) {
51f87458 1611 netdev_dev->current |= NETDEV_F_FIBER;
8b61709d
BP
1612 }
1613
1614 if (ecmd.autoneg) {
51f87458 1615 netdev_dev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1616 }
1617
1618 /* Peer advertisements. */
51f87458 1619 netdev_dev->peer = 0; /* XXX */
8b61709d 1620
51f87458
PS
1621out:
1622 netdev_dev->cache_valid |= VALID_FEATURES;
1623 netdev_dev->get_features_error = error;
1624}
1625
1626/* Stores the features supported by 'netdev' into each of '*current',
1627 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1628 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1629 * errno value. */
1630static int
1631netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1632 enum netdev_features *current,
1633 enum netdev_features *advertised,
1634 enum netdev_features *supported,
1635 enum netdev_features *peer)
51f87458
PS
1636{
1637 struct netdev_dev_linux *netdev_dev =
1638 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1639
1640 netdev_linux_read_features(netdev_dev);
1641
1642 if (!netdev_dev->get_features_error) {
1643 *current = netdev_dev->current;
1644 *advertised = netdev_dev->advertised;
1645 *supported = netdev_dev->supported;
1646 *peer = netdev_dev->peer;
1647 }
1648 return netdev_dev->get_features_error;
8b61709d
BP
1649}
1650
1651/* Set the features advertised by 'netdev' to 'advertise'. */
1652static int
6c038611
BP
1653netdev_linux_set_advertisements(struct netdev *netdev,
1654 enum netdev_features advertise)
8b61709d
BP
1655{
1656 struct ethtool_cmd ecmd;
1657 int error;
1658
ab985a77 1659 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1660 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1661 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1662 ETHTOOL_GSET, "ETHTOOL_GSET");
1663 if (error) {
1664 return error;
1665 }
1666
1667 ecmd.advertising = 0;
6c038611 1668 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1669 ecmd.advertising |= ADVERTISED_10baseT_Half;
1670 }
6c038611 1671 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1672 ecmd.advertising |= ADVERTISED_10baseT_Full;
1673 }
6c038611 1674 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1675 ecmd.advertising |= ADVERTISED_100baseT_Half;
1676 }
6c038611 1677 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1678 ecmd.advertising |= ADVERTISED_100baseT_Full;
1679 }
6c038611 1680 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1681 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1682 }
6c038611 1683 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1684 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1685 }
6c038611 1686 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1687 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1688 }
6c038611 1689 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1690 ecmd.advertising |= ADVERTISED_TP;
1691 }
6c038611 1692 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1693 ecmd.advertising |= ADVERTISED_FIBRE;
1694 }
6c038611 1695 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1696 ecmd.advertising |= ADVERTISED_Autoneg;
1697 }
6c038611 1698 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1699 ecmd.advertising |= ADVERTISED_Pause;
1700 }
6c038611 1701 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1702 ecmd.advertising |= ADVERTISED_Asym_Pause;
1703 }
ab985a77 1704 COVERAGE_INC(netdev_set_ethtool);
0b0544d7 1705 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1706 ETHTOOL_SSET, "ETHTOOL_SSET");
1707}
1708
f8500004
JP
1709/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1710 * successful, otherwise a positive errno value. */
8b61709d
BP
1711static int
1712netdev_linux_set_policing(struct netdev *netdev,
1713 uint32_t kbits_rate, uint32_t kbits_burst)
1714{
80a86fbe
BP
1715 struct netdev_dev_linux *netdev_dev =
1716 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d 1717 const char *netdev_name = netdev_get_name(netdev);
f8500004 1718 int error;
8b61709d 1719
8e460221 1720
80a86fbe
BP
1721 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1722 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1723 : kbits_burst); /* Stick with user-specified value. */
1724
c9f71668
PS
1725 if (netdev_dev->cache_valid & VALID_POLICING) {
1726 if (netdev_dev->netdev_policing_error) {
1727 return netdev_dev->netdev_policing_error;
1728 }
1729
1730 if (netdev_dev->kbits_rate == kbits_rate &&
1731 netdev_dev->kbits_burst == kbits_burst) {
1732 /* Assume that settings haven't changed since we last set them. */
1733 return 0;
1734 }
1735 netdev_dev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1736 }
1737
ac8c3412 1738 COVERAGE_INC(netdev_set_policing);
f8500004
JP
1739 /* Remove any existing ingress qdisc. */
1740 error = tc_add_del_ingress_qdisc(netdev, false);
1741 if (error) {
1742 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1743 netdev_name, strerror(error));
c9f71668 1744 goto out;
f8500004
JP
1745 }
1746
8b61709d 1747 if (kbits_rate) {
f8500004
JP
1748 error = tc_add_del_ingress_qdisc(netdev, true);
1749 if (error) {
1750 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1751 netdev_name, strerror(error));
c9f71668 1752 goto out;
8b61709d
BP
1753 }
1754
f8500004
JP
1755 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1756 if (error){
1757 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1758 netdev_name, strerror(error));
c9f71668 1759 goto out;
8b61709d 1760 }
8b61709d
BP
1761 }
1762
f8500004
JP
1763 netdev_dev->kbits_rate = kbits_rate;
1764 netdev_dev->kbits_burst = kbits_burst;
f8500004 1765
c9f71668
PS
1766out:
1767 if (!error || error == ENODEV) {
1768 netdev_dev->netdev_policing_error = error;
1769 netdev_dev->cache_valid |= VALID_POLICING;
1770 }
1771 return error;
8b61709d
BP
1772}
1773
c1c9c9c4
BP
1774static int
1775netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1776 struct sset *types)
c1c9c9c4
BP
1777{
1778 const struct tc_ops **opsp;
1779
1780 for (opsp = tcs; *opsp != NULL; opsp++) {
1781 const struct tc_ops *ops = *opsp;
1782 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1783 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1784 }
1785 }
1786 return 0;
1787}
1788
1789static const struct tc_ops *
1790tc_lookup_ovs_name(const char *name)
1791{
1792 const struct tc_ops **opsp;
1793
1794 for (opsp = tcs; *opsp != NULL; opsp++) {
1795 const struct tc_ops *ops = *opsp;
1796 if (!strcmp(name, ops->ovs_name)) {
1797 return ops;
1798 }
1799 }
1800 return NULL;
1801}
1802
1803static const struct tc_ops *
1804tc_lookup_linux_name(const char *name)
1805{
1806 const struct tc_ops **opsp;
1807
1808 for (opsp = tcs; *opsp != NULL; opsp++) {
1809 const struct tc_ops *ops = *opsp;
1810 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1811 return ops;
1812 }
1813 }
1814 return NULL;
1815}
1816
93b13be8
BP
1817static struct tc_queue *
1818tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1819 size_t hash)
1820{
1821 struct netdev_dev_linux *netdev_dev =
1822 netdev_dev_linux_cast(netdev_get_dev(netdev));
1823 struct tc_queue *queue;
1824
4e8e4213 1825 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1826 if (queue->queue_id == queue_id) {
1827 return queue;
1828 }
1829 }
1830 return NULL;
1831}
1832
1833static struct tc_queue *
1834tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1835{
1836 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1837}
1838
c1c9c9c4
BP
1839static int
1840netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1841 const char *type,
1842 struct netdev_qos_capabilities *caps)
1843{
1844 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1845 if (!ops) {
1846 return EOPNOTSUPP;
1847 }
1848 caps->n_queues = ops->n_queues;
1849 return 0;
1850}
1851
1852static int
1853netdev_linux_get_qos(const struct netdev *netdev,
79f1cbe9 1854 const char **typep, struct smap *details)
c1c9c9c4
BP
1855{
1856 struct netdev_dev_linux *netdev_dev =
1857 netdev_dev_linux_cast(netdev_get_dev(netdev));
1858 int error;
1859
1860 error = tc_query_qdisc(netdev);
1861 if (error) {
1862 return error;
1863 }
1864
1865 *typep = netdev_dev->tc->ops->ovs_name;
1866 return (netdev_dev->tc->ops->qdisc_get
1867 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1868 : 0);
1869}
1870
1871static int
1872netdev_linux_set_qos(struct netdev *netdev,
79f1cbe9 1873 const char *type, const struct smap *details)
c1c9c9c4
BP
1874{
1875 struct netdev_dev_linux *netdev_dev =
1876 netdev_dev_linux_cast(netdev_get_dev(netdev));
1877 const struct tc_ops *new_ops;
1878 int error;
1879
1880 new_ops = tc_lookup_ovs_name(type);
1881 if (!new_ops || !new_ops->tc_install) {
1882 return EOPNOTSUPP;
1883 }
1884
1885 error = tc_query_qdisc(netdev);
1886 if (error) {
1887 return error;
1888 }
1889
1890 if (new_ops == netdev_dev->tc->ops) {
1891 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1892 } else {
1893 /* Delete existing qdisc. */
1894 error = tc_del_qdisc(netdev);
1895 if (error) {
1896 return error;
1897 }
1898 assert(netdev_dev->tc == NULL);
1899
1900 /* Install new qdisc. */
1901 error = new_ops->tc_install(netdev, details);
1902 assert((error == 0) == (netdev_dev->tc != NULL));
1903
1904 return error;
1905 }
1906}
1907
1908static int
1909netdev_linux_get_queue(const struct netdev *netdev,
79f1cbe9 1910 unsigned int queue_id, struct smap *details)
c1c9c9c4
BP
1911{
1912 struct netdev_dev_linux *netdev_dev =
1913 netdev_dev_linux_cast(netdev_get_dev(netdev));
1914 int error;
1915
1916 error = tc_query_qdisc(netdev);
1917 if (error) {
1918 return error;
93b13be8
BP
1919 } else {
1920 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1921 return (queue
1922 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1923 : ENOENT);
c1c9c9c4 1924 }
c1c9c9c4
BP
1925}
1926
1927static int
1928netdev_linux_set_queue(struct netdev *netdev,
79f1cbe9 1929 unsigned int queue_id, const struct smap *details)
c1c9c9c4
BP
1930{
1931 struct netdev_dev_linux *netdev_dev =
1932 netdev_dev_linux_cast(netdev_get_dev(netdev));
1933 int error;
1934
1935 error = tc_query_qdisc(netdev);
1936 if (error) {
1937 return error;
1938 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1939 || !netdev_dev->tc->ops->class_set) {
1940 return EINVAL;
1941 }
1942
1943 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1944}
1945
1946static int
1947netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1948{
1949 struct netdev_dev_linux *netdev_dev =
1950 netdev_dev_linux_cast(netdev_get_dev(netdev));
1951 int error;
1952
1953 error = tc_query_qdisc(netdev);
1954 if (error) {
1955 return error;
1956 } else if (!netdev_dev->tc->ops->class_delete) {
1957 return EINVAL;
93b13be8
BP
1958 } else {
1959 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1960 return (queue
1961 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1962 : ENOENT);
c1c9c9c4 1963 }
c1c9c9c4
BP
1964}
1965
1966static int
1967netdev_linux_get_queue_stats(const struct netdev *netdev,
1968 unsigned int queue_id,
1969 struct netdev_queue_stats *stats)
1970{
1971 struct netdev_dev_linux *netdev_dev =
1972 netdev_dev_linux_cast(netdev_get_dev(netdev));
1973 int error;
1974
1975 error = tc_query_qdisc(netdev);
1976 if (error) {
1977 return error;
c1c9c9c4
BP
1978 } else if (!netdev_dev->tc->ops->class_get_stats) {
1979 return EOPNOTSUPP;
93b13be8
BP
1980 } else {
1981 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1982 return (queue
1983 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1984 : ENOENT);
c1c9c9c4 1985 }
c1c9c9c4
BP
1986}
1987
23a98ffe 1988static bool
c1c9c9c4
BP
1989start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1990{
1991 struct ofpbuf request;
1992 struct tcmsg *tcmsg;
1993
1994 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1995 if (!tcmsg) {
1996 return false;
1997 }
3c4de644 1998 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1999 nl_dump_start(dump, rtnl_sock, &request);
2000 ofpbuf_uninit(&request);
23a98ffe 2001 return true;
c1c9c9c4
BP
2002}
2003
2004static int
2005netdev_linux_dump_queues(const struct netdev *netdev,
2006 netdev_dump_queues_cb *cb, void *aux)
2007{
2008 struct netdev_dev_linux *netdev_dev =
2009 netdev_dev_linux_cast(netdev_get_dev(netdev));
f486e840 2010 struct tc_queue *queue, *next_queue;
79f1cbe9 2011 struct smap details;
c1c9c9c4 2012 int last_error;
c1c9c9c4
BP
2013 int error;
2014
2015 error = tc_query_qdisc(netdev);
2016 if (error) {
2017 return error;
2018 } else if (!netdev_dev->tc->ops->class_get) {
2019 return EOPNOTSUPP;
2020 }
2021
2022 last_error = 0;
79f1cbe9 2023 smap_init(&details);
f486e840
BP
2024 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2025 &netdev_dev->tc->queues) {
79f1cbe9 2026 smap_clear(&details);
c1c9c9c4 2027
93b13be8 2028 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 2029 if (!error) {
93b13be8 2030 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
2031 } else {
2032 last_error = error;
2033 }
2034 }
79f1cbe9 2035 smap_destroy(&details);
c1c9c9c4
BP
2036
2037 return last_error;
2038}
2039
2040static int
2041netdev_linux_dump_queue_stats(const struct netdev *netdev,
2042 netdev_dump_queue_stats_cb *cb, void *aux)
2043{
2044 struct netdev_dev_linux *netdev_dev =
2045 netdev_dev_linux_cast(netdev_get_dev(netdev));
2046 struct nl_dump dump;
2047 struct ofpbuf msg;
2048 int last_error;
2049 int error;
2050
2051 error = tc_query_qdisc(netdev);
2052 if (error) {
2053 return error;
2054 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2055 return EOPNOTSUPP;
2056 }
2057
2058 last_error = 0;
23a98ffe
BP
2059 if (!start_queue_dump(netdev, &dump)) {
2060 return ENODEV;
2061 }
c1c9c9c4
BP
2062 while (nl_dump_next(&dump, &msg)) {
2063 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2064 if (error) {
2065 last_error = error;
2066 }
2067 }
2068
2069 error = nl_dump_done(&dump);
2070 return error ? error : last_error;
2071}
2072
8b61709d 2073static int
f1acd62b
BP
2074netdev_linux_get_in4(const struct netdev *netdev_,
2075 struct in_addr *address, struct in_addr *netmask)
8b61709d 2076{
149f577a
JG
2077 struct netdev_dev_linux *netdev_dev =
2078 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2079
2080 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
2081 int error;
2082
149f577a 2083 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
2084 SIOCGIFADDR, "SIOCGIFADDR");
2085 if (error) {
2086 return error;
2087 }
2088
149f577a 2089 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
2090 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2091 if (error) {
2092 return error;
2093 }
2094
149f577a 2095 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 2096 }
149f577a
JG
2097 *address = netdev_dev->address;
2098 *netmask = netdev_dev->netmask;
f1acd62b 2099 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
2100}
2101
8b61709d 2102static int
f1acd62b
BP
2103netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2104 struct in_addr netmask)
8b61709d 2105{
149f577a
JG
2106 struct netdev_dev_linux *netdev_dev =
2107 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
2108 int error;
2109
f1acd62b 2110 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2111 if (!error) {
149f577a
JG
2112 netdev_dev->cache_valid |= VALID_IN4;
2113 netdev_dev->address = address;
2114 netdev_dev->netmask = netmask;
f1acd62b 2115 if (address.s_addr != INADDR_ANY) {
8b61709d 2116 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2117 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2118 }
2119 }
2120 return error;
2121}
2122
2123static bool
2124parse_if_inet6_line(const char *line,
2125 struct in6_addr *in6, char ifname[16 + 1])
2126{
2127 uint8_t *s6 = in6->s6_addr;
2128#define X8 "%2"SCNx8
2129 return sscanf(line,
2130 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2131 "%*x %*x %*x %*x %16s\n",
2132 &s6[0], &s6[1], &s6[2], &s6[3],
2133 &s6[4], &s6[5], &s6[6], &s6[7],
2134 &s6[8], &s6[9], &s6[10], &s6[11],
2135 &s6[12], &s6[13], &s6[14], &s6[15],
2136 ifname) == 17;
2137}
2138
2139/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2140 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2141static int
2142netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2143{
149f577a
JG
2144 struct netdev_dev_linux *netdev_dev =
2145 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2146 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2147 FILE *file;
2148 char line[128];
2149
149f577a 2150 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2151
2152 file = fopen("/proc/net/if_inet6", "r");
2153 if (file != NULL) {
2154 const char *name = netdev_get_name(netdev_);
2155 while (fgets(line, sizeof line, file)) {
2a022368 2156 struct in6_addr in6_tmp;
8b61709d 2157 char ifname[16 + 1];
2a022368 2158 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2159 && !strcmp(name, ifname))
2160 {
2a022368 2161 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2162 break;
2163 }
2164 }
2165 fclose(file);
2166 }
149f577a 2167 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2168 }
149f577a 2169 *in6 = netdev_dev->in6;
8b61709d
BP
2170 return 0;
2171}
2172
2173static void
2174make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2175{
2176 struct sockaddr_in sin;
2177 memset(&sin, 0, sizeof sin);
2178 sin.sin_family = AF_INET;
2179 sin.sin_addr = addr;
2180 sin.sin_port = 0;
2181
2182 memset(sa, 0, sizeof *sa);
2183 memcpy(sa, &sin, sizeof sin);
2184}
2185
2186static int
2187do_set_addr(struct netdev *netdev,
2188 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2189{
2190 struct ifreq ifr;
71d7c22f 2191 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2192 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2193
2194 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2195 ioctl_name);
8b61709d
BP
2196}
2197
2198/* Adds 'router' as a default IP gateway. */
2199static int
67a4917b 2200netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2201{
2202 struct in_addr any = { INADDR_ANY };
2203 struct rtentry rt;
2204 int error;
2205
2206 memset(&rt, 0, sizeof rt);
2207 make_in4_sockaddr(&rt.rt_dst, any);
2208 make_in4_sockaddr(&rt.rt_gateway, router);
2209 make_in4_sockaddr(&rt.rt_genmask, any);
2210 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2211 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2212 if (error) {
2213 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2214 }
2215 return error;
2216}
2217
f1acd62b
BP
2218static int
2219netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2220 char **netdev_name)
2221{
2222 static const char fn[] = "/proc/net/route";
2223 FILE *stream;
2224 char line[256];
2225 int ln;
2226
2227 *netdev_name = NULL;
2228 stream = fopen(fn, "r");
2229 if (stream == NULL) {
2230 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2231 return errno;
2232 }
2233
2234 ln = 0;
2235 while (fgets(line, sizeof line, stream)) {
2236 if (++ln >= 2) {
2237 char iface[17];
dbba996b 2238 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2239 int refcnt, metric, mtu;
2240 unsigned int flags, use, window, irtt;
2241
2242 if (sscanf(line,
2243 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2244 " %d %u %u\n",
2245 iface, &dest, &gateway, &flags, &refcnt,
2246 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2247
d295e8e9 2248 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2249 fn, ln, line);
2250 continue;
2251 }
2252 if (!(flags & RTF_UP)) {
2253 /* Skip routes that aren't up. */
2254 continue;
2255 }
2256
2257 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2258 * network byte order, so we don't need need any endian
f1acd62b
BP
2259 * conversions here. */
2260 if ((dest & mask) == (host->s_addr & mask)) {
2261 if (!gateway) {
2262 /* The host is directly reachable. */
2263 next_hop->s_addr = 0;
2264 } else {
2265 /* To reach the host, we must go through a gateway. */
2266 next_hop->s_addr = gateway;
2267 }
2268 *netdev_name = xstrdup(iface);
2269 fclose(stream);
2270 return 0;
2271 }
2272 }
2273 }
2274
2275 fclose(stream);
2276 return ENXIO;
2277}
2278
e210037e 2279static int
79f1cbe9 2280netdev_linux_get_drv_info(const struct netdev *netdev, struct smap *smap)
e210037e 2281{
e210037e 2282 int error;
4f925bd3
PS
2283 struct netdev_dev_linux *netdev_dev =
2284 netdev_dev_linux_cast(netdev_get_dev(netdev));
e210037e 2285
4f925bd3 2286 error = netdev_linux_get_drvinfo(netdev_dev);
e210037e 2287 if (!error) {
79f1cbe9
EJ
2288 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2289 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2290 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
e210037e 2291 }
e210037e
AE
2292 return error;
2293}
2294
4f925bd3 2295static int
79f1cbe9
EJ
2296netdev_internal_get_drv_info(const struct netdev *netdev OVS_UNUSED,
2297 struct smap *smap)
4f925bd3 2298{
79f1cbe9 2299 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2300 return 0;
2301}
2302
8b61709d
BP
2303/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2304 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2305 * returns 0. Otherwise, it returns a positive errno value; in particular,
2306 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2307static int
2308netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2309 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2310{
2311 struct arpreq r;
c100e025 2312 struct sockaddr_in sin;
8b61709d
BP
2313 int retval;
2314
2315 memset(&r, 0, sizeof r);
f2cc621b 2316 memset(&sin, 0, sizeof sin);
c100e025
BP
2317 sin.sin_family = AF_INET;
2318 sin.sin_addr.s_addr = ip;
2319 sin.sin_port = 0;
2320 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2321 r.arp_ha.sa_family = ARPHRD_ETHER;
2322 r.arp_flags = 0;
71d7c22f 2323 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2324 COVERAGE_INC(netdev_arp_lookup);
2325 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2326 if (!retval) {
2327 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2328 } else if (retval != ENXIO) {
2329 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2330 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2331 }
2332 return retval;
2333}
2334
2335static int
2336nd_to_iff_flags(enum netdev_flags nd)
2337{
2338 int iff = 0;
2339 if (nd & NETDEV_UP) {
2340 iff |= IFF_UP;
2341 }
2342 if (nd & NETDEV_PROMISC) {
2343 iff |= IFF_PROMISC;
2344 }
2345 return iff;
2346}
2347
2348static int
2349iff_to_nd_flags(int iff)
2350{
2351 enum netdev_flags nd = 0;
2352 if (iff & IFF_UP) {
2353 nd |= NETDEV_UP;
2354 }
2355 if (iff & IFF_PROMISC) {
2356 nd |= NETDEV_PROMISC;
2357 }
2358 return nd;
2359}
2360
2361static int
2362netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2363 enum netdev_flags on, enum netdev_flags *old_flagsp)
2364{
c37d4da4 2365 struct netdev_dev_linux *netdev_dev;
8b61709d 2366 int old_flags, new_flags;
c37d4da4
EJ
2367 int error = 0;
2368
2369 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2370 old_flags = netdev_dev->ifi_flags;
2371 *old_flagsp = iff_to_nd_flags(old_flags);
2372 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2373 if (new_flags != old_flags) {
2374 error = set_flags(netdev, new_flags);
2375 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
8b61709d
BP
2376 }
2377 return error;
2378}
2379
ac4d3bcb
EJ
2380static unsigned int
2381netdev_linux_change_seq(const struct netdev *netdev)
2382{
2383 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2384}
2385
4f925bd3 2386#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
51f87458 2387 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2388{ \
2389 NAME, \
2390 \
2391 netdev_linux_init, \
2392 netdev_linux_run, \
2393 netdev_linux_wait, \
2394 \
2395 CREATE, \
2396 netdev_linux_destroy, \
de5cdb90 2397 NULL, /* get_config */ \
6d9e6eb4 2398 NULL, /* set_config */ \
c3827f61
BP
2399 \
2400 netdev_linux_open, \
2401 netdev_linux_close, \
2402 \
7b6b0ef4 2403 netdev_linux_listen, \
c3827f61
BP
2404 netdev_linux_recv, \
2405 netdev_linux_recv_wait, \
2406 netdev_linux_drain, \
2407 \
2408 netdev_linux_send, \
2409 netdev_linux_send_wait, \
2410 \
2411 netdev_linux_set_etheraddr, \
2412 netdev_linux_get_etheraddr, \
2413 netdev_linux_get_mtu, \
9b020780 2414 netdev_linux_set_mtu, \
c3827f61
BP
2415 netdev_linux_get_ifindex, \
2416 netdev_linux_get_carrier, \
65c3058c 2417 netdev_linux_get_carrier_resets, \
1670c579 2418 netdev_linux_set_miimon_interval, \
f613a0d7 2419 GET_STATS, \
c3827f61
BP
2420 SET_STATS, \
2421 \
51f87458 2422 GET_FEATURES, \
c3827f61 2423 netdev_linux_set_advertisements, \
c3827f61
BP
2424 \
2425 netdev_linux_set_policing, \
2426 netdev_linux_get_qos_types, \
2427 netdev_linux_get_qos_capabilities, \
2428 netdev_linux_get_qos, \
2429 netdev_linux_set_qos, \
2430 netdev_linux_get_queue, \
2431 netdev_linux_set_queue, \
2432 netdev_linux_delete_queue, \
2433 netdev_linux_get_queue_stats, \
2434 netdev_linux_dump_queues, \
2435 netdev_linux_dump_queue_stats, \
2436 \
2437 netdev_linux_get_in4, \
2438 netdev_linux_set_in4, \
2439 netdev_linux_get_in6, \
2440 netdev_linux_add_router, \
2441 netdev_linux_get_next_hop, \
4f925bd3 2442 GET_STATUS, \
c3827f61
BP
2443 netdev_linux_arp_lookup, \
2444 \
2445 netdev_linux_update_flags, \
2446 \
ac4d3bcb 2447 netdev_linux_change_seq \
c3827f61
BP
2448}
2449
2450const struct netdev_class netdev_linux_class =
2451 NETDEV_LINUX_CLASS(
2452 "system",
2453 netdev_linux_create,
f613a0d7 2454 netdev_linux_get_stats,
4f925bd3 2455 NULL, /* set_stats */
51f87458 2456 netdev_linux_get_features,
2c2ea5a8 2457 netdev_linux_get_drv_info);
c3827f61
BP
2458
2459const struct netdev_class netdev_tap_class =
2460 NETDEV_LINUX_CLASS(
2461 "tap",
2462 netdev_linux_create_tap,
bba1e6f3 2463 netdev_tap_get_stats,
4f925bd3 2464 NULL, /* set_stats */
51f87458 2465 netdev_linux_get_features,
2c2ea5a8 2466 netdev_linux_get_drv_info);
c3827f61
BP
2467
2468const struct netdev_class netdev_internal_class =
2469 NETDEV_LINUX_CLASS(
2470 "internal",
2471 netdev_linux_create,
bba1e6f3 2472 netdev_internal_get_stats,
4f925bd3 2473 netdev_vport_set_stats,
51f87458 2474 NULL, /* get_features */
2c2ea5a8 2475 netdev_internal_get_drv_info);
8b61709d 2476\f
c1c9c9c4 2477/* HTB traffic control class. */
559843ed 2478
c1c9c9c4 2479#define HTB_N_QUEUES 0xf000
8b61709d 2480
c1c9c9c4
BP
2481struct htb {
2482 struct tc tc;
2483 unsigned int max_rate; /* In bytes/s. */
2484};
8b61709d 2485
c1c9c9c4 2486struct htb_class {
93b13be8 2487 struct tc_queue tc_queue;
c1c9c9c4
BP
2488 unsigned int min_rate; /* In bytes/s. */
2489 unsigned int max_rate; /* In bytes/s. */
2490 unsigned int burst; /* In bytes. */
2491 unsigned int priority; /* Lower values are higher priorities. */
2492};
8b61709d 2493
c1c9c9c4
BP
2494static struct htb *
2495htb_get__(const struct netdev *netdev)
2496{
2497 struct netdev_dev_linux *netdev_dev =
2498 netdev_dev_linux_cast(netdev_get_dev(netdev));
2499 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2500}
2501
24045e35 2502static void
c1c9c9c4
BP
2503htb_install__(struct netdev *netdev, uint64_t max_rate)
2504{
2505 struct netdev_dev_linux *netdev_dev =
2506 netdev_dev_linux_cast(netdev_get_dev(netdev));
2507 struct htb *htb;
2508
2509 htb = xmalloc(sizeof *htb);
2510 tc_init(&htb->tc, &tc_ops_htb);
2511 htb->max_rate = max_rate;
2512
2513 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2514}
2515
2516/* Create an HTB qdisc.
2517 *
a339aa81 2518 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2519static int
2520htb_setup_qdisc__(struct netdev *netdev)
2521{
2522 size_t opt_offset;
2523 struct tc_htb_glob opt;
2524 struct ofpbuf request;
2525 struct tcmsg *tcmsg;
2526
2527 tc_del_qdisc(netdev);
2528
2529 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2530 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2531 if (!tcmsg) {
2532 return ENODEV;
2533 }
c1c9c9c4
BP
2534 tcmsg->tcm_handle = tc_make_handle(1, 0);
2535 tcmsg->tcm_parent = TC_H_ROOT;
2536
2537 nl_msg_put_string(&request, TCA_KIND, "htb");
2538
2539 memset(&opt, 0, sizeof opt);
2540 opt.rate2quantum = 10;
2541 opt.version = 3;
4ecf12d5 2542 opt.defcls = 1;
c1c9c9c4
BP
2543
2544 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2545 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2546 nl_msg_end_nested(&request, opt_offset);
2547
2548 return tc_transact(&request, NULL);
2549}
2550
2551/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2552 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2553static int
2554htb_setup_class__(struct netdev *netdev, unsigned int handle,
2555 unsigned int parent, struct htb_class *class)
2556{
2557 size_t opt_offset;
2558 struct tc_htb_opt opt;
2559 struct ofpbuf request;
2560 struct tcmsg *tcmsg;
2561 int error;
2562 int mtu;
2563
9b020780
PS
2564 error = netdev_get_mtu(netdev, &mtu);
2565 if (error) {
f915f1a8
BP
2566 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2567 netdev_get_name(netdev));
9b020780 2568 return error;
f915f1a8 2569 }
c1c9c9c4
BP
2570
2571 memset(&opt, 0, sizeof opt);
2572 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2573 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2574 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2575 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2576 opt.prio = class->priority;
2577
2578 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2579 if (!tcmsg) {
2580 return ENODEV;
2581 }
c1c9c9c4
BP
2582 tcmsg->tcm_handle = handle;
2583 tcmsg->tcm_parent = parent;
2584
2585 nl_msg_put_string(&request, TCA_KIND, "htb");
2586 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2587 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2588 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2589 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2590 nl_msg_end_nested(&request, opt_offset);
2591
2592 error = tc_transact(&request, NULL);
2593 if (error) {
2594 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2595 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2596 netdev_get_name(netdev),
2597 tc_get_major(handle), tc_get_minor(handle),
2598 tc_get_major(parent), tc_get_minor(parent),
2599 class->min_rate, class->max_rate,
2600 class->burst, class->priority, strerror(error));
2601 }
2602 return error;
2603}
2604
2605/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2606 * description of them into 'details'. The description complies with the
2607 * specification given in the vswitch database documentation for linux-htb
2608 * queue details. */
2609static int
2610htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2611{
2612 static const struct nl_policy tca_htb_policy[] = {
2613 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2614 .min_len = sizeof(struct tc_htb_opt) },
2615 };
2616
2617 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2618 const struct tc_htb_opt *htb;
2619
2620 if (!nl_parse_nested(nl_options, tca_htb_policy,
2621 attrs, ARRAY_SIZE(tca_htb_policy))) {
2622 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2623 return EPROTO;
2624 }
2625
2626 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2627 class->min_rate = htb->rate.rate;
2628 class->max_rate = htb->ceil.rate;
2629 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2630 class->priority = htb->prio;
2631 return 0;
2632}
2633
2634static int
2635htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2636 struct htb_class *options,
2637 struct netdev_queue_stats *stats)
2638{
2639 struct nlattr *nl_options;
2640 unsigned int handle;
2641 int error;
2642
2643 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2644 if (!error && queue_id) {
17ee3c1f
BP
2645 unsigned int major = tc_get_major(handle);
2646 unsigned int minor = tc_get_minor(handle);
2647 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2648 *queue_id = minor - 1;
c1c9c9c4
BP
2649 } else {
2650 error = EPROTO;
2651 }
2652 }
2653 if (!error && options) {
2654 error = htb_parse_tca_options__(nl_options, options);
2655 }
2656 return error;
2657}
2658
2659static void
2660htb_parse_qdisc_details__(struct netdev *netdev,
79f1cbe9 2661 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2662{
2663 const char *max_rate_s;
2664
79f1cbe9 2665 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
2666 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2667 if (!hc->max_rate) {
a00ca915 2668 enum netdev_features current;
c1c9c9c4
BP
2669
2670 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 2671 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
2672 }
2673 hc->min_rate = hc->max_rate;
2674 hc->burst = 0;
2675 hc->priority = 0;
2676}
2677
2678static int
2679htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 2680 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2681{
2682 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
2683 const char *min_rate_s = smap_get(details, "min-rate");
2684 const char *max_rate_s = smap_get(details, "max-rate");
2685 const char *burst_s = smap_get(details, "burst");
2686 const char *priority_s = smap_get(details, "priority");
9b020780 2687 int mtu, error;
c1c9c9c4 2688
9b020780
PS
2689 error = netdev_get_mtu(netdev, &mtu);
2690 if (error) {
f915f1a8
BP
2691 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2692 netdev_get_name(netdev));
9b020780 2693 return error;
f915f1a8
BP
2694 }
2695
4f104611
EJ
2696 /* HTB requires at least an mtu sized min-rate to send any traffic even
2697 * on uncongested links. */
c45ab5e9 2698 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2699 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2700 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2701
2702 /* max-rate */
2703 hc->max_rate = (max_rate_s
2704 ? strtoull(max_rate_s, NULL, 10) / 8
2705 : htb->max_rate);
2706 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2707 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2708
2709 /* burst
2710 *
2711 * According to hints in the documentation that I've read, it is important
2712 * that 'burst' be at least as big as the largest frame that might be
2713 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2714 * but having it a bit too small is a problem. Since netdev_get_mtu()
2715 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2716 * the MTU. We actually add 64, instead of 14, as a guard against
2717 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2718 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2719 hc->burst = MAX(hc->burst, mtu + 64);
2720
2721 /* priority */
2722 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2723
2724 return 0;
2725}
2726
2727static int
2728htb_query_class__(const struct netdev *netdev, unsigned int handle,
2729 unsigned int parent, struct htb_class *options,
2730 struct netdev_queue_stats *stats)
2731{
2732 struct ofpbuf *reply;
2733 int error;
2734
2735 error = tc_query_class(netdev, handle, parent, &reply);
2736 if (!error) {
2737 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2738 ofpbuf_delete(reply);
2739 }
2740 return error;
2741}
2742
2743static int
79f1cbe9 2744htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2745{
2746 int error;
2747
2748 error = htb_setup_qdisc__(netdev);
2749 if (!error) {
2750 struct htb_class hc;
2751
2752 htb_parse_qdisc_details__(netdev, details, &hc);
2753 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2754 tc_make_handle(1, 0), &hc);
2755 if (!error) {
2756 htb_install__(netdev, hc.max_rate);
2757 }
2758 }
2759 return error;
2760}
2761
93b13be8
BP
2762static struct htb_class *
2763htb_class_cast__(const struct tc_queue *queue)
2764{
2765 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2766}
2767
c1c9c9c4
BP
2768static void
2769htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2770 const struct htb_class *hc)
2771{
2772 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2773 size_t hash = hash_int(queue_id, 0);
2774 struct tc_queue *queue;
c1c9c9c4
BP
2775 struct htb_class *hcp;
2776
93b13be8
BP
2777 queue = tc_find_queue__(netdev, queue_id, hash);
2778 if (queue) {
2779 hcp = htb_class_cast__(queue);
2780 } else {
c1c9c9c4 2781 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2782 queue = &hcp->tc_queue;
2783 queue->queue_id = queue_id;
2784 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2785 }
93b13be8
BP
2786
2787 hcp->min_rate = hc->min_rate;
2788 hcp->max_rate = hc->max_rate;
2789 hcp->burst = hc->burst;
2790 hcp->priority = hc->priority;
c1c9c9c4
BP
2791}
2792
2793static int
2794htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2795{
c1c9c9c4
BP
2796 struct ofpbuf msg;
2797 struct nl_dump dump;
2798 struct htb_class hc;
c1c9c9c4
BP
2799
2800 /* Get qdisc options. */
2801 hc.max_rate = 0;
2802 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2803 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2804
2805 /* Get queues. */
23a98ffe
BP
2806 if (!start_queue_dump(netdev, &dump)) {
2807 return ENODEV;
2808 }
c1c9c9c4
BP
2809 while (nl_dump_next(&dump, &msg)) {
2810 unsigned int queue_id;
2811
2812 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2813 htb_update_queue__(netdev, queue_id, &hc);
2814 }
2815 }
2816 nl_dump_done(&dump);
2817
2818 return 0;
2819}
2820
2821static void
2822htb_tc_destroy(struct tc *tc)
2823{
2824 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2825 struct htb_class *hc, *next;
c1c9c9c4 2826
4e8e4213 2827 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2828 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2829 free(hc);
2830 }
2831 tc_destroy(tc);
2832 free(htb);
2833}
2834
2835static int
79f1cbe9 2836htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
2837{
2838 const struct htb *htb = htb_get__(netdev);
79f1cbe9 2839 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
2840 return 0;
2841}
2842
2843static int
79f1cbe9 2844htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2845{
2846 struct htb_class hc;
2847 int error;
2848
2849 htb_parse_qdisc_details__(netdev, details, &hc);
2850 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2851 tc_make_handle(1, 0), &hc);
2852 if (!error) {
2853 htb_get__(netdev)->max_rate = hc.max_rate;
2854 }
2855 return error;
2856}
2857
2858static int
93b13be8 2859htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 2860 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 2861{
93b13be8 2862 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2863
79f1cbe9 2864 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 2865 if (hc->min_rate != hc->max_rate) {
79f1cbe9 2866 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 2867 }
79f1cbe9 2868 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 2869 if (hc->priority) {
79f1cbe9 2870 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
2871 }
2872 return 0;
2873}
2874
2875static int
2876htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 2877 const struct smap *details)
c1c9c9c4
BP
2878{
2879 struct htb_class hc;
2880 int error;
2881
2882 error = htb_parse_class_details__(netdev, details, &hc);
2883 if (error) {
2884 return error;
2885 }
2886
17ee3c1f 2887 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2888 tc_make_handle(1, 0xfffe), &hc);
2889 if (error) {
2890 return error;
2891 }
2892
2893 htb_update_queue__(netdev, queue_id, &hc);
2894 return 0;
2895}
2896
2897static int
93b13be8 2898htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2899{
93b13be8 2900 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2901 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2902 int error;
2903
93b13be8 2904 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2905 if (!error) {
93b13be8 2906 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2907 free(hc);
c1c9c9c4
BP
2908 }
2909 return error;
2910}
2911
2912static int
93b13be8 2913htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2914 struct netdev_queue_stats *stats)
2915{
93b13be8 2916 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2917 tc_make_handle(1, 0xfffe), NULL, stats);
2918}
2919
2920static int
2921htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2922 const struct ofpbuf *nlmsg,
2923 netdev_dump_queue_stats_cb *cb, void *aux)
2924{
2925 struct netdev_queue_stats stats;
17ee3c1f 2926 unsigned int handle, major, minor;
c1c9c9c4
BP
2927 int error;
2928
2929 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2930 if (error) {
2931 return error;
2932 }
2933
17ee3c1f
BP
2934 major = tc_get_major(handle);
2935 minor = tc_get_minor(handle);
2936 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2937 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2938 }
2939 return 0;
2940}
2941
2942static const struct tc_ops tc_ops_htb = {
2943 "htb", /* linux_name */
2944 "linux-htb", /* ovs_name */
2945 HTB_N_QUEUES, /* n_queues */
2946 htb_tc_install,
2947 htb_tc_load,
2948 htb_tc_destroy,
2949 htb_qdisc_get,
2950 htb_qdisc_set,
2951 htb_class_get,
2952 htb_class_set,
2953 htb_class_delete,
2954 htb_class_get_stats,
2955 htb_class_dump_stats
2956};
2957\f
a339aa81
EJ
2958/* "linux-hfsc" traffic control class. */
2959
2960#define HFSC_N_QUEUES 0xf000
2961
2962struct hfsc {
2963 struct tc tc;
2964 uint32_t max_rate;
2965};
2966
2967struct hfsc_class {
2968 struct tc_queue tc_queue;
2969 uint32_t min_rate;
2970 uint32_t max_rate;
2971};
2972
2973static struct hfsc *
2974hfsc_get__(const struct netdev *netdev)
2975{
2976 struct netdev_dev_linux *netdev_dev;
2977 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2978 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2979}
2980
2981static struct hfsc_class *
2982hfsc_class_cast__(const struct tc_queue *queue)
2983{
2984 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2985}
2986
24045e35 2987static void
a339aa81
EJ
2988hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2989{
2990 struct netdev_dev_linux * netdev_dev;
2991 struct hfsc *hfsc;
2992
2993 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2994 hfsc = xmalloc(sizeof *hfsc);
2995 tc_init(&hfsc->tc, &tc_ops_hfsc);
2996 hfsc->max_rate = max_rate;
2997 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
2998}
2999
3000static void
3001hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3002 const struct hfsc_class *hc)
3003{
3004 size_t hash;
3005 struct hfsc *hfsc;
3006 struct hfsc_class *hcp;
3007 struct tc_queue *queue;
3008
3009 hfsc = hfsc_get__(netdev);
3010 hash = hash_int(queue_id, 0);
3011
3012 queue = tc_find_queue__(netdev, queue_id, hash);
3013 if (queue) {
3014 hcp = hfsc_class_cast__(queue);
3015 } else {
3016 hcp = xmalloc(sizeof *hcp);
3017 queue = &hcp->tc_queue;
3018 queue->queue_id = queue_id;
3019 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3020 }
3021
3022 hcp->min_rate = hc->min_rate;
3023 hcp->max_rate = hc->max_rate;
3024}
3025
3026static int
3027hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3028{
3029 const struct tc_service_curve *rsc, *fsc, *usc;
3030 static const struct nl_policy tca_hfsc_policy[] = {
3031 [TCA_HFSC_RSC] = {
3032 .type = NL_A_UNSPEC,
3033 .optional = false,
3034 .min_len = sizeof(struct tc_service_curve),
3035 },
3036 [TCA_HFSC_FSC] = {
3037 .type = NL_A_UNSPEC,
3038 .optional = false,
3039 .min_len = sizeof(struct tc_service_curve),
3040 },
3041 [TCA_HFSC_USC] = {
3042 .type = NL_A_UNSPEC,
3043 .optional = false,
3044 .min_len = sizeof(struct tc_service_curve),
3045 },
3046 };
3047 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3048
3049 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3050 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3051 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3052 return EPROTO;
3053 }
3054
3055 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3056 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3057 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3058
3059 if (rsc->m1 != 0 || rsc->d != 0 ||
3060 fsc->m1 != 0 || fsc->d != 0 ||
3061 usc->m1 != 0 || usc->d != 0) {
3062 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3063 "Non-linear service curves are not supported.");
3064 return EPROTO;
3065 }
3066
3067 if (rsc->m2 != fsc->m2) {
3068 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3069 "Real-time service curves are not supported ");
3070 return EPROTO;
3071 }
3072
3073 if (rsc->m2 > usc->m2) {
3074 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3075 "Min-rate service curve is greater than "
3076 "the max-rate service curve.");
3077 return EPROTO;
3078 }
3079
3080 class->min_rate = fsc->m2;
3081 class->max_rate = usc->m2;
3082 return 0;
3083}
3084
3085static int
3086hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3087 struct hfsc_class *options,
3088 struct netdev_queue_stats *stats)
3089{
3090 int error;
3091 unsigned int handle;
3092 struct nlattr *nl_options;
3093
3094 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3095 if (error) {
3096 return error;
3097 }
3098
3099 if (queue_id) {
3100 unsigned int major, minor;
3101
3102 major = tc_get_major(handle);
3103 minor = tc_get_minor(handle);
3104 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3105 *queue_id = minor - 1;
3106 } else {
3107 return EPROTO;
3108 }
3109 }
3110
3111 if (options) {
3112 error = hfsc_parse_tca_options__(nl_options, options);
3113 }
3114
3115 return error;
3116}
3117
3118static int
3119hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3120 unsigned int parent, struct hfsc_class *options,
3121 struct netdev_queue_stats *stats)
3122{
3123 int error;
3124 struct ofpbuf *reply;
3125
3126 error = tc_query_class(netdev, handle, parent, &reply);
3127 if (error) {
3128 return error;
3129 }
3130
3131 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3132 ofpbuf_delete(reply);
3133 return error;
3134}
3135
3136static void
79f1cbe9 3137hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
a339aa81
EJ
3138 struct hfsc_class *class)
3139{
3140 uint32_t max_rate;
3141 const char *max_rate_s;
3142
79f1cbe9 3143 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
3144 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3145
3146 if (!max_rate) {
a00ca915 3147 enum netdev_features current;
a339aa81
EJ
3148
3149 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 3150 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
3151 }
3152
3153 class->min_rate = max_rate;
3154 class->max_rate = max_rate;
3155}
3156
3157static int
3158hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 3159 const struct smap *details,
a339aa81
EJ
3160 struct hfsc_class * class)
3161{
3162 const struct hfsc *hfsc;
3163 uint32_t min_rate, max_rate;
3164 const char *min_rate_s, *max_rate_s;
3165
3166 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
3167 min_rate_s = smap_get(details, "min-rate");
3168 max_rate_s = smap_get(details, "max-rate");
a339aa81 3169
c45ab5e9 3170 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3171 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3172 min_rate = MIN(min_rate, hfsc->max_rate);
3173
3174 max_rate = (max_rate_s
3175 ? strtoull(max_rate_s, NULL, 10) / 8
3176 : hfsc->max_rate);
3177 max_rate = MAX(max_rate, min_rate);
3178 max_rate = MIN(max_rate, hfsc->max_rate);
3179
3180 class->min_rate = min_rate;
3181 class->max_rate = max_rate;
3182
3183 return 0;
3184}
3185
3186/* Create an HFSC qdisc.
3187 *
3188 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3189static int
3190hfsc_setup_qdisc__(struct netdev * netdev)
3191{
3192 struct tcmsg *tcmsg;
3193 struct ofpbuf request;
3194 struct tc_hfsc_qopt opt;
3195
3196 tc_del_qdisc(netdev);
3197
3198 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3199 NLM_F_EXCL | NLM_F_CREATE, &request);
3200
3201 if (!tcmsg) {
3202 return ENODEV;
3203 }
3204
3205 tcmsg->tcm_handle = tc_make_handle(1, 0);
3206 tcmsg->tcm_parent = TC_H_ROOT;
3207
3208 memset(&opt, 0, sizeof opt);
3209 opt.defcls = 1;
3210
3211 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3212 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3213
3214 return tc_transact(&request, NULL);
3215}
3216
3217/* Create an HFSC class.
3218 *
3219 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3220 * sc rate <min_rate> ul rate <max_rate>" */
3221static int
3222hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3223 unsigned int parent, struct hfsc_class *class)
3224{
3225 int error;
3226 size_t opt_offset;
3227 struct tcmsg *tcmsg;
3228 struct ofpbuf request;
3229 struct tc_service_curve min, max;
3230
3231 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3232
3233 if (!tcmsg) {
3234 return ENODEV;
3235 }
3236
3237 tcmsg->tcm_handle = handle;
3238 tcmsg->tcm_parent = parent;
3239
3240 min.m1 = 0;
3241 min.d = 0;
3242 min.m2 = class->min_rate;
3243
3244 max.m1 = 0;
3245 max.d = 0;
3246 max.m2 = class->max_rate;
3247
3248 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3249 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3250 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3251 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3252 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3253 nl_msg_end_nested(&request, opt_offset);
3254
3255 error = tc_transact(&request, NULL);
3256 if (error) {
3257 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3258 "min-rate %ubps, max-rate %ubps (%s)",
3259 netdev_get_name(netdev),
3260 tc_get_major(handle), tc_get_minor(handle),
3261 tc_get_major(parent), tc_get_minor(parent),
3262 class->min_rate, class->max_rate, strerror(error));
3263 }
3264
3265 return error;
3266}
3267
3268static int
79f1cbe9 3269hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3270{
3271 int error;
3272 struct hfsc_class class;
3273
3274 error = hfsc_setup_qdisc__(netdev);
3275
3276 if (error) {
3277 return error;
3278 }
3279
3280 hfsc_parse_qdisc_details__(netdev, details, &class);
3281 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3282 tc_make_handle(1, 0), &class);
3283
3284 if (error) {
3285 return error;
3286 }
3287
3288 hfsc_install__(netdev, class.max_rate);
3289 return 0;
3290}
3291
3292static int
3293hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3294{
3295 struct ofpbuf msg;
a339aa81
EJ
3296 struct nl_dump dump;
3297 struct hfsc_class hc;
3298
3299 hc.max_rate = 0;
3300 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3301 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3302
3303 if (!start_queue_dump(netdev, &dump)) {
3304 return ENODEV;
3305 }
3306
3307 while (nl_dump_next(&dump, &msg)) {
3308 unsigned int queue_id;
3309
3310 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3311 hfsc_update_queue__(netdev, queue_id, &hc);
3312 }
3313 }
3314
3315 nl_dump_done(&dump);
3316 return 0;
3317}
3318
3319static void
3320hfsc_tc_destroy(struct tc *tc)
3321{
3322 struct hfsc *hfsc;
3323 struct hfsc_class *hc, *next;
3324
3325 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3326
3327 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3328 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3329 free(hc);
3330 }
3331
3332 tc_destroy(tc);
3333 free(hfsc);
3334}
3335
3336static int
79f1cbe9 3337hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
3338{
3339 const struct hfsc *hfsc;
3340 hfsc = hfsc_get__(netdev);
79f1cbe9 3341 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
3342 return 0;
3343}
3344
3345static int
79f1cbe9 3346hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3347{
3348 int error;
3349 struct hfsc_class class;
3350
3351 hfsc_parse_qdisc_details__(netdev, details, &class);
3352 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3353 tc_make_handle(1, 0), &class);
3354
3355 if (!error) {
3356 hfsc_get__(netdev)->max_rate = class.max_rate;
3357 }
3358
3359 return error;
3360}
3361
3362static int
3363hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3364 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
3365{
3366 const struct hfsc_class *hc;
3367
3368 hc = hfsc_class_cast__(queue);
79f1cbe9 3369 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 3370 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3371 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
3372 }
3373 return 0;
3374}
3375
3376static int
3377hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3378 const struct smap *details)
a339aa81
EJ
3379{
3380 int error;
3381 struct hfsc_class class;
3382
3383 error = hfsc_parse_class_details__(netdev, details, &class);
3384 if (error) {
3385 return error;
3386 }
3387
3388 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3389 tc_make_handle(1, 0xfffe), &class);
3390 if (error) {
3391 return error;
3392 }
3393
3394 hfsc_update_queue__(netdev, queue_id, &class);
3395 return 0;
3396}
3397
3398static int
3399hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3400{
3401 int error;
3402 struct hfsc *hfsc;
3403 struct hfsc_class *hc;
3404
3405 hc = hfsc_class_cast__(queue);
3406 hfsc = hfsc_get__(netdev);
3407
3408 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3409 if (!error) {
3410 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3411 free(hc);
3412 }
3413 return error;
3414}
3415
3416static int
3417hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3418 struct netdev_queue_stats *stats)
3419{
3420 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3421 tc_make_handle(1, 0xfffe), NULL, stats);
3422}
3423
3424static int
3425hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3426 const struct ofpbuf *nlmsg,
3427 netdev_dump_queue_stats_cb *cb, void *aux)
3428{
3429 struct netdev_queue_stats stats;
3430 unsigned int handle, major, minor;
3431 int error;
3432
3433 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3434 if (error) {
3435 return error;
3436 }
3437
3438 major = tc_get_major(handle);
3439 minor = tc_get_minor(handle);
3440 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3441 (*cb)(minor - 1, &stats, aux);
3442 }
3443 return 0;
3444}
3445
3446static const struct tc_ops tc_ops_hfsc = {
3447 "hfsc", /* linux_name */
3448 "linux-hfsc", /* ovs_name */
3449 HFSC_N_QUEUES, /* n_queues */
3450 hfsc_tc_install, /* tc_install */
3451 hfsc_tc_load, /* tc_load */
3452 hfsc_tc_destroy, /* tc_destroy */
3453 hfsc_qdisc_get, /* qdisc_get */
3454 hfsc_qdisc_set, /* qdisc_set */
3455 hfsc_class_get, /* class_get */
3456 hfsc_class_set, /* class_set */
3457 hfsc_class_delete, /* class_delete */
3458 hfsc_class_get_stats, /* class_get_stats */
3459 hfsc_class_dump_stats /* class_dump_stats */
3460};
3461\f
c1c9c9c4
BP
3462/* "linux-default" traffic control class.
3463 *
3464 * This class represents the default, unnamed Linux qdisc. It corresponds to
3465 * the "" (empty string) QoS type in the OVS database. */
3466
3467static void
3468default_install__(struct netdev *netdev)
3469{
3470 struct netdev_dev_linux *netdev_dev =
3471 netdev_dev_linux_cast(netdev_get_dev(netdev));
3472 static struct tc *tc;
3473
3474 if (!tc) {
3475 tc = xmalloc(sizeof *tc);
3476 tc_init(tc, &tc_ops_default);
3477 }
3478 netdev_dev->tc = tc;
3479}
3480
3481static int
3482default_tc_install(struct netdev *netdev,
79f1cbe9 3483 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
3484{
3485 default_install__(netdev);
3486 return 0;
3487}
3488
3489static int
3490default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3491{
3492 default_install__(netdev);
3493 return 0;
3494}
3495
3496static const struct tc_ops tc_ops_default = {
3497 NULL, /* linux_name */
3498 "", /* ovs_name */
3499 0, /* n_queues */
3500 default_tc_install,
3501 default_tc_load,
3502 NULL, /* tc_destroy */
3503 NULL, /* qdisc_get */
3504 NULL, /* qdisc_set */
3505 NULL, /* class_get */
3506 NULL, /* class_set */
3507 NULL, /* class_delete */
3508 NULL, /* class_get_stats */
3509 NULL /* class_dump_stats */
3510};
3511\f
3512/* "linux-other" traffic control class.
3513 *
3514 * */
3515
3516static int
3517other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3518{
3519 struct netdev_dev_linux *netdev_dev =
3520 netdev_dev_linux_cast(netdev_get_dev(netdev));
3521 static struct tc *tc;
3522
3523 if (!tc) {
3524 tc = xmalloc(sizeof *tc);
3525 tc_init(tc, &tc_ops_other);
3526 }
3527 netdev_dev->tc = tc;
3528 return 0;
3529}
3530
3531static const struct tc_ops tc_ops_other = {
3532 NULL, /* linux_name */
3533 "linux-other", /* ovs_name */
3534 0, /* n_queues */
3535 NULL, /* tc_install */
3536 other_tc_load,
3537 NULL, /* tc_destroy */
3538 NULL, /* qdisc_get */
3539 NULL, /* qdisc_set */
3540 NULL, /* class_get */
3541 NULL, /* class_set */
3542 NULL, /* class_delete */
3543 NULL, /* class_get_stats */
3544 NULL /* class_dump_stats */
3545};
3546\f
3547/* Traffic control. */
3548
3549/* Number of kernel "tc" ticks per second. */
3550static double ticks_per_s;
3551
3552/* Number of kernel "jiffies" per second. This is used for the purpose of
3553 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3554 * one jiffy's worth of data.
3555 *
3556 * There are two possibilities here:
3557 *
3558 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3559 * approximate range of 100 to 1024. That means that we really need to
3560 * make sure that the qdisc can buffer that much data.
3561 *
3562 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3563 * has finely granular timers and there's no need to fudge additional room
3564 * for buffers. (There's no extra effort needed to implement that: the
3565 * large 'buffer_hz' is used as a divisor, so practically any number will
3566 * come out as 0 in the division. Small integer results in the case of
3567 * really high dividends won't have any real effect anyhow.)
3568 */
3569static unsigned int buffer_hz;
3570
3571/* Returns tc handle 'major':'minor'. */
3572static unsigned int
3573tc_make_handle(unsigned int major, unsigned int minor)
3574{
3575 return TC_H_MAKE(major << 16, minor);
3576}
3577
3578/* Returns the major number from 'handle'. */
3579static unsigned int
3580tc_get_major(unsigned int handle)
3581{
3582 return TC_H_MAJ(handle) >> 16;
3583}
3584
3585/* Returns the minor number from 'handle'. */
3586static unsigned int
3587tc_get_minor(unsigned int handle)
3588{
3589 return TC_H_MIN(handle);
3590}
3591
3592static struct tcmsg *
3593tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3594 struct ofpbuf *request)
3595{
3596 struct tcmsg *tcmsg;
3597 int ifindex;
3598 int error;
3599
3600 error = get_ifindex(netdev, &ifindex);
3601 if (error) {
3602 return NULL;
3603 }
3604
3605 ofpbuf_init(request, 512);
3606 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3607 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3608 tcmsg->tcm_family = AF_UNSPEC;
3609 tcmsg->tcm_ifindex = ifindex;
3610 /* Caller should fill in tcmsg->tcm_handle. */
3611 /* Caller should fill in tcmsg->tcm_parent. */
3612
3613 return tcmsg;
3614}
3615
3616static int
3617tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3618{
3619 int error = nl_sock_transact(rtnl_sock, request, replyp);
3620 ofpbuf_uninit(request);
3621 return error;
3622}
3623
f8500004
JP
3624/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3625 * policing configuration.
3626 *
3627 * This function is equivalent to running the following when 'add' is true:
3628 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3629 *
3630 * This function is equivalent to running the following when 'add' is false:
3631 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3632 *
3633 * The configuration and stats may be seen with the following command:
3634 * /sbin/tc -s qdisc show dev <devname>
3635 *
3636 * Returns 0 if successful, otherwise a positive errno value.
3637 */
3638static int
3639tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3640{
3641 struct ofpbuf request;
3642 struct tcmsg *tcmsg;
3643 int error;
3644 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3645 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3646
3647 tcmsg = tc_make_request(netdev, type, flags, &request);
3648 if (!tcmsg) {
3649 return ENODEV;
3650 }
3651 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3652 tcmsg->tcm_parent = TC_H_INGRESS;
3653 nl_msg_put_string(&request, TCA_KIND, "ingress");
3654 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3655
3656 error = tc_transact(&request, NULL);
3657 if (error) {
3658 /* If we're deleting the qdisc, don't worry about some of the
3659 * error conditions. */
3660 if (!add && (error == ENOENT || error == EINVAL)) {
3661 return 0;
3662 }
3663 return error;
3664 }
3665
3666 return 0;
3667}
3668
3669/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3670 * of 'kbits_burst'.
3671 *
3672 * This function is equivalent to running:
3673 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3674 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3675 * mtu 65535 drop
3676 *
3677 * The configuration and stats may be seen with the following command:
3678 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3679 *
3680 * Returns 0 if successful, otherwise a positive errno value.
3681 */
3682static int
3683tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3684{
3685 struct tc_police tc_police;
3686 struct ofpbuf request;
3687 struct tcmsg *tcmsg;
3688 size_t basic_offset;
3689 size_t police_offset;
3690 int error;
3691 int mtu = 65535;
3692
3693 memset(&tc_police, 0, sizeof tc_police);
3694 tc_police.action = TC_POLICE_SHOT;
3695 tc_police.mtu = mtu;
3696 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3697 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3698 kbits_burst * 1024);
3699
3700 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3701 NLM_F_EXCL | NLM_F_CREATE, &request);
3702 if (!tcmsg) {
3703 return ENODEV;
3704 }
3705 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3706 tcmsg->tcm_info = tc_make_handle(49,
3707 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3708
3709 nl_msg_put_string(&request, TCA_KIND, "basic");
3710 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3711 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3712 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3713 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3714 nl_msg_end_nested(&request, police_offset);
3715 nl_msg_end_nested(&request, basic_offset);
3716
3717 error = tc_transact(&request, NULL);
3718 if (error) {
3719 return error;
3720 }
3721
3722 return 0;
3723}
3724
c1c9c9c4
BP
3725static void
3726read_psched(void)
3727{
3728 /* The values in psched are not individually very meaningful, but they are
3729 * important. The tables below show some values seen in the wild.
3730 *
3731 * Some notes:
3732 *
3733 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3734 * (Before that, there are hints that it was 1000000000.)
3735 *
3736 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3737 * above.
3738 *
3739 * /proc/net/psched
3740 * -----------------------------------
3741 * [1] 000c8000 000f4240 000f4240 00000064
3742 * [2] 000003e8 00000400 000f4240 3b9aca00
3743 * [3] 000003e8 00000400 000f4240 3b9aca00
3744 * [4] 000003e8 00000400 000f4240 00000064
3745 * [5] 000003e8 00000040 000f4240 3b9aca00
3746 * [6] 000003e8 00000040 000f4240 000000f9
3747 *
3748 * a b c d ticks_per_s buffer_hz
3749 * ------- --------- ---------- ------------- ----------- -------------
3750 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3751 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3752 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3753 * [4] 1,000 1,024 1,000,000 100 976,562 100
3754 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3755 * [6] 1,000 64 1,000,000 249 15,625,000 249
3756 *
3757 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3758 * [2] 2.6.26-1-686-bigmem from Debian lenny
3759 * [3] 2.6.26-2-sparc64 from Debian lenny
3760 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3761 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3762 * [6] 2.6.34 from kernel.org on KVM
3763 */
3764 static const char fn[] = "/proc/net/psched";
3765 unsigned int a, b, c, d;
3766 FILE *stream;
3767
3768 ticks_per_s = 1.0;
3769 buffer_hz = 100;
3770
3771 stream = fopen(fn, "r");
3772 if (!stream) {
3773 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3774 return;
3775 }
3776
3777 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3778 VLOG_WARN("%s: read failed", fn);
3779 fclose(stream);
3780 return;
3781 }
3782 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3783 fclose(stream);
3784
3785 if (!a || !c) {
3786 VLOG_WARN("%s: invalid scheduler parameters", fn);
3787 return;
3788 }
3789
3790 ticks_per_s = (double) a * c / b;
3791 if (c == 1000000) {
3792 buffer_hz = d;
3793 } else {
3794 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3795 fn, a, b, c, d);
3796 }
3797 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3798}
3799
3800/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3801 * rate of 'rate' bytes per second. */
3802static unsigned int
3803tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3804{
3805 if (!buffer_hz) {
3806 read_psched();
3807 }
3808 return (rate * ticks) / ticks_per_s;
3809}
3810
3811/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3812 * rate of 'rate' bytes per second. */
3813static unsigned int
3814tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3815{
3816 if (!buffer_hz) {
3817 read_psched();
3818 }
015c93a4 3819 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3820}
3821
3822/* Returns the number of bytes that need to be reserved for qdisc buffering at
3823 * a transmission rate of 'rate' bytes per second. */
3824static unsigned int
3825tc_buffer_per_jiffy(unsigned int rate)
3826{
3827 if (!buffer_hz) {
3828 read_psched();
3829 }
3830 return rate / buffer_hz;
3831}
3832
3833/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3834 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3835 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3836 * stores NULL into it if it is absent.
3837 *
3838 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3839 * 'msg'.
3840 *
3841 * Returns 0 if successful, otherwise a positive errno value. */
3842static int
3843tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3844 struct nlattr **options)
3845{
3846 static const struct nl_policy tca_policy[] = {
3847 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3848 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3849 };
3850 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3851
3852 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3853 tca_policy, ta, ARRAY_SIZE(ta))) {
3854 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3855 goto error;
3856 }
3857
3858 if (kind) {
3859 *kind = nl_attr_get_string(ta[TCA_KIND]);
3860 }
3861
3862 if (options) {
3863 *options = ta[TCA_OPTIONS];
3864 }
3865
3866 return 0;
3867
3868error:
3869 if (kind) {
3870 *kind = NULL;
3871 }
3872 if (options) {
3873 *options = NULL;
3874 }
3875 return EPROTO;
3876}
3877
3878/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3879 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3880 * into '*options', and its queue statistics into '*stats'. Any of the output
3881 * arguments may be null.
3882 *
3883 * Returns 0 if successful, otherwise a positive errno value. */
3884static int
3885tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3886 struct nlattr **options, struct netdev_queue_stats *stats)
3887{
3888 static const struct nl_policy tca_policy[] = {
3889 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3890 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3891 };
3892 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3893
3894 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3895 tca_policy, ta, ARRAY_SIZE(ta))) {
3896 VLOG_WARN_RL(&rl, "failed to parse class message");
3897 goto error;
3898 }
3899
3900 if (handlep) {
3901 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3902 *handlep = tc->tcm_handle;
3903 }
3904
3905 if (options) {
3906 *options = ta[TCA_OPTIONS];
3907 }
3908
3909 if (stats) {
3910 const struct gnet_stats_queue *gsq;
3911 struct gnet_stats_basic gsb;
3912
3913 static const struct nl_policy stats_policy[] = {
3914 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3915 .min_len = sizeof gsb },
3916 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3917 .min_len = sizeof *gsq },
3918 };
3919 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3920
3921 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3922 sa, ARRAY_SIZE(sa))) {
3923 VLOG_WARN_RL(&rl, "failed to parse class stats");
3924 goto error;
3925 }
3926
3927 /* Alignment issues screw up the length of struct gnet_stats_basic on
3928 * some arch/bitsize combinations. Newer versions of Linux have a
3929 * struct gnet_stats_basic_packed, but we can't depend on that. The
3930 * easiest thing to do is just to make a copy. */
3931 memset(&gsb, 0, sizeof gsb);
3932 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3933 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3934 stats->tx_bytes = gsb.bytes;
3935 stats->tx_packets = gsb.packets;
3936
3937 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3938 stats->tx_errors = gsq->drops;
3939 }
3940
3941 return 0;
3942
3943error:
3944 if (options) {
3945 *options = NULL;
3946 }
3947 if (stats) {
3948 memset(stats, 0, sizeof *stats);
3949 }
3950 return EPROTO;
3951}
3952
3953/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3954 * on 'netdev'. */
3955static int
3956tc_query_class(const struct netdev *netdev,
3957 unsigned int handle, unsigned int parent,
3958 struct ofpbuf **replyp)
3959{
3960 struct ofpbuf request;
3961 struct tcmsg *tcmsg;
3962 int error;
3963
3964 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3965 if (!tcmsg) {
3966 return ENODEV;
3967 }
c1c9c9c4
BP
3968 tcmsg->tcm_handle = handle;
3969 tcmsg->tcm_parent = parent;
3970
3971 error = tc_transact(&request, replyp);
3972 if (error) {
3973 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3974 netdev_get_name(netdev),
3975 tc_get_major(handle), tc_get_minor(handle),
3976 tc_get_major(parent), tc_get_minor(parent),
3977 strerror(error));
3978 }
3979 return error;
3980}
3981
3982/* Equivalent to "tc class del dev <name> handle <handle>". */
3983static int
3984tc_delete_class(const struct netdev *netdev, unsigned int handle)
3985{
3986 struct ofpbuf request;
3987 struct tcmsg *tcmsg;
3988 int error;
3989
3990 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3991 if (!tcmsg) {
3992 return ENODEV;
3993 }
c1c9c9c4
BP
3994 tcmsg->tcm_handle = handle;
3995 tcmsg->tcm_parent = 0;
3996
3997 error = tc_transact(&request, NULL);
3998 if (error) {
3999 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4000 netdev_get_name(netdev),
4001 tc_get_major(handle), tc_get_minor(handle),
4002 strerror(error));
4003 }
4004 return error;
4005}
4006
4007/* Equivalent to "tc qdisc del dev <name> root". */
4008static int
4009tc_del_qdisc(struct netdev *netdev)
4010{
4011 struct netdev_dev_linux *netdev_dev =
4012 netdev_dev_linux_cast(netdev_get_dev(netdev));
4013 struct ofpbuf request;
4014 struct tcmsg *tcmsg;
4015 int error;
4016
4017 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
4018 if (!tcmsg) {
4019 return ENODEV;
4020 }
c1c9c9c4
BP
4021 tcmsg->tcm_handle = tc_make_handle(1, 0);
4022 tcmsg->tcm_parent = TC_H_ROOT;
4023
4024 error = tc_transact(&request, NULL);
4025 if (error == EINVAL) {
4026 /* EINVAL probably means that the default qdisc was in use, in which
4027 * case we've accomplished our purpose. */
4028 error = 0;
4029 }
4030 if (!error && netdev_dev->tc) {
4031 if (netdev_dev->tc->ops->tc_destroy) {
4032 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4033 }
4034 netdev_dev->tc = NULL;
4035 }
4036 return error;
4037}
4038
4039/* If 'netdev''s qdisc type and parameters are not yet known, queries the
4040 * kernel to determine what they are. Returns 0 if successful, otherwise a
4041 * positive errno value. */
4042static int
4043tc_query_qdisc(const struct netdev *netdev)
4044{
4045 struct netdev_dev_linux *netdev_dev =
4046 netdev_dev_linux_cast(netdev_get_dev(netdev));
4047 struct ofpbuf request, *qdisc;
4048 const struct tc_ops *ops;
4049 struct tcmsg *tcmsg;
4050 int load_error;
4051 int error;
4052
4053 if (netdev_dev->tc) {
4054 return 0;
4055 }
4056
4057 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4058 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4059 * 2.6.35 without that fix backported to it.
4060 *
4061 * To avoid the OOPS, we must not make a request that would attempt to dump
4062 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4063 * few others. There are a few ways that I can see to do this, but most of
4064 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4065 * technique chosen here is to assume that any non-default qdisc that we
4066 * create will have a class with handle 1:0. The built-in qdiscs only have
4067 * a class with handle 0:0.
4068 *
4069 * We could check for Linux 2.6.35+ and use a more straightforward method
4070 * there. */
4071 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
4072 if (!tcmsg) {
4073 return ENODEV;
4074 }
c1c9c9c4
BP
4075 tcmsg->tcm_handle = tc_make_handle(1, 0);
4076 tcmsg->tcm_parent = 0;
4077
4078 /* Figure out what tc class to instantiate. */
4079 error = tc_transact(&request, &qdisc);
4080 if (!error) {
4081 const char *kind;
4082
4083 error = tc_parse_qdisc(qdisc, &kind, NULL);
4084 if (error) {
4085 ops = &tc_ops_other;
4086 } else {
4087 ops = tc_lookup_linux_name(kind);
4088 if (!ops) {
4089 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4090 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4091
4092 ops = &tc_ops_other;
4093 }
4094 }
4095 } else if (error == ENOENT) {
4096 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4097 * other entity that doesn't have a handle 1:0. We will assume
4098 * that it's the system default qdisc. */
4099 ops = &tc_ops_default;
4100 error = 0;
4101 } else {
4102 /* Who knows? Maybe the device got deleted. */
4103 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4104 netdev_get_name(netdev), strerror(error));
4105 ops = &tc_ops_other;
4106 }
4107
4108 /* Instantiate it. */
ebc56baa 4109 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
c1c9c9c4
BP
4110 assert((load_error == 0) == (netdev_dev->tc != NULL));
4111 ofpbuf_delete(qdisc);
4112
4113 return error ? error : load_error;
4114}
4115
4116/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4117 approximate the time to transmit packets of various lengths. For an MTU of
4118 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4119 represents two possible packet lengths; for a MTU of 513 through 1024, four
4120 possible lengths; and so on.
4121
4122 Returns, for the specified 'mtu', the number of bits that packet lengths
4123 need to be shifted right to fit within such a 256-entry table. */
4124static int
4125tc_calc_cell_log(unsigned int mtu)
4126{
4127 int cell_log;
4128
4129 if (!mtu) {
4130 mtu = ETH_PAYLOAD_MAX;
4131 }
4132 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4133
4134 for (cell_log = 0; mtu >= 256; cell_log++) {
4135 mtu >>= 1;
4136 }
4137
4138 return cell_log;
4139}
4140
4141/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4142 * of 'mtu'. */
4143static void
4144tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4145{
4146 memset(rate, 0, sizeof *rate);
4147 rate->cell_log = tc_calc_cell_log(mtu);
4148 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4149 /* rate->cell_align = 0; */ /* distro headers. */
4150 rate->mpu = ETH_TOTAL_MIN;
4151 rate->rate = Bps;
4152}
4153
4154/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4155 * attribute of the specified "type".
4156 *
4157 * See tc_calc_cell_log() above for a description of "rtab"s. */
4158static void
4159tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4160{
4161 uint32_t *rtab;
4162 unsigned int i;
4163
4164 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4165 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4166 unsigned packet_size = (i + 1) << rate->cell_log;
4167 if (packet_size < rate->mpu) {
4168 packet_size = rate->mpu;
4169 }
4170 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4171 }
4172}
4173
4174/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4175 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4176 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4177 * 0 is fine.) */
c1c9c9c4
BP
4178static int
4179tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4180{
4181 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4182 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4183}
d3980822 4184\f
aaf2fb1a
BP
4185/* Linux-only functions declared in netdev-linux.h */
4186
025e874a
BP
4187/* Returns a fd for an AF_INET socket or a negative errno value. */
4188int
4189netdev_linux_get_af_inet_sock(void)
4190{
4191 int error = netdev_linux_init();
4192 return error ? -error : af_inet_sock;
4193}
4194
aaf2fb1a
BP
4195/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4196 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4197int
4198netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4199 const char *flag_name, bool enable)
4200{
4201 const char *netdev_name = netdev_get_name(netdev);
4202 struct ethtool_value evalue;
4203 uint32_t new_flags;
4204 int error;
4205
ab985a77 4206 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4207 memset(&evalue, 0, sizeof evalue);
4208 error = netdev_linux_do_ethtool(netdev_name,
4209 (struct ethtool_cmd *)&evalue,
4210 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4211 if (error) {
4212 return error;
4213 }
4214
ab985a77 4215 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
4216 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4217 error = netdev_linux_do_ethtool(netdev_name,
4218 (struct ethtool_cmd *)&evalue,
4219 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4220 if (error) {
4221 return error;
4222 }
4223
ab985a77 4224 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4225 memset(&evalue, 0, sizeof evalue);
4226 error = netdev_linux_do_ethtool(netdev_name,
4227 (struct ethtool_cmd *)&evalue,
4228 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4229 if (error) {
4230 return error;
4231 }
4232
4233 if (new_flags != evalue.data) {
4234 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4235 "device %s failed", enable ? "enable" : "disable",
4236 flag_name, netdev_name);
4237 return EOPNOTSUPP;
4238 }
4239
4240 return 0;
4241}
4242\f
4243/* Utility functions. */
4244
d3980822 4245/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4246static void
d3980822
BP
4247netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4248 const struct rtnl_link_stats *src)
4249{
f613a0d7
PS
4250 dst->rx_packets = src->rx_packets;
4251 dst->tx_packets = src->tx_packets;
4252 dst->rx_bytes = src->rx_bytes;
4253 dst->tx_bytes = src->tx_bytes;
4254 dst->rx_errors = src->rx_errors;
4255 dst->tx_errors = src->tx_errors;
4256 dst->rx_dropped = src->rx_dropped;
4257 dst->tx_dropped = src->tx_dropped;
4258 dst->multicast = src->multicast;
4259 dst->collisions = src->collisions;
4260 dst->rx_length_errors = src->rx_length_errors;
4261 dst->rx_over_errors = src->rx_over_errors;
4262 dst->rx_crc_errors = src->rx_crc_errors;
4263 dst->rx_frame_errors = src->rx_frame_errors;
4264 dst->rx_fifo_errors = src->rx_fifo_errors;
4265 dst->rx_missed_errors = src->rx_missed_errors;
4266 dst->tx_aborted_errors = src->tx_aborted_errors;
4267 dst->tx_carrier_errors = src->tx_carrier_errors;
4268 dst->tx_fifo_errors = src->tx_fifo_errors;
4269 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4270 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4271}
4272
c1c9c9c4
BP
4273static int
4274get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4275{
4276 /* Policy for RTNLGRP_LINK messages.
4277 *
4278 * There are *many* more fields in these messages, but currently we only
4279 * care about these fields. */
4280 static const struct nl_policy rtnlgrp_link_policy[] = {
4281 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4282 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4283 .min_len = sizeof(struct rtnl_link_stats) },
4284 };
4285
4286 struct ofpbuf request;
4287 struct ofpbuf *reply;
4288 struct ifinfomsg *ifi;
c1c9c9c4
BP
4289 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4290 int error;
4291
4292 ofpbuf_init(&request, 0);
4293 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4294 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4295 ifi->ifi_family = PF_UNSPEC;
4296 ifi->ifi_index = ifindex;
4297 error = nl_sock_transact(rtnl_sock, &request, &reply);
4298 ofpbuf_uninit(&request);
4299 if (error) {
4300 return error;
4301 }
4302
4303 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4304 rtnlgrp_link_policy,
4305 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4306 ofpbuf_delete(reply);
4307 return EPROTO;
4308 }
4309
4310 if (!attrs[IFLA_STATS]) {
4311 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4312 ofpbuf_delete(reply);
4313 return EPROTO;
4314 }
8b61709d 4315
d3980822 4316 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4317
576e26d7
BP
4318 ofpbuf_delete(reply);
4319
8b61709d
BP
4320 return 0;
4321}
4322
4323static int
4324get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4325{
4326 static const char fn[] = "/proc/net/dev";
4327 char line[1024];
4328 FILE *stream;
4329 int ln;
4330
4331 stream = fopen(fn, "r");
4332 if (!stream) {
4333 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4334 return errno;
4335 }
4336
4337 ln = 0;
4338 while (fgets(line, sizeof line, stream)) {
4339 if (++ln >= 3) {
4340 char devname[16];
4341#define X64 "%"SCNu64
4342 if (sscanf(line,
4343 " %15[^:]:"
4344 X64 X64 X64 X64 X64 X64 X64 "%*u"
4345 X64 X64 X64 X64 X64 X64 X64 "%*u",
4346 devname,
4347 &stats->rx_bytes,
4348 &stats->rx_packets,
4349 &stats->rx_errors,
4350 &stats->rx_dropped,
4351 &stats->rx_fifo_errors,
4352 &stats->rx_frame_errors,
4353 &stats->multicast,
4354 &stats->tx_bytes,
4355 &stats->tx_packets,
4356 &stats->tx_errors,
4357 &stats->tx_dropped,
4358 &stats->tx_fifo_errors,
4359 &stats->collisions,
4360 &stats->tx_carrier_errors) != 15) {
4361 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4362 } else if (!strcmp(devname, netdev_name)) {
4363 stats->rx_length_errors = UINT64_MAX;
4364 stats->rx_over_errors = UINT64_MAX;
4365 stats->rx_crc_errors = UINT64_MAX;
4366 stats->rx_missed_errors = UINT64_MAX;
4367 stats->tx_aborted_errors = UINT64_MAX;
4368 stats->tx_heartbeat_errors = UINT64_MAX;
4369 stats->tx_window_errors = UINT64_MAX;
4370 fclose(stream);
4371 return 0;
4372 }
4373 }
4374 }
4375 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4376 fclose(stream);
4377 return ENODEV;
4378}
c1c9c9c4 4379
3a183124 4380static int
059e5f4f 4381get_flags(const struct netdev_dev *dev, unsigned int *flags)
8b61709d
BP
4382{
4383 struct ifreq ifr;
4384 int error;
4385
755be9ea
EJ
4386 *flags = 0;
4387 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
149f577a 4388 "SIOCGIFFLAGS");
755be9ea
EJ
4389 if (!error) {
4390 *flags = ifr.ifr_flags;
4391 }
8b61709d
BP
4392 return error;
4393}
4394
4395static int
059e5f4f 4396set_flags(struct netdev *netdev, unsigned int flags)
8b61709d
BP
4397{
4398 struct ifreq ifr;
4399
4400 ifr.ifr_flags = flags;
149f577a
JG
4401 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4402 "SIOCSIFFLAGS");
8b61709d
BP
4403}
4404
4405static int
4406do_get_ifindex(const char *netdev_name)
4407{
4408 struct ifreq ifr;
4409
71d7c22f 4410 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4411 COVERAGE_INC(netdev_get_ifindex);
4412 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4413 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4414 netdev_name, strerror(errno));
4415 return -errno;
4416 }
4417 return ifr.ifr_ifindex;
4418}
4419
4420static int
4421get_ifindex(const struct netdev *netdev_, int *ifindexp)
4422{
149f577a
JG
4423 struct netdev_dev_linux *netdev_dev =
4424 netdev_dev_linux_cast(netdev_get_dev(netdev_));
c7b1b0a5 4425
149f577a 4426 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d 4427 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 4428
8b61709d 4429 if (ifindex < 0) {
c7b1b0a5
PS
4430 netdev_dev->get_ifindex_error = -ifindex;
4431 netdev_dev->ifindex = 0;
4432 } else {
4433 netdev_dev->get_ifindex_error = 0;
4434 netdev_dev->ifindex = ifindex;
8b61709d 4435 }
149f577a 4436 netdev_dev->cache_valid |= VALID_IFINDEX;
8b61709d 4437 }
c7b1b0a5 4438
149f577a 4439 *ifindexp = netdev_dev->ifindex;
c7b1b0a5 4440 return netdev_dev->get_ifindex_error;
8b61709d
BP
4441}
4442
4443static int
4444get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4445{
4446 struct ifreq ifr;
4447 int hwaddr_family;
4448
4449 memset(&ifr, 0, sizeof ifr);
71d7c22f 4450 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4451 COVERAGE_INC(netdev_get_hwaddr);
4452 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4453 /* ENODEV probably means that a vif disappeared asynchronously and
4454 * hasn't been removed from the database yet, so reduce the log level
4455 * to INFO for that case. */
4456 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4457 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4458 netdev_name, strerror(errno));
8b61709d
BP
4459 return errno;
4460 }
4461 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4462 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4463 VLOG_WARN("%s device has unknown hardware address family %d",
4464 netdev_name, hwaddr_family);
4465 }
4466 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4467 return 0;
4468}
4469
4470static int
44445cac 4471set_etheraddr(const char *netdev_name,
8b61709d
BP
4472 const uint8_t mac[ETH_ADDR_LEN])
4473{
4474 struct ifreq ifr;
4475
4476 memset(&ifr, 0, sizeof ifr);
71d7c22f 4477 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 4478 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
4479 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4480 COVERAGE_INC(netdev_set_hwaddr);
4481 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4482 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4483 netdev_name, strerror(errno));
4484 return errno;
4485 }
4486 return 0;
4487}
4488
4489static int
0b0544d7 4490netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4491 int cmd, const char *cmd_name)
4492{
4493 struct ifreq ifr;
4494
4495 memset(&ifr, 0, sizeof ifr);
71d7c22f 4496 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4497 ifr.ifr_data = (caddr_t) ecmd;
4498
4499 ecmd->cmd = cmd;
8b61709d
BP
4500 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4501 return 0;
4502 } else {
4503 if (errno != EOPNOTSUPP) {
4504 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4505 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4506 } else {
4507 /* The device doesn't support this operation. That's pretty
4508 * common, so there's no point in logging anything. */
4509 }
4510 return errno;
4511 }
4512}
4513
4514static int
149f577a
JG
4515netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4516 const char *cmd_name)
8b61709d 4517{
71d7c22f 4518 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4519 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4520 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4521 strerror(errno));
8b61709d
BP
4522 return errno;
4523 }
4524 return 0;
4525}
f1acd62b
BP
4526
4527static int
4528netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4529 int cmd, const char *cmd_name)
4530{
4531 struct ifreq ifr;
4532 int error;
4533
4534 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4535 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4536 if (!error) {
4537 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4538 *ip = sin->sin_addr;
4539 }
4540 return error;
4541}
488d734d
BP
4542
4543/* Returns an AF_PACKET raw socket or a negative errno value. */
4544static int
4545af_packet_sock(void)
4546{
4547 static int sock = INT_MIN;
4548
4549 if (sock == INT_MIN) {
4550 sock = socket(AF_PACKET, SOCK_RAW, 0);
4551 if (sock >= 0) {
4552 set_nonblocking(sock);
4553 } else {
4554 sock = -errno;
4555 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4556 }
4557 }
4558
4559 return sock;
4560}