]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
netdev-linux: Maintain carrier flag constantly.
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
782e6111 2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
8b61709d 21#include <assert.h>
e9e28be3 22#include <errno.h>
8b61709d
BP
23#include <fcntl.h>
24#include <arpa/inet.h>
25#include <inttypes.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d 28#include <linux/if_tun.h>
a740f0de 29#include <linux/ip.h>
8b61709d
BP
30#include <linux/types.h>
31#include <linux/ethtool.h>
63331829 32#include <linux/mii.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d
BP
35#include <linux/sockios.h>
36#include <linux/version.h>
37#include <sys/types.h>
38#include <sys/ioctl.h>
39#include <sys/socket.h>
40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
43#include <net/if_packet.h>
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
9fe3b9a2 52#include "dpif-linux.h"
8b61709d
BP
53#include "dynamic-string.h"
54#include "fatal-signal.h"
93b13be8
BP
55#include "hash.h"
56#include "hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
e9e28be3 59#include "netlink.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d
BP
63#include "openflow/openflow.h"
64#include "packets.h"
65#include "poll-loop.h"
21d6e22e 66#include "rtnetlink-link.h"
8b61709d
BP
67#include "socket-util.h"
68#include "shash.h"
19993ef3 69#include "sset.h"
1670c579 70#include "timer.h"
e9e28be3 71#include "vlog.h"
5136ce49 72
d98e6007 73VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
74
75COVERAGE_DEFINE(netdev_get_vlan_vid);
76COVERAGE_DEFINE(netdev_set_policing);
77COVERAGE_DEFINE(netdev_arp_lookup);
78COVERAGE_DEFINE(netdev_get_ifindex);
79COVERAGE_DEFINE(netdev_get_hwaddr);
80COVERAGE_DEFINE(netdev_set_hwaddr);
81COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
82\f
83/* These were introduced in Linux 2.6.14, so they might be missing if we have
84 * old headers. */
85#ifndef ADVERTISED_Pause
86#define ADVERTISED_Pause (1 << 13)
87#endif
88#ifndef ADVERTISED_Asym_Pause
89#define ADVERTISED_Asym_Pause (1 << 14)
90#endif
91
e47bd51a
JP
92/* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94#ifndef ETHTOOL_GFLAGS
95#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96#endif
97#ifndef ETHTOOL_SFLAGS
98#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
99#endif
100
c1c9c9c4
BP
101/* This was introduced in Linux 2.6.25, so it might be missing if we have old
102 * headers. */
103#ifndef TC_RTAB_SIZE
104#define TC_RTAB_SIZE 1024
105#endif
106
2ee6545f 107static struct nln_notifier *netdev_linux_cache_notifier = NULL;
46415c90 108static int cache_notifier_refcount;
8b61709d
BP
109
110enum {
7fbef77a
JG
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
113 VALID_IN4 = 1 << 2,
114 VALID_IN6 = 1 << 3,
115 VALID_MTU = 1 << 4,
3a183124
EJ
116 VALID_POLICING = 1 << 5,
117 VALID_HAVE_VPORT_STATS = 1 << 6
8b61709d
BP
118};
119
149f577a
JG
120struct tap_state {
121 int fd;
61b999dd 122 bool opened;
149f577a 123};
c1c9c9c4
BP
124\f
125/* Traffic control. */
126
127/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
128 * network device.
129 *
130 * Each TC implementation subclasses this with whatever additional data it
131 * needs. */
c1c9c9c4
BP
132struct tc {
133 const struct tc_ops *ops;
93b13be8
BP
134 struct hmap queues; /* Contains "struct tc_queue"s.
135 * Read by generic TC layer.
136 * Written only by TC implementation. */
137};
c1c9c9c4 138
93b13be8
BP
139/* One traffic control queue.
140 *
141 * Each TC implementation subclasses this with whatever additional data it
142 * needs. */
143struct tc_queue {
144 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
145 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
146};
147
148/* A particular kind of traffic control. Each implementation generally maps to
149 * one particular Linux qdisc class.
150 *
151 * The functions below return 0 if successful or a positive errno value on
152 * failure, except where otherwise noted. All of them must be provided, except
153 * where otherwise noted. */
154struct tc_ops {
155 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
156 * This is null for tc_ops_default and tc_ops_other, for which there are no
157 * appropriate values. */
158 const char *linux_name;
159
160 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
161 const char *ovs_name;
162
163 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
164 * queues. The queues are numbered 0 through n_queues - 1. */
165 unsigned int n_queues;
166
167 /* Called to install this TC class on 'netdev'. The implementation should
168 * make the Netlink calls required to set up 'netdev' with the right qdisc
169 * and configure it according to 'details'. The implementation may assume
170 * that the current qdisc is the default; that is, there is no need for it
171 * to delete the current qdisc before installing itself.
172 *
173 * The contents of 'details' should be documented as valid for 'ovs_name'
174 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
175 * (which is built as ovs-vswitchd.conf.db(8)).
176 *
177 * This function must return 0 if and only if it sets 'netdev->tc' to an
178 * initialized 'struct tc'.
179 *
180 * (This function is null for tc_ops_other, which cannot be installed. For
181 * other TC classes it should always be nonnull.) */
182 int (*tc_install)(struct netdev *netdev, const struct shash *details);
183
184 /* Called when the netdev code determines (through a Netlink query) that
185 * this TC class's qdisc is installed on 'netdev', but we didn't install
186 * it ourselves and so don't know any of the details.
187 *
188 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
189 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
190 * implementation should parse the other attributes of 'nlmsg' as
191 * necessary to determine its configuration. If necessary it should also
192 * use Netlink queries to determine the configuration of queues on
193 * 'netdev'.
194 *
195 * This function must return 0 if and only if it sets 'netdev->tc' to an
196 * initialized 'struct tc'. */
197 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198
199 /* Destroys the data structures allocated by the implementation as part of
200 * 'tc'. (This includes destroying 'tc->queues' by calling
201 * tc_destroy(tc).
202 *
203 * The implementation should not need to perform any Netlink calls. If
204 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
205 * (But it may not be desirable.)
206 *
207 * This function may be null if 'tc' is trivial. */
208 void (*tc_destroy)(struct tc *tc);
209
210 /* Retrieves details of 'netdev->tc' configuration into 'details'.
211 *
212 * The implementation should not need to perform any Netlink calls, because
213 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
214 * cached the configuration.
215 *
216 * The contents of 'details' should be documented as valid for 'ovs_name'
217 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
218 * (which is built as ovs-vswitchd.conf.db(8)).
219 *
220 * This function may be null if 'tc' is not configurable.
221 */
222 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
223
224 /* Reconfigures 'netdev->tc' according to 'details', performing any
225 * required Netlink calls to complete the reconfiguration.
226 *
227 * The contents of 'details' should be documented as valid for 'ovs_name'
228 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
229 * (which is built as ovs-vswitchd.conf.db(8)).
230 *
231 * This function may be null if 'tc' is not configurable.
232 */
233 int (*qdisc_set)(struct netdev *, const struct shash *details);
234
93b13be8
BP
235 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
236 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
237 *
238 * The contents of 'details' should be documented as valid for 'ovs_name'
239 * in the "other_config" column in the "Queue" table in
240 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241 *
242 * The implementation should not need to perform any Netlink calls, because
243 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
244 * cached the queue configuration.
245 *
246 * This function may be null if 'tc' does not have queues ('n_queues' is
247 * 0). */
93b13be8 248 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
249 struct shash *details);
250
251 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
252 * 'details', perfoming any required Netlink calls to complete the
253 * reconfiguration. The caller ensures that 'queue_id' is less than
254 * 'n_queues'.
255 *
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "Queue" table in
258 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259 *
260 * This function may be null if 'tc' does not have queues or its queues are
261 * not configurable. */
262 int (*class_set)(struct netdev *, unsigned int queue_id,
263 const struct shash *details);
264
93b13be8
BP
265 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
266 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
267 *
268 * This function may be null if 'tc' does not have queues or its queues
269 * cannot be deleted. */
93b13be8 270 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 271
93b13be8
BP
272 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
273 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
274 *
275 * On success, initializes '*stats'.
276 *
277 * This function may be null if 'tc' does not have queues or if it cannot
278 * report queue statistics. */
93b13be8
BP
279 int (*class_get_stats)(const struct netdev *netdev,
280 const struct tc_queue *queue,
c1c9c9c4
BP
281 struct netdev_queue_stats *stats);
282
283 /* Extracts queue stats from 'nlmsg', which is a response to a
284 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285 *
286 * This function may be null if 'tc' does not have queues or if it cannot
287 * report queue statistics. */
288 int (*class_dump_stats)(const struct netdev *netdev,
289 const struct ofpbuf *nlmsg,
290 netdev_dump_queue_stats_cb *cb, void *aux);
291};
292
293static void
294tc_init(struct tc *tc, const struct tc_ops *ops)
295{
296 tc->ops = ops;
93b13be8 297 hmap_init(&tc->queues);
c1c9c9c4
BP
298}
299
300static void
301tc_destroy(struct tc *tc)
302{
93b13be8 303 hmap_destroy(&tc->queues);
c1c9c9c4
BP
304}
305
306static const struct tc_ops tc_ops_htb;
a339aa81 307static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
308static const struct tc_ops tc_ops_default;
309static const struct tc_ops tc_ops_other;
310
311static const struct tc_ops *tcs[] = {
312 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 313 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
314 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
315 &tc_ops_other, /* Some other qdisc. */
316 NULL
317};
149f577a 318
c1c9c9c4
BP
319static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
320static unsigned int tc_get_major(unsigned int handle);
321static unsigned int tc_get_minor(unsigned int handle);
322
323static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
324static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
325static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326
327static struct tcmsg *tc_make_request(const struct netdev *, int type,
328 unsigned int flags, struct ofpbuf *);
329static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330
331static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
332 struct nlattr **options);
333static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
334 struct nlattr **options,
335 struct netdev_queue_stats *);
336static int tc_query_class(const struct netdev *,
337 unsigned int handle, unsigned int parent,
338 struct ofpbuf **replyp);
339static int tc_delete_class(const struct netdev *, unsigned int handle);
340
341static int tc_del_qdisc(struct netdev *netdev);
342static int tc_query_qdisc(const struct netdev *netdev);
343
344static int tc_calc_cell_log(unsigned int mtu);
345static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
346static void tc_put_rtab(struct ofpbuf *, uint16_t type,
347 const struct tc_ratespec *rate);
348static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
349\f
149f577a
JG
350struct netdev_dev_linux {
351 struct netdev_dev netdev_dev;
352
8b61709d 353 struct shash_node *shash_node;
149f577a 354 unsigned int cache_valid;
ac4d3bcb 355 unsigned int change_seq;
8b61709d 356
1670c579
EJ
357 bool miimon; /* Link status of last poll. */
358 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
359 struct timer miimon_timer;
360
8722022c
BP
361 /* The following are figured out "on demand" only. They are only valid
362 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
363 int ifindex;
364 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 365 struct in_addr address, netmask;
8b61709d
BP
366 struct in6_addr in6;
367 int mtu;
3a183124 368 bool carrier;
80a86fbe
BP
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
7fbef77a 371 bool have_vport_stats;
c1c9c9c4 372 struct tc *tc;
149f577a
JG
373
374 union {
375 struct tap_state tap;
376 } state;
8b61709d
BP
377};
378
149f577a
JG
379struct netdev_linux {
380 struct netdev netdev;
5b7448ed 381 int fd;
149f577a 382};
8b61709d 383
76c308b5
BP
384/* Sockets used for ioctl operations. */
385static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 386
ff4ed3c9
BP
387/* A Netlink routing socket that is not subscribed to any multicast groups. */
388static struct nl_sock *rtnl_sock;
389
8b61709d
BP
390/* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
393
15b3596a 394static int netdev_linux_init(void);
6f643e49 395
0b0544d7 396static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 397 int cmd, const char *cmd_name);
149f577a
JG
398static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
f1acd62b
BP
400static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
8b61709d
BP
402static int get_flags(const struct netdev *, int *flagsp);
403static int set_flags(struct netdev *, int flags);
404static int do_get_ifindex(const char *netdev_name);
405static int get_ifindex(const struct netdev *, int *ifindexp);
406static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
3a183124 414static int get_carrier_via_sysfs(const char *name, bool *carrier);
488d734d 415static int af_packet_sock(void);
1670c579
EJ
416static void netdev_linux_miimon_run(void);
417static void netdev_linux_miimon_wait(void);
8b61709d 418
15b3596a
JG
419static bool
420is_netdev_linux_class(const struct netdev_class *netdev_class)
421{
422 return netdev_class->init == netdev_linux_init;
423}
424
149f577a
JG
425static struct netdev_dev_linux *
426netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 427{
15b3596a
JG
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
430
149f577a 431 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
432}
433
8b61709d
BP
434static struct netdev_linux *
435netdev_linux_cast(const struct netdev *netdev)
436{
15b3596a
JG
437 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
440
8b61709d
BP
441 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
442}
ff4ed3c9 443\f
8b61709d
BP
444static int
445netdev_linux_init(void)
446{
447 static int status = -1;
448 if (status < 0) {
ff4ed3c9 449 /* Create AF_INET socket. */
8b61709d
BP
450 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451 status = af_inet_sock >= 0 ? 0 : errno;
452 if (status) {
453 VLOG_ERR("failed to create inet socket: %s", strerror(status));
454 }
ff4ed3c9
BP
455
456 /* Create rtnetlink socket. */
457 if (!status) {
cceb11f5 458 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
459 if (status) {
460 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
461 strerror(status));
462 }
463 }
8b61709d
BP
464 }
465 return status;
466}
467
468static void
469netdev_linux_run(void)
470{
18a23781 471 rtnetlink_link_run();
1670c579 472 netdev_linux_miimon_run();
8b61709d
BP
473}
474
475static void
476netdev_linux_wait(void)
477{
18a23781 478 rtnetlink_link_wait();
1670c579 479 netdev_linux_miimon_wait();
8b61709d
BP
480}
481
ac4d3bcb
EJ
482static void
483netdev_dev_linux_changed(struct netdev_dev_linux *dev)
484{
485 dev->change_seq++;
486 if (!dev->change_seq) {
487 dev->change_seq++;
488 }
489 dev->cache_valid = 0;
490}
491
8b61709d 492static void
21d6e22e 493netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 494 void *aux OVS_UNUSED)
8b61709d 495{
149f577a 496 struct netdev_dev_linux *dev;
8b61709d 497 if (change) {
46415c90
JG
498 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
499 if (base_dev) {
15b3596a
JG
500 const struct netdev_class *netdev_class =
501 netdev_dev_get_class(base_dev);
502
503 if (is_netdev_linux_class(netdev_class)) {
504 dev = netdev_dev_linux_cast(base_dev);
3a183124
EJ
505
506 if (dev->carrier != change->running) {
507 dev->carrier = change->running;
508 }
509
ac4d3bcb 510 netdev_dev_linux_changed(dev);
15b3596a 511 }
8b61709d
BP
512 }
513 } else {
46415c90 514 struct shash device_shash;
8b61709d 515 struct shash_node *node;
46415c90
JG
516
517 shash_init(&device_shash);
518 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
519 SHASH_FOR_EACH (node, &device_shash) {
3a183124
EJ
520 bool carrier;
521
149f577a 522 dev = node->data;
3a183124
EJ
523
524 get_carrier_via_sysfs(node->name, &carrier);
525 if (dev->carrier != carrier) {
526 dev->carrier = carrier;
527 }
528
ac4d3bcb 529 netdev_dev_linux_changed(dev);
8b61709d 530 }
46415c90 531 shash_destroy(&device_shash);
8b61709d
BP
532 }
533}
534
c3827f61 535/* Creates system and internal devices. */
8b61709d 536static int
de5cdb90
BP
537netdev_linux_create(const struct netdev_class *class, const char *name,
538 struct netdev_dev **netdev_devp)
6c88d577 539{
149f577a 540 struct netdev_dev_linux *netdev_dev;
6c88d577 541
46415c90 542 if (!cache_notifier_refcount) {
2ee6545f
EJ
543 assert(!netdev_linux_cache_notifier);
544
545 netdev_linux_cache_notifier =
546 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
547
548 if (!netdev_linux_cache_notifier) {
549 return EINVAL;
149f577a
JG
550 }
551 }
46415c90 552 cache_notifier_refcount++;
6c88d577 553
149f577a 554 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 555 netdev_dev->change_seq = 1;
de5cdb90 556 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
3a183124 557 get_carrier_via_sysfs(name, &netdev_dev->carrier);
46415c90 558
149f577a 559 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
560 return 0;
561}
562
5b7448ed
JG
563/* For most types of netdevs we open the device for each call of
564 * netdev_open(). However, this is not the case with tap devices,
565 * since it is only possible to open the device once. In this
566 * situation we share a single file descriptor, and consequently
567 * buffers, across all readers. Therefore once data is read it will
568 * be unavailable to other reads for tap devices. */
a740f0de 569static int
b8dcf5e9 570netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
de5cdb90 571 const char *name, struct netdev_dev **netdev_devp)
a740f0de 572{
149f577a 573 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
574 struct tap_state *state;
575 static const char tap_dev[] = "/dev/net/tun";
576 struct ifreq ifr;
577 int error;
578
149f577a
JG
579 netdev_dev = xzalloc(sizeof *netdev_dev);
580 state = &netdev_dev->state.tap;
a740f0de 581
6c88d577 582 /* Open tap device. */
149f577a
JG
583 state->fd = open(tap_dev, O_RDWR);
584 if (state->fd < 0) {
6c88d577
JP
585 error = errno;
586 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
587 goto error;
588 }
589
590 /* Create tap device. */
591 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 592 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 593 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
594 VLOG_WARN("%s: creating tap device failed: %s", name,
595 strerror(errno));
596 error = errno;
597 goto error;
598 }
599
600 /* Make non-blocking. */
149f577a 601 error = set_nonblocking(state->fd);
a740f0de
JG
602 if (error) {
603 goto error;
604 }
605
de5cdb90 606 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
149f577a 607 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
608 return 0;
609
610error:
149f577a 611 free(netdev_dev);
a740f0de
JG
612 return error;
613}
614
a740f0de 615static void
149f577a 616destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 617{
149f577a
JG
618 struct tap_state *state = &netdev_dev->state.tap;
619
620 if (state->fd >= 0) {
621 close(state->fd);
a740f0de
JG
622 }
623}
624
149f577a 625/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 626static void
149f577a 627netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 628{
149f577a 629 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 630 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 631
c1c9c9c4
BP
632 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
633 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
634 }
635
d2bb2799 636 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 637 cache_notifier_refcount--;
149f577a 638
46415c90 639 if (!cache_notifier_refcount) {
2ee6545f
EJ
640 assert(netdev_linux_cache_notifier);
641 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
642 netdev_linux_cache_notifier = NULL;
149f577a 643 }
d2bb2799 644 } else if (class == &netdev_tap_class) {
149f577a 645 destroy_tap(netdev_dev);
d2bb2799
BP
646 } else {
647 NOT_REACHED();
6c88d577 648 }
149f577a 649
658797c8 650 free(netdev_dev);
6c88d577
JP
651}
652
8b61709d 653static int
7b6b0ef4 654netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d 655{
5b7448ed 656 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
657 struct netdev_linux *netdev;
658 enum netdev_flags flags;
659 int error;
660
661 /* Allocate network device. */
ec6fde61 662 netdev = xzalloc(sizeof *netdev);
49a6a163 663 netdev->fd = -1;
5b7448ed 664 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 665
c3827f61
BP
666 /* Verify that the device really exists, by attempting to read its flags.
667 * (The flags might be cached, in which case this won't actually do an
668 * ioctl.)
669 *
670 * Don't do this for "internal" netdevs, though, because those have to be
671 * created as netdev objects before they exist in the kernel, because
672 * creating them in the kernel happens by passing a netdev object to
673 * dpif_port_add(). */
674 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
675 error = netdev_get_flags(&netdev->netdev, &flags);
676 if (error == ENODEV) {
677 goto error;
678 }
8b61709d
BP
679 }
680
61b999dd
JG
681 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
682 !netdev_dev->state.tap.opened) {
683
684 /* We assume that the first user of the tap device is the primary user
685 * and give them the tap FD. Subsequent users probably just expect
686 * this to be a system device so open it normally to avoid send/receive
687 * directions appearing to be reversed. */
5b7448ed 688 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 689 netdev_dev->state.tap.opened = true;
8b61709d
BP
690 }
691
692 *netdevp = &netdev->netdev;
693 return 0;
694
695error:
149f577a 696 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
697 return error;
698}
699
700/* Closes and destroys 'netdev'. */
701static void
702netdev_linux_close(struct netdev *netdev_)
703{
704 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
705
49a6a163 706 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 707 close(netdev->fd);
8b61709d
BP
708 }
709 free(netdev);
710}
e9e28be3 711
7b6b0ef4
BP
712static int
713netdev_linux_listen(struct netdev *netdev_)
714{
715 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
716 struct sockaddr_ll sll;
717 int ifindex;
718 int error;
719 int fd;
720
721 if (netdev->fd >= 0) {
722 return 0;
723 }
724
725 /* Create file descriptor. */
726 fd = socket(PF_PACKET, SOCK_RAW, 0);
727 if (fd < 0) {
728 error = errno;
729 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
730 goto error;
731 }
732
733 /* Set non-blocking mode. */
734 error = set_nonblocking(fd);
735 if (error) {
736 goto error;
737 }
738
739 /* Get ethernet device index. */
740 error = get_ifindex(&netdev->netdev, &ifindex);
741 if (error) {
742 goto error;
743 }
744
745 /* Bind to specific ethernet device. */
746 memset(&sll, 0, sizeof sll);
747 sll.sll_family = AF_PACKET;
748 sll.sll_ifindex = ifindex;
749 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
750 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
751 error = errno;
752 VLOG_ERR("%s: failed to bind raw socket (%s)",
753 netdev_get_name(netdev_), strerror(error));
754 goto error;
755 }
756
757 netdev->fd = fd;
758 return 0;
759
760error:
761 if (fd >= 0) {
762 close(fd);
763 }
764 return error;
765}
766
8b61709d
BP
767static int
768netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
769{
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
771
5b7448ed 772 if (netdev->fd < 0) {
7b6b0ef4 773 /* Device is not listening. */
c0e5f6ca 774 return -EAGAIN;
8b61709d
BP
775 }
776
777 for (;;) {
5b7448ed 778 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
779 if (retval >= 0) {
780 return retval;
781 } else if (errno != EINTR) {
782 if (errno != EAGAIN) {
783 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
784 strerror(errno), netdev_get_name(netdev_));
785 }
c0e5f6ca 786 return -errno;
8b61709d
BP
787 }
788 }
789}
790
791/* Registers with the poll loop to wake up from the next call to poll_block()
792 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
793static void
794netdev_linux_recv_wait(struct netdev *netdev_)
795{
796 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
797 if (netdev->fd >= 0) {
798 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
799 }
800}
801
802/* Discards all packets waiting to be received from 'netdev'. */
803static int
804netdev_linux_drain(struct netdev *netdev_)
805{
806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 807 if (netdev->fd < 0) {
8b61709d 808 return 0;
5b7448ed 809 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 810 struct ifreq ifr;
149f577a 811 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
812 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
813 if (error) {
814 return error;
815 }
5b7448ed 816 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
817 return 0;
818 } else {
5b7448ed 819 return drain_rcvbuf(netdev->fd);
8b61709d
BP
820 }
821}
822
823/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
824 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
825 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
826 * the packet is too big or too small to transmit on the device.
827 *
828 * The caller retains ownership of 'buffer' in all cases.
829 *
830 * The kernel maintains a packet transmission queue, so the caller is not
831 * expected to do additional queuing of packets. */
832static int
833netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
834{
f23347ea
BP
835 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
836 for (;;) {
837 ssize_t retval;
8b61709d 838
f23347ea
BP
839 if (netdev->fd < 0) {
840 /* Use our AF_PACKET socket to send to this device. */
841 struct sockaddr_ll sll;
842 struct msghdr msg;
843 struct iovec iov;
844 int ifindex;
845 int error;
488d734d
BP
846 int sock;
847
848 sock = af_packet_sock();
849 if (sock < 0) {
850 return sock;
851 }
f23347ea
BP
852
853 error = get_ifindex(netdev_, &ifindex);
854 if (error) {
855 return error;
856 }
8b61709d 857
f23347ea
BP
858 /* We don't bother setting most fields in sockaddr_ll because the
859 * kernel ignores them for SOCK_RAW. */
860 memset(&sll, 0, sizeof sll);
861 sll.sll_family = AF_PACKET;
862 sll.sll_ifindex = ifindex;
76c308b5 863
f23347ea
BP
864 iov.iov_base = (void *) data;
865 iov.iov_len = size;
76c308b5 866
f23347ea
BP
867 msg.msg_name = &sll;
868 msg.msg_namelen = sizeof sll;
869 msg.msg_iov = &iov;
870 msg.msg_iovlen = 1;
871 msg.msg_control = NULL;
872 msg.msg_controllen = 0;
873 msg.msg_flags = 0;
874
488d734d 875 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
876 } else {
877 /* Use the netdev's own fd to send to this device. This is
878 * essential for tap devices, because packets sent to a tap device
879 * with an AF_PACKET socket will loop back to be *received* again
880 * on the tap device. */
881 retval = write(netdev->fd, data, size);
882 }
76c308b5 883
8b61709d
BP
884 if (retval < 0) {
885 /* The Linux AF_PACKET implementation never blocks waiting for room
886 * for packets, instead returning ENOBUFS. Translate this into
887 * EAGAIN for the caller. */
888 if (errno == ENOBUFS) {
889 return EAGAIN;
890 } else if (errno == EINTR) {
891 continue;
892 } else if (errno != EAGAIN) {
893 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
894 netdev_get_name(netdev_), strerror(errno));
895 }
896 return errno;
897 } else if (retval != size) {
898 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
899 "%zu) on %s", retval, size, netdev_get_name(netdev_));
900 return EMSGSIZE;
901 } else {
902 return 0;
903 }
904 }
905}
906
907/* Registers with the poll loop to wake up from the next call to poll_block()
908 * when the packet transmission queue has sufficient room to transmit a packet
909 * with netdev_send().
910 *
911 * The kernel maintains a packet transmission queue, so the client is not
912 * expected to do additional queuing of packets. Thus, this function is
913 * unlikely to ever be used. It is included for completeness. */
914static void
915netdev_linux_send_wait(struct netdev *netdev_)
916{
917 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 918 if (netdev->fd < 0) {
8b61709d 919 /* Nothing to do. */
5b7448ed
JG
920 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
921 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
922 } else {
923 /* TAP device always accepts packets.*/
924 poll_immediate_wake();
925 }
926}
927
928/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
929 * otherwise a positive errno value. */
930static int
931netdev_linux_set_etheraddr(struct netdev *netdev_,
932 const uint8_t mac[ETH_ADDR_LEN])
933{
149f577a
JG
934 struct netdev_dev_linux *netdev_dev =
935 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
936 int error;
937
149f577a
JG
938 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
939 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
940 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
941 if (!error) {
149f577a
JG
942 netdev_dev->cache_valid |= VALID_ETHERADDR;
943 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
944 }
945 } else {
946 error = 0;
8b61709d
BP
947 }
948 return error;
949}
950
951/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
952 * free the returned buffer. */
953static int
954netdev_linux_get_etheraddr(const struct netdev *netdev_,
955 uint8_t mac[ETH_ADDR_LEN])
956{
149f577a
JG
957 struct netdev_dev_linux *netdev_dev =
958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
959 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 960 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 961 netdev_dev->etheraddr);
8b61709d
BP
962 if (error) {
963 return error;
964 }
149f577a 965 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 966 }
149f577a 967 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
968 return 0;
969}
970
971/* Returns the maximum size of transmitted (and received) packets on 'netdev',
972 * in bytes, not including the hardware header; thus, this is typically 1500
973 * bytes for Ethernet devices. */
974static int
975netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
976{
149f577a
JG
977 struct netdev_dev_linux *netdev_dev =
978 netdev_dev_linux_cast(netdev_get_dev(netdev_));
979 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
980 struct ifreq ifr;
981 int error;
982
149f577a
JG
983 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
984 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
985 if (error) {
986 return error;
987 }
149f577a
JG
988 netdev_dev->mtu = ifr.ifr_mtu;
989 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 990 }
149f577a 991 *mtup = netdev_dev->mtu;
8b61709d
BP
992 return 0;
993}
994
9b020780
PS
995/* Sets the maximum size of transmitted (MTU) for given device using linux
996 * networking ioctl interface.
997 */
998static int
999netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1000{
1001 struct netdev_dev_linux *netdev_dev =
1002 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1003 struct ifreq ifr;
1004 int error;
1005
1006 ifr.ifr_mtu = mtu;
1007 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1008 SIOCSIFMTU, "SIOCSIFMTU");
1009 if (error) {
1010 return error;
1011 }
1012
1013 netdev_dev->mtu = ifr.ifr_mtu;
1014 netdev_dev->cache_valid |= VALID_MTU;
1015 return 0;
1016}
1017
9ab3d9a3
BP
1018/* Returns the ifindex of 'netdev', if successful, as a positive number.
1019 * On failure, returns a negative errno value. */
1020static int
1021netdev_linux_get_ifindex(const struct netdev *netdev)
1022{
1023 int ifindex, error;
1024
1025 error = get_ifindex(netdev, &ifindex);
1026 return error ? -error : ifindex;
1027}
1028
8b61709d
BP
1029static int
1030netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1031{
149f577a
JG
1032 struct netdev_dev_linux *netdev_dev =
1033 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1034
1670c579
EJ
1035 if (netdev_dev->miimon_interval > 0) {
1036 *carrier = netdev_dev->miimon;
3a183124
EJ
1037 } else {
1038 *carrier = netdev_dev->carrier;
8b61709d 1039 }
8b61709d 1040
3a183124 1041 return 0;
8b61709d
BP
1042}
1043
63331829 1044static int
1670c579
EJ
1045netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1046 struct mii_ioctl_data *data)
63331829 1047{
63331829 1048 struct ifreq ifr;
782e6111 1049 int error;
63331829 1050
63331829 1051 memset(&ifr, 0, sizeof ifr);
782e6111 1052 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1053 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1054 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1055
782e6111
EJ
1056 return error;
1057}
1058
1059static int
1670c579 1060netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1061{
782e6111
EJ
1062 struct mii_ioctl_data data;
1063 int error;
63331829 1064
782e6111
EJ
1065 *miimon = false;
1066
1067 memset(&data, 0, sizeof data);
1670c579 1068 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1069 if (!error) {
1070 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1071 data.reg_num = MII_BMSR;
1670c579 1072 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1073 &data);
63331829
EJ
1074
1075 if (!error) {
782e6111 1076 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1077 } else {
1078 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1079 }
1080 } else {
1081 struct ethtool_cmd ecmd;
63331829
EJ
1082
1083 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1084 name);
1085
1086 memset(&ecmd, 0, sizeof ecmd);
1087 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1088 "ETHTOOL_GLINK");
1089 if (!error) {
782e6111
EJ
1090 struct ethtool_value eval;
1091
1092 memcpy(&eval, &ecmd, sizeof eval);
1093 *miimon = !!eval.data;
63331829
EJ
1094 } else {
1095 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1096 }
1097 }
1098
1099 return error;
1100}
1101
1670c579
EJ
1102static int
1103netdev_linux_set_miimon_interval(struct netdev *netdev_,
1104 long long int interval)
1105{
1106 struct netdev_dev_linux *netdev_dev;
1107
1108 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1109
1110 interval = interval > 0 ? MAX(interval, 100) : 0;
1111 if (netdev_dev->miimon_interval != interval) {
1112 netdev_dev->miimon_interval = interval;
1113 timer_set_expired(&netdev_dev->miimon_timer);
1114 }
1115
1116 return 0;
1117}
1118
1119static void
1120netdev_linux_miimon_run(void)
1121{
1122 struct shash device_shash;
1123 struct shash_node *node;
1124
1125 shash_init(&device_shash);
1126 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1127 SHASH_FOR_EACH (node, &device_shash) {
1128 struct netdev_dev_linux *dev = node->data;
1129 bool miimon;
1130
1131 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1132 continue;
1133 }
1134
1135 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1136 if (miimon != dev->miimon) {
1670c579 1137 dev->miimon = miimon;
ac4d3bcb 1138 netdev_dev_linux_changed(dev);
1670c579
EJ
1139 }
1140
1141 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1142 }
1143
1144 shash_destroy(&device_shash);
1145}
1146
1147static void
1148netdev_linux_miimon_wait(void)
1149{
1150 struct shash device_shash;
1151 struct shash_node *node;
1152
1153 shash_init(&device_shash);
1154 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1155 SHASH_FOR_EACH (node, &device_shash) {
1156 struct netdev_dev_linux *dev = node->data;
1157
1158 if (dev->miimon_interval > 0) {
1159 timer_wait(&dev->miimon_timer);
1160 }
1161 }
1162 shash_destroy(&device_shash);
1163}
1164
8b61709d
BP
1165/* Check whether we can we use RTM_GETLINK to get network device statistics.
1166 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1167 * enabled. */
1168static bool
1169check_for_working_netlink_stats(void)
1170{
1171 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1172 * preferable, so if that works, we'll use it. */
1173 int ifindex = do_get_ifindex("lo");
1174 if (ifindex < 0) {
1175 VLOG_WARN("failed to get ifindex for lo, "
1176 "obtaining netdev stats from proc");
1177 return false;
1178 } else {
1179 struct netdev_stats stats;
1180 int error = get_stats_via_netlink(ifindex, &stats);
1181 if (!error) {
1182 VLOG_DBG("obtaining netdev stats via rtnetlink");
1183 return true;
1184 } else {
1185 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1186 "via proc (you are probably running a pre-2.6.19 "
1187 "kernel)", strerror(error));
1188 return false;
1189 }
1190 }
1191}
1192
92df599c
JG
1193static void
1194swap_uint64(uint64_t *a, uint64_t *b)
1195{
1de0e8ae
BP
1196 uint64_t tmp = *a;
1197 *a = *b;
1198 *b = tmp;
92df599c
JG
1199}
1200
f613a0d7
PS
1201static void
1202get_stats_via_vport(const struct netdev *netdev_,
1203 struct netdev_stats *stats)
8b61709d 1204{
149f577a
JG
1205 struct netdev_dev_linux *netdev_dev =
1206 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1207
7fbef77a
JG
1208 if (netdev_dev->have_vport_stats ||
1209 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
f613a0d7 1210 int error;
7fbef77a
JG
1211
1212 error = netdev_vport_get_stats(netdev_, stats);
f613a0d7
PS
1213 if (error) {
1214 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1215 netdev_get_name(netdev_), error);
1216 }
7fbef77a
JG
1217 netdev_dev->have_vport_stats = !error;
1218 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1219 }
f613a0d7 1220}
8b61709d 1221
f613a0d7
PS
1222static int
1223netdev_linux_sys_get_stats(const struct netdev *netdev_,
1224 struct netdev_stats *stats)
1225{
1226 static int use_netlink_stats = -1;
1227 int error;
1228
1229 if (use_netlink_stats < 0) {
1230 use_netlink_stats = check_for_working_netlink_stats();
1231 }
1232
1233 if (use_netlink_stats) {
1234 int ifindex;
1235
1236 error = get_ifindex(netdev_, &ifindex);
1237 if (!error) {
1238 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1239 }
f613a0d7
PS
1240 } else {
1241 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1242 }
7fbef77a 1243
f613a0d7
PS
1244 if (error) {
1245 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1246 netdev_get_name(netdev_), error);
1247 }
1248 return error;
1249
1250}
1251
1252/* Retrieves current device stats for 'netdev-linux'. */
1253static int
1254netdev_linux_get_stats(const struct netdev *netdev_,
1255 struct netdev_stats *stats)
1256{
1257 struct netdev_dev_linux *netdev_dev =
1258 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1259 struct netdev_stats dev_stats;
1260 int error;
1261
1262 get_stats_via_vport(netdev_, stats);
1263
1264 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1265
1266 if (error) {
1267 if (!netdev_dev->have_vport_stats) {
1268 return error;
7fbef77a 1269 } else {
f613a0d7
PS
1270 return 0;
1271 }
1272 }
1273
1274 if (!netdev_dev->have_vport_stats) {
1275 /* stats not available from OVS then use ioctl stats. */
1276 *stats = dev_stats;
1277 } else {
1278 stats->rx_errors += dev_stats.rx_errors;
1279 stats->tx_errors += dev_stats.tx_errors;
1280 stats->rx_dropped += dev_stats.rx_dropped;
1281 stats->tx_dropped += dev_stats.tx_dropped;
1282 stats->multicast += dev_stats.multicast;
1283 stats->collisions += dev_stats.collisions;
1284 stats->rx_length_errors += dev_stats.rx_length_errors;
1285 stats->rx_over_errors += dev_stats.rx_over_errors;
1286 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1287 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1288 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1289 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1290 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1291 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1292 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1293 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1294 stats->tx_window_errors += dev_stats.tx_window_errors;
1295 }
1296 return 0;
1297}
1298
1299/* Retrieves current device stats for 'netdev-tap' netdev or
1300 * netdev-internal. */
1301static int
1302netdev_pseudo_get_stats(const struct netdev *netdev_,
1303 struct netdev_stats *stats)
1304{
1305 struct netdev_dev_linux *netdev_dev =
1306 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1307 struct netdev_stats dev_stats;
1308 int error;
1309
1310 get_stats_via_vport(netdev_, stats);
1311
1312 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1313 if (error) {
1314 if (!netdev_dev->have_vport_stats) {
1315 return error;
1316 } else {
1317 return 0;
8b61709d 1318 }
8b61709d 1319 }
fe6b0e03
JG
1320
1321 /* If this port is an internal port then the transmit and receive stats
1322 * will appear to be swapped relative to the other ports since we are the
1323 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1324 * them back here. This does not apply if we are getting stats from the
1325 * vport layer because it always tracks stats from the perspective of the
1326 * switch. */
f613a0d7
PS
1327 if (!netdev_dev->have_vport_stats) {
1328 *stats = dev_stats;
92df599c
JG
1329 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1330 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1331 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1332 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1333 stats->rx_length_errors = 0;
1334 stats->rx_over_errors = 0;
1335 stats->rx_crc_errors = 0;
1336 stats->rx_frame_errors = 0;
1337 stats->rx_fifo_errors = 0;
1338 stats->rx_missed_errors = 0;
1339 stats->tx_aborted_errors = 0;
1340 stats->tx_carrier_errors = 0;
1341 stats->tx_fifo_errors = 0;
1342 stats->tx_heartbeat_errors = 0;
1343 stats->tx_window_errors = 0;
f613a0d7
PS
1344 } else {
1345 stats->rx_dropped += dev_stats.tx_dropped;
1346 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1347
f613a0d7
PS
1348 stats->rx_errors += dev_stats.tx_errors;
1349 stats->tx_errors += dev_stats.rx_errors;
1350
1351 stats->multicast += dev_stats.multicast;
1352 stats->collisions += dev_stats.collisions;
1353 }
1354 return 0;
8b61709d
BP
1355}
1356
1357/* Stores the features supported by 'netdev' into each of '*current',
1358 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1359 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1360 * successful, otherwise a positive errno value. */
8b61709d 1361static int
6f2f5cce 1362netdev_linux_get_features(const struct netdev *netdev,
8b61709d
BP
1363 uint32_t *current, uint32_t *advertised,
1364 uint32_t *supported, uint32_t *peer)
1365{
1366 struct ethtool_cmd ecmd;
1367 int error;
1368
1369 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1370 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1371 ETHTOOL_GSET, "ETHTOOL_GSET");
1372 if (error) {
1373 return error;
1374 }
1375
1376 /* Supported features. */
1377 *supported = 0;
1378 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1379 *supported |= OFPPF_10MB_HD;
1380 }
1381 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1382 *supported |= OFPPF_10MB_FD;
1383 }
1384 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1385 *supported |= OFPPF_100MB_HD;
1386 }
1387 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1388 *supported |= OFPPF_100MB_FD;
1389 }
1390 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1391 *supported |= OFPPF_1GB_HD;
1392 }
1393 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1394 *supported |= OFPPF_1GB_FD;
1395 }
1396 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1397 *supported |= OFPPF_10GB_FD;
1398 }
1399 if (ecmd.supported & SUPPORTED_TP) {
1400 *supported |= OFPPF_COPPER;
1401 }
1402 if (ecmd.supported & SUPPORTED_FIBRE) {
1403 *supported |= OFPPF_FIBER;
1404 }
1405 if (ecmd.supported & SUPPORTED_Autoneg) {
1406 *supported |= OFPPF_AUTONEG;
1407 }
1408 if (ecmd.supported & SUPPORTED_Pause) {
1409 *supported |= OFPPF_PAUSE;
1410 }
1411 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1412 *supported |= OFPPF_PAUSE_ASYM;
1413 }
1414
1415 /* Advertised features. */
1416 *advertised = 0;
1417 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1418 *advertised |= OFPPF_10MB_HD;
1419 }
1420 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1421 *advertised |= OFPPF_10MB_FD;
1422 }
1423 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1424 *advertised |= OFPPF_100MB_HD;
1425 }
1426 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1427 *advertised |= OFPPF_100MB_FD;
1428 }
1429 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1430 *advertised |= OFPPF_1GB_HD;
1431 }
1432 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1433 *advertised |= OFPPF_1GB_FD;
1434 }
1435 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1436 *advertised |= OFPPF_10GB_FD;
1437 }
1438 if (ecmd.advertising & ADVERTISED_TP) {
1439 *advertised |= OFPPF_COPPER;
1440 }
1441 if (ecmd.advertising & ADVERTISED_FIBRE) {
1442 *advertised |= OFPPF_FIBER;
1443 }
1444 if (ecmd.advertising & ADVERTISED_Autoneg) {
1445 *advertised |= OFPPF_AUTONEG;
1446 }
1447 if (ecmd.advertising & ADVERTISED_Pause) {
1448 *advertised |= OFPPF_PAUSE;
1449 }
1450 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1451 *advertised |= OFPPF_PAUSE_ASYM;
1452 }
1453
1454 /* Current settings. */
1455 if (ecmd.speed == SPEED_10) {
1456 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1457 } else if (ecmd.speed == SPEED_100) {
1458 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1459 } else if (ecmd.speed == SPEED_1000) {
1460 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1461 } else if (ecmd.speed == SPEED_10000) {
1462 *current = OFPPF_10GB_FD;
1463 } else {
1464 *current = 0;
1465 }
1466
1467 if (ecmd.port == PORT_TP) {
1468 *current |= OFPPF_COPPER;
1469 } else if (ecmd.port == PORT_FIBRE) {
1470 *current |= OFPPF_FIBER;
1471 }
1472
1473 if (ecmd.autoneg) {
1474 *current |= OFPPF_AUTONEG;
1475 }
1476
1477 /* Peer advertisements. */
1478 *peer = 0; /* XXX */
1479
1480 return 0;
1481}
1482
1483/* Set the features advertised by 'netdev' to 'advertise'. */
1484static int
1485netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1486{
1487 struct ethtool_cmd ecmd;
1488 int error;
1489
1490 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1491 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1492 ETHTOOL_GSET, "ETHTOOL_GSET");
1493 if (error) {
1494 return error;
1495 }
1496
1497 ecmd.advertising = 0;
1498 if (advertise & OFPPF_10MB_HD) {
1499 ecmd.advertising |= ADVERTISED_10baseT_Half;
1500 }
1501 if (advertise & OFPPF_10MB_FD) {
1502 ecmd.advertising |= ADVERTISED_10baseT_Full;
1503 }
1504 if (advertise & OFPPF_100MB_HD) {
1505 ecmd.advertising |= ADVERTISED_100baseT_Half;
1506 }
1507 if (advertise & OFPPF_100MB_FD) {
1508 ecmd.advertising |= ADVERTISED_100baseT_Full;
1509 }
1510 if (advertise & OFPPF_1GB_HD) {
1511 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1512 }
1513 if (advertise & OFPPF_1GB_FD) {
1514 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1515 }
1516 if (advertise & OFPPF_10GB_FD) {
1517 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1518 }
1519 if (advertise & OFPPF_COPPER) {
1520 ecmd.advertising |= ADVERTISED_TP;
1521 }
1522 if (advertise & OFPPF_FIBER) {
1523 ecmd.advertising |= ADVERTISED_FIBRE;
1524 }
1525 if (advertise & OFPPF_AUTONEG) {
1526 ecmd.advertising |= ADVERTISED_Autoneg;
1527 }
1528 if (advertise & OFPPF_PAUSE) {
1529 ecmd.advertising |= ADVERTISED_Pause;
1530 }
1531 if (advertise & OFPPF_PAUSE_ASYM) {
1532 ecmd.advertising |= ADVERTISED_Asym_Pause;
1533 }
0b0544d7 1534 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1535 ETHTOOL_SSET, "ETHTOOL_SSET");
1536}
1537
1538/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1539 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1540 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1541 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1542 * sets '*vlan_vid' to -1. */
1543static int
1544netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1545{
1546 const char *netdev_name = netdev_get_name(netdev);
1547 struct ds line = DS_EMPTY_INITIALIZER;
1548 FILE *stream = NULL;
1549 int error;
1550 char *fn;
1551
1552 COVERAGE_INC(netdev_get_vlan_vid);
1553 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1554 stream = fopen(fn, "r");
1555 if (!stream) {
1556 error = errno;
1557 goto done;
1558 }
1559
1560 if (ds_get_line(&line, stream)) {
1561 if (ferror(stream)) {
1562 error = errno;
1563 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1564 } else {
1565 error = EPROTO;
1566 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1567 }
1568 goto done;
1569 }
1570
1571 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1572 error = EPROTO;
1573 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1574 fn, ds_cstr(&line));
1575 goto done;
1576 }
1577
1578 error = 0;
1579
1580done:
1581 free(fn);
1582 if (stream) {
1583 fclose(stream);
1584 }
1585 ds_destroy(&line);
1586 if (error) {
1587 *vlan_vid = -1;
1588 }
1589 return error;
1590}
1591
1592#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1593#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1594
8e460221 1595/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1596 * positive errno value.
1597 *
1598 * This function is equivalent to running
1599 * /sbin/tc qdisc del dev %s handle ffff: ingress
1600 * but it is much, much faster.
1601 */
8e460221
BP
1602static int
1603netdev_linux_remove_policing(struct netdev *netdev)
1604{
80a86fbe
BP
1605 struct netdev_dev_linux *netdev_dev =
1606 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1607 const char *netdev_name = netdev_get_name(netdev);
8e460221 1608
6f42c8ea 1609 struct ofpbuf request;
6f42c8ea 1610 struct tcmsg *tcmsg;
6f42c8ea
BP
1611 int error;
1612
c1c9c9c4 1613 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1614 if (!tcmsg) {
1615 return ENODEV;
1616 }
c1c9c9c4 1617 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1618 tcmsg->tcm_parent = TC_H_INGRESS;
1619 nl_msg_put_string(&request, TCA_KIND, "ingress");
1620 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1621
1622 error = tc_transact(&request, NULL);
4d10512c 1623 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1624 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1625 netdev_name, strerror(error));
1626 return error;
1627 }
1628
80a86fbe
BP
1629 netdev_dev->kbits_rate = 0;
1630 netdev_dev->kbits_burst = 0;
1631 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1632 return 0;
1633}
1634
8b61709d
BP
1635/* Attempts to set input rate limiting (policing) policy. */
1636static int
1637netdev_linux_set_policing(struct netdev *netdev,
1638 uint32_t kbits_rate, uint32_t kbits_burst)
1639{
80a86fbe
BP
1640 struct netdev_dev_linux *netdev_dev =
1641 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1642 const char *netdev_name = netdev_get_name(netdev);
1643 char command[1024];
1644
1645 COVERAGE_INC(netdev_set_policing);
8e460221 1646
80a86fbe
BP
1647 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1648 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1649 : kbits_burst); /* Stick with user-specified value. */
1650
1651 if (netdev_dev->cache_valid & VALID_POLICING
1652 && netdev_dev->kbits_rate == kbits_rate
1653 && netdev_dev->kbits_burst == kbits_burst) {
1654 /* Assume that settings haven't changed since we last set them. */
1655 return 0;
1656 }
1657
8e460221 1658 netdev_linux_remove_policing(netdev);
8b61709d 1659 if (kbits_rate) {
8b61709d
BP
1660 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1661 if (system(command) != 0) {
1662 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1663 return -1;
1664 }
1665
1666 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1667 kbits_rate, kbits_burst);
1668 if (system(command) != 0) {
1669 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1670 netdev_name);
1671 return -1;
1672 }
80a86fbe
BP
1673
1674 netdev_dev->kbits_rate = kbits_rate;
1675 netdev_dev->kbits_burst = kbits_burst;
1676 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1677 }
1678
1679 return 0;
1680}
1681
c1c9c9c4
BP
1682static int
1683netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1684 struct sset *types)
c1c9c9c4
BP
1685{
1686 const struct tc_ops **opsp;
1687
1688 for (opsp = tcs; *opsp != NULL; opsp++) {
1689 const struct tc_ops *ops = *opsp;
1690 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1691 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1692 }
1693 }
1694 return 0;
1695}
1696
1697static const struct tc_ops *
1698tc_lookup_ovs_name(const char *name)
1699{
1700 const struct tc_ops **opsp;
1701
1702 for (opsp = tcs; *opsp != NULL; opsp++) {
1703 const struct tc_ops *ops = *opsp;
1704 if (!strcmp(name, ops->ovs_name)) {
1705 return ops;
1706 }
1707 }
1708 return NULL;
1709}
1710
1711static const struct tc_ops *
1712tc_lookup_linux_name(const char *name)
1713{
1714 const struct tc_ops **opsp;
1715
1716 for (opsp = tcs; *opsp != NULL; opsp++) {
1717 const struct tc_ops *ops = *opsp;
1718 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1719 return ops;
1720 }
1721 }
1722 return NULL;
1723}
1724
93b13be8
BP
1725static struct tc_queue *
1726tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1727 size_t hash)
1728{
1729 struct netdev_dev_linux *netdev_dev =
1730 netdev_dev_linux_cast(netdev_get_dev(netdev));
1731 struct tc_queue *queue;
1732
4e8e4213 1733 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1734 if (queue->queue_id == queue_id) {
1735 return queue;
1736 }
1737 }
1738 return NULL;
1739}
1740
1741static struct tc_queue *
1742tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1743{
1744 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1745}
1746
c1c9c9c4
BP
1747static int
1748netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1749 const char *type,
1750 struct netdev_qos_capabilities *caps)
1751{
1752 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1753 if (!ops) {
1754 return EOPNOTSUPP;
1755 }
1756 caps->n_queues = ops->n_queues;
1757 return 0;
1758}
1759
1760static int
1761netdev_linux_get_qos(const struct netdev *netdev,
1762 const char **typep, struct shash *details)
1763{
1764 struct netdev_dev_linux *netdev_dev =
1765 netdev_dev_linux_cast(netdev_get_dev(netdev));
1766 int error;
1767
1768 error = tc_query_qdisc(netdev);
1769 if (error) {
1770 return error;
1771 }
1772
1773 *typep = netdev_dev->tc->ops->ovs_name;
1774 return (netdev_dev->tc->ops->qdisc_get
1775 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1776 : 0);
1777}
1778
1779static int
1780netdev_linux_set_qos(struct netdev *netdev,
1781 const char *type, const struct shash *details)
1782{
1783 struct netdev_dev_linux *netdev_dev =
1784 netdev_dev_linux_cast(netdev_get_dev(netdev));
1785 const struct tc_ops *new_ops;
1786 int error;
1787
1788 new_ops = tc_lookup_ovs_name(type);
1789 if (!new_ops || !new_ops->tc_install) {
1790 return EOPNOTSUPP;
1791 }
1792
1793 error = tc_query_qdisc(netdev);
1794 if (error) {
1795 return error;
1796 }
1797
1798 if (new_ops == netdev_dev->tc->ops) {
1799 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1800 } else {
1801 /* Delete existing qdisc. */
1802 error = tc_del_qdisc(netdev);
1803 if (error) {
1804 return error;
1805 }
1806 assert(netdev_dev->tc == NULL);
1807
1808 /* Install new qdisc. */
1809 error = new_ops->tc_install(netdev, details);
1810 assert((error == 0) == (netdev_dev->tc != NULL));
1811
1812 return error;
1813 }
1814}
1815
1816static int
1817netdev_linux_get_queue(const struct netdev *netdev,
1818 unsigned int queue_id, struct shash *details)
1819{
1820 struct netdev_dev_linux *netdev_dev =
1821 netdev_dev_linux_cast(netdev_get_dev(netdev));
1822 int error;
1823
1824 error = tc_query_qdisc(netdev);
1825 if (error) {
1826 return error;
93b13be8
BP
1827 } else {
1828 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1829 return (queue
1830 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1831 : ENOENT);
c1c9c9c4 1832 }
c1c9c9c4
BP
1833}
1834
1835static int
1836netdev_linux_set_queue(struct netdev *netdev,
1837 unsigned int queue_id, const struct shash *details)
1838{
1839 struct netdev_dev_linux *netdev_dev =
1840 netdev_dev_linux_cast(netdev_get_dev(netdev));
1841 int error;
1842
1843 error = tc_query_qdisc(netdev);
1844 if (error) {
1845 return error;
1846 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1847 || !netdev_dev->tc->ops->class_set) {
1848 return EINVAL;
1849 }
1850
1851 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1852}
1853
1854static int
1855netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1856{
1857 struct netdev_dev_linux *netdev_dev =
1858 netdev_dev_linux_cast(netdev_get_dev(netdev));
1859 int error;
1860
1861 error = tc_query_qdisc(netdev);
1862 if (error) {
1863 return error;
1864 } else if (!netdev_dev->tc->ops->class_delete) {
1865 return EINVAL;
93b13be8
BP
1866 } else {
1867 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1868 return (queue
1869 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1870 : ENOENT);
c1c9c9c4 1871 }
c1c9c9c4
BP
1872}
1873
1874static int
1875netdev_linux_get_queue_stats(const struct netdev *netdev,
1876 unsigned int queue_id,
1877 struct netdev_queue_stats *stats)
1878{
1879 struct netdev_dev_linux *netdev_dev =
1880 netdev_dev_linux_cast(netdev_get_dev(netdev));
1881 int error;
1882
1883 error = tc_query_qdisc(netdev);
1884 if (error) {
1885 return error;
c1c9c9c4
BP
1886 } else if (!netdev_dev->tc->ops->class_get_stats) {
1887 return EOPNOTSUPP;
93b13be8
BP
1888 } else {
1889 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1890 return (queue
1891 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1892 : ENOENT);
c1c9c9c4 1893 }
c1c9c9c4
BP
1894}
1895
23a98ffe 1896static bool
c1c9c9c4
BP
1897start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1898{
1899 struct ofpbuf request;
1900 struct tcmsg *tcmsg;
1901
1902 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1903 if (!tcmsg) {
1904 return false;
1905 }
3c4de644 1906 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1907 nl_dump_start(dump, rtnl_sock, &request);
1908 ofpbuf_uninit(&request);
23a98ffe 1909 return true;
c1c9c9c4
BP
1910}
1911
1912static int
1913netdev_linux_dump_queues(const struct netdev *netdev,
1914 netdev_dump_queues_cb *cb, void *aux)
1915{
1916 struct netdev_dev_linux *netdev_dev =
1917 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1918 struct tc_queue *queue;
c1c9c9c4
BP
1919 struct shash details;
1920 int last_error;
c1c9c9c4
BP
1921 int error;
1922
1923 error = tc_query_qdisc(netdev);
1924 if (error) {
1925 return error;
1926 } else if (!netdev_dev->tc->ops->class_get) {
1927 return EOPNOTSUPP;
1928 }
1929
1930 last_error = 0;
1931 shash_init(&details);
4e8e4213 1932 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1933 shash_clear(&details);
1934
93b13be8 1935 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1936 if (!error) {
93b13be8 1937 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1938 } else {
1939 last_error = error;
1940 }
1941 }
1942 shash_destroy(&details);
1943
1944 return last_error;
1945}
1946
1947static int
1948netdev_linux_dump_queue_stats(const struct netdev *netdev,
1949 netdev_dump_queue_stats_cb *cb, void *aux)
1950{
1951 struct netdev_dev_linux *netdev_dev =
1952 netdev_dev_linux_cast(netdev_get_dev(netdev));
1953 struct nl_dump dump;
1954 struct ofpbuf msg;
1955 int last_error;
1956 int error;
1957
1958 error = tc_query_qdisc(netdev);
1959 if (error) {
1960 return error;
1961 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1962 return EOPNOTSUPP;
1963 }
1964
1965 last_error = 0;
23a98ffe
BP
1966 if (!start_queue_dump(netdev, &dump)) {
1967 return ENODEV;
1968 }
c1c9c9c4
BP
1969 while (nl_dump_next(&dump, &msg)) {
1970 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1971 if (error) {
1972 last_error = error;
1973 }
1974 }
1975
1976 error = nl_dump_done(&dump);
1977 return error ? error : last_error;
1978}
1979
8b61709d 1980static int
f1acd62b
BP
1981netdev_linux_get_in4(const struct netdev *netdev_,
1982 struct in_addr *address, struct in_addr *netmask)
8b61709d 1983{
149f577a
JG
1984 struct netdev_dev_linux *netdev_dev =
1985 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1986
1987 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1988 int error;
1989
149f577a 1990 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1991 SIOCGIFADDR, "SIOCGIFADDR");
1992 if (error) {
1993 return error;
1994 }
1995
149f577a 1996 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1997 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1998 if (error) {
1999 return error;
2000 }
2001
149f577a 2002 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 2003 }
149f577a
JG
2004 *address = netdev_dev->address;
2005 *netmask = netdev_dev->netmask;
f1acd62b 2006 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
2007}
2008
8b61709d 2009static int
f1acd62b
BP
2010netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2011 struct in_addr netmask)
8b61709d 2012{
149f577a
JG
2013 struct netdev_dev_linux *netdev_dev =
2014 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
2015 int error;
2016
f1acd62b 2017 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2018 if (!error) {
149f577a
JG
2019 netdev_dev->cache_valid |= VALID_IN4;
2020 netdev_dev->address = address;
2021 netdev_dev->netmask = netmask;
f1acd62b 2022 if (address.s_addr != INADDR_ANY) {
8b61709d 2023 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2024 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2025 }
2026 }
2027 return error;
2028}
2029
2030static bool
2031parse_if_inet6_line(const char *line,
2032 struct in6_addr *in6, char ifname[16 + 1])
2033{
2034 uint8_t *s6 = in6->s6_addr;
2035#define X8 "%2"SCNx8
2036 return sscanf(line,
2037 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2038 "%*x %*x %*x %*x %16s\n",
2039 &s6[0], &s6[1], &s6[2], &s6[3],
2040 &s6[4], &s6[5], &s6[6], &s6[7],
2041 &s6[8], &s6[9], &s6[10], &s6[11],
2042 &s6[12], &s6[13], &s6[14], &s6[15],
2043 ifname) == 17;
2044}
2045
2046/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2047 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2048static int
2049netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2050{
149f577a
JG
2051 struct netdev_dev_linux *netdev_dev =
2052 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2053 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2054 FILE *file;
2055 char line[128];
2056
149f577a 2057 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2058
2059 file = fopen("/proc/net/if_inet6", "r");
2060 if (file != NULL) {
2061 const char *name = netdev_get_name(netdev_);
2062 while (fgets(line, sizeof line, file)) {
2a022368 2063 struct in6_addr in6_tmp;
8b61709d 2064 char ifname[16 + 1];
2a022368 2065 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2066 && !strcmp(name, ifname))
2067 {
2a022368 2068 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2069 break;
2070 }
2071 }
2072 fclose(file);
2073 }
149f577a 2074 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2075 }
149f577a 2076 *in6 = netdev_dev->in6;
8b61709d
BP
2077 return 0;
2078}
2079
2080static void
2081make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2082{
2083 struct sockaddr_in sin;
2084 memset(&sin, 0, sizeof sin);
2085 sin.sin_family = AF_INET;
2086 sin.sin_addr = addr;
2087 sin.sin_port = 0;
2088
2089 memset(sa, 0, sizeof *sa);
2090 memcpy(sa, &sin, sizeof sin);
2091}
2092
2093static int
2094do_set_addr(struct netdev *netdev,
2095 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2096{
2097 struct ifreq ifr;
71d7c22f 2098 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2099 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2100
2101 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2102 ioctl_name);
8b61709d
BP
2103}
2104
2105/* Adds 'router' as a default IP gateway. */
2106static int
67a4917b 2107netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2108{
2109 struct in_addr any = { INADDR_ANY };
2110 struct rtentry rt;
2111 int error;
2112
2113 memset(&rt, 0, sizeof rt);
2114 make_in4_sockaddr(&rt.rt_dst, any);
2115 make_in4_sockaddr(&rt.rt_gateway, router);
2116 make_in4_sockaddr(&rt.rt_genmask, any);
2117 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2118 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2119 if (error) {
2120 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2121 }
2122 return error;
2123}
2124
f1acd62b
BP
2125static int
2126netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2127 char **netdev_name)
2128{
2129 static const char fn[] = "/proc/net/route";
2130 FILE *stream;
2131 char line[256];
2132 int ln;
2133
2134 *netdev_name = NULL;
2135 stream = fopen(fn, "r");
2136 if (stream == NULL) {
2137 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2138 return errno;
2139 }
2140
2141 ln = 0;
2142 while (fgets(line, sizeof line, stream)) {
2143 if (++ln >= 2) {
2144 char iface[17];
dbba996b 2145 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2146 int refcnt, metric, mtu;
2147 unsigned int flags, use, window, irtt;
2148
2149 if (sscanf(line,
2150 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2151 " %d %u %u\n",
2152 iface, &dest, &gateway, &flags, &refcnt,
2153 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2154
d295e8e9 2155 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2156 fn, ln, line);
2157 continue;
2158 }
2159 if (!(flags & RTF_UP)) {
2160 /* Skip routes that aren't up. */
2161 continue;
2162 }
2163
2164 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2165 * network byte order, so we don't need need any endian
f1acd62b
BP
2166 * conversions here. */
2167 if ((dest & mask) == (host->s_addr & mask)) {
2168 if (!gateway) {
2169 /* The host is directly reachable. */
2170 next_hop->s_addr = 0;
2171 } else {
2172 /* To reach the host, we must go through a gateway. */
2173 next_hop->s_addr = gateway;
2174 }
2175 *netdev_name = xstrdup(iface);
2176 fclose(stream);
2177 return 0;
2178 }
2179 }
2180 }
2181
2182 fclose(stream);
2183 return ENXIO;
2184}
2185
e210037e
AE
2186static int
2187netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2188{
2189 struct ethtool_drvinfo drvinfo;
2190 int error;
2191
2192 memset(&drvinfo, 0, sizeof drvinfo);
2193 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2194 (struct ethtool_cmd *)&drvinfo,
2195 ETHTOOL_GDRVINFO,
2196 "ETHTOOL_GDRVINFO");
2197 if (!error) {
2198 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2199 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2200 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2201 }
2202
2203 return error;
2204}
2205
8b61709d
BP
2206/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2207 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2208 * returns 0. Otherwise, it returns a positive errno value; in particular,
2209 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2210static int
2211netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2212 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2213{
2214 struct arpreq r;
c100e025 2215 struct sockaddr_in sin;
8b61709d
BP
2216 int retval;
2217
2218 memset(&r, 0, sizeof r);
f2cc621b 2219 memset(&sin, 0, sizeof sin);
c100e025
BP
2220 sin.sin_family = AF_INET;
2221 sin.sin_addr.s_addr = ip;
2222 sin.sin_port = 0;
2223 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2224 r.arp_ha.sa_family = ARPHRD_ETHER;
2225 r.arp_flags = 0;
71d7c22f 2226 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2227 COVERAGE_INC(netdev_arp_lookup);
2228 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2229 if (!retval) {
2230 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2231 } else if (retval != ENXIO) {
2232 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2233 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2234 }
2235 return retval;
2236}
2237
2238static int
2239nd_to_iff_flags(enum netdev_flags nd)
2240{
2241 int iff = 0;
2242 if (nd & NETDEV_UP) {
2243 iff |= IFF_UP;
2244 }
2245 if (nd & NETDEV_PROMISC) {
2246 iff |= IFF_PROMISC;
2247 }
2248 return iff;
2249}
2250
2251static int
2252iff_to_nd_flags(int iff)
2253{
2254 enum netdev_flags nd = 0;
2255 if (iff & IFF_UP) {
2256 nd |= NETDEV_UP;
2257 }
2258 if (iff & IFF_PROMISC) {
2259 nd |= NETDEV_PROMISC;
2260 }
2261 return nd;
2262}
2263
2264static int
2265netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2266 enum netdev_flags on, enum netdev_flags *old_flagsp)
2267{
2268 int old_flags, new_flags;
2269 int error;
2270
2271 error = get_flags(netdev, &old_flags);
2272 if (!error) {
2273 *old_flagsp = iff_to_nd_flags(old_flags);
2274 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2275 if (new_flags != old_flags) {
2276 error = set_flags(netdev, new_flags);
2277 }
2278 }
2279 return error;
2280}
2281
ac4d3bcb
EJ
2282static unsigned int
2283netdev_linux_change_seq(const struct netdev *netdev)
2284{
2285 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2286}
2287
ee9bed06 2288#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
c3827f61
BP
2289{ \
2290 NAME, \
2291 \
2292 netdev_linux_init, \
2293 netdev_linux_run, \
2294 netdev_linux_wait, \
2295 \
2296 CREATE, \
2297 netdev_linux_destroy, \
de5cdb90 2298 NULL, /* get_config */ \
6d9e6eb4 2299 NULL, /* set_config */ \
c3827f61
BP
2300 \
2301 netdev_linux_open, \
2302 netdev_linux_close, \
2303 \
7b6b0ef4 2304 netdev_linux_listen, \
c3827f61
BP
2305 netdev_linux_recv, \
2306 netdev_linux_recv_wait, \
2307 netdev_linux_drain, \
2308 \
2309 netdev_linux_send, \
2310 netdev_linux_send_wait, \
2311 \
2312 netdev_linux_set_etheraddr, \
2313 netdev_linux_get_etheraddr, \
2314 netdev_linux_get_mtu, \
9b020780 2315 netdev_linux_set_mtu, \
c3827f61
BP
2316 netdev_linux_get_ifindex, \
2317 netdev_linux_get_carrier, \
1670c579 2318 netdev_linux_set_miimon_interval, \
f613a0d7 2319 GET_STATS, \
c3827f61
BP
2320 SET_STATS, \
2321 \
2322 netdev_linux_get_features, \
2323 netdev_linux_set_advertisements, \
2324 netdev_linux_get_vlan_vid, \
2325 \
2326 netdev_linux_set_policing, \
2327 netdev_linux_get_qos_types, \
2328 netdev_linux_get_qos_capabilities, \
2329 netdev_linux_get_qos, \
2330 netdev_linux_set_qos, \
2331 netdev_linux_get_queue, \
2332 netdev_linux_set_queue, \
2333 netdev_linux_delete_queue, \
2334 netdev_linux_get_queue_stats, \
2335 netdev_linux_dump_queues, \
2336 netdev_linux_dump_queue_stats, \
2337 \
2338 netdev_linux_get_in4, \
2339 netdev_linux_set_in4, \
2340 netdev_linux_get_in6, \
2341 netdev_linux_add_router, \
2342 netdev_linux_get_next_hop, \
e210037e 2343 netdev_linux_get_status, \
c3827f61
BP
2344 netdev_linux_arp_lookup, \
2345 \
2346 netdev_linux_update_flags, \
2347 \
ac4d3bcb 2348 netdev_linux_change_seq \
c3827f61
BP
2349}
2350
2351const struct netdev_class netdev_linux_class =
2352 NETDEV_LINUX_CLASS(
2353 "system",
2354 netdev_linux_create,
f613a0d7 2355 netdev_linux_get_stats,
98563392 2356 NULL); /* set_stats */
c3827f61
BP
2357
2358const struct netdev_class netdev_tap_class =
2359 NETDEV_LINUX_CLASS(
2360 "tap",
2361 netdev_linux_create_tap,
f613a0d7 2362 netdev_pseudo_get_stats,
c3827f61
BP
2363 NULL); /* set_stats */
2364
2365const struct netdev_class netdev_internal_class =
2366 NETDEV_LINUX_CLASS(
2367 "internal",
2368 netdev_linux_create,
f613a0d7 2369 netdev_pseudo_get_stats,
c3827f61 2370 netdev_vport_set_stats);
8b61709d 2371\f
c1c9c9c4 2372/* HTB traffic control class. */
559843ed 2373
c1c9c9c4 2374#define HTB_N_QUEUES 0xf000
8b61709d 2375
c1c9c9c4
BP
2376struct htb {
2377 struct tc tc;
2378 unsigned int max_rate; /* In bytes/s. */
2379};
8b61709d 2380
c1c9c9c4 2381struct htb_class {
93b13be8 2382 struct tc_queue tc_queue;
c1c9c9c4
BP
2383 unsigned int min_rate; /* In bytes/s. */
2384 unsigned int max_rate; /* In bytes/s. */
2385 unsigned int burst; /* In bytes. */
2386 unsigned int priority; /* Lower values are higher priorities. */
2387};
8b61709d 2388
c1c9c9c4
BP
2389static struct htb *
2390htb_get__(const struct netdev *netdev)
2391{
2392 struct netdev_dev_linux *netdev_dev =
2393 netdev_dev_linux_cast(netdev_get_dev(netdev));
2394 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2395}
2396
24045e35 2397static void
c1c9c9c4
BP
2398htb_install__(struct netdev *netdev, uint64_t max_rate)
2399{
2400 struct netdev_dev_linux *netdev_dev =
2401 netdev_dev_linux_cast(netdev_get_dev(netdev));
2402 struct htb *htb;
2403
2404 htb = xmalloc(sizeof *htb);
2405 tc_init(&htb->tc, &tc_ops_htb);
2406 htb->max_rate = max_rate;
2407
2408 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2409}
2410
2411/* Create an HTB qdisc.
2412 *
a339aa81 2413 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2414static int
2415htb_setup_qdisc__(struct netdev *netdev)
2416{
2417 size_t opt_offset;
2418 struct tc_htb_glob opt;
2419 struct ofpbuf request;
2420 struct tcmsg *tcmsg;
2421
2422 tc_del_qdisc(netdev);
2423
2424 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2425 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2426 if (!tcmsg) {
2427 return ENODEV;
2428 }
c1c9c9c4
BP
2429 tcmsg->tcm_handle = tc_make_handle(1, 0);
2430 tcmsg->tcm_parent = TC_H_ROOT;
2431
2432 nl_msg_put_string(&request, TCA_KIND, "htb");
2433
2434 memset(&opt, 0, sizeof opt);
2435 opt.rate2quantum = 10;
2436 opt.version = 3;
4ecf12d5 2437 opt.defcls = 1;
c1c9c9c4
BP
2438
2439 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2440 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2441 nl_msg_end_nested(&request, opt_offset);
2442
2443 return tc_transact(&request, NULL);
2444}
2445
2446/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2447 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2448static int
2449htb_setup_class__(struct netdev *netdev, unsigned int handle,
2450 unsigned int parent, struct htb_class *class)
2451{
2452 size_t opt_offset;
2453 struct tc_htb_opt opt;
2454 struct ofpbuf request;
2455 struct tcmsg *tcmsg;
2456 int error;
2457 int mtu;
2458
9b020780
PS
2459 error = netdev_get_mtu(netdev, &mtu);
2460 if (error) {
f915f1a8
BP
2461 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2462 netdev_get_name(netdev));
9b020780 2463 return error;
f915f1a8 2464 }
c1c9c9c4
BP
2465
2466 memset(&opt, 0, sizeof opt);
2467 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2468 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2469 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2470 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2471 opt.prio = class->priority;
2472
2473 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2474 if (!tcmsg) {
2475 return ENODEV;
2476 }
c1c9c9c4
BP
2477 tcmsg->tcm_handle = handle;
2478 tcmsg->tcm_parent = parent;
2479
2480 nl_msg_put_string(&request, TCA_KIND, "htb");
2481 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2482 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2483 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2484 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2485 nl_msg_end_nested(&request, opt_offset);
2486
2487 error = tc_transact(&request, NULL);
2488 if (error) {
2489 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2490 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2491 netdev_get_name(netdev),
2492 tc_get_major(handle), tc_get_minor(handle),
2493 tc_get_major(parent), tc_get_minor(parent),
2494 class->min_rate, class->max_rate,
2495 class->burst, class->priority, strerror(error));
2496 }
2497 return error;
2498}
2499
2500/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2501 * description of them into 'details'. The description complies with the
2502 * specification given in the vswitch database documentation for linux-htb
2503 * queue details. */
2504static int
2505htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2506{
2507 static const struct nl_policy tca_htb_policy[] = {
2508 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2509 .min_len = sizeof(struct tc_htb_opt) },
2510 };
2511
2512 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2513 const struct tc_htb_opt *htb;
2514
2515 if (!nl_parse_nested(nl_options, tca_htb_policy,
2516 attrs, ARRAY_SIZE(tca_htb_policy))) {
2517 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2518 return EPROTO;
2519 }
2520
2521 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2522 class->min_rate = htb->rate.rate;
2523 class->max_rate = htb->ceil.rate;
2524 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2525 class->priority = htb->prio;
2526 return 0;
2527}
2528
2529static int
2530htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2531 struct htb_class *options,
2532 struct netdev_queue_stats *stats)
2533{
2534 struct nlattr *nl_options;
2535 unsigned int handle;
2536 int error;
2537
2538 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2539 if (!error && queue_id) {
17ee3c1f
BP
2540 unsigned int major = tc_get_major(handle);
2541 unsigned int minor = tc_get_minor(handle);
2542 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2543 *queue_id = minor - 1;
c1c9c9c4
BP
2544 } else {
2545 error = EPROTO;
2546 }
2547 }
2548 if (!error && options) {
2549 error = htb_parse_tca_options__(nl_options, options);
2550 }
2551 return error;
2552}
2553
2554static void
2555htb_parse_qdisc_details__(struct netdev *netdev,
2556 const struct shash *details, struct htb_class *hc)
2557{
2558 const char *max_rate_s;
2559
2560 max_rate_s = shash_find_data(details, "max-rate");
2561 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2562 if (!hc->max_rate) {
2563 uint32_t current;
2564
2565 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2566 hc->max_rate = netdev_features_to_bps(current) / 8;
2567 }
2568 hc->min_rate = hc->max_rate;
2569 hc->burst = 0;
2570 hc->priority = 0;
2571}
2572
2573static int
2574htb_parse_class_details__(struct netdev *netdev,
2575 const struct shash *details, struct htb_class *hc)
2576{
2577 const struct htb *htb = htb_get__(netdev);
2578 const char *min_rate_s = shash_find_data(details, "min-rate");
2579 const char *max_rate_s = shash_find_data(details, "max-rate");
2580 const char *burst_s = shash_find_data(details, "burst");
2581 const char *priority_s = shash_find_data(details, "priority");
9b020780 2582 int mtu, error;
c1c9c9c4 2583
9b020780
PS
2584 error = netdev_get_mtu(netdev, &mtu);
2585 if (error) {
f915f1a8
BP
2586 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2587 netdev_get_name(netdev));
9b020780 2588 return error;
f915f1a8
BP
2589 }
2590
4f104611
EJ
2591 /* HTB requires at least an mtu sized min-rate to send any traffic even
2592 * on uncongested links. */
c45ab5e9 2593 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2594 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2595 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2596
2597 /* max-rate */
2598 hc->max_rate = (max_rate_s
2599 ? strtoull(max_rate_s, NULL, 10) / 8
2600 : htb->max_rate);
2601 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2602 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2603
2604 /* burst
2605 *
2606 * According to hints in the documentation that I've read, it is important
2607 * that 'burst' be at least as big as the largest frame that might be
2608 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2609 * but having it a bit too small is a problem. Since netdev_get_mtu()
2610 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2611 * the MTU. We actually add 64, instead of 14, as a guard against
2612 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2613 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2614 hc->burst = MAX(hc->burst, mtu + 64);
2615
2616 /* priority */
2617 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2618
2619 return 0;
2620}
2621
2622static int
2623htb_query_class__(const struct netdev *netdev, unsigned int handle,
2624 unsigned int parent, struct htb_class *options,
2625 struct netdev_queue_stats *stats)
2626{
2627 struct ofpbuf *reply;
2628 int error;
2629
2630 error = tc_query_class(netdev, handle, parent, &reply);
2631 if (!error) {
2632 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2633 ofpbuf_delete(reply);
2634 }
2635 return error;
2636}
2637
2638static int
2639htb_tc_install(struct netdev *netdev, const struct shash *details)
2640{
2641 int error;
2642
2643 error = htb_setup_qdisc__(netdev);
2644 if (!error) {
2645 struct htb_class hc;
2646
2647 htb_parse_qdisc_details__(netdev, details, &hc);
2648 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2649 tc_make_handle(1, 0), &hc);
2650 if (!error) {
2651 htb_install__(netdev, hc.max_rate);
2652 }
2653 }
2654 return error;
2655}
2656
93b13be8
BP
2657static struct htb_class *
2658htb_class_cast__(const struct tc_queue *queue)
2659{
2660 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2661}
2662
c1c9c9c4
BP
2663static void
2664htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2665 const struct htb_class *hc)
2666{
2667 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2668 size_t hash = hash_int(queue_id, 0);
2669 struct tc_queue *queue;
c1c9c9c4
BP
2670 struct htb_class *hcp;
2671
93b13be8
BP
2672 queue = tc_find_queue__(netdev, queue_id, hash);
2673 if (queue) {
2674 hcp = htb_class_cast__(queue);
2675 } else {
c1c9c9c4 2676 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2677 queue = &hcp->tc_queue;
2678 queue->queue_id = queue_id;
2679 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2680 }
93b13be8
BP
2681
2682 hcp->min_rate = hc->min_rate;
2683 hcp->max_rate = hc->max_rate;
2684 hcp->burst = hc->burst;
2685 hcp->priority = hc->priority;
c1c9c9c4
BP
2686}
2687
2688static int
2689htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2690{
c1c9c9c4
BP
2691 struct ofpbuf msg;
2692 struct nl_dump dump;
2693 struct htb_class hc;
c1c9c9c4
BP
2694
2695 /* Get qdisc options. */
2696 hc.max_rate = 0;
2697 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2698 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2699
2700 /* Get queues. */
23a98ffe
BP
2701 if (!start_queue_dump(netdev, &dump)) {
2702 return ENODEV;
2703 }
c1c9c9c4
BP
2704 while (nl_dump_next(&dump, &msg)) {
2705 unsigned int queue_id;
2706
2707 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2708 htb_update_queue__(netdev, queue_id, &hc);
2709 }
2710 }
2711 nl_dump_done(&dump);
2712
2713 return 0;
2714}
2715
2716static void
2717htb_tc_destroy(struct tc *tc)
2718{
2719 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2720 struct htb_class *hc, *next;
c1c9c9c4 2721
4e8e4213 2722 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2723 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2724 free(hc);
2725 }
2726 tc_destroy(tc);
2727 free(htb);
2728}
2729
2730static int
2731htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2732{
2733 const struct htb *htb = htb_get__(netdev);
2734 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2735 return 0;
2736}
2737
2738static int
2739htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2740{
2741 struct htb_class hc;
2742 int error;
2743
2744 htb_parse_qdisc_details__(netdev, details, &hc);
2745 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2746 tc_make_handle(1, 0), &hc);
2747 if (!error) {
2748 htb_get__(netdev)->max_rate = hc.max_rate;
2749 }
2750 return error;
2751}
2752
2753static int
93b13be8
BP
2754htb_class_get(const struct netdev *netdev OVS_UNUSED,
2755 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2756{
93b13be8 2757 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2758
2759 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2760 if (hc->min_rate != hc->max_rate) {
2761 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2762 }
2763 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2764 if (hc->priority) {
2765 shash_add(details, "priority", xasprintf("%u", hc->priority));
2766 }
2767 return 0;
2768}
2769
2770static int
2771htb_class_set(struct netdev *netdev, unsigned int queue_id,
2772 const struct shash *details)
2773{
2774 struct htb_class hc;
2775 int error;
2776
2777 error = htb_parse_class_details__(netdev, details, &hc);
2778 if (error) {
2779 return error;
2780 }
2781
17ee3c1f 2782 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2783 tc_make_handle(1, 0xfffe), &hc);
2784 if (error) {
2785 return error;
2786 }
2787
2788 htb_update_queue__(netdev, queue_id, &hc);
2789 return 0;
2790}
2791
2792static int
93b13be8 2793htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2794{
93b13be8 2795 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2796 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2797 int error;
2798
93b13be8 2799 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2800 if (!error) {
93b13be8 2801 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2802 free(hc);
c1c9c9c4
BP
2803 }
2804 return error;
2805}
2806
2807static int
93b13be8 2808htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2809 struct netdev_queue_stats *stats)
2810{
93b13be8 2811 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2812 tc_make_handle(1, 0xfffe), NULL, stats);
2813}
2814
2815static int
2816htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2817 const struct ofpbuf *nlmsg,
2818 netdev_dump_queue_stats_cb *cb, void *aux)
2819{
2820 struct netdev_queue_stats stats;
17ee3c1f 2821 unsigned int handle, major, minor;
c1c9c9c4
BP
2822 int error;
2823
2824 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2825 if (error) {
2826 return error;
2827 }
2828
17ee3c1f
BP
2829 major = tc_get_major(handle);
2830 minor = tc_get_minor(handle);
2831 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2832 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2833 }
2834 return 0;
2835}
2836
2837static const struct tc_ops tc_ops_htb = {
2838 "htb", /* linux_name */
2839 "linux-htb", /* ovs_name */
2840 HTB_N_QUEUES, /* n_queues */
2841 htb_tc_install,
2842 htb_tc_load,
2843 htb_tc_destroy,
2844 htb_qdisc_get,
2845 htb_qdisc_set,
2846 htb_class_get,
2847 htb_class_set,
2848 htb_class_delete,
2849 htb_class_get_stats,
2850 htb_class_dump_stats
2851};
2852\f
a339aa81
EJ
2853/* "linux-hfsc" traffic control class. */
2854
2855#define HFSC_N_QUEUES 0xf000
2856
2857struct hfsc {
2858 struct tc tc;
2859 uint32_t max_rate;
2860};
2861
2862struct hfsc_class {
2863 struct tc_queue tc_queue;
2864 uint32_t min_rate;
2865 uint32_t max_rate;
2866};
2867
2868static struct hfsc *
2869hfsc_get__(const struct netdev *netdev)
2870{
2871 struct netdev_dev_linux *netdev_dev;
2872 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2873 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2874}
2875
2876static struct hfsc_class *
2877hfsc_class_cast__(const struct tc_queue *queue)
2878{
2879 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2880}
2881
24045e35 2882static void
a339aa81
EJ
2883hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2884{
2885 struct netdev_dev_linux * netdev_dev;
2886 struct hfsc *hfsc;
2887
2888 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2889 hfsc = xmalloc(sizeof *hfsc);
2890 tc_init(&hfsc->tc, &tc_ops_hfsc);
2891 hfsc->max_rate = max_rate;
2892 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
2893}
2894
2895static void
2896hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2897 const struct hfsc_class *hc)
2898{
2899 size_t hash;
2900 struct hfsc *hfsc;
2901 struct hfsc_class *hcp;
2902 struct tc_queue *queue;
2903
2904 hfsc = hfsc_get__(netdev);
2905 hash = hash_int(queue_id, 0);
2906
2907 queue = tc_find_queue__(netdev, queue_id, hash);
2908 if (queue) {
2909 hcp = hfsc_class_cast__(queue);
2910 } else {
2911 hcp = xmalloc(sizeof *hcp);
2912 queue = &hcp->tc_queue;
2913 queue->queue_id = queue_id;
2914 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2915 }
2916
2917 hcp->min_rate = hc->min_rate;
2918 hcp->max_rate = hc->max_rate;
2919}
2920
2921static int
2922hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2923{
2924 const struct tc_service_curve *rsc, *fsc, *usc;
2925 static const struct nl_policy tca_hfsc_policy[] = {
2926 [TCA_HFSC_RSC] = {
2927 .type = NL_A_UNSPEC,
2928 .optional = false,
2929 .min_len = sizeof(struct tc_service_curve),
2930 },
2931 [TCA_HFSC_FSC] = {
2932 .type = NL_A_UNSPEC,
2933 .optional = false,
2934 .min_len = sizeof(struct tc_service_curve),
2935 },
2936 [TCA_HFSC_USC] = {
2937 .type = NL_A_UNSPEC,
2938 .optional = false,
2939 .min_len = sizeof(struct tc_service_curve),
2940 },
2941 };
2942 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2943
2944 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2945 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2946 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2947 return EPROTO;
2948 }
2949
2950 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2951 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2952 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2953
2954 if (rsc->m1 != 0 || rsc->d != 0 ||
2955 fsc->m1 != 0 || fsc->d != 0 ||
2956 usc->m1 != 0 || usc->d != 0) {
2957 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2958 "Non-linear service curves are not supported.");
2959 return EPROTO;
2960 }
2961
2962 if (rsc->m2 != fsc->m2) {
2963 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2964 "Real-time service curves are not supported ");
2965 return EPROTO;
2966 }
2967
2968 if (rsc->m2 > usc->m2) {
2969 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2970 "Min-rate service curve is greater than "
2971 "the max-rate service curve.");
2972 return EPROTO;
2973 }
2974
2975 class->min_rate = fsc->m2;
2976 class->max_rate = usc->m2;
2977 return 0;
2978}
2979
2980static int
2981hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2982 struct hfsc_class *options,
2983 struct netdev_queue_stats *stats)
2984{
2985 int error;
2986 unsigned int handle;
2987 struct nlattr *nl_options;
2988
2989 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2990 if (error) {
2991 return error;
2992 }
2993
2994 if (queue_id) {
2995 unsigned int major, minor;
2996
2997 major = tc_get_major(handle);
2998 minor = tc_get_minor(handle);
2999 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3000 *queue_id = minor - 1;
3001 } else {
3002 return EPROTO;
3003 }
3004 }
3005
3006 if (options) {
3007 error = hfsc_parse_tca_options__(nl_options, options);
3008 }
3009
3010 return error;
3011}
3012
3013static int
3014hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3015 unsigned int parent, struct hfsc_class *options,
3016 struct netdev_queue_stats *stats)
3017{
3018 int error;
3019 struct ofpbuf *reply;
3020
3021 error = tc_query_class(netdev, handle, parent, &reply);
3022 if (error) {
3023 return error;
3024 }
3025
3026 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3027 ofpbuf_delete(reply);
3028 return error;
3029}
3030
3031static void
3032hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
3033 struct hfsc_class *class)
3034{
3035 uint32_t max_rate;
3036 const char *max_rate_s;
3037
3038 max_rate_s = shash_find_data(details, "max-rate");
3039 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3040
3041 if (!max_rate) {
3042 uint32_t current;
3043
3044 netdev_get_features(netdev, &current, NULL, NULL, NULL);
3045 max_rate = netdev_features_to_bps(current) / 8;
3046 }
3047
3048 class->min_rate = max_rate;
3049 class->max_rate = max_rate;
3050}
3051
3052static int
3053hfsc_parse_class_details__(struct netdev *netdev,
3054 const struct shash *details,
3055 struct hfsc_class * class)
3056{
3057 const struct hfsc *hfsc;
3058 uint32_t min_rate, max_rate;
3059 const char *min_rate_s, *max_rate_s;
3060
3061 hfsc = hfsc_get__(netdev);
3062 min_rate_s = shash_find_data(details, "min-rate");
3063 max_rate_s = shash_find_data(details, "max-rate");
3064
c45ab5e9 3065 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3066 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3067 min_rate = MIN(min_rate, hfsc->max_rate);
3068
3069 max_rate = (max_rate_s
3070 ? strtoull(max_rate_s, NULL, 10) / 8
3071 : hfsc->max_rate);
3072 max_rate = MAX(max_rate, min_rate);
3073 max_rate = MIN(max_rate, hfsc->max_rate);
3074
3075 class->min_rate = min_rate;
3076 class->max_rate = max_rate;
3077
3078 return 0;
3079}
3080
3081/* Create an HFSC qdisc.
3082 *
3083 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3084static int
3085hfsc_setup_qdisc__(struct netdev * netdev)
3086{
3087 struct tcmsg *tcmsg;
3088 struct ofpbuf request;
3089 struct tc_hfsc_qopt opt;
3090
3091 tc_del_qdisc(netdev);
3092
3093 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3094 NLM_F_EXCL | NLM_F_CREATE, &request);
3095
3096 if (!tcmsg) {
3097 return ENODEV;
3098 }
3099
3100 tcmsg->tcm_handle = tc_make_handle(1, 0);
3101 tcmsg->tcm_parent = TC_H_ROOT;
3102
3103 memset(&opt, 0, sizeof opt);
3104 opt.defcls = 1;
3105
3106 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3107 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3108
3109 return tc_transact(&request, NULL);
3110}
3111
3112/* Create an HFSC class.
3113 *
3114 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3115 * sc rate <min_rate> ul rate <max_rate>" */
3116static int
3117hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3118 unsigned int parent, struct hfsc_class *class)
3119{
3120 int error;
3121 size_t opt_offset;
3122 struct tcmsg *tcmsg;
3123 struct ofpbuf request;
3124 struct tc_service_curve min, max;
3125
3126 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3127
3128 if (!tcmsg) {
3129 return ENODEV;
3130 }
3131
3132 tcmsg->tcm_handle = handle;
3133 tcmsg->tcm_parent = parent;
3134
3135 min.m1 = 0;
3136 min.d = 0;
3137 min.m2 = class->min_rate;
3138
3139 max.m1 = 0;
3140 max.d = 0;
3141 max.m2 = class->max_rate;
3142
3143 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3144 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3145 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3146 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3147 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3148 nl_msg_end_nested(&request, opt_offset);
3149
3150 error = tc_transact(&request, NULL);
3151 if (error) {
3152 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3153 "min-rate %ubps, max-rate %ubps (%s)",
3154 netdev_get_name(netdev),
3155 tc_get_major(handle), tc_get_minor(handle),
3156 tc_get_major(parent), tc_get_minor(parent),
3157 class->min_rate, class->max_rate, strerror(error));
3158 }
3159
3160 return error;
3161}
3162
3163static int
3164hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3165{
3166 int error;
3167 struct hfsc_class class;
3168
3169 error = hfsc_setup_qdisc__(netdev);
3170
3171 if (error) {
3172 return error;
3173 }
3174
3175 hfsc_parse_qdisc_details__(netdev, details, &class);
3176 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3177 tc_make_handle(1, 0), &class);
3178
3179 if (error) {
3180 return error;
3181 }
3182
3183 hfsc_install__(netdev, class.max_rate);
3184 return 0;
3185}
3186
3187static int
3188hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3189{
3190 struct ofpbuf msg;
a339aa81
EJ
3191 struct nl_dump dump;
3192 struct hfsc_class hc;
3193
3194 hc.max_rate = 0;
3195 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3196 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3197
3198 if (!start_queue_dump(netdev, &dump)) {
3199 return ENODEV;
3200 }
3201
3202 while (nl_dump_next(&dump, &msg)) {
3203 unsigned int queue_id;
3204
3205 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3206 hfsc_update_queue__(netdev, queue_id, &hc);
3207 }
3208 }
3209
3210 nl_dump_done(&dump);
3211 return 0;
3212}
3213
3214static void
3215hfsc_tc_destroy(struct tc *tc)
3216{
3217 struct hfsc *hfsc;
3218 struct hfsc_class *hc, *next;
3219
3220 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3221
3222 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3223 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3224 free(hc);
3225 }
3226
3227 tc_destroy(tc);
3228 free(hfsc);
3229}
3230
3231static int
3232hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3233{
3234 const struct hfsc *hfsc;
3235 hfsc = hfsc_get__(netdev);
3236 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3237 return 0;
3238}
3239
3240static int
3241hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3242{
3243 int error;
3244 struct hfsc_class class;
3245
3246 hfsc_parse_qdisc_details__(netdev, details, &class);
3247 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3248 tc_make_handle(1, 0), &class);
3249
3250 if (!error) {
3251 hfsc_get__(netdev)->max_rate = class.max_rate;
3252 }
3253
3254 return error;
3255}
3256
3257static int
3258hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3259 const struct tc_queue *queue, struct shash *details)
3260{
3261 const struct hfsc_class *hc;
3262
3263 hc = hfsc_class_cast__(queue);
3264 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3265 if (hc->min_rate != hc->max_rate) {
3266 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3267 }
3268 return 0;
3269}
3270
3271static int
3272hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3273 const struct shash *details)
3274{
3275 int error;
3276 struct hfsc_class class;
3277
3278 error = hfsc_parse_class_details__(netdev, details, &class);
3279 if (error) {
3280 return error;
3281 }
3282
3283 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3284 tc_make_handle(1, 0xfffe), &class);
3285 if (error) {
3286 return error;
3287 }
3288
3289 hfsc_update_queue__(netdev, queue_id, &class);
3290 return 0;
3291}
3292
3293static int
3294hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3295{
3296 int error;
3297 struct hfsc *hfsc;
3298 struct hfsc_class *hc;
3299
3300 hc = hfsc_class_cast__(queue);
3301 hfsc = hfsc_get__(netdev);
3302
3303 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3304 if (!error) {
3305 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3306 free(hc);
3307 }
3308 return error;
3309}
3310
3311static int
3312hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3313 struct netdev_queue_stats *stats)
3314{
3315 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3316 tc_make_handle(1, 0xfffe), NULL, stats);
3317}
3318
3319static int
3320hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3321 const struct ofpbuf *nlmsg,
3322 netdev_dump_queue_stats_cb *cb, void *aux)
3323{
3324 struct netdev_queue_stats stats;
3325 unsigned int handle, major, minor;
3326 int error;
3327
3328 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3329 if (error) {
3330 return error;
3331 }
3332
3333 major = tc_get_major(handle);
3334 minor = tc_get_minor(handle);
3335 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3336 (*cb)(minor - 1, &stats, aux);
3337 }
3338 return 0;
3339}
3340
3341static const struct tc_ops tc_ops_hfsc = {
3342 "hfsc", /* linux_name */
3343 "linux-hfsc", /* ovs_name */
3344 HFSC_N_QUEUES, /* n_queues */
3345 hfsc_tc_install, /* tc_install */
3346 hfsc_tc_load, /* tc_load */
3347 hfsc_tc_destroy, /* tc_destroy */
3348 hfsc_qdisc_get, /* qdisc_get */
3349 hfsc_qdisc_set, /* qdisc_set */
3350 hfsc_class_get, /* class_get */
3351 hfsc_class_set, /* class_set */
3352 hfsc_class_delete, /* class_delete */
3353 hfsc_class_get_stats, /* class_get_stats */
3354 hfsc_class_dump_stats /* class_dump_stats */
3355};
3356\f
c1c9c9c4
BP
3357/* "linux-default" traffic control class.
3358 *
3359 * This class represents the default, unnamed Linux qdisc. It corresponds to
3360 * the "" (empty string) QoS type in the OVS database. */
3361
3362static void
3363default_install__(struct netdev *netdev)
3364{
3365 struct netdev_dev_linux *netdev_dev =
3366 netdev_dev_linux_cast(netdev_get_dev(netdev));
3367 static struct tc *tc;
3368
3369 if (!tc) {
3370 tc = xmalloc(sizeof *tc);
3371 tc_init(tc, &tc_ops_default);
3372 }
3373 netdev_dev->tc = tc;
3374}
3375
3376static int
3377default_tc_install(struct netdev *netdev,
3378 const struct shash *details OVS_UNUSED)
3379{
3380 default_install__(netdev);
3381 return 0;
3382}
3383
3384static int
3385default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3386{
3387 default_install__(netdev);
3388 return 0;
3389}
3390
3391static const struct tc_ops tc_ops_default = {
3392 NULL, /* linux_name */
3393 "", /* ovs_name */
3394 0, /* n_queues */
3395 default_tc_install,
3396 default_tc_load,
3397 NULL, /* tc_destroy */
3398 NULL, /* qdisc_get */
3399 NULL, /* qdisc_set */
3400 NULL, /* class_get */
3401 NULL, /* class_set */
3402 NULL, /* class_delete */
3403 NULL, /* class_get_stats */
3404 NULL /* class_dump_stats */
3405};
3406\f
3407/* "linux-other" traffic control class.
3408 *
3409 * */
3410
3411static int
3412other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3413{
3414 struct netdev_dev_linux *netdev_dev =
3415 netdev_dev_linux_cast(netdev_get_dev(netdev));
3416 static struct tc *tc;
3417
3418 if (!tc) {
3419 tc = xmalloc(sizeof *tc);
3420 tc_init(tc, &tc_ops_other);
3421 }
3422 netdev_dev->tc = tc;
3423 return 0;
3424}
3425
3426static const struct tc_ops tc_ops_other = {
3427 NULL, /* linux_name */
3428 "linux-other", /* ovs_name */
3429 0, /* n_queues */
3430 NULL, /* tc_install */
3431 other_tc_load,
3432 NULL, /* tc_destroy */
3433 NULL, /* qdisc_get */
3434 NULL, /* qdisc_set */
3435 NULL, /* class_get */
3436 NULL, /* class_set */
3437 NULL, /* class_delete */
3438 NULL, /* class_get_stats */
3439 NULL /* class_dump_stats */
3440};
3441\f
3442/* Traffic control. */
3443
3444/* Number of kernel "tc" ticks per second. */
3445static double ticks_per_s;
3446
3447/* Number of kernel "jiffies" per second. This is used for the purpose of
3448 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3449 * one jiffy's worth of data.
3450 *
3451 * There are two possibilities here:
3452 *
3453 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3454 * approximate range of 100 to 1024. That means that we really need to
3455 * make sure that the qdisc can buffer that much data.
3456 *
3457 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3458 * has finely granular timers and there's no need to fudge additional room
3459 * for buffers. (There's no extra effort needed to implement that: the
3460 * large 'buffer_hz' is used as a divisor, so practically any number will
3461 * come out as 0 in the division. Small integer results in the case of
3462 * really high dividends won't have any real effect anyhow.)
3463 */
3464static unsigned int buffer_hz;
3465
3466/* Returns tc handle 'major':'minor'. */
3467static unsigned int
3468tc_make_handle(unsigned int major, unsigned int minor)
3469{
3470 return TC_H_MAKE(major << 16, minor);
3471}
3472
3473/* Returns the major number from 'handle'. */
3474static unsigned int
3475tc_get_major(unsigned int handle)
3476{
3477 return TC_H_MAJ(handle) >> 16;
3478}
3479
3480/* Returns the minor number from 'handle'. */
3481static unsigned int
3482tc_get_minor(unsigned int handle)
3483{
3484 return TC_H_MIN(handle);
3485}
3486
3487static struct tcmsg *
3488tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3489 struct ofpbuf *request)
3490{
3491 struct tcmsg *tcmsg;
3492 int ifindex;
3493 int error;
3494
3495 error = get_ifindex(netdev, &ifindex);
3496 if (error) {
3497 return NULL;
3498 }
3499
3500 ofpbuf_init(request, 512);
3501 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3502 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3503 tcmsg->tcm_family = AF_UNSPEC;
3504 tcmsg->tcm_ifindex = ifindex;
3505 /* Caller should fill in tcmsg->tcm_handle. */
3506 /* Caller should fill in tcmsg->tcm_parent. */
3507
3508 return tcmsg;
3509}
3510
3511static int
3512tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3513{
3514 int error = nl_sock_transact(rtnl_sock, request, replyp);
3515 ofpbuf_uninit(request);
3516 return error;
3517}
3518
3519static void
3520read_psched(void)
3521{
3522 /* The values in psched are not individually very meaningful, but they are
3523 * important. The tables below show some values seen in the wild.
3524 *
3525 * Some notes:
3526 *
3527 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3528 * (Before that, there are hints that it was 1000000000.)
3529 *
3530 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3531 * above.
3532 *
3533 * /proc/net/psched
3534 * -----------------------------------
3535 * [1] 000c8000 000f4240 000f4240 00000064
3536 * [2] 000003e8 00000400 000f4240 3b9aca00
3537 * [3] 000003e8 00000400 000f4240 3b9aca00
3538 * [4] 000003e8 00000400 000f4240 00000064
3539 * [5] 000003e8 00000040 000f4240 3b9aca00
3540 * [6] 000003e8 00000040 000f4240 000000f9
3541 *
3542 * a b c d ticks_per_s buffer_hz
3543 * ------- --------- ---------- ------------- ----------- -------------
3544 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3545 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3546 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3547 * [4] 1,000 1,024 1,000,000 100 976,562 100
3548 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3549 * [6] 1,000 64 1,000,000 249 15,625,000 249
3550 *
3551 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3552 * [2] 2.6.26-1-686-bigmem from Debian lenny
3553 * [3] 2.6.26-2-sparc64 from Debian lenny
3554 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3555 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3556 * [6] 2.6.34 from kernel.org on KVM
3557 */
3558 static const char fn[] = "/proc/net/psched";
3559 unsigned int a, b, c, d;
3560 FILE *stream;
3561
3562 ticks_per_s = 1.0;
3563 buffer_hz = 100;
3564
3565 stream = fopen(fn, "r");
3566 if (!stream) {
3567 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3568 return;
3569 }
3570
3571 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3572 VLOG_WARN("%s: read failed", fn);
3573 fclose(stream);
3574 return;
3575 }
3576 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3577 fclose(stream);
3578
3579 if (!a || !c) {
3580 VLOG_WARN("%s: invalid scheduler parameters", fn);
3581 return;
3582 }
3583
3584 ticks_per_s = (double) a * c / b;
3585 if (c == 1000000) {
3586 buffer_hz = d;
3587 } else {
3588 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3589 fn, a, b, c, d);
3590 }
3591 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3592}
3593
3594/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3595 * rate of 'rate' bytes per second. */
3596static unsigned int
3597tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3598{
3599 if (!buffer_hz) {
3600 read_psched();
3601 }
3602 return (rate * ticks) / ticks_per_s;
3603}
3604
3605/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3606 * rate of 'rate' bytes per second. */
3607static unsigned int
3608tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3609{
3610 if (!buffer_hz) {
3611 read_psched();
3612 }
015c93a4 3613 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3614}
3615
3616/* Returns the number of bytes that need to be reserved for qdisc buffering at
3617 * a transmission rate of 'rate' bytes per second. */
3618static unsigned int
3619tc_buffer_per_jiffy(unsigned int rate)
3620{
3621 if (!buffer_hz) {
3622 read_psched();
3623 }
3624 return rate / buffer_hz;
3625}
3626
3627/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3628 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3629 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3630 * stores NULL into it if it is absent.
3631 *
3632 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3633 * 'msg'.
3634 *
3635 * Returns 0 if successful, otherwise a positive errno value. */
3636static int
3637tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3638 struct nlattr **options)
3639{
3640 static const struct nl_policy tca_policy[] = {
3641 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3642 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3643 };
3644 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3645
3646 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3647 tca_policy, ta, ARRAY_SIZE(ta))) {
3648 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3649 goto error;
3650 }
3651
3652 if (kind) {
3653 *kind = nl_attr_get_string(ta[TCA_KIND]);
3654 }
3655
3656 if (options) {
3657 *options = ta[TCA_OPTIONS];
3658 }
3659
3660 return 0;
3661
3662error:
3663 if (kind) {
3664 *kind = NULL;
3665 }
3666 if (options) {
3667 *options = NULL;
3668 }
3669 return EPROTO;
3670}
3671
3672/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3673 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3674 * into '*options', and its queue statistics into '*stats'. Any of the output
3675 * arguments may be null.
3676 *
3677 * Returns 0 if successful, otherwise a positive errno value. */
3678static int
3679tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3680 struct nlattr **options, struct netdev_queue_stats *stats)
3681{
3682 static const struct nl_policy tca_policy[] = {
3683 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3684 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3685 };
3686 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3687
3688 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3689 tca_policy, ta, ARRAY_SIZE(ta))) {
3690 VLOG_WARN_RL(&rl, "failed to parse class message");
3691 goto error;
3692 }
3693
3694 if (handlep) {
3695 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3696 *handlep = tc->tcm_handle;
3697 }
3698
3699 if (options) {
3700 *options = ta[TCA_OPTIONS];
3701 }
3702
3703 if (stats) {
3704 const struct gnet_stats_queue *gsq;
3705 struct gnet_stats_basic gsb;
3706
3707 static const struct nl_policy stats_policy[] = {
3708 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3709 .min_len = sizeof gsb },
3710 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3711 .min_len = sizeof *gsq },
3712 };
3713 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3714
3715 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3716 sa, ARRAY_SIZE(sa))) {
3717 VLOG_WARN_RL(&rl, "failed to parse class stats");
3718 goto error;
3719 }
3720
3721 /* Alignment issues screw up the length of struct gnet_stats_basic on
3722 * some arch/bitsize combinations. Newer versions of Linux have a
3723 * struct gnet_stats_basic_packed, but we can't depend on that. The
3724 * easiest thing to do is just to make a copy. */
3725 memset(&gsb, 0, sizeof gsb);
3726 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3727 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3728 stats->tx_bytes = gsb.bytes;
3729 stats->tx_packets = gsb.packets;
3730
3731 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3732 stats->tx_errors = gsq->drops;
3733 }
3734
3735 return 0;
3736
3737error:
3738 if (options) {
3739 *options = NULL;
3740 }
3741 if (stats) {
3742 memset(stats, 0, sizeof *stats);
3743 }
3744 return EPROTO;
3745}
3746
3747/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3748 * on 'netdev'. */
3749static int
3750tc_query_class(const struct netdev *netdev,
3751 unsigned int handle, unsigned int parent,
3752 struct ofpbuf **replyp)
3753{
3754 struct ofpbuf request;
3755 struct tcmsg *tcmsg;
3756 int error;
3757
3758 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3759 if (!tcmsg) {
3760 return ENODEV;
3761 }
c1c9c9c4
BP
3762 tcmsg->tcm_handle = handle;
3763 tcmsg->tcm_parent = parent;
3764
3765 error = tc_transact(&request, replyp);
3766 if (error) {
3767 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3768 netdev_get_name(netdev),
3769 tc_get_major(handle), tc_get_minor(handle),
3770 tc_get_major(parent), tc_get_minor(parent),
3771 strerror(error));
3772 }
3773 return error;
3774}
3775
3776/* Equivalent to "tc class del dev <name> handle <handle>". */
3777static int
3778tc_delete_class(const struct netdev *netdev, unsigned int handle)
3779{
3780 struct ofpbuf request;
3781 struct tcmsg *tcmsg;
3782 int error;
3783
3784 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3785 if (!tcmsg) {
3786 return ENODEV;
3787 }
c1c9c9c4
BP
3788 tcmsg->tcm_handle = handle;
3789 tcmsg->tcm_parent = 0;
3790
3791 error = tc_transact(&request, NULL);
3792 if (error) {
3793 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3794 netdev_get_name(netdev),
3795 tc_get_major(handle), tc_get_minor(handle),
3796 strerror(error));
3797 }
3798 return error;
3799}
3800
3801/* Equivalent to "tc qdisc del dev <name> root". */
3802static int
3803tc_del_qdisc(struct netdev *netdev)
3804{
3805 struct netdev_dev_linux *netdev_dev =
3806 netdev_dev_linux_cast(netdev_get_dev(netdev));
3807 struct ofpbuf request;
3808 struct tcmsg *tcmsg;
3809 int error;
3810
3811 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3812 if (!tcmsg) {
3813 return ENODEV;
3814 }
c1c9c9c4
BP
3815 tcmsg->tcm_handle = tc_make_handle(1, 0);
3816 tcmsg->tcm_parent = TC_H_ROOT;
3817
3818 error = tc_transact(&request, NULL);
3819 if (error == EINVAL) {
3820 /* EINVAL probably means that the default qdisc was in use, in which
3821 * case we've accomplished our purpose. */
3822 error = 0;
3823 }
3824 if (!error && netdev_dev->tc) {
3825 if (netdev_dev->tc->ops->tc_destroy) {
3826 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3827 }
3828 netdev_dev->tc = NULL;
3829 }
3830 return error;
3831}
3832
3833/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3834 * kernel to determine what they are. Returns 0 if successful, otherwise a
3835 * positive errno value. */
3836static int
3837tc_query_qdisc(const struct netdev *netdev)
3838{
3839 struct netdev_dev_linux *netdev_dev =
3840 netdev_dev_linux_cast(netdev_get_dev(netdev));
3841 struct ofpbuf request, *qdisc;
3842 const struct tc_ops *ops;
3843 struct tcmsg *tcmsg;
3844 int load_error;
3845 int error;
3846
3847 if (netdev_dev->tc) {
3848 return 0;
3849 }
3850
3851 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3852 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3853 * 2.6.35 without that fix backported to it.
3854 *
3855 * To avoid the OOPS, we must not make a request that would attempt to dump
3856 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3857 * few others. There are a few ways that I can see to do this, but most of
3858 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3859 * technique chosen here is to assume that any non-default qdisc that we
3860 * create will have a class with handle 1:0. The built-in qdiscs only have
3861 * a class with handle 0:0.
3862 *
3863 * We could check for Linux 2.6.35+ and use a more straightforward method
3864 * there. */
3865 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3866 if (!tcmsg) {
3867 return ENODEV;
3868 }
c1c9c9c4
BP
3869 tcmsg->tcm_handle = tc_make_handle(1, 0);
3870 tcmsg->tcm_parent = 0;
3871
3872 /* Figure out what tc class to instantiate. */
3873 error = tc_transact(&request, &qdisc);
3874 if (!error) {
3875 const char *kind;
3876
3877 error = tc_parse_qdisc(qdisc, &kind, NULL);
3878 if (error) {
3879 ops = &tc_ops_other;
3880 } else {
3881 ops = tc_lookup_linux_name(kind);
3882 if (!ops) {
3883 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3884 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3885
3886 ops = &tc_ops_other;
3887 }
3888 }
3889 } else if (error == ENOENT) {
3890 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3891 * other entity that doesn't have a handle 1:0. We will assume
3892 * that it's the system default qdisc. */
3893 ops = &tc_ops_default;
3894 error = 0;
3895 } else {
3896 /* Who knows? Maybe the device got deleted. */
3897 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3898 netdev_get_name(netdev), strerror(error));
3899 ops = &tc_ops_other;
3900 }
3901
3902 /* Instantiate it. */
3903 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3904 assert((load_error == 0) == (netdev_dev->tc != NULL));
3905 ofpbuf_delete(qdisc);
3906
3907 return error ? error : load_error;
3908}
3909
3910/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3911 approximate the time to transmit packets of various lengths. For an MTU of
3912 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3913 represents two possible packet lengths; for a MTU of 513 through 1024, four
3914 possible lengths; and so on.
3915
3916 Returns, for the specified 'mtu', the number of bits that packet lengths
3917 need to be shifted right to fit within such a 256-entry table. */
3918static int
3919tc_calc_cell_log(unsigned int mtu)
3920{
3921 int cell_log;
3922
3923 if (!mtu) {
3924 mtu = ETH_PAYLOAD_MAX;
3925 }
3926 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3927
3928 for (cell_log = 0; mtu >= 256; cell_log++) {
3929 mtu >>= 1;
3930 }
3931
3932 return cell_log;
3933}
3934
3935/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3936 * of 'mtu'. */
3937static void
3938tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3939{
3940 memset(rate, 0, sizeof *rate);
3941 rate->cell_log = tc_calc_cell_log(mtu);
3942 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3943 /* rate->cell_align = 0; */ /* distro headers. */
3944 rate->mpu = ETH_TOTAL_MIN;
3945 rate->rate = Bps;
3946}
3947
3948/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3949 * attribute of the specified "type".
3950 *
3951 * See tc_calc_cell_log() above for a description of "rtab"s. */
3952static void
3953tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3954{
3955 uint32_t *rtab;
3956 unsigned int i;
3957
3958 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3959 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3960 unsigned packet_size = (i + 1) << rate->cell_log;
3961 if (packet_size < rate->mpu) {
3962 packet_size = rate->mpu;
3963 }
3964 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3965 }
3966}
3967
3968/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3969 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3970 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3971 * 0 is fine.) */
c1c9c9c4
BP
3972static int
3973tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3974{
3975 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3976 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3977}
d3980822 3978\f
d3980822 3979/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 3980static void
d3980822
BP
3981netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3982 const struct rtnl_link_stats *src)
3983{
f613a0d7
PS
3984 dst->rx_packets = src->rx_packets;
3985 dst->tx_packets = src->tx_packets;
3986 dst->rx_bytes = src->rx_bytes;
3987 dst->tx_bytes = src->tx_bytes;
3988 dst->rx_errors = src->rx_errors;
3989 dst->tx_errors = src->tx_errors;
3990 dst->rx_dropped = src->rx_dropped;
3991 dst->tx_dropped = src->tx_dropped;
3992 dst->multicast = src->multicast;
3993 dst->collisions = src->collisions;
3994 dst->rx_length_errors = src->rx_length_errors;
3995 dst->rx_over_errors = src->rx_over_errors;
3996 dst->rx_crc_errors = src->rx_crc_errors;
3997 dst->rx_frame_errors = src->rx_frame_errors;
3998 dst->rx_fifo_errors = src->rx_fifo_errors;
3999 dst->rx_missed_errors = src->rx_missed_errors;
4000 dst->tx_aborted_errors = src->tx_aborted_errors;
4001 dst->tx_carrier_errors = src->tx_carrier_errors;
4002 dst->tx_fifo_errors = src->tx_fifo_errors;
4003 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4004 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4005}
4006
c1c9c9c4
BP
4007\f
4008/* Utility functions. */
4009
4010static int
4011get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4012{
4013 /* Policy for RTNLGRP_LINK messages.
4014 *
4015 * There are *many* more fields in these messages, but currently we only
4016 * care about these fields. */
4017 static const struct nl_policy rtnlgrp_link_policy[] = {
4018 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4019 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4020 .min_len = sizeof(struct rtnl_link_stats) },
4021 };
4022
4023 struct ofpbuf request;
4024 struct ofpbuf *reply;
4025 struct ifinfomsg *ifi;
c1c9c9c4
BP
4026 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4027 int error;
4028
4029 ofpbuf_init(&request, 0);
4030 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4031 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4032 ifi->ifi_family = PF_UNSPEC;
4033 ifi->ifi_index = ifindex;
4034 error = nl_sock_transact(rtnl_sock, &request, &reply);
4035 ofpbuf_uninit(&request);
4036 if (error) {
4037 return error;
4038 }
4039
4040 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4041 rtnlgrp_link_policy,
4042 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4043 ofpbuf_delete(reply);
4044 return EPROTO;
4045 }
4046
4047 if (!attrs[IFLA_STATS]) {
4048 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4049 ofpbuf_delete(reply);
4050 return EPROTO;
4051 }
8b61709d 4052
d3980822 4053 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4054
576e26d7
BP
4055 ofpbuf_delete(reply);
4056
8b61709d
BP
4057 return 0;
4058}
4059
4060static int
4061get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4062{
4063 static const char fn[] = "/proc/net/dev";
4064 char line[1024];
4065 FILE *stream;
4066 int ln;
4067
4068 stream = fopen(fn, "r");
4069 if (!stream) {
4070 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4071 return errno;
4072 }
4073
4074 ln = 0;
4075 while (fgets(line, sizeof line, stream)) {
4076 if (++ln >= 3) {
4077 char devname[16];
4078#define X64 "%"SCNu64
4079 if (sscanf(line,
4080 " %15[^:]:"
4081 X64 X64 X64 X64 X64 X64 X64 "%*u"
4082 X64 X64 X64 X64 X64 X64 X64 "%*u",
4083 devname,
4084 &stats->rx_bytes,
4085 &stats->rx_packets,
4086 &stats->rx_errors,
4087 &stats->rx_dropped,
4088 &stats->rx_fifo_errors,
4089 &stats->rx_frame_errors,
4090 &stats->multicast,
4091 &stats->tx_bytes,
4092 &stats->tx_packets,
4093 &stats->tx_errors,
4094 &stats->tx_dropped,
4095 &stats->tx_fifo_errors,
4096 &stats->collisions,
4097 &stats->tx_carrier_errors) != 15) {
4098 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4099 } else if (!strcmp(devname, netdev_name)) {
4100 stats->rx_length_errors = UINT64_MAX;
4101 stats->rx_over_errors = UINT64_MAX;
4102 stats->rx_crc_errors = UINT64_MAX;
4103 stats->rx_missed_errors = UINT64_MAX;
4104 stats->tx_aborted_errors = UINT64_MAX;
4105 stats->tx_heartbeat_errors = UINT64_MAX;
4106 stats->tx_window_errors = UINT64_MAX;
4107 fclose(stream);
4108 return 0;
4109 }
4110 }
4111 }
4112 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4113 fclose(stream);
4114 return ENODEV;
4115}
c1c9c9c4 4116
3a183124
EJ
4117static int
4118get_carrier_via_sysfs(const char *name, bool *carrier)
4119{
4120 char line[8];
4121 int retval;
4122
4123 int error = 0;
4124 char *fn = NULL;
4125 int fd = -1;
4126
4127 *carrier = false;
4128
4129 fn = xasprintf("/sys/class/net/%s/carrier", name);
4130 fd = open(fn, O_RDONLY);
4131 if (fd < 0) {
4132 error = errno;
4133 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4134 goto exit;
4135 }
4136
4137 retval = read(fd, line, sizeof line);
4138 if (retval < 0) {
4139 error = errno;
4140 if (error == EINVAL) {
4141 /* This is the normal return value when we try to check carrier if
4142 * the network device is not up. */
4143 } else {
4144 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4145 }
4146 goto exit;
4147 } else if (retval == 0) {
4148 error = EPROTO;
4149 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4150 goto exit;
4151 }
4152
4153 if (line[0] != '0' && line[0] != '1') {
4154 error = EPROTO;
4155 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4156 goto exit;
4157 }
4158 *carrier = line[0] != '0';
4159 error = 0;
4160
4161exit:
4162 if (fd >= 0) {
4163 close(fd);
4164 }
4165 free(fn);
4166 return error;
4167}
4168
8b61709d
BP
4169static int
4170get_flags(const struct netdev *netdev, int *flags)
4171{
4172 struct ifreq ifr;
4173 int error;
4174
149f577a
JG
4175 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4176 "SIOCGIFFLAGS");
8b61709d
BP
4177 *flags = ifr.ifr_flags;
4178 return error;
4179}
4180
4181static int
4182set_flags(struct netdev *netdev, int flags)
4183{
4184 struct ifreq ifr;
4185
4186 ifr.ifr_flags = flags;
149f577a
JG
4187 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4188 "SIOCSIFFLAGS");
8b61709d
BP
4189}
4190
4191static int
4192do_get_ifindex(const char *netdev_name)
4193{
4194 struct ifreq ifr;
4195
71d7c22f 4196 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4197 COVERAGE_INC(netdev_get_ifindex);
4198 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4199 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4200 netdev_name, strerror(errno));
4201 return -errno;
4202 }
4203 return ifr.ifr_ifindex;
4204}
4205
4206static int
4207get_ifindex(const struct netdev *netdev_, int *ifindexp)
4208{
149f577a
JG
4209 struct netdev_dev_linux *netdev_dev =
4210 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4211 *ifindexp = 0;
149f577a 4212 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4213 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4214 if (ifindex < 0) {
4215 return -ifindex;
4216 }
149f577a
JG
4217 netdev_dev->cache_valid |= VALID_IFINDEX;
4218 netdev_dev->ifindex = ifindex;
8b61709d 4219 }
149f577a 4220 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4221 return 0;
4222}
4223
4224static int
4225get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4226{
4227 struct ifreq ifr;
4228 int hwaddr_family;
4229
4230 memset(&ifr, 0, sizeof ifr);
71d7c22f 4231 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4232 COVERAGE_INC(netdev_get_hwaddr);
4233 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4234 /* ENODEV probably means that a vif disappeared asynchronously and
4235 * hasn't been removed from the database yet, so reduce the log level
4236 * to INFO for that case. */
4237 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4238 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4239 netdev_name, strerror(errno));
8b61709d
BP
4240 return errno;
4241 }
4242 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4243 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4244 VLOG_WARN("%s device has unknown hardware address family %d",
4245 netdev_name, hwaddr_family);
4246 }
4247 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4248 return 0;
4249}
4250
4251static int
4252set_etheraddr(const char *netdev_name, int hwaddr_family,
4253 const uint8_t mac[ETH_ADDR_LEN])
4254{
4255 struct ifreq ifr;
4256
4257 memset(&ifr, 0, sizeof ifr);
71d7c22f 4258 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4259 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4260 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4261 COVERAGE_INC(netdev_set_hwaddr);
4262 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4263 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4264 netdev_name, strerror(errno));
4265 return errno;
4266 }
4267 return 0;
4268}
4269
4270static int
0b0544d7 4271netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4272 int cmd, const char *cmd_name)
4273{
4274 struct ifreq ifr;
4275
4276 memset(&ifr, 0, sizeof ifr);
71d7c22f 4277 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4278 ifr.ifr_data = (caddr_t) ecmd;
4279
4280 ecmd->cmd = cmd;
4281 COVERAGE_INC(netdev_ethtool);
4282 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4283 return 0;
4284 } else {
4285 if (errno != EOPNOTSUPP) {
4286 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4287 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4288 } else {
4289 /* The device doesn't support this operation. That's pretty
4290 * common, so there's no point in logging anything. */
4291 }
4292 return errno;
4293 }
4294}
4295
e47bd51a
JP
4296/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4297 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4298int
4299netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4300 const char *flag_name, bool enable)
4301{
4302 const char *netdev_name = netdev_get_name(netdev);
4303 struct ethtool_value evalue;
4304 uint32_t new_flags;
4305 int error;
4306
4307 memset(&evalue, 0, sizeof evalue);
4308 error = netdev_linux_do_ethtool(netdev_name,
4309 (struct ethtool_cmd *)&evalue,
4310 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4311 if (error) {
4312 return error;
4313 }
4314
4315 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4316 error = netdev_linux_do_ethtool(netdev_name,
4317 (struct ethtool_cmd *)&evalue,
4318 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4319 if (error) {
4320 return error;
4321 }
4322
4323 memset(&evalue, 0, sizeof evalue);
4324 error = netdev_linux_do_ethtool(netdev_name,
4325 (struct ethtool_cmd *)&evalue,
4326 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4327 if (error) {
4328 return error;
4329 }
4330
4331 if (new_flags != evalue.data) {
4332 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4333 "device %s failed", enable ? "enable" : "disable",
4334 flag_name, netdev_name);
4335 return EOPNOTSUPP;
4336 }
4337
4338 return 0;
4339}
4340
8b61709d 4341static int
149f577a
JG
4342netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4343 const char *cmd_name)
8b61709d 4344{
71d7c22f 4345 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4346 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4347 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4348 strerror(errno));
8b61709d
BP
4349 return errno;
4350 }
4351 return 0;
4352}
f1acd62b
BP
4353
4354static int
4355netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4356 int cmd, const char *cmd_name)
4357{
4358 struct ifreq ifr;
4359 int error;
4360
4361 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4362 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4363 if (!error) {
4364 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4365 *ip = sin->sin_addr;
4366 }
4367 return error;
4368}
488d734d
BP
4369
4370/* Returns an AF_PACKET raw socket or a negative errno value. */
4371static int
4372af_packet_sock(void)
4373{
4374 static int sock = INT_MIN;
4375
4376 if (sock == INT_MIN) {
4377 sock = socket(AF_PACKET, SOCK_RAW, 0);
4378 if (sock >= 0) {
4379 set_nonblocking(sock);
4380 } else {
4381 sock = -errno;
4382 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4383 }
4384 }
4385
4386 return sock;
4387}