]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
netdev-linux: Reorganize slightly.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
782e6111 2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
8b61709d 21#include <assert.h>
e9e28be3 22#include <errno.h>
8b61709d
BP
23#include <fcntl.h>
24#include <arpa/inet.h>
25#include <inttypes.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d 28#include <linux/if_tun.h>
a740f0de 29#include <linux/ip.h>
8b61709d
BP
30#include <linux/types.h>
31#include <linux/ethtool.h>
63331829 32#include <linux/mii.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d
BP
35#include <linux/sockios.h>
36#include <linux/version.h>
37#include <sys/types.h>
38#include <sys/ioctl.h>
39#include <sys/socket.h>
40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
43#include <net/if_packet.h>
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
9fe3b9a2 52#include "dpif-linux.h"
8b61709d
BP
53#include "dynamic-string.h"
54#include "fatal-signal.h"
93b13be8
BP
55#include "hash.h"
56#include "hmap.h"
8b61709d 57#include "netdev-provider.h"
7fbef77a 58#include "netdev-vport.h"
e9e28be3 59#include "netlink.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
e9e28be3 62#include "ofpbuf.h"
8b61709d
BP
63#include "openflow/openflow.h"
64#include "packets.h"
65#include "poll-loop.h"
21d6e22e 66#include "rtnetlink-link.h"
8b61709d
BP
67#include "socket-util.h"
68#include "shash.h"
19993ef3 69#include "sset.h"
1670c579 70#include "timer.h"
e9e28be3 71#include "vlog.h"
5136ce49 72
d98e6007 73VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 74
d76f09ea
BP
75COVERAGE_DEFINE(netdev_set_policing);
76COVERAGE_DEFINE(netdev_arp_lookup);
77COVERAGE_DEFINE(netdev_get_ifindex);
78COVERAGE_DEFINE(netdev_get_hwaddr);
79COVERAGE_DEFINE(netdev_set_hwaddr);
80COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
81\f
82/* These were introduced in Linux 2.6.14, so they might be missing if we have
83 * old headers. */
84#ifndef ADVERTISED_Pause
85#define ADVERTISED_Pause (1 << 13)
86#endif
87#ifndef ADVERTISED_Asym_Pause
88#define ADVERTISED_Asym_Pause (1 << 14)
89#endif
90
e47bd51a
JP
91/* These were introduced in Linux 2.6.24, so they might be missing if we
92 * have old headers. */
93#ifndef ETHTOOL_GFLAGS
94#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
95#endif
96#ifndef ETHTOOL_SFLAGS
97#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
98#endif
99
c1c9c9c4
BP
100/* This was introduced in Linux 2.6.25, so it might be missing if we have old
101 * headers. */
102#ifndef TC_RTAB_SIZE
103#define TC_RTAB_SIZE 1024
104#endif
105
2ee6545f 106static struct nln_notifier *netdev_linux_cache_notifier = NULL;
46415c90 107static int cache_notifier_refcount;
8b61709d
BP
108
109enum {
7fbef77a
JG
110 VALID_IFINDEX = 1 << 0,
111 VALID_ETHERADDR = 1 << 1,
112 VALID_IN4 = 1 << 2,
113 VALID_IN6 = 1 << 3,
114 VALID_MTU = 1 << 4,
3a183124
EJ
115 VALID_POLICING = 1 << 5,
116 VALID_HAVE_VPORT_STATS = 1 << 6
8b61709d
BP
117};
118
149f577a
JG
119struct tap_state {
120 int fd;
61b999dd 121 bool opened;
149f577a 122};
c1c9c9c4
BP
123\f
124/* Traffic control. */
125
126/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
127 * network device.
128 *
129 * Each TC implementation subclasses this with whatever additional data it
130 * needs. */
c1c9c9c4
BP
131struct tc {
132 const struct tc_ops *ops;
93b13be8
BP
133 struct hmap queues; /* Contains "struct tc_queue"s.
134 * Read by generic TC layer.
135 * Written only by TC implementation. */
136};
c1c9c9c4 137
93b13be8
BP
138/* One traffic control queue.
139 *
140 * Each TC implementation subclasses this with whatever additional data it
141 * needs. */
142struct tc_queue {
143 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
144 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
145};
146
147/* A particular kind of traffic control. Each implementation generally maps to
148 * one particular Linux qdisc class.
149 *
150 * The functions below return 0 if successful or a positive errno value on
151 * failure, except where otherwise noted. All of them must be provided, except
152 * where otherwise noted. */
153struct tc_ops {
154 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155 * This is null for tc_ops_default and tc_ops_other, for which there are no
156 * appropriate values. */
157 const char *linux_name;
158
159 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160 const char *ovs_name;
161
162 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
163 * queues. The queues are numbered 0 through n_queues - 1. */
164 unsigned int n_queues;
165
166 /* Called to install this TC class on 'netdev'. The implementation should
167 * make the Netlink calls required to set up 'netdev' with the right qdisc
168 * and configure it according to 'details'. The implementation may assume
169 * that the current qdisc is the default; that is, there is no need for it
170 * to delete the current qdisc before installing itself.
171 *
172 * The contents of 'details' should be documented as valid for 'ovs_name'
173 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174 * (which is built as ovs-vswitchd.conf.db(8)).
175 *
176 * This function must return 0 if and only if it sets 'netdev->tc' to an
177 * initialized 'struct tc'.
178 *
179 * (This function is null for tc_ops_other, which cannot be installed. For
180 * other TC classes it should always be nonnull.) */
181 int (*tc_install)(struct netdev *netdev, const struct shash *details);
182
183 /* Called when the netdev code determines (through a Netlink query) that
184 * this TC class's qdisc is installed on 'netdev', but we didn't install
185 * it ourselves and so don't know any of the details.
186 *
187 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189 * implementation should parse the other attributes of 'nlmsg' as
190 * necessary to determine its configuration. If necessary it should also
191 * use Netlink queries to determine the configuration of queues on
192 * 'netdev'.
193 *
194 * This function must return 0 if and only if it sets 'netdev->tc' to an
195 * initialized 'struct tc'. */
196 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
197
198 /* Destroys the data structures allocated by the implementation as part of
199 * 'tc'. (This includes destroying 'tc->queues' by calling
200 * tc_destroy(tc).
201 *
202 * The implementation should not need to perform any Netlink calls. If
203 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
204 * (But it may not be desirable.)
205 *
206 * This function may be null if 'tc' is trivial. */
207 void (*tc_destroy)(struct tc *tc);
208
209 /* Retrieves details of 'netdev->tc' configuration into 'details'.
210 *
211 * The implementation should not need to perform any Netlink calls, because
212 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213 * cached the configuration.
214 *
215 * The contents of 'details' should be documented as valid for 'ovs_name'
216 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217 * (which is built as ovs-vswitchd.conf.db(8)).
218 *
219 * This function may be null if 'tc' is not configurable.
220 */
221 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
222
223 /* Reconfigures 'netdev->tc' according to 'details', performing any
224 * required Netlink calls to complete the reconfiguration.
225 *
226 * The contents of 'details' should be documented as valid for 'ovs_name'
227 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228 * (which is built as ovs-vswitchd.conf.db(8)).
229 *
230 * This function may be null if 'tc' is not configurable.
231 */
232 int (*qdisc_set)(struct netdev *, const struct shash *details);
233
93b13be8
BP
234 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
236 *
237 * The contents of 'details' should be documented as valid for 'ovs_name'
238 * in the "other_config" column in the "Queue" table in
239 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
240 *
241 * The implementation should not need to perform any Netlink calls, because
242 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243 * cached the queue configuration.
244 *
245 * This function may be null if 'tc' does not have queues ('n_queues' is
246 * 0). */
93b13be8 247 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
248 struct shash *details);
249
250 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251 * 'details', perfoming any required Netlink calls to complete the
252 * reconfiguration. The caller ensures that 'queue_id' is less than
253 * 'n_queues'.
254 *
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "Queue" table in
257 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
258 *
259 * This function may be null if 'tc' does not have queues or its queues are
260 * not configurable. */
261 int (*class_set)(struct netdev *, unsigned int queue_id,
262 const struct shash *details);
263
93b13be8
BP
264 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
266 *
267 * This function may be null if 'tc' does not have queues or its queues
268 * cannot be deleted. */
93b13be8 269 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 270
93b13be8
BP
271 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
273 *
274 * On success, initializes '*stats'.
275 *
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
93b13be8
BP
278 int (*class_get_stats)(const struct netdev *netdev,
279 const struct tc_queue *queue,
c1c9c9c4
BP
280 struct netdev_queue_stats *stats);
281
282 /* Extracts queue stats from 'nlmsg', which is a response to a
283 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
284 *
285 * This function may be null if 'tc' does not have queues or if it cannot
286 * report queue statistics. */
287 int (*class_dump_stats)(const struct netdev *netdev,
288 const struct ofpbuf *nlmsg,
289 netdev_dump_queue_stats_cb *cb, void *aux);
290};
291
292static void
293tc_init(struct tc *tc, const struct tc_ops *ops)
294{
295 tc->ops = ops;
93b13be8 296 hmap_init(&tc->queues);
c1c9c9c4
BP
297}
298
299static void
300tc_destroy(struct tc *tc)
301{
93b13be8 302 hmap_destroy(&tc->queues);
c1c9c9c4
BP
303}
304
305static const struct tc_ops tc_ops_htb;
a339aa81 306static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
307static const struct tc_ops tc_ops_default;
308static const struct tc_ops tc_ops_other;
309
310static const struct tc_ops *tcs[] = {
311 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 312 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
313 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314 &tc_ops_other, /* Some other qdisc. */
315 NULL
316};
149f577a 317
c1c9c9c4
BP
318static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319static unsigned int tc_get_major(unsigned int handle);
320static unsigned int tc_get_minor(unsigned int handle);
321
322static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324static unsigned int tc_buffer_per_jiffy(unsigned int rate);
325
326static struct tcmsg *tc_make_request(const struct netdev *, int type,
327 unsigned int flags, struct ofpbuf *);
328static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
329
330static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
331 struct nlattr **options);
332static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
333 struct nlattr **options,
334 struct netdev_queue_stats *);
335static int tc_query_class(const struct netdev *,
336 unsigned int handle, unsigned int parent,
337 struct ofpbuf **replyp);
338static int tc_delete_class(const struct netdev *, unsigned int handle);
339
340static int tc_del_qdisc(struct netdev *netdev);
341static int tc_query_qdisc(const struct netdev *netdev);
342
343static int tc_calc_cell_log(unsigned int mtu);
344static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
345static void tc_put_rtab(struct ofpbuf *, uint16_t type,
346 const struct tc_ratespec *rate);
347static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
348\f
149f577a
JG
349struct netdev_dev_linux {
350 struct netdev_dev netdev_dev;
351
8b61709d 352 struct shash_node *shash_node;
149f577a 353 unsigned int cache_valid;
ac4d3bcb 354 unsigned int change_seq;
8b61709d 355
1670c579
EJ
356 bool miimon; /* Link status of last poll. */
357 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
358 struct timer miimon_timer;
359
8722022c
BP
360 /* The following are figured out "on demand" only. They are only valid
361 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
362 int ifindex;
363 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 364 struct in_addr address, netmask;
8b61709d
BP
365 struct in6_addr in6;
366 int mtu;
3a183124 367 bool carrier;
65c3058c 368 long long int carrier_resets;
80a86fbe
BP
369 uint32_t kbits_rate; /* Policing data. */
370 uint32_t kbits_burst;
7fbef77a 371 bool have_vport_stats;
c1c9c9c4 372 struct tc *tc;
149f577a
JG
373
374 union {
375 struct tap_state tap;
376 } state;
8b61709d
BP
377};
378
149f577a
JG
379struct netdev_linux {
380 struct netdev netdev;
5b7448ed 381 int fd;
149f577a 382};
8b61709d 383
76c308b5
BP
384/* Sockets used for ioctl operations. */
385static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 386
ff4ed3c9
BP
387/* A Netlink routing socket that is not subscribed to any multicast groups. */
388static struct nl_sock *rtnl_sock;
389
8b61709d
BP
390/* This is set pretty low because we probably won't learn anything from the
391 * additional log messages. */
392static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
393
15b3596a 394static int netdev_linux_init(void);
6f643e49 395
0b0544d7 396static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 397 int cmd, const char *cmd_name);
149f577a
JG
398static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399 const char *cmd_name);
f1acd62b
BP
400static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401 int cmd, const char *cmd_name);
8b61709d
BP
402static int get_flags(const struct netdev *, int *flagsp);
403static int set_flags(struct netdev *, int flags);
404static int do_get_ifindex(const char *netdev_name);
405static int get_ifindex(const struct netdev *, int *ifindexp);
406static int do_set_addr(struct netdev *netdev,
407 int ioctl_nr, const char *ioctl_name,
408 struct in_addr addr);
409static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411 const uint8_t[ETH_ADDR_LEN]);
412static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
3a183124 414static int get_carrier_via_sysfs(const char *name, bool *carrier);
488d734d 415static int af_packet_sock(void);
1670c579
EJ
416static void netdev_linux_miimon_run(void);
417static void netdev_linux_miimon_wait(void);
8b61709d 418
15b3596a
JG
419static bool
420is_netdev_linux_class(const struct netdev_class *netdev_class)
421{
422 return netdev_class->init == netdev_linux_init;
423}
424
149f577a
JG
425static struct netdev_dev_linux *
426netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 427{
15b3596a
JG
428 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429 assert(is_netdev_linux_class(netdev_class));
430
149f577a 431 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
432}
433
8b61709d
BP
434static struct netdev_linux *
435netdev_linux_cast(const struct netdev *netdev)
436{
15b3596a
JG
437 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439 assert(is_netdev_linux_class(netdev_class));
440
8b61709d
BP
441 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
442}
ff4ed3c9 443\f
8b61709d
BP
444static int
445netdev_linux_init(void)
446{
447 static int status = -1;
448 if (status < 0) {
ff4ed3c9 449 /* Create AF_INET socket. */
8b61709d
BP
450 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451 status = af_inet_sock >= 0 ? 0 : errno;
452 if (status) {
453 VLOG_ERR("failed to create inet socket: %s", strerror(status));
454 }
ff4ed3c9
BP
455
456 /* Create rtnetlink socket. */
457 if (!status) {
cceb11f5 458 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
459 if (status) {
460 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
461 strerror(status));
462 }
463 }
8b61709d
BP
464 }
465 return status;
466}
467
468static void
469netdev_linux_run(void)
470{
18a23781 471 rtnetlink_link_run();
1670c579 472 netdev_linux_miimon_run();
8b61709d
BP
473}
474
475static void
476netdev_linux_wait(void)
477{
18a23781 478 rtnetlink_link_wait();
1670c579 479 netdev_linux_miimon_wait();
8b61709d
BP
480}
481
ac4d3bcb
EJ
482static void
483netdev_dev_linux_changed(struct netdev_dev_linux *dev)
484{
485 dev->change_seq++;
486 if (!dev->change_seq) {
487 dev->change_seq++;
488 }
489 dev->cache_valid = 0;
490}
491
8b61709d 492static void
21d6e22e 493netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 494 void *aux OVS_UNUSED)
8b61709d 495{
149f577a 496 struct netdev_dev_linux *dev;
8b61709d 497 if (change) {
46415c90
JG
498 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
499 if (base_dev) {
15b3596a
JG
500 const struct netdev_class *netdev_class =
501 netdev_dev_get_class(base_dev);
502
503 if (is_netdev_linux_class(netdev_class)) {
504 dev = netdev_dev_linux_cast(base_dev);
3a183124
EJ
505
506 if (dev->carrier != change->running) {
507 dev->carrier = change->running;
65c3058c 508 dev->carrier_resets++;
3a183124
EJ
509 }
510
ac4d3bcb 511 netdev_dev_linux_changed(dev);
15b3596a 512 }
8b61709d
BP
513 }
514 } else {
46415c90 515 struct shash device_shash;
8b61709d 516 struct shash_node *node;
46415c90
JG
517
518 shash_init(&device_shash);
519 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
520 SHASH_FOR_EACH (node, &device_shash) {
3a183124
EJ
521 bool carrier;
522
149f577a 523 dev = node->data;
3a183124
EJ
524
525 get_carrier_via_sysfs(node->name, &carrier);
526 if (dev->carrier != carrier) {
527 dev->carrier = carrier;
65c3058c 528 dev->carrier_resets++;
3a183124
EJ
529 }
530
ac4d3bcb 531 netdev_dev_linux_changed(dev);
8b61709d 532 }
46415c90 533 shash_destroy(&device_shash);
8b61709d
BP
534 }
535}
536
c3827f61 537/* Creates system and internal devices. */
8b61709d 538static int
de5cdb90
BP
539netdev_linux_create(const struct netdev_class *class, const char *name,
540 struct netdev_dev **netdev_devp)
6c88d577 541{
149f577a 542 struct netdev_dev_linux *netdev_dev;
6c88d577 543
46415c90 544 if (!cache_notifier_refcount) {
2ee6545f
EJ
545 assert(!netdev_linux_cache_notifier);
546
547 netdev_linux_cache_notifier =
548 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
549
550 if (!netdev_linux_cache_notifier) {
551 return EINVAL;
149f577a
JG
552 }
553 }
46415c90 554 cache_notifier_refcount++;
6c88d577 555
149f577a 556 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 557 netdev_dev->change_seq = 1;
de5cdb90 558 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
3a183124 559 get_carrier_via_sysfs(name, &netdev_dev->carrier);
46415c90 560
149f577a 561 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
562 return 0;
563}
564
5b7448ed
JG
565/* For most types of netdevs we open the device for each call of
566 * netdev_open(). However, this is not the case with tap devices,
567 * since it is only possible to open the device once. In this
568 * situation we share a single file descriptor, and consequently
569 * buffers, across all readers. Therefore once data is read it will
570 * be unavailable to other reads for tap devices. */
a740f0de 571static int
b8dcf5e9 572netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
de5cdb90 573 const char *name, struct netdev_dev **netdev_devp)
a740f0de 574{
149f577a 575 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
576 struct tap_state *state;
577 static const char tap_dev[] = "/dev/net/tun";
578 struct ifreq ifr;
579 int error;
580
149f577a
JG
581 netdev_dev = xzalloc(sizeof *netdev_dev);
582 state = &netdev_dev->state.tap;
a740f0de 583
6c88d577 584 /* Open tap device. */
149f577a
JG
585 state->fd = open(tap_dev, O_RDWR);
586 if (state->fd < 0) {
6c88d577
JP
587 error = errno;
588 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
589 goto error;
590 }
591
592 /* Create tap device. */
593 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 594 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 595 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
596 VLOG_WARN("%s: creating tap device failed: %s", name,
597 strerror(errno));
598 error = errno;
599 goto error;
600 }
601
602 /* Make non-blocking. */
149f577a 603 error = set_nonblocking(state->fd);
a740f0de
JG
604 if (error) {
605 goto error;
606 }
607
de5cdb90 608 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
149f577a 609 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
610 return 0;
611
612error:
149f577a 613 free(netdev_dev);
a740f0de
JG
614 return error;
615}
616
a740f0de 617static void
149f577a 618destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 619{
149f577a
JG
620 struct tap_state *state = &netdev_dev->state.tap;
621
622 if (state->fd >= 0) {
623 close(state->fd);
a740f0de
JG
624 }
625}
626
149f577a 627/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 628static void
149f577a 629netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 630{
149f577a 631 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 632 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 633
c1c9c9c4
BP
634 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
635 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
636 }
637
d2bb2799 638 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 639 cache_notifier_refcount--;
149f577a 640
46415c90 641 if (!cache_notifier_refcount) {
2ee6545f
EJ
642 assert(netdev_linux_cache_notifier);
643 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
644 netdev_linux_cache_notifier = NULL;
149f577a 645 }
d2bb2799 646 } else if (class == &netdev_tap_class) {
149f577a 647 destroy_tap(netdev_dev);
d2bb2799
BP
648 } else {
649 NOT_REACHED();
6c88d577 650 }
149f577a 651
658797c8 652 free(netdev_dev);
6c88d577
JP
653}
654
8b61709d 655static int
7b6b0ef4 656netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d 657{
5b7448ed 658 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
659 struct netdev_linux *netdev;
660 enum netdev_flags flags;
661 int error;
662
663 /* Allocate network device. */
ec6fde61 664 netdev = xzalloc(sizeof *netdev);
49a6a163 665 netdev->fd = -1;
5b7448ed 666 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 667
c3827f61
BP
668 /* Verify that the device really exists, by attempting to read its flags.
669 * (The flags might be cached, in which case this won't actually do an
670 * ioctl.)
671 *
672 * Don't do this for "internal" netdevs, though, because those have to be
673 * created as netdev objects before they exist in the kernel, because
674 * creating them in the kernel happens by passing a netdev object to
675 * dpif_port_add(). */
676 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
677 error = netdev_get_flags(&netdev->netdev, &flags);
678 if (error == ENODEV) {
679 goto error;
680 }
8b61709d
BP
681 }
682
61b999dd
JG
683 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
684 !netdev_dev->state.tap.opened) {
685
686 /* We assume that the first user of the tap device is the primary user
687 * and give them the tap FD. Subsequent users probably just expect
688 * this to be a system device so open it normally to avoid send/receive
689 * directions appearing to be reversed. */
5b7448ed 690 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 691 netdev_dev->state.tap.opened = true;
8b61709d
BP
692 }
693
694 *netdevp = &netdev->netdev;
695 return 0;
696
697error:
149f577a 698 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
699 return error;
700}
701
702/* Closes and destroys 'netdev'. */
703static void
704netdev_linux_close(struct netdev *netdev_)
705{
706 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
707
49a6a163 708 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 709 close(netdev->fd);
8b61709d
BP
710 }
711 free(netdev);
712}
e9e28be3 713
7b6b0ef4
BP
714static int
715netdev_linux_listen(struct netdev *netdev_)
716{
717 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
718 struct sockaddr_ll sll;
719 int ifindex;
720 int error;
721 int fd;
722
723 if (netdev->fd >= 0) {
724 return 0;
725 }
726
727 /* Create file descriptor. */
728 fd = socket(PF_PACKET, SOCK_RAW, 0);
729 if (fd < 0) {
730 error = errno;
731 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
732 goto error;
733 }
734
735 /* Set non-blocking mode. */
736 error = set_nonblocking(fd);
737 if (error) {
738 goto error;
739 }
740
741 /* Get ethernet device index. */
742 error = get_ifindex(&netdev->netdev, &ifindex);
743 if (error) {
744 goto error;
745 }
746
747 /* Bind to specific ethernet device. */
748 memset(&sll, 0, sizeof sll);
749 sll.sll_family = AF_PACKET;
750 sll.sll_ifindex = ifindex;
751 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
752 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
753 error = errno;
754 VLOG_ERR("%s: failed to bind raw socket (%s)",
755 netdev_get_name(netdev_), strerror(error));
756 goto error;
757 }
758
759 netdev->fd = fd;
760 return 0;
761
762error:
763 if (fd >= 0) {
764 close(fd);
765 }
766 return error;
767}
768
8b61709d
BP
769static int
770netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
771{
772 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
773
5b7448ed 774 if (netdev->fd < 0) {
7b6b0ef4 775 /* Device is not listening. */
c0e5f6ca 776 return -EAGAIN;
8b61709d
BP
777 }
778
779 for (;;) {
5b7448ed 780 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
781 if (retval >= 0) {
782 return retval;
783 } else if (errno != EINTR) {
784 if (errno != EAGAIN) {
785 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
786 strerror(errno), netdev_get_name(netdev_));
787 }
c0e5f6ca 788 return -errno;
8b61709d
BP
789 }
790 }
791}
792
793/* Registers with the poll loop to wake up from the next call to poll_block()
794 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
795static void
796netdev_linux_recv_wait(struct netdev *netdev_)
797{
798 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
799 if (netdev->fd >= 0) {
800 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
801 }
802}
803
804/* Discards all packets waiting to be received from 'netdev'. */
805static int
806netdev_linux_drain(struct netdev *netdev_)
807{
808 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 809 if (netdev->fd < 0) {
8b61709d 810 return 0;
5b7448ed 811 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 812 struct ifreq ifr;
149f577a 813 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
814 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
815 if (error) {
816 return error;
817 }
5b7448ed 818 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
819 return 0;
820 } else {
5b7448ed 821 return drain_rcvbuf(netdev->fd);
8b61709d
BP
822 }
823}
824
825/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
826 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
827 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
828 * the packet is too big or too small to transmit on the device.
829 *
830 * The caller retains ownership of 'buffer' in all cases.
831 *
832 * The kernel maintains a packet transmission queue, so the caller is not
833 * expected to do additional queuing of packets. */
834static int
835netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
836{
f23347ea
BP
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
838 for (;;) {
839 ssize_t retval;
8b61709d 840
f23347ea
BP
841 if (netdev->fd < 0) {
842 /* Use our AF_PACKET socket to send to this device. */
843 struct sockaddr_ll sll;
844 struct msghdr msg;
845 struct iovec iov;
846 int ifindex;
847 int error;
488d734d
BP
848 int sock;
849
850 sock = af_packet_sock();
851 if (sock < 0) {
852 return sock;
853 }
f23347ea
BP
854
855 error = get_ifindex(netdev_, &ifindex);
856 if (error) {
857 return error;
858 }
8b61709d 859
f23347ea
BP
860 /* We don't bother setting most fields in sockaddr_ll because the
861 * kernel ignores them for SOCK_RAW. */
862 memset(&sll, 0, sizeof sll);
863 sll.sll_family = AF_PACKET;
864 sll.sll_ifindex = ifindex;
76c308b5 865
f23347ea
BP
866 iov.iov_base = (void *) data;
867 iov.iov_len = size;
76c308b5 868
f23347ea
BP
869 msg.msg_name = &sll;
870 msg.msg_namelen = sizeof sll;
871 msg.msg_iov = &iov;
872 msg.msg_iovlen = 1;
873 msg.msg_control = NULL;
874 msg.msg_controllen = 0;
875 msg.msg_flags = 0;
876
488d734d 877 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
878 } else {
879 /* Use the netdev's own fd to send to this device. This is
880 * essential for tap devices, because packets sent to a tap device
881 * with an AF_PACKET socket will loop back to be *received* again
882 * on the tap device. */
883 retval = write(netdev->fd, data, size);
884 }
76c308b5 885
8b61709d
BP
886 if (retval < 0) {
887 /* The Linux AF_PACKET implementation never blocks waiting for room
888 * for packets, instead returning ENOBUFS. Translate this into
889 * EAGAIN for the caller. */
890 if (errno == ENOBUFS) {
891 return EAGAIN;
892 } else if (errno == EINTR) {
893 continue;
894 } else if (errno != EAGAIN) {
895 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
896 netdev_get_name(netdev_), strerror(errno));
897 }
898 return errno;
899 } else if (retval != size) {
900 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
901 "%zu) on %s", retval, size, netdev_get_name(netdev_));
902 return EMSGSIZE;
903 } else {
904 return 0;
905 }
906 }
907}
908
909/* Registers with the poll loop to wake up from the next call to poll_block()
910 * when the packet transmission queue has sufficient room to transmit a packet
911 * with netdev_send().
912 *
913 * The kernel maintains a packet transmission queue, so the client is not
914 * expected to do additional queuing of packets. Thus, this function is
915 * unlikely to ever be used. It is included for completeness. */
916static void
917netdev_linux_send_wait(struct netdev *netdev_)
918{
919 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 920 if (netdev->fd < 0) {
8b61709d 921 /* Nothing to do. */
5b7448ed
JG
922 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
923 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
924 } else {
925 /* TAP device always accepts packets.*/
926 poll_immediate_wake();
927 }
928}
929
930/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
931 * otherwise a positive errno value. */
932static int
933netdev_linux_set_etheraddr(struct netdev *netdev_,
934 const uint8_t mac[ETH_ADDR_LEN])
935{
149f577a
JG
936 struct netdev_dev_linux *netdev_dev =
937 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
938 int error;
939
149f577a
JG
940 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
941 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
942 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
943 if (!error) {
149f577a
JG
944 netdev_dev->cache_valid |= VALID_ETHERADDR;
945 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
946 }
947 } else {
948 error = 0;
8b61709d
BP
949 }
950 return error;
951}
952
953/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
954 * free the returned buffer. */
955static int
956netdev_linux_get_etheraddr(const struct netdev *netdev_,
957 uint8_t mac[ETH_ADDR_LEN])
958{
149f577a
JG
959 struct netdev_dev_linux *netdev_dev =
960 netdev_dev_linux_cast(netdev_get_dev(netdev_));
961 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 962 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 963 netdev_dev->etheraddr);
8b61709d
BP
964 if (error) {
965 return error;
966 }
149f577a 967 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 968 }
149f577a 969 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
970 return 0;
971}
972
973/* Returns the maximum size of transmitted (and received) packets on 'netdev',
974 * in bytes, not including the hardware header; thus, this is typically 1500
975 * bytes for Ethernet devices. */
976static int
977netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
978{
149f577a
JG
979 struct netdev_dev_linux *netdev_dev =
980 netdev_dev_linux_cast(netdev_get_dev(netdev_));
981 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
982 struct ifreq ifr;
983 int error;
984
149f577a
JG
985 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
986 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
987 if (error) {
988 return error;
989 }
149f577a
JG
990 netdev_dev->mtu = ifr.ifr_mtu;
991 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 992 }
149f577a 993 *mtup = netdev_dev->mtu;
8b61709d
BP
994 return 0;
995}
996
9b020780
PS
997/* Sets the maximum size of transmitted (MTU) for given device using linux
998 * networking ioctl interface.
999 */
1000static int
1001netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1002{
1003 struct netdev_dev_linux *netdev_dev =
1004 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1005 struct ifreq ifr;
1006 int error;
1007
1008 ifr.ifr_mtu = mtu;
1009 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1010 SIOCSIFMTU, "SIOCSIFMTU");
1011 if (error) {
1012 return error;
1013 }
1014
1015 netdev_dev->mtu = ifr.ifr_mtu;
1016 netdev_dev->cache_valid |= VALID_MTU;
1017 return 0;
1018}
1019
9ab3d9a3
BP
1020/* Returns the ifindex of 'netdev', if successful, as a positive number.
1021 * On failure, returns a negative errno value. */
1022static int
1023netdev_linux_get_ifindex(const struct netdev *netdev)
1024{
1025 int ifindex, error;
1026
1027 error = get_ifindex(netdev, &ifindex);
1028 return error ? -error : ifindex;
1029}
1030
8b61709d
BP
1031static int
1032netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1033{
149f577a
JG
1034 struct netdev_dev_linux *netdev_dev =
1035 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1036
1670c579
EJ
1037 if (netdev_dev->miimon_interval > 0) {
1038 *carrier = netdev_dev->miimon;
3a183124
EJ
1039 } else {
1040 *carrier = netdev_dev->carrier;
8b61709d 1041 }
8b61709d 1042
3a183124 1043 return 0;
8b61709d
BP
1044}
1045
65c3058c
EJ
1046static long long int
1047netdev_linux_get_carrier_resets(const struct netdev *netdev)
1048{
1049 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1050}
1051
63331829 1052static int
1670c579
EJ
1053netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1054 struct mii_ioctl_data *data)
63331829 1055{
63331829 1056 struct ifreq ifr;
782e6111 1057 int error;
63331829 1058
63331829 1059 memset(&ifr, 0, sizeof ifr);
782e6111 1060 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1061 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1062 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1063
782e6111
EJ
1064 return error;
1065}
1066
1067static int
1670c579 1068netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1069{
782e6111
EJ
1070 struct mii_ioctl_data data;
1071 int error;
63331829 1072
782e6111
EJ
1073 *miimon = false;
1074
1075 memset(&data, 0, sizeof data);
1670c579 1076 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1077 if (!error) {
1078 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1079 data.reg_num = MII_BMSR;
1670c579 1080 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1081 &data);
63331829
EJ
1082
1083 if (!error) {
782e6111 1084 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1085 } else {
1086 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1087 }
1088 } else {
1089 struct ethtool_cmd ecmd;
63331829
EJ
1090
1091 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1092 name);
1093
1094 memset(&ecmd, 0, sizeof ecmd);
1095 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1096 "ETHTOOL_GLINK");
1097 if (!error) {
782e6111
EJ
1098 struct ethtool_value eval;
1099
1100 memcpy(&eval, &ecmd, sizeof eval);
1101 *miimon = !!eval.data;
63331829
EJ
1102 } else {
1103 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1104 }
1105 }
1106
1107 return error;
1108}
1109
1670c579
EJ
1110static int
1111netdev_linux_set_miimon_interval(struct netdev *netdev_,
1112 long long int interval)
1113{
1114 struct netdev_dev_linux *netdev_dev;
1115
1116 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1117
1118 interval = interval > 0 ? MAX(interval, 100) : 0;
1119 if (netdev_dev->miimon_interval != interval) {
1120 netdev_dev->miimon_interval = interval;
1121 timer_set_expired(&netdev_dev->miimon_timer);
1122 }
1123
1124 return 0;
1125}
1126
1127static void
1128netdev_linux_miimon_run(void)
1129{
1130 struct shash device_shash;
1131 struct shash_node *node;
1132
1133 shash_init(&device_shash);
1134 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1135 SHASH_FOR_EACH (node, &device_shash) {
1136 struct netdev_dev_linux *dev = node->data;
1137 bool miimon;
1138
1139 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1140 continue;
1141 }
1142
1143 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1144 if (miimon != dev->miimon) {
1670c579 1145 dev->miimon = miimon;
ac4d3bcb 1146 netdev_dev_linux_changed(dev);
1670c579
EJ
1147 }
1148
1149 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1150 }
1151
1152 shash_destroy(&device_shash);
1153}
1154
1155static void
1156netdev_linux_miimon_wait(void)
1157{
1158 struct shash device_shash;
1159 struct shash_node *node;
1160
1161 shash_init(&device_shash);
1162 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1163 SHASH_FOR_EACH (node, &device_shash) {
1164 struct netdev_dev_linux *dev = node->data;
1165
1166 if (dev->miimon_interval > 0) {
1167 timer_wait(&dev->miimon_timer);
1168 }
1169 }
1170 shash_destroy(&device_shash);
1171}
1172
8b61709d
BP
1173/* Check whether we can we use RTM_GETLINK to get network device statistics.
1174 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1175 * enabled. */
1176static bool
1177check_for_working_netlink_stats(void)
1178{
1179 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1180 * preferable, so if that works, we'll use it. */
1181 int ifindex = do_get_ifindex("lo");
1182 if (ifindex < 0) {
1183 VLOG_WARN("failed to get ifindex for lo, "
1184 "obtaining netdev stats from proc");
1185 return false;
1186 } else {
1187 struct netdev_stats stats;
1188 int error = get_stats_via_netlink(ifindex, &stats);
1189 if (!error) {
1190 VLOG_DBG("obtaining netdev stats via rtnetlink");
1191 return true;
1192 } else {
1193 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1194 "via proc (you are probably running a pre-2.6.19 "
1195 "kernel)", strerror(error));
1196 return false;
1197 }
1198 }
1199}
1200
92df599c
JG
1201static void
1202swap_uint64(uint64_t *a, uint64_t *b)
1203{
1de0e8ae
BP
1204 uint64_t tmp = *a;
1205 *a = *b;
1206 *b = tmp;
92df599c
JG
1207}
1208
f613a0d7
PS
1209static void
1210get_stats_via_vport(const struct netdev *netdev_,
1211 struct netdev_stats *stats)
8b61709d 1212{
149f577a
JG
1213 struct netdev_dev_linux *netdev_dev =
1214 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1215
7fbef77a
JG
1216 if (netdev_dev->have_vport_stats ||
1217 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
f613a0d7 1218 int error;
7fbef77a
JG
1219
1220 error = netdev_vport_get_stats(netdev_, stats);
f613a0d7
PS
1221 if (error) {
1222 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1223 netdev_get_name(netdev_), error);
1224 }
7fbef77a
JG
1225 netdev_dev->have_vport_stats = !error;
1226 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1227 }
f613a0d7 1228}
8b61709d 1229
f613a0d7
PS
1230static int
1231netdev_linux_sys_get_stats(const struct netdev *netdev_,
1232 struct netdev_stats *stats)
1233{
1234 static int use_netlink_stats = -1;
1235 int error;
1236
1237 if (use_netlink_stats < 0) {
1238 use_netlink_stats = check_for_working_netlink_stats();
1239 }
1240
1241 if (use_netlink_stats) {
1242 int ifindex;
1243
1244 error = get_ifindex(netdev_, &ifindex);
1245 if (!error) {
1246 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1247 }
f613a0d7
PS
1248 } else {
1249 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1250 }
7fbef77a 1251
f613a0d7
PS
1252 if (error) {
1253 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1254 netdev_get_name(netdev_), error);
1255 }
1256 return error;
1257
1258}
1259
1260/* Retrieves current device stats for 'netdev-linux'. */
1261static int
1262netdev_linux_get_stats(const struct netdev *netdev_,
1263 struct netdev_stats *stats)
1264{
1265 struct netdev_dev_linux *netdev_dev =
1266 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1267 struct netdev_stats dev_stats;
1268 int error;
1269
1270 get_stats_via_vport(netdev_, stats);
1271
1272 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1273
1274 if (error) {
1275 if (!netdev_dev->have_vport_stats) {
1276 return error;
7fbef77a 1277 } else {
f613a0d7
PS
1278 return 0;
1279 }
1280 }
1281
1282 if (!netdev_dev->have_vport_stats) {
1283 /* stats not available from OVS then use ioctl stats. */
1284 *stats = dev_stats;
1285 } else {
1286 stats->rx_errors += dev_stats.rx_errors;
1287 stats->tx_errors += dev_stats.tx_errors;
1288 stats->rx_dropped += dev_stats.rx_dropped;
1289 stats->tx_dropped += dev_stats.tx_dropped;
1290 stats->multicast += dev_stats.multicast;
1291 stats->collisions += dev_stats.collisions;
1292 stats->rx_length_errors += dev_stats.rx_length_errors;
1293 stats->rx_over_errors += dev_stats.rx_over_errors;
1294 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1295 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1296 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1297 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1298 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1299 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1300 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1301 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1302 stats->tx_window_errors += dev_stats.tx_window_errors;
1303 }
1304 return 0;
1305}
1306
1307/* Retrieves current device stats for 'netdev-tap' netdev or
1308 * netdev-internal. */
1309static int
1310netdev_pseudo_get_stats(const struct netdev *netdev_,
1311 struct netdev_stats *stats)
1312{
1313 struct netdev_dev_linux *netdev_dev =
1314 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1315 struct netdev_stats dev_stats;
1316 int error;
1317
1318 get_stats_via_vport(netdev_, stats);
1319
1320 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1321 if (error) {
1322 if (!netdev_dev->have_vport_stats) {
1323 return error;
1324 } else {
1325 return 0;
8b61709d 1326 }
8b61709d 1327 }
fe6b0e03
JG
1328
1329 /* If this port is an internal port then the transmit and receive stats
1330 * will appear to be swapped relative to the other ports since we are the
1331 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1332 * them back here. This does not apply if we are getting stats from the
1333 * vport layer because it always tracks stats from the perspective of the
1334 * switch. */
f613a0d7
PS
1335 if (!netdev_dev->have_vport_stats) {
1336 *stats = dev_stats;
92df599c
JG
1337 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1338 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1339 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1340 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1341 stats->rx_length_errors = 0;
1342 stats->rx_over_errors = 0;
1343 stats->rx_crc_errors = 0;
1344 stats->rx_frame_errors = 0;
1345 stats->rx_fifo_errors = 0;
1346 stats->rx_missed_errors = 0;
1347 stats->tx_aborted_errors = 0;
1348 stats->tx_carrier_errors = 0;
1349 stats->tx_fifo_errors = 0;
1350 stats->tx_heartbeat_errors = 0;
1351 stats->tx_window_errors = 0;
f613a0d7
PS
1352 } else {
1353 stats->rx_dropped += dev_stats.tx_dropped;
1354 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1355
f613a0d7
PS
1356 stats->rx_errors += dev_stats.tx_errors;
1357 stats->tx_errors += dev_stats.rx_errors;
1358
1359 stats->multicast += dev_stats.multicast;
1360 stats->collisions += dev_stats.collisions;
1361 }
1362 return 0;
8b61709d
BP
1363}
1364
1365/* Stores the features supported by 'netdev' into each of '*current',
1366 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1367 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1368 * successful, otherwise a positive errno value. */
8b61709d 1369static int
6f2f5cce 1370netdev_linux_get_features(const struct netdev *netdev,
8b61709d
BP
1371 uint32_t *current, uint32_t *advertised,
1372 uint32_t *supported, uint32_t *peer)
1373{
1374 struct ethtool_cmd ecmd;
1375 int error;
1376
1377 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1378 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1379 ETHTOOL_GSET, "ETHTOOL_GSET");
1380 if (error) {
1381 return error;
1382 }
1383
1384 /* Supported features. */
1385 *supported = 0;
1386 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1387 *supported |= OFPPF_10MB_HD;
1388 }
1389 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1390 *supported |= OFPPF_10MB_FD;
1391 }
1392 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1393 *supported |= OFPPF_100MB_HD;
1394 }
1395 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1396 *supported |= OFPPF_100MB_FD;
1397 }
1398 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1399 *supported |= OFPPF_1GB_HD;
1400 }
1401 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1402 *supported |= OFPPF_1GB_FD;
1403 }
1404 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1405 *supported |= OFPPF_10GB_FD;
1406 }
1407 if (ecmd.supported & SUPPORTED_TP) {
1408 *supported |= OFPPF_COPPER;
1409 }
1410 if (ecmd.supported & SUPPORTED_FIBRE) {
1411 *supported |= OFPPF_FIBER;
1412 }
1413 if (ecmd.supported & SUPPORTED_Autoneg) {
1414 *supported |= OFPPF_AUTONEG;
1415 }
1416 if (ecmd.supported & SUPPORTED_Pause) {
1417 *supported |= OFPPF_PAUSE;
1418 }
1419 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1420 *supported |= OFPPF_PAUSE_ASYM;
1421 }
1422
1423 /* Advertised features. */
1424 *advertised = 0;
1425 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1426 *advertised |= OFPPF_10MB_HD;
1427 }
1428 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1429 *advertised |= OFPPF_10MB_FD;
1430 }
1431 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1432 *advertised |= OFPPF_100MB_HD;
1433 }
1434 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1435 *advertised |= OFPPF_100MB_FD;
1436 }
1437 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1438 *advertised |= OFPPF_1GB_HD;
1439 }
1440 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1441 *advertised |= OFPPF_1GB_FD;
1442 }
1443 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1444 *advertised |= OFPPF_10GB_FD;
1445 }
1446 if (ecmd.advertising & ADVERTISED_TP) {
1447 *advertised |= OFPPF_COPPER;
1448 }
1449 if (ecmd.advertising & ADVERTISED_FIBRE) {
1450 *advertised |= OFPPF_FIBER;
1451 }
1452 if (ecmd.advertising & ADVERTISED_Autoneg) {
1453 *advertised |= OFPPF_AUTONEG;
1454 }
1455 if (ecmd.advertising & ADVERTISED_Pause) {
1456 *advertised |= OFPPF_PAUSE;
1457 }
1458 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1459 *advertised |= OFPPF_PAUSE_ASYM;
1460 }
1461
1462 /* Current settings. */
1463 if (ecmd.speed == SPEED_10) {
1464 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1465 } else if (ecmd.speed == SPEED_100) {
1466 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1467 } else if (ecmd.speed == SPEED_1000) {
1468 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1469 } else if (ecmd.speed == SPEED_10000) {
1470 *current = OFPPF_10GB_FD;
1471 } else {
1472 *current = 0;
1473 }
1474
1475 if (ecmd.port == PORT_TP) {
1476 *current |= OFPPF_COPPER;
1477 } else if (ecmd.port == PORT_FIBRE) {
1478 *current |= OFPPF_FIBER;
1479 }
1480
1481 if (ecmd.autoneg) {
1482 *current |= OFPPF_AUTONEG;
1483 }
1484
1485 /* Peer advertisements. */
1486 *peer = 0; /* XXX */
1487
1488 return 0;
1489}
1490
1491/* Set the features advertised by 'netdev' to 'advertise'. */
1492static int
1493netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1494{
1495 struct ethtool_cmd ecmd;
1496 int error;
1497
1498 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1499 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1500 ETHTOOL_GSET, "ETHTOOL_GSET");
1501 if (error) {
1502 return error;
1503 }
1504
1505 ecmd.advertising = 0;
1506 if (advertise & OFPPF_10MB_HD) {
1507 ecmd.advertising |= ADVERTISED_10baseT_Half;
1508 }
1509 if (advertise & OFPPF_10MB_FD) {
1510 ecmd.advertising |= ADVERTISED_10baseT_Full;
1511 }
1512 if (advertise & OFPPF_100MB_HD) {
1513 ecmd.advertising |= ADVERTISED_100baseT_Half;
1514 }
1515 if (advertise & OFPPF_100MB_FD) {
1516 ecmd.advertising |= ADVERTISED_100baseT_Full;
1517 }
1518 if (advertise & OFPPF_1GB_HD) {
1519 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1520 }
1521 if (advertise & OFPPF_1GB_FD) {
1522 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1523 }
1524 if (advertise & OFPPF_10GB_FD) {
1525 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1526 }
1527 if (advertise & OFPPF_COPPER) {
1528 ecmd.advertising |= ADVERTISED_TP;
1529 }
1530 if (advertise & OFPPF_FIBER) {
1531 ecmd.advertising |= ADVERTISED_FIBRE;
1532 }
1533 if (advertise & OFPPF_AUTONEG) {
1534 ecmd.advertising |= ADVERTISED_Autoneg;
1535 }
1536 if (advertise & OFPPF_PAUSE) {
1537 ecmd.advertising |= ADVERTISED_Pause;
1538 }
1539 if (advertise & OFPPF_PAUSE_ASYM) {
1540 ecmd.advertising |= ADVERTISED_Asym_Pause;
1541 }
0b0544d7 1542 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1543 ETHTOOL_SSET, "ETHTOOL_SSET");
1544}
1545
8b61709d
BP
1546#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1547#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1548
8e460221 1549/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1550 * positive errno value.
1551 *
1552 * This function is equivalent to running
1553 * /sbin/tc qdisc del dev %s handle ffff: ingress
1554 * but it is much, much faster.
1555 */
8e460221
BP
1556static int
1557netdev_linux_remove_policing(struct netdev *netdev)
1558{
80a86fbe
BP
1559 struct netdev_dev_linux *netdev_dev =
1560 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1561 const char *netdev_name = netdev_get_name(netdev);
8e460221 1562
6f42c8ea 1563 struct ofpbuf request;
6f42c8ea 1564 struct tcmsg *tcmsg;
6f42c8ea
BP
1565 int error;
1566
c1c9c9c4 1567 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1568 if (!tcmsg) {
1569 return ENODEV;
1570 }
c1c9c9c4 1571 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1572 tcmsg->tcm_parent = TC_H_INGRESS;
1573 nl_msg_put_string(&request, TCA_KIND, "ingress");
1574 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1575
1576 error = tc_transact(&request, NULL);
4d10512c 1577 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1578 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1579 netdev_name, strerror(error));
1580 return error;
1581 }
1582
80a86fbe
BP
1583 netdev_dev->kbits_rate = 0;
1584 netdev_dev->kbits_burst = 0;
1585 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1586 return 0;
1587}
1588
8b61709d
BP
1589/* Attempts to set input rate limiting (policing) policy. */
1590static int
1591netdev_linux_set_policing(struct netdev *netdev,
1592 uint32_t kbits_rate, uint32_t kbits_burst)
1593{
80a86fbe
BP
1594 struct netdev_dev_linux *netdev_dev =
1595 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1596 const char *netdev_name = netdev_get_name(netdev);
1597 char command[1024];
1598
1599 COVERAGE_INC(netdev_set_policing);
8e460221 1600
80a86fbe
BP
1601 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1602 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1603 : kbits_burst); /* Stick with user-specified value. */
1604
1605 if (netdev_dev->cache_valid & VALID_POLICING
1606 && netdev_dev->kbits_rate == kbits_rate
1607 && netdev_dev->kbits_burst == kbits_burst) {
1608 /* Assume that settings haven't changed since we last set them. */
1609 return 0;
1610 }
1611
8e460221 1612 netdev_linux_remove_policing(netdev);
8b61709d 1613 if (kbits_rate) {
8b61709d
BP
1614 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1615 if (system(command) != 0) {
1616 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1617 return -1;
1618 }
1619
1620 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1621 kbits_rate, kbits_burst);
1622 if (system(command) != 0) {
1623 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1624 netdev_name);
1625 return -1;
1626 }
80a86fbe
BP
1627
1628 netdev_dev->kbits_rate = kbits_rate;
1629 netdev_dev->kbits_burst = kbits_burst;
1630 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1631 }
1632
1633 return 0;
1634}
1635
c1c9c9c4
BP
1636static int
1637netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1638 struct sset *types)
c1c9c9c4
BP
1639{
1640 const struct tc_ops **opsp;
1641
1642 for (opsp = tcs; *opsp != NULL; opsp++) {
1643 const struct tc_ops *ops = *opsp;
1644 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1645 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1646 }
1647 }
1648 return 0;
1649}
1650
1651static const struct tc_ops *
1652tc_lookup_ovs_name(const char *name)
1653{
1654 const struct tc_ops **opsp;
1655
1656 for (opsp = tcs; *opsp != NULL; opsp++) {
1657 const struct tc_ops *ops = *opsp;
1658 if (!strcmp(name, ops->ovs_name)) {
1659 return ops;
1660 }
1661 }
1662 return NULL;
1663}
1664
1665static const struct tc_ops *
1666tc_lookup_linux_name(const char *name)
1667{
1668 const struct tc_ops **opsp;
1669
1670 for (opsp = tcs; *opsp != NULL; opsp++) {
1671 const struct tc_ops *ops = *opsp;
1672 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1673 return ops;
1674 }
1675 }
1676 return NULL;
1677}
1678
93b13be8
BP
1679static struct tc_queue *
1680tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1681 size_t hash)
1682{
1683 struct netdev_dev_linux *netdev_dev =
1684 netdev_dev_linux_cast(netdev_get_dev(netdev));
1685 struct tc_queue *queue;
1686
4e8e4213 1687 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1688 if (queue->queue_id == queue_id) {
1689 return queue;
1690 }
1691 }
1692 return NULL;
1693}
1694
1695static struct tc_queue *
1696tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1697{
1698 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1699}
1700
c1c9c9c4
BP
1701static int
1702netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1703 const char *type,
1704 struct netdev_qos_capabilities *caps)
1705{
1706 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1707 if (!ops) {
1708 return EOPNOTSUPP;
1709 }
1710 caps->n_queues = ops->n_queues;
1711 return 0;
1712}
1713
1714static int
1715netdev_linux_get_qos(const struct netdev *netdev,
1716 const char **typep, struct shash *details)
1717{
1718 struct netdev_dev_linux *netdev_dev =
1719 netdev_dev_linux_cast(netdev_get_dev(netdev));
1720 int error;
1721
1722 error = tc_query_qdisc(netdev);
1723 if (error) {
1724 return error;
1725 }
1726
1727 *typep = netdev_dev->tc->ops->ovs_name;
1728 return (netdev_dev->tc->ops->qdisc_get
1729 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1730 : 0);
1731}
1732
1733static int
1734netdev_linux_set_qos(struct netdev *netdev,
1735 const char *type, const struct shash *details)
1736{
1737 struct netdev_dev_linux *netdev_dev =
1738 netdev_dev_linux_cast(netdev_get_dev(netdev));
1739 const struct tc_ops *new_ops;
1740 int error;
1741
1742 new_ops = tc_lookup_ovs_name(type);
1743 if (!new_ops || !new_ops->tc_install) {
1744 return EOPNOTSUPP;
1745 }
1746
1747 error = tc_query_qdisc(netdev);
1748 if (error) {
1749 return error;
1750 }
1751
1752 if (new_ops == netdev_dev->tc->ops) {
1753 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1754 } else {
1755 /* Delete existing qdisc. */
1756 error = tc_del_qdisc(netdev);
1757 if (error) {
1758 return error;
1759 }
1760 assert(netdev_dev->tc == NULL);
1761
1762 /* Install new qdisc. */
1763 error = new_ops->tc_install(netdev, details);
1764 assert((error == 0) == (netdev_dev->tc != NULL));
1765
1766 return error;
1767 }
1768}
1769
1770static int
1771netdev_linux_get_queue(const struct netdev *netdev,
1772 unsigned int queue_id, struct shash *details)
1773{
1774 struct netdev_dev_linux *netdev_dev =
1775 netdev_dev_linux_cast(netdev_get_dev(netdev));
1776 int error;
1777
1778 error = tc_query_qdisc(netdev);
1779 if (error) {
1780 return error;
93b13be8
BP
1781 } else {
1782 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1783 return (queue
1784 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1785 : ENOENT);
c1c9c9c4 1786 }
c1c9c9c4
BP
1787}
1788
1789static int
1790netdev_linux_set_queue(struct netdev *netdev,
1791 unsigned int queue_id, const struct shash *details)
1792{
1793 struct netdev_dev_linux *netdev_dev =
1794 netdev_dev_linux_cast(netdev_get_dev(netdev));
1795 int error;
1796
1797 error = tc_query_qdisc(netdev);
1798 if (error) {
1799 return error;
1800 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1801 || !netdev_dev->tc->ops->class_set) {
1802 return EINVAL;
1803 }
1804
1805 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1806}
1807
1808static int
1809netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1810{
1811 struct netdev_dev_linux *netdev_dev =
1812 netdev_dev_linux_cast(netdev_get_dev(netdev));
1813 int error;
1814
1815 error = tc_query_qdisc(netdev);
1816 if (error) {
1817 return error;
1818 } else if (!netdev_dev->tc->ops->class_delete) {
1819 return EINVAL;
93b13be8
BP
1820 } else {
1821 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1822 return (queue
1823 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1824 : ENOENT);
c1c9c9c4 1825 }
c1c9c9c4
BP
1826}
1827
1828static int
1829netdev_linux_get_queue_stats(const struct netdev *netdev,
1830 unsigned int queue_id,
1831 struct netdev_queue_stats *stats)
1832{
1833 struct netdev_dev_linux *netdev_dev =
1834 netdev_dev_linux_cast(netdev_get_dev(netdev));
1835 int error;
1836
1837 error = tc_query_qdisc(netdev);
1838 if (error) {
1839 return error;
c1c9c9c4
BP
1840 } else if (!netdev_dev->tc->ops->class_get_stats) {
1841 return EOPNOTSUPP;
93b13be8
BP
1842 } else {
1843 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1844 return (queue
1845 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1846 : ENOENT);
c1c9c9c4 1847 }
c1c9c9c4
BP
1848}
1849
23a98ffe 1850static bool
c1c9c9c4
BP
1851start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1852{
1853 struct ofpbuf request;
1854 struct tcmsg *tcmsg;
1855
1856 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1857 if (!tcmsg) {
1858 return false;
1859 }
3c4de644 1860 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1861 nl_dump_start(dump, rtnl_sock, &request);
1862 ofpbuf_uninit(&request);
23a98ffe 1863 return true;
c1c9c9c4
BP
1864}
1865
1866static int
1867netdev_linux_dump_queues(const struct netdev *netdev,
1868 netdev_dump_queues_cb *cb, void *aux)
1869{
1870 struct netdev_dev_linux *netdev_dev =
1871 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1872 struct tc_queue *queue;
c1c9c9c4
BP
1873 struct shash details;
1874 int last_error;
c1c9c9c4
BP
1875 int error;
1876
1877 error = tc_query_qdisc(netdev);
1878 if (error) {
1879 return error;
1880 } else if (!netdev_dev->tc->ops->class_get) {
1881 return EOPNOTSUPP;
1882 }
1883
1884 last_error = 0;
1885 shash_init(&details);
4e8e4213 1886 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1887 shash_clear(&details);
1888
93b13be8 1889 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1890 if (!error) {
93b13be8 1891 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1892 } else {
1893 last_error = error;
1894 }
1895 }
1896 shash_destroy(&details);
1897
1898 return last_error;
1899}
1900
1901static int
1902netdev_linux_dump_queue_stats(const struct netdev *netdev,
1903 netdev_dump_queue_stats_cb *cb, void *aux)
1904{
1905 struct netdev_dev_linux *netdev_dev =
1906 netdev_dev_linux_cast(netdev_get_dev(netdev));
1907 struct nl_dump dump;
1908 struct ofpbuf msg;
1909 int last_error;
1910 int error;
1911
1912 error = tc_query_qdisc(netdev);
1913 if (error) {
1914 return error;
1915 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1916 return EOPNOTSUPP;
1917 }
1918
1919 last_error = 0;
23a98ffe
BP
1920 if (!start_queue_dump(netdev, &dump)) {
1921 return ENODEV;
1922 }
c1c9c9c4
BP
1923 while (nl_dump_next(&dump, &msg)) {
1924 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1925 if (error) {
1926 last_error = error;
1927 }
1928 }
1929
1930 error = nl_dump_done(&dump);
1931 return error ? error : last_error;
1932}
1933
8b61709d 1934static int
f1acd62b
BP
1935netdev_linux_get_in4(const struct netdev *netdev_,
1936 struct in_addr *address, struct in_addr *netmask)
8b61709d 1937{
149f577a
JG
1938 struct netdev_dev_linux *netdev_dev =
1939 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1940
1941 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1942 int error;
1943
149f577a 1944 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1945 SIOCGIFADDR, "SIOCGIFADDR");
1946 if (error) {
1947 return error;
1948 }
1949
149f577a 1950 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1951 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1952 if (error) {
1953 return error;
1954 }
1955
149f577a 1956 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1957 }
149f577a
JG
1958 *address = netdev_dev->address;
1959 *netmask = netdev_dev->netmask;
f1acd62b 1960 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1961}
1962
8b61709d 1963static int
f1acd62b
BP
1964netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1965 struct in_addr netmask)
8b61709d 1966{
149f577a
JG
1967 struct netdev_dev_linux *netdev_dev =
1968 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1969 int error;
1970
f1acd62b 1971 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1972 if (!error) {
149f577a
JG
1973 netdev_dev->cache_valid |= VALID_IN4;
1974 netdev_dev->address = address;
1975 netdev_dev->netmask = netmask;
f1acd62b 1976 if (address.s_addr != INADDR_ANY) {
8b61709d 1977 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1978 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1979 }
1980 }
1981 return error;
1982}
1983
1984static bool
1985parse_if_inet6_line(const char *line,
1986 struct in6_addr *in6, char ifname[16 + 1])
1987{
1988 uint8_t *s6 = in6->s6_addr;
1989#define X8 "%2"SCNx8
1990 return sscanf(line,
1991 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1992 "%*x %*x %*x %*x %16s\n",
1993 &s6[0], &s6[1], &s6[2], &s6[3],
1994 &s6[4], &s6[5], &s6[6], &s6[7],
1995 &s6[8], &s6[9], &s6[10], &s6[11],
1996 &s6[12], &s6[13], &s6[14], &s6[15],
1997 ifname) == 17;
1998}
1999
2000/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2001 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2002static int
2003netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2004{
149f577a
JG
2005 struct netdev_dev_linux *netdev_dev =
2006 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2007 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2008 FILE *file;
2009 char line[128];
2010
149f577a 2011 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2012
2013 file = fopen("/proc/net/if_inet6", "r");
2014 if (file != NULL) {
2015 const char *name = netdev_get_name(netdev_);
2016 while (fgets(line, sizeof line, file)) {
2a022368 2017 struct in6_addr in6_tmp;
8b61709d 2018 char ifname[16 + 1];
2a022368 2019 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2020 && !strcmp(name, ifname))
2021 {
2a022368 2022 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2023 break;
2024 }
2025 }
2026 fclose(file);
2027 }
149f577a 2028 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2029 }
149f577a 2030 *in6 = netdev_dev->in6;
8b61709d
BP
2031 return 0;
2032}
2033
2034static void
2035make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2036{
2037 struct sockaddr_in sin;
2038 memset(&sin, 0, sizeof sin);
2039 sin.sin_family = AF_INET;
2040 sin.sin_addr = addr;
2041 sin.sin_port = 0;
2042
2043 memset(sa, 0, sizeof *sa);
2044 memcpy(sa, &sin, sizeof sin);
2045}
2046
2047static int
2048do_set_addr(struct netdev *netdev,
2049 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2050{
2051 struct ifreq ifr;
71d7c22f 2052 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2053 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2054
2055 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2056 ioctl_name);
8b61709d
BP
2057}
2058
2059/* Adds 'router' as a default IP gateway. */
2060static int
67a4917b 2061netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2062{
2063 struct in_addr any = { INADDR_ANY };
2064 struct rtentry rt;
2065 int error;
2066
2067 memset(&rt, 0, sizeof rt);
2068 make_in4_sockaddr(&rt.rt_dst, any);
2069 make_in4_sockaddr(&rt.rt_gateway, router);
2070 make_in4_sockaddr(&rt.rt_genmask, any);
2071 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2072 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2073 if (error) {
2074 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2075 }
2076 return error;
2077}
2078
f1acd62b
BP
2079static int
2080netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2081 char **netdev_name)
2082{
2083 static const char fn[] = "/proc/net/route";
2084 FILE *stream;
2085 char line[256];
2086 int ln;
2087
2088 *netdev_name = NULL;
2089 stream = fopen(fn, "r");
2090 if (stream == NULL) {
2091 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2092 return errno;
2093 }
2094
2095 ln = 0;
2096 while (fgets(line, sizeof line, stream)) {
2097 if (++ln >= 2) {
2098 char iface[17];
dbba996b 2099 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2100 int refcnt, metric, mtu;
2101 unsigned int flags, use, window, irtt;
2102
2103 if (sscanf(line,
2104 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2105 " %d %u %u\n",
2106 iface, &dest, &gateway, &flags, &refcnt,
2107 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2108
d295e8e9 2109 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2110 fn, ln, line);
2111 continue;
2112 }
2113 if (!(flags & RTF_UP)) {
2114 /* Skip routes that aren't up. */
2115 continue;
2116 }
2117
2118 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2119 * network byte order, so we don't need need any endian
f1acd62b
BP
2120 * conversions here. */
2121 if ((dest & mask) == (host->s_addr & mask)) {
2122 if (!gateway) {
2123 /* The host is directly reachable. */
2124 next_hop->s_addr = 0;
2125 } else {
2126 /* To reach the host, we must go through a gateway. */
2127 next_hop->s_addr = gateway;
2128 }
2129 *netdev_name = xstrdup(iface);
2130 fclose(stream);
2131 return 0;
2132 }
2133 }
2134 }
2135
2136 fclose(stream);
2137 return ENXIO;
2138}
2139
e210037e
AE
2140static int
2141netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2142{
2143 struct ethtool_drvinfo drvinfo;
2144 int error;
2145
2146 memset(&drvinfo, 0, sizeof drvinfo);
2147 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2148 (struct ethtool_cmd *)&drvinfo,
2149 ETHTOOL_GDRVINFO,
2150 "ETHTOOL_GDRVINFO");
2151 if (!error) {
2152 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2153 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2154 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2155 }
2156
2157 return error;
2158}
2159
8b61709d
BP
2160/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2161 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2162 * returns 0. Otherwise, it returns a positive errno value; in particular,
2163 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2164static int
2165netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2166 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2167{
2168 struct arpreq r;
c100e025 2169 struct sockaddr_in sin;
8b61709d
BP
2170 int retval;
2171
2172 memset(&r, 0, sizeof r);
f2cc621b 2173 memset(&sin, 0, sizeof sin);
c100e025
BP
2174 sin.sin_family = AF_INET;
2175 sin.sin_addr.s_addr = ip;
2176 sin.sin_port = 0;
2177 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2178 r.arp_ha.sa_family = ARPHRD_ETHER;
2179 r.arp_flags = 0;
71d7c22f 2180 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2181 COVERAGE_INC(netdev_arp_lookup);
2182 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2183 if (!retval) {
2184 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2185 } else if (retval != ENXIO) {
2186 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2187 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2188 }
2189 return retval;
2190}
2191
2192static int
2193nd_to_iff_flags(enum netdev_flags nd)
2194{
2195 int iff = 0;
2196 if (nd & NETDEV_UP) {
2197 iff |= IFF_UP;
2198 }
2199 if (nd & NETDEV_PROMISC) {
2200 iff |= IFF_PROMISC;
2201 }
2202 return iff;
2203}
2204
2205static int
2206iff_to_nd_flags(int iff)
2207{
2208 enum netdev_flags nd = 0;
2209 if (iff & IFF_UP) {
2210 nd |= NETDEV_UP;
2211 }
2212 if (iff & IFF_PROMISC) {
2213 nd |= NETDEV_PROMISC;
2214 }
2215 return nd;
2216}
2217
2218static int
2219netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2220 enum netdev_flags on, enum netdev_flags *old_flagsp)
2221{
2222 int old_flags, new_flags;
2223 int error;
2224
2225 error = get_flags(netdev, &old_flags);
2226 if (!error) {
2227 *old_flagsp = iff_to_nd_flags(old_flags);
2228 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2229 if (new_flags != old_flags) {
2230 error = set_flags(netdev, new_flags);
2231 }
2232 }
2233 return error;
2234}
2235
ac4d3bcb
EJ
2236static unsigned int
2237netdev_linux_change_seq(const struct netdev *netdev)
2238{
2239 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2240}
2241
ee9bed06 2242#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
c3827f61
BP
2243{ \
2244 NAME, \
2245 \
2246 netdev_linux_init, \
2247 netdev_linux_run, \
2248 netdev_linux_wait, \
2249 \
2250 CREATE, \
2251 netdev_linux_destroy, \
de5cdb90 2252 NULL, /* get_config */ \
6d9e6eb4 2253 NULL, /* set_config */ \
c3827f61
BP
2254 \
2255 netdev_linux_open, \
2256 netdev_linux_close, \
2257 \
7b6b0ef4 2258 netdev_linux_listen, \
c3827f61
BP
2259 netdev_linux_recv, \
2260 netdev_linux_recv_wait, \
2261 netdev_linux_drain, \
2262 \
2263 netdev_linux_send, \
2264 netdev_linux_send_wait, \
2265 \
2266 netdev_linux_set_etheraddr, \
2267 netdev_linux_get_etheraddr, \
2268 netdev_linux_get_mtu, \
9b020780 2269 netdev_linux_set_mtu, \
c3827f61
BP
2270 netdev_linux_get_ifindex, \
2271 netdev_linux_get_carrier, \
65c3058c 2272 netdev_linux_get_carrier_resets, \
1670c579 2273 netdev_linux_set_miimon_interval, \
f613a0d7 2274 GET_STATS, \
c3827f61
BP
2275 SET_STATS, \
2276 \
2277 netdev_linux_get_features, \
2278 netdev_linux_set_advertisements, \
c3827f61
BP
2279 \
2280 netdev_linux_set_policing, \
2281 netdev_linux_get_qos_types, \
2282 netdev_linux_get_qos_capabilities, \
2283 netdev_linux_get_qos, \
2284 netdev_linux_set_qos, \
2285 netdev_linux_get_queue, \
2286 netdev_linux_set_queue, \
2287 netdev_linux_delete_queue, \
2288 netdev_linux_get_queue_stats, \
2289 netdev_linux_dump_queues, \
2290 netdev_linux_dump_queue_stats, \
2291 \
2292 netdev_linux_get_in4, \
2293 netdev_linux_set_in4, \
2294 netdev_linux_get_in6, \
2295 netdev_linux_add_router, \
2296 netdev_linux_get_next_hop, \
e210037e 2297 netdev_linux_get_status, \
c3827f61
BP
2298 netdev_linux_arp_lookup, \
2299 \
2300 netdev_linux_update_flags, \
2301 \
ac4d3bcb 2302 netdev_linux_change_seq \
c3827f61
BP
2303}
2304
2305const struct netdev_class netdev_linux_class =
2306 NETDEV_LINUX_CLASS(
2307 "system",
2308 netdev_linux_create,
f613a0d7 2309 netdev_linux_get_stats,
98563392 2310 NULL); /* set_stats */
c3827f61
BP
2311
2312const struct netdev_class netdev_tap_class =
2313 NETDEV_LINUX_CLASS(
2314 "tap",
2315 netdev_linux_create_tap,
f613a0d7 2316 netdev_pseudo_get_stats,
c3827f61
BP
2317 NULL); /* set_stats */
2318
2319const struct netdev_class netdev_internal_class =
2320 NETDEV_LINUX_CLASS(
2321 "internal",
2322 netdev_linux_create,
f613a0d7 2323 netdev_pseudo_get_stats,
c3827f61 2324 netdev_vport_set_stats);
8b61709d 2325\f
c1c9c9c4 2326/* HTB traffic control class. */
559843ed 2327
c1c9c9c4 2328#define HTB_N_QUEUES 0xf000
8b61709d 2329
c1c9c9c4
BP
2330struct htb {
2331 struct tc tc;
2332 unsigned int max_rate; /* In bytes/s. */
2333};
8b61709d 2334
c1c9c9c4 2335struct htb_class {
93b13be8 2336 struct tc_queue tc_queue;
c1c9c9c4
BP
2337 unsigned int min_rate; /* In bytes/s. */
2338 unsigned int max_rate; /* In bytes/s. */
2339 unsigned int burst; /* In bytes. */
2340 unsigned int priority; /* Lower values are higher priorities. */
2341};
8b61709d 2342
c1c9c9c4
BP
2343static struct htb *
2344htb_get__(const struct netdev *netdev)
2345{
2346 struct netdev_dev_linux *netdev_dev =
2347 netdev_dev_linux_cast(netdev_get_dev(netdev));
2348 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2349}
2350
24045e35 2351static void
c1c9c9c4
BP
2352htb_install__(struct netdev *netdev, uint64_t max_rate)
2353{
2354 struct netdev_dev_linux *netdev_dev =
2355 netdev_dev_linux_cast(netdev_get_dev(netdev));
2356 struct htb *htb;
2357
2358 htb = xmalloc(sizeof *htb);
2359 tc_init(&htb->tc, &tc_ops_htb);
2360 htb->max_rate = max_rate;
2361
2362 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2363}
2364
2365/* Create an HTB qdisc.
2366 *
a339aa81 2367 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2368static int
2369htb_setup_qdisc__(struct netdev *netdev)
2370{
2371 size_t opt_offset;
2372 struct tc_htb_glob opt;
2373 struct ofpbuf request;
2374 struct tcmsg *tcmsg;
2375
2376 tc_del_qdisc(netdev);
2377
2378 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2379 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2380 if (!tcmsg) {
2381 return ENODEV;
2382 }
c1c9c9c4
BP
2383 tcmsg->tcm_handle = tc_make_handle(1, 0);
2384 tcmsg->tcm_parent = TC_H_ROOT;
2385
2386 nl_msg_put_string(&request, TCA_KIND, "htb");
2387
2388 memset(&opt, 0, sizeof opt);
2389 opt.rate2quantum = 10;
2390 opt.version = 3;
4ecf12d5 2391 opt.defcls = 1;
c1c9c9c4
BP
2392
2393 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2394 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2395 nl_msg_end_nested(&request, opt_offset);
2396
2397 return tc_transact(&request, NULL);
2398}
2399
2400/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2401 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2402static int
2403htb_setup_class__(struct netdev *netdev, unsigned int handle,
2404 unsigned int parent, struct htb_class *class)
2405{
2406 size_t opt_offset;
2407 struct tc_htb_opt opt;
2408 struct ofpbuf request;
2409 struct tcmsg *tcmsg;
2410 int error;
2411 int mtu;
2412
9b020780
PS
2413 error = netdev_get_mtu(netdev, &mtu);
2414 if (error) {
f915f1a8
BP
2415 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2416 netdev_get_name(netdev));
9b020780 2417 return error;
f915f1a8 2418 }
c1c9c9c4
BP
2419
2420 memset(&opt, 0, sizeof opt);
2421 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2422 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2423 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2424 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2425 opt.prio = class->priority;
2426
2427 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2428 if (!tcmsg) {
2429 return ENODEV;
2430 }
c1c9c9c4
BP
2431 tcmsg->tcm_handle = handle;
2432 tcmsg->tcm_parent = parent;
2433
2434 nl_msg_put_string(&request, TCA_KIND, "htb");
2435 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2436 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2437 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2438 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2439 nl_msg_end_nested(&request, opt_offset);
2440
2441 error = tc_transact(&request, NULL);
2442 if (error) {
2443 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2444 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2445 netdev_get_name(netdev),
2446 tc_get_major(handle), tc_get_minor(handle),
2447 tc_get_major(parent), tc_get_minor(parent),
2448 class->min_rate, class->max_rate,
2449 class->burst, class->priority, strerror(error));
2450 }
2451 return error;
2452}
2453
2454/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2455 * description of them into 'details'. The description complies with the
2456 * specification given in the vswitch database documentation for linux-htb
2457 * queue details. */
2458static int
2459htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2460{
2461 static const struct nl_policy tca_htb_policy[] = {
2462 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2463 .min_len = sizeof(struct tc_htb_opt) },
2464 };
2465
2466 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2467 const struct tc_htb_opt *htb;
2468
2469 if (!nl_parse_nested(nl_options, tca_htb_policy,
2470 attrs, ARRAY_SIZE(tca_htb_policy))) {
2471 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2472 return EPROTO;
2473 }
2474
2475 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2476 class->min_rate = htb->rate.rate;
2477 class->max_rate = htb->ceil.rate;
2478 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2479 class->priority = htb->prio;
2480 return 0;
2481}
2482
2483static int
2484htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2485 struct htb_class *options,
2486 struct netdev_queue_stats *stats)
2487{
2488 struct nlattr *nl_options;
2489 unsigned int handle;
2490 int error;
2491
2492 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2493 if (!error && queue_id) {
17ee3c1f
BP
2494 unsigned int major = tc_get_major(handle);
2495 unsigned int minor = tc_get_minor(handle);
2496 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2497 *queue_id = minor - 1;
c1c9c9c4
BP
2498 } else {
2499 error = EPROTO;
2500 }
2501 }
2502 if (!error && options) {
2503 error = htb_parse_tca_options__(nl_options, options);
2504 }
2505 return error;
2506}
2507
2508static void
2509htb_parse_qdisc_details__(struct netdev *netdev,
2510 const struct shash *details, struct htb_class *hc)
2511{
2512 const char *max_rate_s;
2513
2514 max_rate_s = shash_find_data(details, "max-rate");
2515 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2516 if (!hc->max_rate) {
2517 uint32_t current;
2518
2519 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2520 hc->max_rate = netdev_features_to_bps(current) / 8;
2521 }
2522 hc->min_rate = hc->max_rate;
2523 hc->burst = 0;
2524 hc->priority = 0;
2525}
2526
2527static int
2528htb_parse_class_details__(struct netdev *netdev,
2529 const struct shash *details, struct htb_class *hc)
2530{
2531 const struct htb *htb = htb_get__(netdev);
2532 const char *min_rate_s = shash_find_data(details, "min-rate");
2533 const char *max_rate_s = shash_find_data(details, "max-rate");
2534 const char *burst_s = shash_find_data(details, "burst");
2535 const char *priority_s = shash_find_data(details, "priority");
9b020780 2536 int mtu, error;
c1c9c9c4 2537
9b020780
PS
2538 error = netdev_get_mtu(netdev, &mtu);
2539 if (error) {
f915f1a8
BP
2540 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2541 netdev_get_name(netdev));
9b020780 2542 return error;
f915f1a8
BP
2543 }
2544
4f104611
EJ
2545 /* HTB requires at least an mtu sized min-rate to send any traffic even
2546 * on uncongested links. */
c45ab5e9 2547 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2548 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2549 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2550
2551 /* max-rate */
2552 hc->max_rate = (max_rate_s
2553 ? strtoull(max_rate_s, NULL, 10) / 8
2554 : htb->max_rate);
2555 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2556 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2557
2558 /* burst
2559 *
2560 * According to hints in the documentation that I've read, it is important
2561 * that 'burst' be at least as big as the largest frame that might be
2562 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2563 * but having it a bit too small is a problem. Since netdev_get_mtu()
2564 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2565 * the MTU. We actually add 64, instead of 14, as a guard against
2566 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2567 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2568 hc->burst = MAX(hc->burst, mtu + 64);
2569
2570 /* priority */
2571 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2572
2573 return 0;
2574}
2575
2576static int
2577htb_query_class__(const struct netdev *netdev, unsigned int handle,
2578 unsigned int parent, struct htb_class *options,
2579 struct netdev_queue_stats *stats)
2580{
2581 struct ofpbuf *reply;
2582 int error;
2583
2584 error = tc_query_class(netdev, handle, parent, &reply);
2585 if (!error) {
2586 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2587 ofpbuf_delete(reply);
2588 }
2589 return error;
2590}
2591
2592static int
2593htb_tc_install(struct netdev *netdev, const struct shash *details)
2594{
2595 int error;
2596
2597 error = htb_setup_qdisc__(netdev);
2598 if (!error) {
2599 struct htb_class hc;
2600
2601 htb_parse_qdisc_details__(netdev, details, &hc);
2602 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2603 tc_make_handle(1, 0), &hc);
2604 if (!error) {
2605 htb_install__(netdev, hc.max_rate);
2606 }
2607 }
2608 return error;
2609}
2610
93b13be8
BP
2611static struct htb_class *
2612htb_class_cast__(const struct tc_queue *queue)
2613{
2614 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2615}
2616
c1c9c9c4
BP
2617static void
2618htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2619 const struct htb_class *hc)
2620{
2621 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2622 size_t hash = hash_int(queue_id, 0);
2623 struct tc_queue *queue;
c1c9c9c4
BP
2624 struct htb_class *hcp;
2625
93b13be8
BP
2626 queue = tc_find_queue__(netdev, queue_id, hash);
2627 if (queue) {
2628 hcp = htb_class_cast__(queue);
2629 } else {
c1c9c9c4 2630 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2631 queue = &hcp->tc_queue;
2632 queue->queue_id = queue_id;
2633 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2634 }
93b13be8
BP
2635
2636 hcp->min_rate = hc->min_rate;
2637 hcp->max_rate = hc->max_rate;
2638 hcp->burst = hc->burst;
2639 hcp->priority = hc->priority;
c1c9c9c4
BP
2640}
2641
2642static int
2643htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2644{
c1c9c9c4
BP
2645 struct ofpbuf msg;
2646 struct nl_dump dump;
2647 struct htb_class hc;
c1c9c9c4
BP
2648
2649 /* Get qdisc options. */
2650 hc.max_rate = 0;
2651 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2652 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2653
2654 /* Get queues. */
23a98ffe
BP
2655 if (!start_queue_dump(netdev, &dump)) {
2656 return ENODEV;
2657 }
c1c9c9c4
BP
2658 while (nl_dump_next(&dump, &msg)) {
2659 unsigned int queue_id;
2660
2661 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2662 htb_update_queue__(netdev, queue_id, &hc);
2663 }
2664 }
2665 nl_dump_done(&dump);
2666
2667 return 0;
2668}
2669
2670static void
2671htb_tc_destroy(struct tc *tc)
2672{
2673 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2674 struct htb_class *hc, *next;
c1c9c9c4 2675
4e8e4213 2676 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2677 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2678 free(hc);
2679 }
2680 tc_destroy(tc);
2681 free(htb);
2682}
2683
2684static int
2685htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2686{
2687 const struct htb *htb = htb_get__(netdev);
2688 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2689 return 0;
2690}
2691
2692static int
2693htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2694{
2695 struct htb_class hc;
2696 int error;
2697
2698 htb_parse_qdisc_details__(netdev, details, &hc);
2699 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2700 tc_make_handle(1, 0), &hc);
2701 if (!error) {
2702 htb_get__(netdev)->max_rate = hc.max_rate;
2703 }
2704 return error;
2705}
2706
2707static int
93b13be8
BP
2708htb_class_get(const struct netdev *netdev OVS_UNUSED,
2709 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2710{
93b13be8 2711 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2712
2713 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2714 if (hc->min_rate != hc->max_rate) {
2715 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2716 }
2717 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2718 if (hc->priority) {
2719 shash_add(details, "priority", xasprintf("%u", hc->priority));
2720 }
2721 return 0;
2722}
2723
2724static int
2725htb_class_set(struct netdev *netdev, unsigned int queue_id,
2726 const struct shash *details)
2727{
2728 struct htb_class hc;
2729 int error;
2730
2731 error = htb_parse_class_details__(netdev, details, &hc);
2732 if (error) {
2733 return error;
2734 }
2735
17ee3c1f 2736 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2737 tc_make_handle(1, 0xfffe), &hc);
2738 if (error) {
2739 return error;
2740 }
2741
2742 htb_update_queue__(netdev, queue_id, &hc);
2743 return 0;
2744}
2745
2746static int
93b13be8 2747htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2748{
93b13be8 2749 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2750 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2751 int error;
2752
93b13be8 2753 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2754 if (!error) {
93b13be8 2755 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2756 free(hc);
c1c9c9c4
BP
2757 }
2758 return error;
2759}
2760
2761static int
93b13be8 2762htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2763 struct netdev_queue_stats *stats)
2764{
93b13be8 2765 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2766 tc_make_handle(1, 0xfffe), NULL, stats);
2767}
2768
2769static int
2770htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2771 const struct ofpbuf *nlmsg,
2772 netdev_dump_queue_stats_cb *cb, void *aux)
2773{
2774 struct netdev_queue_stats stats;
17ee3c1f 2775 unsigned int handle, major, minor;
c1c9c9c4
BP
2776 int error;
2777
2778 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2779 if (error) {
2780 return error;
2781 }
2782
17ee3c1f
BP
2783 major = tc_get_major(handle);
2784 minor = tc_get_minor(handle);
2785 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2786 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2787 }
2788 return 0;
2789}
2790
2791static const struct tc_ops tc_ops_htb = {
2792 "htb", /* linux_name */
2793 "linux-htb", /* ovs_name */
2794 HTB_N_QUEUES, /* n_queues */
2795 htb_tc_install,
2796 htb_tc_load,
2797 htb_tc_destroy,
2798 htb_qdisc_get,
2799 htb_qdisc_set,
2800 htb_class_get,
2801 htb_class_set,
2802 htb_class_delete,
2803 htb_class_get_stats,
2804 htb_class_dump_stats
2805};
2806\f
a339aa81
EJ
2807/* "linux-hfsc" traffic control class. */
2808
2809#define HFSC_N_QUEUES 0xf000
2810
2811struct hfsc {
2812 struct tc tc;
2813 uint32_t max_rate;
2814};
2815
2816struct hfsc_class {
2817 struct tc_queue tc_queue;
2818 uint32_t min_rate;
2819 uint32_t max_rate;
2820};
2821
2822static struct hfsc *
2823hfsc_get__(const struct netdev *netdev)
2824{
2825 struct netdev_dev_linux *netdev_dev;
2826 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2827 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2828}
2829
2830static struct hfsc_class *
2831hfsc_class_cast__(const struct tc_queue *queue)
2832{
2833 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2834}
2835
24045e35 2836static void
a339aa81
EJ
2837hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2838{
2839 struct netdev_dev_linux * netdev_dev;
2840 struct hfsc *hfsc;
2841
2842 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2843 hfsc = xmalloc(sizeof *hfsc);
2844 tc_init(&hfsc->tc, &tc_ops_hfsc);
2845 hfsc->max_rate = max_rate;
2846 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
2847}
2848
2849static void
2850hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2851 const struct hfsc_class *hc)
2852{
2853 size_t hash;
2854 struct hfsc *hfsc;
2855 struct hfsc_class *hcp;
2856 struct tc_queue *queue;
2857
2858 hfsc = hfsc_get__(netdev);
2859 hash = hash_int(queue_id, 0);
2860
2861 queue = tc_find_queue__(netdev, queue_id, hash);
2862 if (queue) {
2863 hcp = hfsc_class_cast__(queue);
2864 } else {
2865 hcp = xmalloc(sizeof *hcp);
2866 queue = &hcp->tc_queue;
2867 queue->queue_id = queue_id;
2868 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2869 }
2870
2871 hcp->min_rate = hc->min_rate;
2872 hcp->max_rate = hc->max_rate;
2873}
2874
2875static int
2876hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2877{
2878 const struct tc_service_curve *rsc, *fsc, *usc;
2879 static const struct nl_policy tca_hfsc_policy[] = {
2880 [TCA_HFSC_RSC] = {
2881 .type = NL_A_UNSPEC,
2882 .optional = false,
2883 .min_len = sizeof(struct tc_service_curve),
2884 },
2885 [TCA_HFSC_FSC] = {
2886 .type = NL_A_UNSPEC,
2887 .optional = false,
2888 .min_len = sizeof(struct tc_service_curve),
2889 },
2890 [TCA_HFSC_USC] = {
2891 .type = NL_A_UNSPEC,
2892 .optional = false,
2893 .min_len = sizeof(struct tc_service_curve),
2894 },
2895 };
2896 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2897
2898 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2899 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2900 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2901 return EPROTO;
2902 }
2903
2904 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2905 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2906 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2907
2908 if (rsc->m1 != 0 || rsc->d != 0 ||
2909 fsc->m1 != 0 || fsc->d != 0 ||
2910 usc->m1 != 0 || usc->d != 0) {
2911 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2912 "Non-linear service curves are not supported.");
2913 return EPROTO;
2914 }
2915
2916 if (rsc->m2 != fsc->m2) {
2917 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2918 "Real-time service curves are not supported ");
2919 return EPROTO;
2920 }
2921
2922 if (rsc->m2 > usc->m2) {
2923 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2924 "Min-rate service curve is greater than "
2925 "the max-rate service curve.");
2926 return EPROTO;
2927 }
2928
2929 class->min_rate = fsc->m2;
2930 class->max_rate = usc->m2;
2931 return 0;
2932}
2933
2934static int
2935hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2936 struct hfsc_class *options,
2937 struct netdev_queue_stats *stats)
2938{
2939 int error;
2940 unsigned int handle;
2941 struct nlattr *nl_options;
2942
2943 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2944 if (error) {
2945 return error;
2946 }
2947
2948 if (queue_id) {
2949 unsigned int major, minor;
2950
2951 major = tc_get_major(handle);
2952 minor = tc_get_minor(handle);
2953 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2954 *queue_id = minor - 1;
2955 } else {
2956 return EPROTO;
2957 }
2958 }
2959
2960 if (options) {
2961 error = hfsc_parse_tca_options__(nl_options, options);
2962 }
2963
2964 return error;
2965}
2966
2967static int
2968hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2969 unsigned int parent, struct hfsc_class *options,
2970 struct netdev_queue_stats *stats)
2971{
2972 int error;
2973 struct ofpbuf *reply;
2974
2975 error = tc_query_class(netdev, handle, parent, &reply);
2976 if (error) {
2977 return error;
2978 }
2979
2980 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2981 ofpbuf_delete(reply);
2982 return error;
2983}
2984
2985static void
2986hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2987 struct hfsc_class *class)
2988{
2989 uint32_t max_rate;
2990 const char *max_rate_s;
2991
2992 max_rate_s = shash_find_data(details, "max-rate");
2993 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2994
2995 if (!max_rate) {
2996 uint32_t current;
2997
2998 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2999 max_rate = netdev_features_to_bps(current) / 8;
3000 }
3001
3002 class->min_rate = max_rate;
3003 class->max_rate = max_rate;
3004}
3005
3006static int
3007hfsc_parse_class_details__(struct netdev *netdev,
3008 const struct shash *details,
3009 struct hfsc_class * class)
3010{
3011 const struct hfsc *hfsc;
3012 uint32_t min_rate, max_rate;
3013 const char *min_rate_s, *max_rate_s;
3014
3015 hfsc = hfsc_get__(netdev);
3016 min_rate_s = shash_find_data(details, "min-rate");
3017 max_rate_s = shash_find_data(details, "max-rate");
3018
c45ab5e9 3019 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3020 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3021 min_rate = MIN(min_rate, hfsc->max_rate);
3022
3023 max_rate = (max_rate_s
3024 ? strtoull(max_rate_s, NULL, 10) / 8
3025 : hfsc->max_rate);
3026 max_rate = MAX(max_rate, min_rate);
3027 max_rate = MIN(max_rate, hfsc->max_rate);
3028
3029 class->min_rate = min_rate;
3030 class->max_rate = max_rate;
3031
3032 return 0;
3033}
3034
3035/* Create an HFSC qdisc.
3036 *
3037 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3038static int
3039hfsc_setup_qdisc__(struct netdev * netdev)
3040{
3041 struct tcmsg *tcmsg;
3042 struct ofpbuf request;
3043 struct tc_hfsc_qopt opt;
3044
3045 tc_del_qdisc(netdev);
3046
3047 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3048 NLM_F_EXCL | NLM_F_CREATE, &request);
3049
3050 if (!tcmsg) {
3051 return ENODEV;
3052 }
3053
3054 tcmsg->tcm_handle = tc_make_handle(1, 0);
3055 tcmsg->tcm_parent = TC_H_ROOT;
3056
3057 memset(&opt, 0, sizeof opt);
3058 opt.defcls = 1;
3059
3060 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3061 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3062
3063 return tc_transact(&request, NULL);
3064}
3065
3066/* Create an HFSC class.
3067 *
3068 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3069 * sc rate <min_rate> ul rate <max_rate>" */
3070static int
3071hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3072 unsigned int parent, struct hfsc_class *class)
3073{
3074 int error;
3075 size_t opt_offset;
3076 struct tcmsg *tcmsg;
3077 struct ofpbuf request;
3078 struct tc_service_curve min, max;
3079
3080 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3081
3082 if (!tcmsg) {
3083 return ENODEV;
3084 }
3085
3086 tcmsg->tcm_handle = handle;
3087 tcmsg->tcm_parent = parent;
3088
3089 min.m1 = 0;
3090 min.d = 0;
3091 min.m2 = class->min_rate;
3092
3093 max.m1 = 0;
3094 max.d = 0;
3095 max.m2 = class->max_rate;
3096
3097 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3098 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3099 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3100 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3101 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3102 nl_msg_end_nested(&request, opt_offset);
3103
3104 error = tc_transact(&request, NULL);
3105 if (error) {
3106 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3107 "min-rate %ubps, max-rate %ubps (%s)",
3108 netdev_get_name(netdev),
3109 tc_get_major(handle), tc_get_minor(handle),
3110 tc_get_major(parent), tc_get_minor(parent),
3111 class->min_rate, class->max_rate, strerror(error));
3112 }
3113
3114 return error;
3115}
3116
3117static int
3118hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3119{
3120 int error;
3121 struct hfsc_class class;
3122
3123 error = hfsc_setup_qdisc__(netdev);
3124
3125 if (error) {
3126 return error;
3127 }
3128
3129 hfsc_parse_qdisc_details__(netdev, details, &class);
3130 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3131 tc_make_handle(1, 0), &class);
3132
3133 if (error) {
3134 return error;
3135 }
3136
3137 hfsc_install__(netdev, class.max_rate);
3138 return 0;
3139}
3140
3141static int
3142hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3143{
3144 struct ofpbuf msg;
a339aa81
EJ
3145 struct nl_dump dump;
3146 struct hfsc_class hc;
3147
3148 hc.max_rate = 0;
3149 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3150 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3151
3152 if (!start_queue_dump(netdev, &dump)) {
3153 return ENODEV;
3154 }
3155
3156 while (nl_dump_next(&dump, &msg)) {
3157 unsigned int queue_id;
3158
3159 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3160 hfsc_update_queue__(netdev, queue_id, &hc);
3161 }
3162 }
3163
3164 nl_dump_done(&dump);
3165 return 0;
3166}
3167
3168static void
3169hfsc_tc_destroy(struct tc *tc)
3170{
3171 struct hfsc *hfsc;
3172 struct hfsc_class *hc, *next;
3173
3174 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3175
3176 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3177 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3178 free(hc);
3179 }
3180
3181 tc_destroy(tc);
3182 free(hfsc);
3183}
3184
3185static int
3186hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3187{
3188 const struct hfsc *hfsc;
3189 hfsc = hfsc_get__(netdev);
3190 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3191 return 0;
3192}
3193
3194static int
3195hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3196{
3197 int error;
3198 struct hfsc_class class;
3199
3200 hfsc_parse_qdisc_details__(netdev, details, &class);
3201 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3202 tc_make_handle(1, 0), &class);
3203
3204 if (!error) {
3205 hfsc_get__(netdev)->max_rate = class.max_rate;
3206 }
3207
3208 return error;
3209}
3210
3211static int
3212hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3213 const struct tc_queue *queue, struct shash *details)
3214{
3215 const struct hfsc_class *hc;
3216
3217 hc = hfsc_class_cast__(queue);
3218 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3219 if (hc->min_rate != hc->max_rate) {
3220 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3221 }
3222 return 0;
3223}
3224
3225static int
3226hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3227 const struct shash *details)
3228{
3229 int error;
3230 struct hfsc_class class;
3231
3232 error = hfsc_parse_class_details__(netdev, details, &class);
3233 if (error) {
3234 return error;
3235 }
3236
3237 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3238 tc_make_handle(1, 0xfffe), &class);
3239 if (error) {
3240 return error;
3241 }
3242
3243 hfsc_update_queue__(netdev, queue_id, &class);
3244 return 0;
3245}
3246
3247static int
3248hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3249{
3250 int error;
3251 struct hfsc *hfsc;
3252 struct hfsc_class *hc;
3253
3254 hc = hfsc_class_cast__(queue);
3255 hfsc = hfsc_get__(netdev);
3256
3257 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3258 if (!error) {
3259 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3260 free(hc);
3261 }
3262 return error;
3263}
3264
3265static int
3266hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3267 struct netdev_queue_stats *stats)
3268{
3269 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3270 tc_make_handle(1, 0xfffe), NULL, stats);
3271}
3272
3273static int
3274hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3275 const struct ofpbuf *nlmsg,
3276 netdev_dump_queue_stats_cb *cb, void *aux)
3277{
3278 struct netdev_queue_stats stats;
3279 unsigned int handle, major, minor;
3280 int error;
3281
3282 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3283 if (error) {
3284 return error;
3285 }
3286
3287 major = tc_get_major(handle);
3288 minor = tc_get_minor(handle);
3289 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3290 (*cb)(minor - 1, &stats, aux);
3291 }
3292 return 0;
3293}
3294
3295static const struct tc_ops tc_ops_hfsc = {
3296 "hfsc", /* linux_name */
3297 "linux-hfsc", /* ovs_name */
3298 HFSC_N_QUEUES, /* n_queues */
3299 hfsc_tc_install, /* tc_install */
3300 hfsc_tc_load, /* tc_load */
3301 hfsc_tc_destroy, /* tc_destroy */
3302 hfsc_qdisc_get, /* qdisc_get */
3303 hfsc_qdisc_set, /* qdisc_set */
3304 hfsc_class_get, /* class_get */
3305 hfsc_class_set, /* class_set */
3306 hfsc_class_delete, /* class_delete */
3307 hfsc_class_get_stats, /* class_get_stats */
3308 hfsc_class_dump_stats /* class_dump_stats */
3309};
3310\f
c1c9c9c4
BP
3311/* "linux-default" traffic control class.
3312 *
3313 * This class represents the default, unnamed Linux qdisc. It corresponds to
3314 * the "" (empty string) QoS type in the OVS database. */
3315
3316static void
3317default_install__(struct netdev *netdev)
3318{
3319 struct netdev_dev_linux *netdev_dev =
3320 netdev_dev_linux_cast(netdev_get_dev(netdev));
3321 static struct tc *tc;
3322
3323 if (!tc) {
3324 tc = xmalloc(sizeof *tc);
3325 tc_init(tc, &tc_ops_default);
3326 }
3327 netdev_dev->tc = tc;
3328}
3329
3330static int
3331default_tc_install(struct netdev *netdev,
3332 const struct shash *details OVS_UNUSED)
3333{
3334 default_install__(netdev);
3335 return 0;
3336}
3337
3338static int
3339default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3340{
3341 default_install__(netdev);
3342 return 0;
3343}
3344
3345static const struct tc_ops tc_ops_default = {
3346 NULL, /* linux_name */
3347 "", /* ovs_name */
3348 0, /* n_queues */
3349 default_tc_install,
3350 default_tc_load,
3351 NULL, /* tc_destroy */
3352 NULL, /* qdisc_get */
3353 NULL, /* qdisc_set */
3354 NULL, /* class_get */
3355 NULL, /* class_set */
3356 NULL, /* class_delete */
3357 NULL, /* class_get_stats */
3358 NULL /* class_dump_stats */
3359};
3360\f
3361/* "linux-other" traffic control class.
3362 *
3363 * */
3364
3365static int
3366other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3367{
3368 struct netdev_dev_linux *netdev_dev =
3369 netdev_dev_linux_cast(netdev_get_dev(netdev));
3370 static struct tc *tc;
3371
3372 if (!tc) {
3373 tc = xmalloc(sizeof *tc);
3374 tc_init(tc, &tc_ops_other);
3375 }
3376 netdev_dev->tc = tc;
3377 return 0;
3378}
3379
3380static const struct tc_ops tc_ops_other = {
3381 NULL, /* linux_name */
3382 "linux-other", /* ovs_name */
3383 0, /* n_queues */
3384 NULL, /* tc_install */
3385 other_tc_load,
3386 NULL, /* tc_destroy */
3387 NULL, /* qdisc_get */
3388 NULL, /* qdisc_set */
3389 NULL, /* class_get */
3390 NULL, /* class_set */
3391 NULL, /* class_delete */
3392 NULL, /* class_get_stats */
3393 NULL /* class_dump_stats */
3394};
3395\f
3396/* Traffic control. */
3397
3398/* Number of kernel "tc" ticks per second. */
3399static double ticks_per_s;
3400
3401/* Number of kernel "jiffies" per second. This is used for the purpose of
3402 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3403 * one jiffy's worth of data.
3404 *
3405 * There are two possibilities here:
3406 *
3407 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3408 * approximate range of 100 to 1024. That means that we really need to
3409 * make sure that the qdisc can buffer that much data.
3410 *
3411 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3412 * has finely granular timers and there's no need to fudge additional room
3413 * for buffers. (There's no extra effort needed to implement that: the
3414 * large 'buffer_hz' is used as a divisor, so practically any number will
3415 * come out as 0 in the division. Small integer results in the case of
3416 * really high dividends won't have any real effect anyhow.)
3417 */
3418static unsigned int buffer_hz;
3419
3420/* Returns tc handle 'major':'minor'. */
3421static unsigned int
3422tc_make_handle(unsigned int major, unsigned int minor)
3423{
3424 return TC_H_MAKE(major << 16, minor);
3425}
3426
3427/* Returns the major number from 'handle'. */
3428static unsigned int
3429tc_get_major(unsigned int handle)
3430{
3431 return TC_H_MAJ(handle) >> 16;
3432}
3433
3434/* Returns the minor number from 'handle'. */
3435static unsigned int
3436tc_get_minor(unsigned int handle)
3437{
3438 return TC_H_MIN(handle);
3439}
3440
3441static struct tcmsg *
3442tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3443 struct ofpbuf *request)
3444{
3445 struct tcmsg *tcmsg;
3446 int ifindex;
3447 int error;
3448
3449 error = get_ifindex(netdev, &ifindex);
3450 if (error) {
3451 return NULL;
3452 }
3453
3454 ofpbuf_init(request, 512);
3455 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3456 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3457 tcmsg->tcm_family = AF_UNSPEC;
3458 tcmsg->tcm_ifindex = ifindex;
3459 /* Caller should fill in tcmsg->tcm_handle. */
3460 /* Caller should fill in tcmsg->tcm_parent. */
3461
3462 return tcmsg;
3463}
3464
3465static int
3466tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3467{
3468 int error = nl_sock_transact(rtnl_sock, request, replyp);
3469 ofpbuf_uninit(request);
3470 return error;
3471}
3472
3473static void
3474read_psched(void)
3475{
3476 /* The values in psched are not individually very meaningful, but they are
3477 * important. The tables below show some values seen in the wild.
3478 *
3479 * Some notes:
3480 *
3481 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3482 * (Before that, there are hints that it was 1000000000.)
3483 *
3484 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3485 * above.
3486 *
3487 * /proc/net/psched
3488 * -----------------------------------
3489 * [1] 000c8000 000f4240 000f4240 00000064
3490 * [2] 000003e8 00000400 000f4240 3b9aca00
3491 * [3] 000003e8 00000400 000f4240 3b9aca00
3492 * [4] 000003e8 00000400 000f4240 00000064
3493 * [5] 000003e8 00000040 000f4240 3b9aca00
3494 * [6] 000003e8 00000040 000f4240 000000f9
3495 *
3496 * a b c d ticks_per_s buffer_hz
3497 * ------- --------- ---------- ------------- ----------- -------------
3498 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3499 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3500 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3501 * [4] 1,000 1,024 1,000,000 100 976,562 100
3502 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3503 * [6] 1,000 64 1,000,000 249 15,625,000 249
3504 *
3505 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3506 * [2] 2.6.26-1-686-bigmem from Debian lenny
3507 * [3] 2.6.26-2-sparc64 from Debian lenny
3508 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3509 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3510 * [6] 2.6.34 from kernel.org on KVM
3511 */
3512 static const char fn[] = "/proc/net/psched";
3513 unsigned int a, b, c, d;
3514 FILE *stream;
3515
3516 ticks_per_s = 1.0;
3517 buffer_hz = 100;
3518
3519 stream = fopen(fn, "r");
3520 if (!stream) {
3521 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3522 return;
3523 }
3524
3525 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3526 VLOG_WARN("%s: read failed", fn);
3527 fclose(stream);
3528 return;
3529 }
3530 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3531 fclose(stream);
3532
3533 if (!a || !c) {
3534 VLOG_WARN("%s: invalid scheduler parameters", fn);
3535 return;
3536 }
3537
3538 ticks_per_s = (double) a * c / b;
3539 if (c == 1000000) {
3540 buffer_hz = d;
3541 } else {
3542 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3543 fn, a, b, c, d);
3544 }
3545 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3546}
3547
3548/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3549 * rate of 'rate' bytes per second. */
3550static unsigned int
3551tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3552{
3553 if (!buffer_hz) {
3554 read_psched();
3555 }
3556 return (rate * ticks) / ticks_per_s;
3557}
3558
3559/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3560 * rate of 'rate' bytes per second. */
3561static unsigned int
3562tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3563{
3564 if (!buffer_hz) {
3565 read_psched();
3566 }
015c93a4 3567 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3568}
3569
3570/* Returns the number of bytes that need to be reserved for qdisc buffering at
3571 * a transmission rate of 'rate' bytes per second. */
3572static unsigned int
3573tc_buffer_per_jiffy(unsigned int rate)
3574{
3575 if (!buffer_hz) {
3576 read_psched();
3577 }
3578 return rate / buffer_hz;
3579}
3580
3581/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3582 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3583 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3584 * stores NULL into it if it is absent.
3585 *
3586 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3587 * 'msg'.
3588 *
3589 * Returns 0 if successful, otherwise a positive errno value. */
3590static int
3591tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3592 struct nlattr **options)
3593{
3594 static const struct nl_policy tca_policy[] = {
3595 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3596 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3597 };
3598 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3599
3600 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3601 tca_policy, ta, ARRAY_SIZE(ta))) {
3602 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3603 goto error;
3604 }
3605
3606 if (kind) {
3607 *kind = nl_attr_get_string(ta[TCA_KIND]);
3608 }
3609
3610 if (options) {
3611 *options = ta[TCA_OPTIONS];
3612 }
3613
3614 return 0;
3615
3616error:
3617 if (kind) {
3618 *kind = NULL;
3619 }
3620 if (options) {
3621 *options = NULL;
3622 }
3623 return EPROTO;
3624}
3625
3626/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3627 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3628 * into '*options', and its queue statistics into '*stats'. Any of the output
3629 * arguments may be null.
3630 *
3631 * Returns 0 if successful, otherwise a positive errno value. */
3632static int
3633tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3634 struct nlattr **options, struct netdev_queue_stats *stats)
3635{
3636 static const struct nl_policy tca_policy[] = {
3637 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3638 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3639 };
3640 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3641
3642 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3643 tca_policy, ta, ARRAY_SIZE(ta))) {
3644 VLOG_WARN_RL(&rl, "failed to parse class message");
3645 goto error;
3646 }
3647
3648 if (handlep) {
3649 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3650 *handlep = tc->tcm_handle;
3651 }
3652
3653 if (options) {
3654 *options = ta[TCA_OPTIONS];
3655 }
3656
3657 if (stats) {
3658 const struct gnet_stats_queue *gsq;
3659 struct gnet_stats_basic gsb;
3660
3661 static const struct nl_policy stats_policy[] = {
3662 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3663 .min_len = sizeof gsb },
3664 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3665 .min_len = sizeof *gsq },
3666 };
3667 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3668
3669 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3670 sa, ARRAY_SIZE(sa))) {
3671 VLOG_WARN_RL(&rl, "failed to parse class stats");
3672 goto error;
3673 }
3674
3675 /* Alignment issues screw up the length of struct gnet_stats_basic on
3676 * some arch/bitsize combinations. Newer versions of Linux have a
3677 * struct gnet_stats_basic_packed, but we can't depend on that. The
3678 * easiest thing to do is just to make a copy. */
3679 memset(&gsb, 0, sizeof gsb);
3680 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3681 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3682 stats->tx_bytes = gsb.bytes;
3683 stats->tx_packets = gsb.packets;
3684
3685 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3686 stats->tx_errors = gsq->drops;
3687 }
3688
3689 return 0;
3690
3691error:
3692 if (options) {
3693 *options = NULL;
3694 }
3695 if (stats) {
3696 memset(stats, 0, sizeof *stats);
3697 }
3698 return EPROTO;
3699}
3700
3701/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3702 * on 'netdev'. */
3703static int
3704tc_query_class(const struct netdev *netdev,
3705 unsigned int handle, unsigned int parent,
3706 struct ofpbuf **replyp)
3707{
3708 struct ofpbuf request;
3709 struct tcmsg *tcmsg;
3710 int error;
3711
3712 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3713 if (!tcmsg) {
3714 return ENODEV;
3715 }
c1c9c9c4
BP
3716 tcmsg->tcm_handle = handle;
3717 tcmsg->tcm_parent = parent;
3718
3719 error = tc_transact(&request, replyp);
3720 if (error) {
3721 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3722 netdev_get_name(netdev),
3723 tc_get_major(handle), tc_get_minor(handle),
3724 tc_get_major(parent), tc_get_minor(parent),
3725 strerror(error));
3726 }
3727 return error;
3728}
3729
3730/* Equivalent to "tc class del dev <name> handle <handle>". */
3731static int
3732tc_delete_class(const struct netdev *netdev, unsigned int handle)
3733{
3734 struct ofpbuf request;
3735 struct tcmsg *tcmsg;
3736 int error;
3737
3738 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3739 if (!tcmsg) {
3740 return ENODEV;
3741 }
c1c9c9c4
BP
3742 tcmsg->tcm_handle = handle;
3743 tcmsg->tcm_parent = 0;
3744
3745 error = tc_transact(&request, NULL);
3746 if (error) {
3747 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3748 netdev_get_name(netdev),
3749 tc_get_major(handle), tc_get_minor(handle),
3750 strerror(error));
3751 }
3752 return error;
3753}
3754
3755/* Equivalent to "tc qdisc del dev <name> root". */
3756static int
3757tc_del_qdisc(struct netdev *netdev)
3758{
3759 struct netdev_dev_linux *netdev_dev =
3760 netdev_dev_linux_cast(netdev_get_dev(netdev));
3761 struct ofpbuf request;
3762 struct tcmsg *tcmsg;
3763 int error;
3764
3765 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3766 if (!tcmsg) {
3767 return ENODEV;
3768 }
c1c9c9c4
BP
3769 tcmsg->tcm_handle = tc_make_handle(1, 0);
3770 tcmsg->tcm_parent = TC_H_ROOT;
3771
3772 error = tc_transact(&request, NULL);
3773 if (error == EINVAL) {
3774 /* EINVAL probably means that the default qdisc was in use, in which
3775 * case we've accomplished our purpose. */
3776 error = 0;
3777 }
3778 if (!error && netdev_dev->tc) {
3779 if (netdev_dev->tc->ops->tc_destroy) {
3780 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3781 }
3782 netdev_dev->tc = NULL;
3783 }
3784 return error;
3785}
3786
3787/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3788 * kernel to determine what they are. Returns 0 if successful, otherwise a
3789 * positive errno value. */
3790static int
3791tc_query_qdisc(const struct netdev *netdev)
3792{
3793 struct netdev_dev_linux *netdev_dev =
3794 netdev_dev_linux_cast(netdev_get_dev(netdev));
3795 struct ofpbuf request, *qdisc;
3796 const struct tc_ops *ops;
3797 struct tcmsg *tcmsg;
3798 int load_error;
3799 int error;
3800
3801 if (netdev_dev->tc) {
3802 return 0;
3803 }
3804
3805 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3806 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3807 * 2.6.35 without that fix backported to it.
3808 *
3809 * To avoid the OOPS, we must not make a request that would attempt to dump
3810 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3811 * few others. There are a few ways that I can see to do this, but most of
3812 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3813 * technique chosen here is to assume that any non-default qdisc that we
3814 * create will have a class with handle 1:0. The built-in qdiscs only have
3815 * a class with handle 0:0.
3816 *
3817 * We could check for Linux 2.6.35+ and use a more straightforward method
3818 * there. */
3819 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3820 if (!tcmsg) {
3821 return ENODEV;
3822 }
c1c9c9c4
BP
3823 tcmsg->tcm_handle = tc_make_handle(1, 0);
3824 tcmsg->tcm_parent = 0;
3825
3826 /* Figure out what tc class to instantiate. */
3827 error = tc_transact(&request, &qdisc);
3828 if (!error) {
3829 const char *kind;
3830
3831 error = tc_parse_qdisc(qdisc, &kind, NULL);
3832 if (error) {
3833 ops = &tc_ops_other;
3834 } else {
3835 ops = tc_lookup_linux_name(kind);
3836 if (!ops) {
3837 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3838 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3839
3840 ops = &tc_ops_other;
3841 }
3842 }
3843 } else if (error == ENOENT) {
3844 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3845 * other entity that doesn't have a handle 1:0. We will assume
3846 * that it's the system default qdisc. */
3847 ops = &tc_ops_default;
3848 error = 0;
3849 } else {
3850 /* Who knows? Maybe the device got deleted. */
3851 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3852 netdev_get_name(netdev), strerror(error));
3853 ops = &tc_ops_other;
3854 }
3855
3856 /* Instantiate it. */
3857 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3858 assert((load_error == 0) == (netdev_dev->tc != NULL));
3859 ofpbuf_delete(qdisc);
3860
3861 return error ? error : load_error;
3862}
3863
3864/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3865 approximate the time to transmit packets of various lengths. For an MTU of
3866 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3867 represents two possible packet lengths; for a MTU of 513 through 1024, four
3868 possible lengths; and so on.
3869
3870 Returns, for the specified 'mtu', the number of bits that packet lengths
3871 need to be shifted right to fit within such a 256-entry table. */
3872static int
3873tc_calc_cell_log(unsigned int mtu)
3874{
3875 int cell_log;
3876
3877 if (!mtu) {
3878 mtu = ETH_PAYLOAD_MAX;
3879 }
3880 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3881
3882 for (cell_log = 0; mtu >= 256; cell_log++) {
3883 mtu >>= 1;
3884 }
3885
3886 return cell_log;
3887}
3888
3889/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3890 * of 'mtu'. */
3891static void
3892tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3893{
3894 memset(rate, 0, sizeof *rate);
3895 rate->cell_log = tc_calc_cell_log(mtu);
3896 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3897 /* rate->cell_align = 0; */ /* distro headers. */
3898 rate->mpu = ETH_TOTAL_MIN;
3899 rate->rate = Bps;
3900}
3901
3902/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3903 * attribute of the specified "type".
3904 *
3905 * See tc_calc_cell_log() above for a description of "rtab"s. */
3906static void
3907tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3908{
3909 uint32_t *rtab;
3910 unsigned int i;
3911
3912 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3913 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3914 unsigned packet_size = (i + 1) << rate->cell_log;
3915 if (packet_size < rate->mpu) {
3916 packet_size = rate->mpu;
3917 }
3918 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3919 }
3920}
3921
3922/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3923 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3924 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3925 * 0 is fine.) */
c1c9c9c4
BP
3926static int
3927tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3928{
3929 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3930 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3931}
d3980822 3932\f
aaf2fb1a
BP
3933/* Linux-only functions declared in netdev-linux.h */
3934
3935/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
3936 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
3937int
3938netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
3939 const char *flag_name, bool enable)
3940{
3941 const char *netdev_name = netdev_get_name(netdev);
3942 struct ethtool_value evalue;
3943 uint32_t new_flags;
3944 int error;
3945
3946 memset(&evalue, 0, sizeof evalue);
3947 error = netdev_linux_do_ethtool(netdev_name,
3948 (struct ethtool_cmd *)&evalue,
3949 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3950 if (error) {
3951 return error;
3952 }
3953
3954 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
3955 error = netdev_linux_do_ethtool(netdev_name,
3956 (struct ethtool_cmd *)&evalue,
3957 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
3958 if (error) {
3959 return error;
3960 }
3961
3962 memset(&evalue, 0, sizeof evalue);
3963 error = netdev_linux_do_ethtool(netdev_name,
3964 (struct ethtool_cmd *)&evalue,
3965 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3966 if (error) {
3967 return error;
3968 }
3969
3970 if (new_flags != evalue.data) {
3971 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
3972 "device %s failed", enable ? "enable" : "disable",
3973 flag_name, netdev_name);
3974 return EOPNOTSUPP;
3975 }
3976
3977 return 0;
3978}
3979\f
3980/* Utility functions. */
3981
d3980822 3982/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 3983static void
d3980822
BP
3984netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3985 const struct rtnl_link_stats *src)
3986{
f613a0d7
PS
3987 dst->rx_packets = src->rx_packets;
3988 dst->tx_packets = src->tx_packets;
3989 dst->rx_bytes = src->rx_bytes;
3990 dst->tx_bytes = src->tx_bytes;
3991 dst->rx_errors = src->rx_errors;
3992 dst->tx_errors = src->tx_errors;
3993 dst->rx_dropped = src->rx_dropped;
3994 dst->tx_dropped = src->tx_dropped;
3995 dst->multicast = src->multicast;
3996 dst->collisions = src->collisions;
3997 dst->rx_length_errors = src->rx_length_errors;
3998 dst->rx_over_errors = src->rx_over_errors;
3999 dst->rx_crc_errors = src->rx_crc_errors;
4000 dst->rx_frame_errors = src->rx_frame_errors;
4001 dst->rx_fifo_errors = src->rx_fifo_errors;
4002 dst->rx_missed_errors = src->rx_missed_errors;
4003 dst->tx_aborted_errors = src->tx_aborted_errors;
4004 dst->tx_carrier_errors = src->tx_carrier_errors;
4005 dst->tx_fifo_errors = src->tx_fifo_errors;
4006 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4007 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4008}
4009
c1c9c9c4
BP
4010static int
4011get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4012{
4013 /* Policy for RTNLGRP_LINK messages.
4014 *
4015 * There are *many* more fields in these messages, but currently we only
4016 * care about these fields. */
4017 static const struct nl_policy rtnlgrp_link_policy[] = {
4018 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4019 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4020 .min_len = sizeof(struct rtnl_link_stats) },
4021 };
4022
4023 struct ofpbuf request;
4024 struct ofpbuf *reply;
4025 struct ifinfomsg *ifi;
c1c9c9c4
BP
4026 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4027 int error;
4028
4029 ofpbuf_init(&request, 0);
4030 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4031 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4032 ifi->ifi_family = PF_UNSPEC;
4033 ifi->ifi_index = ifindex;
4034 error = nl_sock_transact(rtnl_sock, &request, &reply);
4035 ofpbuf_uninit(&request);
4036 if (error) {
4037 return error;
4038 }
4039
4040 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4041 rtnlgrp_link_policy,
4042 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4043 ofpbuf_delete(reply);
4044 return EPROTO;
4045 }
4046
4047 if (!attrs[IFLA_STATS]) {
4048 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4049 ofpbuf_delete(reply);
4050 return EPROTO;
4051 }
8b61709d 4052
d3980822 4053 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4054
576e26d7
BP
4055 ofpbuf_delete(reply);
4056
8b61709d
BP
4057 return 0;
4058}
4059
4060static int
4061get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4062{
4063 static const char fn[] = "/proc/net/dev";
4064 char line[1024];
4065 FILE *stream;
4066 int ln;
4067
4068 stream = fopen(fn, "r");
4069 if (!stream) {
4070 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4071 return errno;
4072 }
4073
4074 ln = 0;
4075 while (fgets(line, sizeof line, stream)) {
4076 if (++ln >= 3) {
4077 char devname[16];
4078#define X64 "%"SCNu64
4079 if (sscanf(line,
4080 " %15[^:]:"
4081 X64 X64 X64 X64 X64 X64 X64 "%*u"
4082 X64 X64 X64 X64 X64 X64 X64 "%*u",
4083 devname,
4084 &stats->rx_bytes,
4085 &stats->rx_packets,
4086 &stats->rx_errors,
4087 &stats->rx_dropped,
4088 &stats->rx_fifo_errors,
4089 &stats->rx_frame_errors,
4090 &stats->multicast,
4091 &stats->tx_bytes,
4092 &stats->tx_packets,
4093 &stats->tx_errors,
4094 &stats->tx_dropped,
4095 &stats->tx_fifo_errors,
4096 &stats->collisions,
4097 &stats->tx_carrier_errors) != 15) {
4098 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4099 } else if (!strcmp(devname, netdev_name)) {
4100 stats->rx_length_errors = UINT64_MAX;
4101 stats->rx_over_errors = UINT64_MAX;
4102 stats->rx_crc_errors = UINT64_MAX;
4103 stats->rx_missed_errors = UINT64_MAX;
4104 stats->tx_aborted_errors = UINT64_MAX;
4105 stats->tx_heartbeat_errors = UINT64_MAX;
4106 stats->tx_window_errors = UINT64_MAX;
4107 fclose(stream);
4108 return 0;
4109 }
4110 }
4111 }
4112 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4113 fclose(stream);
4114 return ENODEV;
4115}
c1c9c9c4 4116
3a183124
EJ
4117static int
4118get_carrier_via_sysfs(const char *name, bool *carrier)
4119{
4120 char line[8];
4121 int retval;
4122
4123 int error = 0;
4124 char *fn = NULL;
4125 int fd = -1;
4126
4127 *carrier = false;
4128
4129 fn = xasprintf("/sys/class/net/%s/carrier", name);
4130 fd = open(fn, O_RDONLY);
4131 if (fd < 0) {
4132 error = errno;
4133 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4134 goto exit;
4135 }
4136
4137 retval = read(fd, line, sizeof line);
4138 if (retval < 0) {
4139 error = errno;
4140 if (error == EINVAL) {
4141 /* This is the normal return value when we try to check carrier if
4142 * the network device is not up. */
4143 } else {
4144 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4145 }
4146 goto exit;
4147 } else if (retval == 0) {
4148 error = EPROTO;
4149 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4150 goto exit;
4151 }
4152
4153 if (line[0] != '0' && line[0] != '1') {
4154 error = EPROTO;
4155 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4156 goto exit;
4157 }
4158 *carrier = line[0] != '0';
4159 error = 0;
4160
4161exit:
4162 if (fd >= 0) {
4163 close(fd);
4164 }
4165 free(fn);
4166 return error;
4167}
4168
8b61709d
BP
4169static int
4170get_flags(const struct netdev *netdev, int *flags)
4171{
4172 struct ifreq ifr;
4173 int error;
4174
149f577a
JG
4175 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4176 "SIOCGIFFLAGS");
8b61709d
BP
4177 *flags = ifr.ifr_flags;
4178 return error;
4179}
4180
4181static int
4182set_flags(struct netdev *netdev, int flags)
4183{
4184 struct ifreq ifr;
4185
4186 ifr.ifr_flags = flags;
149f577a
JG
4187 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4188 "SIOCSIFFLAGS");
8b61709d
BP
4189}
4190
4191static int
4192do_get_ifindex(const char *netdev_name)
4193{
4194 struct ifreq ifr;
4195
71d7c22f 4196 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4197 COVERAGE_INC(netdev_get_ifindex);
4198 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4199 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4200 netdev_name, strerror(errno));
4201 return -errno;
4202 }
4203 return ifr.ifr_ifindex;
4204}
4205
4206static int
4207get_ifindex(const struct netdev *netdev_, int *ifindexp)
4208{
149f577a
JG
4209 struct netdev_dev_linux *netdev_dev =
4210 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4211 *ifindexp = 0;
149f577a 4212 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4213 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4214 if (ifindex < 0) {
4215 return -ifindex;
4216 }
149f577a
JG
4217 netdev_dev->cache_valid |= VALID_IFINDEX;
4218 netdev_dev->ifindex = ifindex;
8b61709d 4219 }
149f577a 4220 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4221 return 0;
4222}
4223
4224static int
4225get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4226{
4227 struct ifreq ifr;
4228 int hwaddr_family;
4229
4230 memset(&ifr, 0, sizeof ifr);
71d7c22f 4231 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4232 COVERAGE_INC(netdev_get_hwaddr);
4233 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4234 /* ENODEV probably means that a vif disappeared asynchronously and
4235 * hasn't been removed from the database yet, so reduce the log level
4236 * to INFO for that case. */
4237 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4238 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4239 netdev_name, strerror(errno));
8b61709d
BP
4240 return errno;
4241 }
4242 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4243 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4244 VLOG_WARN("%s device has unknown hardware address family %d",
4245 netdev_name, hwaddr_family);
4246 }
4247 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4248 return 0;
4249}
4250
4251static int
4252set_etheraddr(const char *netdev_name, int hwaddr_family,
4253 const uint8_t mac[ETH_ADDR_LEN])
4254{
4255 struct ifreq ifr;
4256
4257 memset(&ifr, 0, sizeof ifr);
71d7c22f 4258 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4259 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4260 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4261 COVERAGE_INC(netdev_set_hwaddr);
4262 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4263 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4264 netdev_name, strerror(errno));
4265 return errno;
4266 }
4267 return 0;
4268}
4269
4270static int
0b0544d7 4271netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4272 int cmd, const char *cmd_name)
4273{
4274 struct ifreq ifr;
4275
4276 memset(&ifr, 0, sizeof ifr);
71d7c22f 4277 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4278 ifr.ifr_data = (caddr_t) ecmd;
4279
4280 ecmd->cmd = cmd;
4281 COVERAGE_INC(netdev_ethtool);
4282 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4283 return 0;
4284 } else {
4285 if (errno != EOPNOTSUPP) {
4286 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4287 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4288 } else {
4289 /* The device doesn't support this operation. That's pretty
4290 * common, so there's no point in logging anything. */
4291 }
4292 return errno;
4293 }
4294}
4295
4296static int
149f577a
JG
4297netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4298 const char *cmd_name)
8b61709d 4299{
71d7c22f 4300 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4301 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4302 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4303 strerror(errno));
8b61709d
BP
4304 return errno;
4305 }
4306 return 0;
4307}
f1acd62b
BP
4308
4309static int
4310netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4311 int cmd, const char *cmd_name)
4312{
4313 struct ifreq ifr;
4314 int error;
4315
4316 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4317 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4318 if (!error) {
4319 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4320 *ip = sin->sin_addr;
4321 }
4322 return error;
4323}
488d734d
BP
4324
4325/* Returns an AF_PACKET raw socket or a negative errno value. */
4326static int
4327af_packet_sock(void)
4328{
4329 static int sock = INT_MIN;
4330
4331 if (sock == INT_MIN) {
4332 sock = socket(AF_PACKET, SOCK_RAW, 0);
4333 if (sock >= 0) {
4334 set_nonblocking(sock);
4335 } else {
4336 sock = -errno;
4337 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4338 }
4339 }
4340
4341 return sock;
4342}