]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
reconnect.py: Fix Python 2.4 compatibility break.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
782e6111 2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
8b61709d 18#include <assert.h>
e9e28be3 19#include <errno.h>
8b61709d
BP
20#include <fcntl.h>
21#include <arpa/inet.h>
22#include <inttypes.h>
c1c9c9c4 23#include <linux/gen_stats.h>
8b61709d 24#include <linux/if_tun.h>
a740f0de 25#include <linux/ip.h>
8b61709d
BP
26#include <linux/types.h>
27#include <linux/ethtool.h>
63331829 28#include <linux/mii.h>
6f42c8ea 29#include <linux/pkt_sched.h>
e9e28be3 30#include <linux/rtnetlink.h>
8b61709d
BP
31#include <linux/sockios.h>
32#include <linux/version.h>
33#include <sys/types.h>
34#include <sys/ioctl.h>
35#include <sys/socket.h>
36#include <netpacket/packet.h>
37#include <net/ethernet.h>
38#include <net/if.h>
a740f0de 39#include <linux/if_tunnel.h>
8b61709d
BP
40#include <net/if_arp.h>
41#include <net/if_packet.h>
42#include <net/route.h>
43#include <netinet/in.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
9fe3b9a2 50#include "dpif-linux.h"
8b61709d
BP
51#include "dynamic-string.h"
52#include "fatal-signal.h"
93b13be8
BP
53#include "hash.h"
54#include "hmap.h"
8b61709d 55#include "netdev-provider.h"
7fbef77a 56#include "netdev-vport.h"
e9e28be3 57#include "netlink.h"
2fe27d5a 58#include "netlink-socket.h"
e9e28be3 59#include "ofpbuf.h"
8b61709d
BP
60#include "openflow/openflow.h"
61#include "packets.h"
62#include "poll-loop.h"
559843ed 63#include "rtnetlink.h"
21d6e22e 64#include "rtnetlink-link.h"
8b61709d
BP
65#include "socket-util.h"
66#include "shash.h"
67#include "svec.h"
e9e28be3 68#include "vlog.h"
5136ce49 69
d98e6007 70VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
71
72COVERAGE_DEFINE(netdev_get_vlan_vid);
73COVERAGE_DEFINE(netdev_set_policing);
74COVERAGE_DEFINE(netdev_arp_lookup);
75COVERAGE_DEFINE(netdev_get_ifindex);
76COVERAGE_DEFINE(netdev_get_hwaddr);
77COVERAGE_DEFINE(netdev_set_hwaddr);
78COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
79\f
80/* These were introduced in Linux 2.6.14, so they might be missing if we have
81 * old headers. */
82#ifndef ADVERTISED_Pause
83#define ADVERTISED_Pause (1 << 13)
84#endif
85#ifndef ADVERTISED_Asym_Pause
86#define ADVERTISED_Asym_Pause (1 << 14)
87#endif
88
c1c9c9c4
BP
89/* This was introduced in Linux 2.6.25, so it might be missing if we have old
90 * headers. */
91#ifndef TC_RTAB_SIZE
92#define TC_RTAB_SIZE 1024
93#endif
94
149f577a 95static struct rtnetlink_notifier netdev_linux_cache_notifier;
46415c90 96static int cache_notifier_refcount;
8b61709d
BP
97
98enum {
7fbef77a
JG
99 VALID_IFINDEX = 1 << 0,
100 VALID_ETHERADDR = 1 << 1,
101 VALID_IN4 = 1 << 2,
102 VALID_IN6 = 1 << 3,
103 VALID_MTU = 1 << 4,
104 VALID_CARRIER = 1 << 5,
105 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
106 VALID_POLICING = 1 << 7,
107 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
108};
109
149f577a
JG
110struct tap_state {
111 int fd;
61b999dd 112 bool opened;
149f577a 113};
c1c9c9c4
BP
114\f
115/* Traffic control. */
116
117/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
118 * network device.
119 *
120 * Each TC implementation subclasses this with whatever additional data it
121 * needs. */
c1c9c9c4
BP
122struct tc {
123 const struct tc_ops *ops;
93b13be8
BP
124 struct hmap queues; /* Contains "struct tc_queue"s.
125 * Read by generic TC layer.
126 * Written only by TC implementation. */
127};
c1c9c9c4 128
93b13be8
BP
129/* One traffic control queue.
130 *
131 * Each TC implementation subclasses this with whatever additional data it
132 * needs. */
133struct tc_queue {
134 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
135 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
136};
137
138/* A particular kind of traffic control. Each implementation generally maps to
139 * one particular Linux qdisc class.
140 *
141 * The functions below return 0 if successful or a positive errno value on
142 * failure, except where otherwise noted. All of them must be provided, except
143 * where otherwise noted. */
144struct tc_ops {
145 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
146 * This is null for tc_ops_default and tc_ops_other, for which there are no
147 * appropriate values. */
148 const char *linux_name;
149
150 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
151 const char *ovs_name;
152
153 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
154 * queues. The queues are numbered 0 through n_queues - 1. */
155 unsigned int n_queues;
156
157 /* Called to install this TC class on 'netdev'. The implementation should
158 * make the Netlink calls required to set up 'netdev' with the right qdisc
159 * and configure it according to 'details'. The implementation may assume
160 * that the current qdisc is the default; that is, there is no need for it
161 * to delete the current qdisc before installing itself.
162 *
163 * The contents of 'details' should be documented as valid for 'ovs_name'
164 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
165 * (which is built as ovs-vswitchd.conf.db(8)).
166 *
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'.
169 *
170 * (This function is null for tc_ops_other, which cannot be installed. For
171 * other TC classes it should always be nonnull.) */
172 int (*tc_install)(struct netdev *netdev, const struct shash *details);
173
174 /* Called when the netdev code determines (through a Netlink query) that
175 * this TC class's qdisc is installed on 'netdev', but we didn't install
176 * it ourselves and so don't know any of the details.
177 *
178 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
179 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
180 * implementation should parse the other attributes of 'nlmsg' as
181 * necessary to determine its configuration. If necessary it should also
182 * use Netlink queries to determine the configuration of queues on
183 * 'netdev'.
184 *
185 * This function must return 0 if and only if it sets 'netdev->tc' to an
186 * initialized 'struct tc'. */
187 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
188
189 /* Destroys the data structures allocated by the implementation as part of
190 * 'tc'. (This includes destroying 'tc->queues' by calling
191 * tc_destroy(tc).
192 *
193 * The implementation should not need to perform any Netlink calls. If
194 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
195 * (But it may not be desirable.)
196 *
197 * This function may be null if 'tc' is trivial. */
198 void (*tc_destroy)(struct tc *tc);
199
200 /* Retrieves details of 'netdev->tc' configuration into 'details'.
201 *
202 * The implementation should not need to perform any Netlink calls, because
203 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
204 * cached the configuration.
205 *
206 * The contents of 'details' should be documented as valid for 'ovs_name'
207 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
208 * (which is built as ovs-vswitchd.conf.db(8)).
209 *
210 * This function may be null if 'tc' is not configurable.
211 */
212 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
213
214 /* Reconfigures 'netdev->tc' according to 'details', performing any
215 * required Netlink calls to complete the reconfiguration.
216 *
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
220 *
221 * This function may be null if 'tc' is not configurable.
222 */
223 int (*qdisc_set)(struct netdev *, const struct shash *details);
224
93b13be8
BP
225 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
226 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
227 *
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
231 *
232 * The implementation should not need to perform any Netlink calls, because
233 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
234 * cached the queue configuration.
235 *
236 * This function may be null if 'tc' does not have queues ('n_queues' is
237 * 0). */
93b13be8 238 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
239 struct shash *details);
240
241 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
242 * 'details', perfoming any required Netlink calls to complete the
243 * reconfiguration. The caller ensures that 'queue_id' is less than
244 * 'n_queues'.
245 *
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "Queue" table in
248 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
249 *
250 * This function may be null if 'tc' does not have queues or its queues are
251 * not configurable. */
252 int (*class_set)(struct netdev *, unsigned int queue_id,
253 const struct shash *details);
254
93b13be8
BP
255 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
256 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
257 *
258 * This function may be null if 'tc' does not have queues or its queues
259 * cannot be deleted. */
93b13be8 260 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 261
93b13be8
BP
262 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
263 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
264 *
265 * On success, initializes '*stats'.
266 *
267 * This function may be null if 'tc' does not have queues or if it cannot
268 * report queue statistics. */
93b13be8
BP
269 int (*class_get_stats)(const struct netdev *netdev,
270 const struct tc_queue *queue,
c1c9c9c4
BP
271 struct netdev_queue_stats *stats);
272
273 /* Extracts queue stats from 'nlmsg', which is a response to a
274 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
275 *
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_dump_stats)(const struct netdev *netdev,
279 const struct ofpbuf *nlmsg,
280 netdev_dump_queue_stats_cb *cb, void *aux);
281};
282
283static void
284tc_init(struct tc *tc, const struct tc_ops *ops)
285{
286 tc->ops = ops;
93b13be8 287 hmap_init(&tc->queues);
c1c9c9c4
BP
288}
289
290static void
291tc_destroy(struct tc *tc)
292{
93b13be8 293 hmap_destroy(&tc->queues);
c1c9c9c4
BP
294}
295
296static const struct tc_ops tc_ops_htb;
a339aa81 297static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
298static const struct tc_ops tc_ops_default;
299static const struct tc_ops tc_ops_other;
300
301static const struct tc_ops *tcs[] = {
302 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 303 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
304 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
305 &tc_ops_other, /* Some other qdisc. */
306 NULL
307};
149f577a 308
c1c9c9c4
BP
309static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
310static unsigned int tc_get_major(unsigned int handle);
311static unsigned int tc_get_minor(unsigned int handle);
312
313static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
314static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
315static unsigned int tc_buffer_per_jiffy(unsigned int rate);
316
317static struct tcmsg *tc_make_request(const struct netdev *, int type,
318 unsigned int flags, struct ofpbuf *);
319static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
320
321static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
322 struct nlattr **options);
323static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
324 struct nlattr **options,
325 struct netdev_queue_stats *);
326static int tc_query_class(const struct netdev *,
327 unsigned int handle, unsigned int parent,
328 struct ofpbuf **replyp);
329static int tc_delete_class(const struct netdev *, unsigned int handle);
330
331static int tc_del_qdisc(struct netdev *netdev);
332static int tc_query_qdisc(const struct netdev *netdev);
333
334static int tc_calc_cell_log(unsigned int mtu);
335static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
336static void tc_put_rtab(struct ofpbuf *, uint16_t type,
337 const struct tc_ratespec *rate);
338static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
339\f
149f577a
JG
340struct netdev_dev_linux {
341 struct netdev_dev netdev_dev;
342
8b61709d 343 struct shash_node *shash_node;
149f577a 344 unsigned int cache_valid;
8b61709d 345
8722022c
BP
346 /* The following are figured out "on demand" only. They are only valid
347 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
348 int ifindex;
349 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 350 struct in_addr address, netmask;
8b61709d
BP
351 struct in6_addr in6;
352 int mtu;
353 int carrier;
8722022c
BP
354 bool is_internal; /* Is this an openvswitch internal device? */
355 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
356 uint32_t kbits_rate; /* Policing data. */
357 uint32_t kbits_burst;
7fbef77a 358 bool have_vport_stats;
c1c9c9c4 359 struct tc *tc;
149f577a
JG
360
361 union {
362 struct tap_state tap;
363 } state;
8b61709d
BP
364};
365
149f577a
JG
366struct netdev_linux {
367 struct netdev netdev;
5b7448ed 368 int fd;
149f577a 369};
8b61709d 370
8b61709d
BP
371/* An AF_INET socket (used for ioctl operations). */
372static int af_inet_sock = -1;
373
ff4ed3c9
BP
374/* A Netlink routing socket that is not subscribed to any multicast groups. */
375static struct nl_sock *rtnl_sock;
376
8b61709d
BP
377struct netdev_linux_notifier {
378 struct netdev_notifier notifier;
379 struct list node;
380};
381
382static struct shash netdev_linux_notifiers =
383 SHASH_INITIALIZER(&netdev_linux_notifiers);
46097491 384static struct rtnetlink_notifier netdev_linux_poll_notifier;
8b61709d
BP
385
386/* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
389
15b3596a 390static int netdev_linux_init(void);
6f643e49 391
0b0544d7 392static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 393 int cmd, const char *cmd_name);
149f577a
JG
394static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
f1acd62b
BP
396static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
8b61709d
BP
398static int get_flags(const struct netdev *, int *flagsp);
399static int set_flags(struct netdev *, int flags);
400static int do_get_ifindex(const char *netdev_name);
401static int get_ifindex(const struct netdev *, int *ifindexp);
402static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
410
15b3596a
JG
411static bool
412is_netdev_linux_class(const struct netdev_class *netdev_class)
413{
414 return netdev_class->init == netdev_linux_init;
415}
416
149f577a
JG
417static struct netdev_dev_linux *
418netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 419{
15b3596a
JG
420 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
421 assert(is_netdev_linux_class(netdev_class));
422
149f577a 423 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
424}
425
8b61709d
BP
426static struct netdev_linux *
427netdev_linux_cast(const struct netdev *netdev)
428{
15b3596a
JG
429 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
430 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
431 assert(is_netdev_linux_class(netdev_class));
432
8b61709d
BP
433 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
434}
ff4ed3c9 435\f
8b61709d
BP
436static int
437netdev_linux_init(void)
438{
439 static int status = -1;
440 if (status < 0) {
ff4ed3c9 441 /* Create AF_INET socket. */
8b61709d
BP
442 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
443 status = af_inet_sock >= 0 ? 0 : errno;
444 if (status) {
445 VLOG_ERR("failed to create inet socket: %s", strerror(status));
446 }
ff4ed3c9
BP
447
448 /* Create rtnetlink socket. */
449 if (!status) {
cceb11f5 450 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
451 if (status) {
452 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
453 strerror(status));
454 }
455 }
8b61709d
BP
456 }
457 return status;
458}
459
460static void
461netdev_linux_run(void)
462{
21d6e22e 463 rtnetlink_link_notifier_run();
8b61709d
BP
464}
465
466static void
467netdev_linux_wait(void)
468{
21d6e22e 469 rtnetlink_link_notifier_wait();
8b61709d
BP
470}
471
472static void
21d6e22e 473netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 474 void *aux OVS_UNUSED)
8b61709d 475{
149f577a 476 struct netdev_dev_linux *dev;
8b61709d 477 if (change) {
46415c90
JG
478 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
479 if (base_dev) {
15b3596a
JG
480 const struct netdev_class *netdev_class =
481 netdev_dev_get_class(base_dev);
482
483 if (is_netdev_linux_class(netdev_class)) {
484 dev = netdev_dev_linux_cast(base_dev);
485 dev->cache_valid = 0;
486 }
8b61709d
BP
487 }
488 } else {
46415c90 489 struct shash device_shash;
8b61709d 490 struct shash_node *node;
46415c90
JG
491
492 shash_init(&device_shash);
493 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
494 SHASH_FOR_EACH (node, &device_shash) {
149f577a
JG
495 dev = node->data;
496 dev->cache_valid = 0;
8b61709d 497 }
46415c90 498 shash_destroy(&device_shash);
8b61709d
BP
499 }
500}
501
c3827f61 502/* Creates system and internal devices. */
8b61709d 503static int
c3827f61 504netdev_linux_create(const struct netdev_class *class,
b8dcf5e9
BP
505 const char *name, const struct shash *args,
506 struct netdev_dev **netdev_devp)
6c88d577 507{
149f577a
JG
508 struct netdev_dev_linux *netdev_dev;
509 int error;
6c88d577
JP
510
511 if (!shash_is_empty(args)) {
c3827f61
BP
512 VLOG_WARN("%s: arguments for %s devices should be empty",
513 name, class->type);
6c88d577
JP
514 }
515
46415c90 516 if (!cache_notifier_refcount) {
21d6e22e
EJ
517 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
518 netdev_linux_cache_cb, NULL);
149f577a
JG
519 if (error) {
520 return error;
521 }
522 }
46415c90 523 cache_notifier_refcount++;
6c88d577 524
149f577a 525 netdev_dev = xzalloc(sizeof *netdev_dev);
6d9e6eb4 526 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
46415c90 527
149f577a 528 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
529 return 0;
530}
531
5b7448ed
JG
532/* For most types of netdevs we open the device for each call of
533 * netdev_open(). However, this is not the case with tap devices,
534 * since it is only possible to open the device once. In this
535 * situation we share a single file descriptor, and consequently
536 * buffers, across all readers. Therefore once data is read it will
537 * be unavailable to other reads for tap devices. */
a740f0de 538static int
b8dcf5e9
BP
539netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
540 const char *name, const struct shash *args,
541 struct netdev_dev **netdev_devp)
a740f0de 542{
149f577a 543 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
544 struct tap_state *state;
545 static const char tap_dev[] = "/dev/net/tun";
546 struct ifreq ifr;
547 int error;
548
549 if (!shash_is_empty(args)) {
149f577a 550 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
6c88d577
JP
551 }
552
149f577a
JG
553 netdev_dev = xzalloc(sizeof *netdev_dev);
554 state = &netdev_dev->state.tap;
a740f0de 555
6c88d577 556 /* Open tap device. */
149f577a
JG
557 state->fd = open(tap_dev, O_RDWR);
558 if (state->fd < 0) {
6c88d577
JP
559 error = errno;
560 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
561 goto error;
562 }
563
564 /* Create tap device. */
565 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 566 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 567 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
568 VLOG_WARN("%s: creating tap device failed: %s", name,
569 strerror(errno));
570 error = errno;
571 goto error;
572 }
573
574 /* Make non-blocking. */
149f577a 575 error = set_nonblocking(state->fd);
a740f0de
JG
576 if (error) {
577 goto error;
578 }
579
6d9e6eb4 580 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
149f577a 581 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
582 return 0;
583
584error:
149f577a 585 free(netdev_dev);
a740f0de
JG
586 return error;
587}
588
a740f0de 589static void
149f577a 590destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 591{
149f577a
JG
592 struct tap_state *state = &netdev_dev->state.tap;
593
594 if (state->fd >= 0) {
595 close(state->fd);
a740f0de
JG
596 }
597}
598
149f577a 599/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 600static void
149f577a 601netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 602{
149f577a 603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 604 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 605
c1c9c9c4
BP
606 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
607 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
608 }
609
d2bb2799 610 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 611 cache_notifier_refcount--;
149f577a 612
46415c90 613 if (!cache_notifier_refcount) {
21d6e22e 614 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
149f577a 615 }
d2bb2799 616 } else if (class == &netdev_tap_class) {
149f577a 617 destroy_tap(netdev_dev);
d2bb2799
BP
618 } else {
619 NOT_REACHED();
6c88d577 620 }
149f577a 621
658797c8 622 free(netdev_dev);
6c88d577
JP
623}
624
8b61709d 625static int
5b7448ed 626netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
149f577a 627 struct netdev **netdevp)
8b61709d 628{
5b7448ed 629 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
630 struct netdev_linux *netdev;
631 enum netdev_flags flags;
632 int error;
633
634 /* Allocate network device. */
ec6fde61 635 netdev = xzalloc(sizeof *netdev);
49a6a163 636 netdev->fd = -1;
5b7448ed 637 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 638
c3827f61
BP
639 /* Verify that the device really exists, by attempting to read its flags.
640 * (The flags might be cached, in which case this won't actually do an
641 * ioctl.)
642 *
643 * Don't do this for "internal" netdevs, though, because those have to be
644 * created as netdev objects before they exist in the kernel, because
645 * creating them in the kernel happens by passing a netdev object to
646 * dpif_port_add(). */
647 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
648 error = netdev_get_flags(&netdev->netdev, &flags);
649 if (error == ENODEV) {
650 goto error;
651 }
8b61709d
BP
652 }
653
61b999dd
JG
654 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
655 !netdev_dev->state.tap.opened) {
656
657 /* We assume that the first user of the tap device is the primary user
658 * and give them the tap FD. Subsequent users probably just expect
659 * this to be a system device so open it normally to avoid send/receive
660 * directions appearing to be reversed. */
5b7448ed 661 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 662 netdev_dev->state.tap.opened = true;
5b7448ed 663 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
8b61709d
BP
664 struct sockaddr_ll sll;
665 int protocol;
666 int ifindex;
667
668 /* Create file descriptor. */
669 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
670 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
671 : ethertype);
5b7448ed
JG
672 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
673 if (netdev->fd < 0) {
8b61709d
BP
674 error = errno;
675 goto error;
676 }
8b61709d
BP
677
678 /* Set non-blocking mode. */
5b7448ed 679 error = set_nonblocking(netdev->fd);
8b61709d
BP
680 if (error) {
681 goto error;
682 }
683
684 /* Get ethernet device index. */
685 error = get_ifindex(&netdev->netdev, &ifindex);
686 if (error) {
687 goto error;
688 }
689
690 /* Bind to specific ethernet device. */
691 memset(&sll, 0, sizeof sll);
692 sll.sll_family = AF_PACKET;
693 sll.sll_ifindex = ifindex;
5b7448ed 694 if (bind(netdev->fd,
8b61709d
BP
695 (struct sockaddr *) &sll, sizeof sll) < 0) {
696 error = errno;
5b7448ed 697 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
149f577a 698 strerror(error));
8b61709d
BP
699 goto error;
700 }
701
702 /* Between the socket() and bind() calls above, the socket receives all
703 * packets of the requested type on all system interfaces. We do not
704 * want to receive that data, but there is no way to avoid it. So we
705 * must now drain out the receive queue. */
5b7448ed 706 error = drain_rcvbuf(netdev->fd);
8b61709d
BP
707 if (error) {
708 goto error;
709 }
710 }
711
712 *netdevp = &netdev->netdev;
713 return 0;
714
715error:
149f577a 716 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
717 return error;
718}
719
720/* Closes and destroys 'netdev'. */
721static void
722netdev_linux_close(struct netdev *netdev_)
723{
724 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725
49a6a163 726 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 727 close(netdev->fd);
8b61709d
BP
728 }
729 free(netdev);
730}
e9e28be3 731
8b61709d
BP
732/* Initializes 'svec' with a list of the names of all known network devices. */
733static int
734netdev_linux_enumerate(struct svec *svec)
735{
736 struct if_nameindex *names;
737
738 names = if_nameindex();
739 if (names) {
740 size_t i;
741
742 for (i = 0; names[i].if_name != NULL; i++) {
743 svec_add(svec, names[i].if_name);
744 }
745 if_freenameindex(names);
746 return 0;
747 } else {
748 VLOG_WARN("could not obtain list of network device names: %s",
749 strerror(errno));
750 return errno;
751 }
752}
753
754static int
755netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
756{
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758
5b7448ed 759 if (netdev->fd < 0) {
8b61709d 760 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
c0e5f6ca 761 return -EAGAIN;
8b61709d
BP
762 }
763
764 for (;;) {
5b7448ed 765 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
766 if (retval >= 0) {
767 return retval;
768 } else if (errno != EINTR) {
769 if (errno != EAGAIN) {
770 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
771 strerror(errno), netdev_get_name(netdev_));
772 }
c0e5f6ca 773 return -errno;
8b61709d
BP
774 }
775 }
776}
777
778/* Registers with the poll loop to wake up from the next call to poll_block()
779 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
780static void
781netdev_linux_recv_wait(struct netdev *netdev_)
782{
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
784 if (netdev->fd >= 0) {
785 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
786 }
787}
788
789/* Discards all packets waiting to be received from 'netdev'. */
790static int
791netdev_linux_drain(struct netdev *netdev_)
792{
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 794 if (netdev->fd < 0) {
8b61709d 795 return 0;
5b7448ed 796 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 797 struct ifreq ifr;
149f577a 798 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
799 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
800 if (error) {
801 return error;
802 }
5b7448ed 803 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
804 return 0;
805 } else {
5b7448ed 806 return drain_rcvbuf(netdev->fd);
8b61709d
BP
807 }
808}
809
810/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
811 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
812 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
813 * the packet is too big or too small to transmit on the device.
814 *
815 * The caller retains ownership of 'buffer' in all cases.
816 *
817 * The kernel maintains a packet transmission queue, so the caller is not
818 * expected to do additional queuing of packets. */
819static int
820netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
821{
822 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823
824 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
825 */
5b7448ed 826 if (netdev->fd < 0) {
8b61709d
BP
827 return EPIPE;
828 }
829
830 for (;;) {
5b7448ed 831 ssize_t retval = write(netdev->fd, data, size);
8b61709d
BP
832 if (retval < 0) {
833 /* The Linux AF_PACKET implementation never blocks waiting for room
834 * for packets, instead returning ENOBUFS. Translate this into
835 * EAGAIN for the caller. */
836 if (errno == ENOBUFS) {
837 return EAGAIN;
838 } else if (errno == EINTR) {
839 continue;
840 } else if (errno != EAGAIN) {
841 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
842 netdev_get_name(netdev_), strerror(errno));
843 }
844 return errno;
845 } else if (retval != size) {
846 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
847 "%zu) on %s", retval, size, netdev_get_name(netdev_));
848 return EMSGSIZE;
849 } else {
850 return 0;
851 }
852 }
853}
854
855/* Registers with the poll loop to wake up from the next call to poll_block()
856 * when the packet transmission queue has sufficient room to transmit a packet
857 * with netdev_send().
858 *
859 * The kernel maintains a packet transmission queue, so the client is not
860 * expected to do additional queuing of packets. Thus, this function is
861 * unlikely to ever be used. It is included for completeness. */
862static void
863netdev_linux_send_wait(struct netdev *netdev_)
864{
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 866 if (netdev->fd < 0) {
8b61709d 867 /* Nothing to do. */
5b7448ed
JG
868 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
869 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
870 } else {
871 /* TAP device always accepts packets.*/
872 poll_immediate_wake();
873 }
874}
875
876/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
877 * otherwise a positive errno value. */
878static int
879netdev_linux_set_etheraddr(struct netdev *netdev_,
880 const uint8_t mac[ETH_ADDR_LEN])
881{
149f577a
JG
882 struct netdev_dev_linux *netdev_dev =
883 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
884 int error;
885
149f577a
JG
886 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
887 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
888 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
889 if (!error) {
149f577a
JG
890 netdev_dev->cache_valid |= VALID_ETHERADDR;
891 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
892 }
893 } else {
894 error = 0;
8b61709d
BP
895 }
896 return error;
897}
898
899/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
900 * free the returned buffer. */
901static int
902netdev_linux_get_etheraddr(const struct netdev *netdev_,
903 uint8_t mac[ETH_ADDR_LEN])
904{
149f577a
JG
905 struct netdev_dev_linux *netdev_dev =
906 netdev_dev_linux_cast(netdev_get_dev(netdev_));
907 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 908 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 909 netdev_dev->etheraddr);
8b61709d
BP
910 if (error) {
911 return error;
912 }
149f577a 913 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 914 }
149f577a 915 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
916 return 0;
917}
918
919/* Returns the maximum size of transmitted (and received) packets on 'netdev',
920 * in bytes, not including the hardware header; thus, this is typically 1500
921 * bytes for Ethernet devices. */
922static int
923netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
924{
149f577a
JG
925 struct netdev_dev_linux *netdev_dev =
926 netdev_dev_linux_cast(netdev_get_dev(netdev_));
927 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
928 struct ifreq ifr;
929 int error;
930
149f577a
JG
931 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
932 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
933 if (error) {
934 return error;
935 }
149f577a
JG
936 netdev_dev->mtu = ifr.ifr_mtu;
937 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 938 }
149f577a 939 *mtup = netdev_dev->mtu;
8b61709d
BP
940 return 0;
941}
942
9ab3d9a3
BP
943/* Returns the ifindex of 'netdev', if successful, as a positive number.
944 * On failure, returns a negative errno value. */
945static int
946netdev_linux_get_ifindex(const struct netdev *netdev)
947{
948 int ifindex, error;
949
950 error = get_ifindex(netdev, &ifindex);
951 return error ? -error : ifindex;
952}
953
8b61709d
BP
954static int
955netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
956{
149f577a
JG
957 struct netdev_dev_linux *netdev_dev =
958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
959 int error = 0;
960 char *fn = NULL;
961 int fd = -1;
962
149f577a 963 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
964 char line[8];
965 int retval;
966
149f577a
JG
967 fn = xasprintf("/sys/class/net/%s/carrier",
968 netdev_get_name(netdev_));
8b61709d
BP
969 fd = open(fn, O_RDONLY);
970 if (fd < 0) {
971 error = errno;
972 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
973 goto exit;
974 }
975
976 retval = read(fd, line, sizeof line);
977 if (retval < 0) {
978 error = errno;
979 if (error == EINVAL) {
980 /* This is the normal return value when we try to check carrier
981 * if the network device is not up. */
982 } else {
983 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
984 }
985 goto exit;
986 } else if (retval == 0) {
987 error = EPROTO;
988 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
989 goto exit;
990 }
991
992 if (line[0] != '0' && line[0] != '1') {
993 error = EPROTO;
994 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
995 fn, line[0]);
996 goto exit;
997 }
149f577a
JG
998 netdev_dev->carrier = line[0] != '0';
999 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 1000 }
149f577a 1001 *carrier = netdev_dev->carrier;
8b61709d
BP
1002 error = 0;
1003
1004exit:
1005 if (fd >= 0) {
1006 close(fd);
1007 }
1008 free(fn);
1009 return error;
1010}
1011
63331829 1012static int
782e6111
EJ
1013netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1014 const char *cmd_name, struct mii_ioctl_data *data)
63331829 1015{
63331829 1016 struct ifreq ifr;
782e6111 1017 int error;
63331829 1018
63331829 1019 memset(&ifr, 0, sizeof ifr);
782e6111
EJ
1020 memcpy(&ifr.ifr_data, data, sizeof *data);
1021 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1022 &ifr, cmd, cmd_name);
1023 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1024
782e6111
EJ
1025 return error;
1026}
1027
1028static int
1029netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1030{
1031 const char *name = netdev_get_name(netdev);
1032 struct mii_ioctl_data data;
1033 int error;
63331829 1034
782e6111
EJ
1035 *miimon = false;
1036
1037 memset(&data, 0, sizeof data);
1038 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1039 if (!error) {
1040 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1041 data.reg_num = MII_BMSR;
1042 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1043 &data);
63331829
EJ
1044
1045 if (!error) {
782e6111 1046 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1047 } else {
1048 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1049 }
1050 } else {
1051 struct ethtool_cmd ecmd;
63331829
EJ
1052
1053 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1054 name);
1055
1056 memset(&ecmd, 0, sizeof ecmd);
1057 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1058 "ETHTOOL_GLINK");
1059 if (!error) {
782e6111
EJ
1060 struct ethtool_value eval;
1061
1062 memcpy(&eval, &ecmd, sizeof eval);
1063 *miimon = !!eval.data;
63331829
EJ
1064 } else {
1065 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1066 }
1067 }
1068
1069 return error;
1070}
1071
8b61709d
BP
1072/* Check whether we can we use RTM_GETLINK to get network device statistics.
1073 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1074 * enabled. */
1075static bool
1076check_for_working_netlink_stats(void)
1077{
1078 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1079 * preferable, so if that works, we'll use it. */
1080 int ifindex = do_get_ifindex("lo");
1081 if (ifindex < 0) {
1082 VLOG_WARN("failed to get ifindex for lo, "
1083 "obtaining netdev stats from proc");
1084 return false;
1085 } else {
1086 struct netdev_stats stats;
1087 int error = get_stats_via_netlink(ifindex, &stats);
1088 if (!error) {
1089 VLOG_DBG("obtaining netdev stats via rtnetlink");
1090 return true;
1091 } else {
1092 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1093 "via proc (you are probably running a pre-2.6.19 "
1094 "kernel)", strerror(error));
1095 return false;
1096 }
1097 }
1098}
1099
8722022c
BP
1100/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1101static void
1102netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1103{
1104 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1105 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1106 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1107
8722022c 1108 netdev_dev->is_tap = !strcmp(type, "tap");
9fe3b9a2
BP
1109 netdev_dev->is_internal = (!netdev_dev->is_tap
1110 && dpif_linux_is_internal_device(name));
8722022c
BP
1111 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1112 }
1113}
1114
92df599c
JG
1115static void
1116swap_uint64(uint64_t *a, uint64_t *b)
1117{
1118 *a ^= *b;
1119 *b ^= *a;
1120 *a ^= *b;
1121}
1122
7fbef77a 1123/* Retrieves current device stats for 'netdev'. */
8b61709d 1124static int
149f577a
JG
1125netdev_linux_get_stats(const struct netdev *netdev_,
1126 struct netdev_stats *stats)
8b61709d 1127{
149f577a
JG
1128 struct netdev_dev_linux *netdev_dev =
1129 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1130 static int use_netlink_stats = -1;
1131 int error;
1132
7fbef77a
JG
1133 if (netdev_dev->have_vport_stats ||
1134 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1135
1136 error = netdev_vport_get_stats(netdev_, stats);
1137 netdev_dev->have_vport_stats = !error;
1138 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1139 }
8b61709d 1140
7fbef77a
JG
1141 if (!netdev_dev->have_vport_stats) {
1142 if (use_netlink_stats < 0) {
1143 use_netlink_stats = check_for_working_netlink_stats();
1144 }
1145 if (use_netlink_stats) {
1146 int ifindex;
1147
1148 error = get_ifindex(netdev_, &ifindex);
1149 if (!error) {
1150 error = get_stats_via_netlink(ifindex, stats);
1151 }
1152 } else {
1153 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1154 }
8b61709d 1155 }
fe6b0e03
JG
1156
1157 /* If this port is an internal port then the transmit and receive stats
1158 * will appear to be swapped relative to the other ports since we are the
1159 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1160 * them back here. This does not apply if we are getting stats from the
1161 * vport layer because it always tracks stats from the perspective of the
1162 * switch. */
92df599c 1163 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1164 if (!error && !netdev_dev->have_vport_stats &&
1165 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1166 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1167 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1168 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1169 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1170 stats->rx_length_errors = 0;
1171 stats->rx_over_errors = 0;
1172 stats->rx_crc_errors = 0;
1173 stats->rx_frame_errors = 0;
1174 stats->rx_fifo_errors = 0;
1175 stats->rx_missed_errors = 0;
1176 stats->tx_aborted_errors = 0;
1177 stats->tx_carrier_errors = 0;
1178 stats->tx_fifo_errors = 0;
1179 stats->tx_heartbeat_errors = 0;
1180 stats->tx_window_errors = 0;
1181 }
1182
8b61709d
BP
1183 return error;
1184}
1185
1186/* Stores the features supported by 'netdev' into each of '*current',
1187 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1188 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1189 * successful, otherwise a positive errno value. */
8b61709d 1190static int
6f2f5cce 1191netdev_linux_get_features(const struct netdev *netdev,
8b61709d
BP
1192 uint32_t *current, uint32_t *advertised,
1193 uint32_t *supported, uint32_t *peer)
1194{
1195 struct ethtool_cmd ecmd;
1196 int error;
1197
1198 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1199 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1200 ETHTOOL_GSET, "ETHTOOL_GSET");
1201 if (error) {
1202 return error;
1203 }
1204
1205 /* Supported features. */
1206 *supported = 0;
1207 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1208 *supported |= OFPPF_10MB_HD;
1209 }
1210 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1211 *supported |= OFPPF_10MB_FD;
1212 }
1213 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1214 *supported |= OFPPF_100MB_HD;
1215 }
1216 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1217 *supported |= OFPPF_100MB_FD;
1218 }
1219 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1220 *supported |= OFPPF_1GB_HD;
1221 }
1222 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1223 *supported |= OFPPF_1GB_FD;
1224 }
1225 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1226 *supported |= OFPPF_10GB_FD;
1227 }
1228 if (ecmd.supported & SUPPORTED_TP) {
1229 *supported |= OFPPF_COPPER;
1230 }
1231 if (ecmd.supported & SUPPORTED_FIBRE) {
1232 *supported |= OFPPF_FIBER;
1233 }
1234 if (ecmd.supported & SUPPORTED_Autoneg) {
1235 *supported |= OFPPF_AUTONEG;
1236 }
1237 if (ecmd.supported & SUPPORTED_Pause) {
1238 *supported |= OFPPF_PAUSE;
1239 }
1240 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1241 *supported |= OFPPF_PAUSE_ASYM;
1242 }
1243
1244 /* Advertised features. */
1245 *advertised = 0;
1246 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1247 *advertised |= OFPPF_10MB_HD;
1248 }
1249 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1250 *advertised |= OFPPF_10MB_FD;
1251 }
1252 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1253 *advertised |= OFPPF_100MB_HD;
1254 }
1255 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1256 *advertised |= OFPPF_100MB_FD;
1257 }
1258 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1259 *advertised |= OFPPF_1GB_HD;
1260 }
1261 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1262 *advertised |= OFPPF_1GB_FD;
1263 }
1264 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1265 *advertised |= OFPPF_10GB_FD;
1266 }
1267 if (ecmd.advertising & ADVERTISED_TP) {
1268 *advertised |= OFPPF_COPPER;
1269 }
1270 if (ecmd.advertising & ADVERTISED_FIBRE) {
1271 *advertised |= OFPPF_FIBER;
1272 }
1273 if (ecmd.advertising & ADVERTISED_Autoneg) {
1274 *advertised |= OFPPF_AUTONEG;
1275 }
1276 if (ecmd.advertising & ADVERTISED_Pause) {
1277 *advertised |= OFPPF_PAUSE;
1278 }
1279 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1280 *advertised |= OFPPF_PAUSE_ASYM;
1281 }
1282
1283 /* Current settings. */
1284 if (ecmd.speed == SPEED_10) {
1285 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1286 } else if (ecmd.speed == SPEED_100) {
1287 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1288 } else if (ecmd.speed == SPEED_1000) {
1289 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1290 } else if (ecmd.speed == SPEED_10000) {
1291 *current = OFPPF_10GB_FD;
1292 } else {
1293 *current = 0;
1294 }
1295
1296 if (ecmd.port == PORT_TP) {
1297 *current |= OFPPF_COPPER;
1298 } else if (ecmd.port == PORT_FIBRE) {
1299 *current |= OFPPF_FIBER;
1300 }
1301
1302 if (ecmd.autoneg) {
1303 *current |= OFPPF_AUTONEG;
1304 }
1305
1306 /* Peer advertisements. */
1307 *peer = 0; /* XXX */
1308
1309 return 0;
1310}
1311
1312/* Set the features advertised by 'netdev' to 'advertise'. */
1313static int
1314netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1315{
1316 struct ethtool_cmd ecmd;
1317 int error;
1318
1319 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1320 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1321 ETHTOOL_GSET, "ETHTOOL_GSET");
1322 if (error) {
1323 return error;
1324 }
1325
1326 ecmd.advertising = 0;
1327 if (advertise & OFPPF_10MB_HD) {
1328 ecmd.advertising |= ADVERTISED_10baseT_Half;
1329 }
1330 if (advertise & OFPPF_10MB_FD) {
1331 ecmd.advertising |= ADVERTISED_10baseT_Full;
1332 }
1333 if (advertise & OFPPF_100MB_HD) {
1334 ecmd.advertising |= ADVERTISED_100baseT_Half;
1335 }
1336 if (advertise & OFPPF_100MB_FD) {
1337 ecmd.advertising |= ADVERTISED_100baseT_Full;
1338 }
1339 if (advertise & OFPPF_1GB_HD) {
1340 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1341 }
1342 if (advertise & OFPPF_1GB_FD) {
1343 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1344 }
1345 if (advertise & OFPPF_10GB_FD) {
1346 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1347 }
1348 if (advertise & OFPPF_COPPER) {
1349 ecmd.advertising |= ADVERTISED_TP;
1350 }
1351 if (advertise & OFPPF_FIBER) {
1352 ecmd.advertising |= ADVERTISED_FIBRE;
1353 }
1354 if (advertise & OFPPF_AUTONEG) {
1355 ecmd.advertising |= ADVERTISED_Autoneg;
1356 }
1357 if (advertise & OFPPF_PAUSE) {
1358 ecmd.advertising |= ADVERTISED_Pause;
1359 }
1360 if (advertise & OFPPF_PAUSE_ASYM) {
1361 ecmd.advertising |= ADVERTISED_Asym_Pause;
1362 }
0b0544d7 1363 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1364 ETHTOOL_SSET, "ETHTOOL_SSET");
1365}
1366
1367/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1368 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1369 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1370 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1371 * sets '*vlan_vid' to -1. */
1372static int
1373netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1374{
1375 const char *netdev_name = netdev_get_name(netdev);
1376 struct ds line = DS_EMPTY_INITIALIZER;
1377 FILE *stream = NULL;
1378 int error;
1379 char *fn;
1380
1381 COVERAGE_INC(netdev_get_vlan_vid);
1382 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1383 stream = fopen(fn, "r");
1384 if (!stream) {
1385 error = errno;
1386 goto done;
1387 }
1388
1389 if (ds_get_line(&line, stream)) {
1390 if (ferror(stream)) {
1391 error = errno;
1392 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1393 } else {
1394 error = EPROTO;
1395 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1396 }
1397 goto done;
1398 }
1399
1400 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1401 error = EPROTO;
1402 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1403 fn, ds_cstr(&line));
1404 goto done;
1405 }
1406
1407 error = 0;
1408
1409done:
1410 free(fn);
1411 if (stream) {
1412 fclose(stream);
1413 }
1414 ds_destroy(&line);
1415 if (error) {
1416 *vlan_vid = -1;
1417 }
1418 return error;
1419}
1420
1421#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1422#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1423
8e460221 1424/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1425 * positive errno value.
1426 *
1427 * This function is equivalent to running
1428 * /sbin/tc qdisc del dev %s handle ffff: ingress
1429 * but it is much, much faster.
1430 */
8e460221
BP
1431static int
1432netdev_linux_remove_policing(struct netdev *netdev)
1433{
80a86fbe
BP
1434 struct netdev_dev_linux *netdev_dev =
1435 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1436 const char *netdev_name = netdev_get_name(netdev);
8e460221 1437
6f42c8ea 1438 struct ofpbuf request;
6f42c8ea 1439 struct tcmsg *tcmsg;
6f42c8ea
BP
1440 int error;
1441
c1c9c9c4 1442 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1443 if (!tcmsg) {
1444 return ENODEV;
1445 }
c1c9c9c4 1446 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1447 tcmsg->tcm_parent = TC_H_INGRESS;
1448 nl_msg_put_string(&request, TCA_KIND, "ingress");
1449 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1450
1451 error = tc_transact(&request, NULL);
4d10512c 1452 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1453 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1454 netdev_name, strerror(error));
1455 return error;
1456 }
1457
80a86fbe
BP
1458 netdev_dev->kbits_rate = 0;
1459 netdev_dev->kbits_burst = 0;
1460 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1461 return 0;
1462}
1463
8b61709d
BP
1464/* Attempts to set input rate limiting (policing) policy. */
1465static int
1466netdev_linux_set_policing(struct netdev *netdev,
1467 uint32_t kbits_rate, uint32_t kbits_burst)
1468{
80a86fbe
BP
1469 struct netdev_dev_linux *netdev_dev =
1470 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1471 const char *netdev_name = netdev_get_name(netdev);
1472 char command[1024];
1473
1474 COVERAGE_INC(netdev_set_policing);
8e460221 1475
80a86fbe
BP
1476 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1477 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1478 : kbits_burst); /* Stick with user-specified value. */
1479
1480 if (netdev_dev->cache_valid & VALID_POLICING
1481 && netdev_dev->kbits_rate == kbits_rate
1482 && netdev_dev->kbits_burst == kbits_burst) {
1483 /* Assume that settings haven't changed since we last set them. */
1484 return 0;
1485 }
1486
8e460221 1487 netdev_linux_remove_policing(netdev);
8b61709d 1488 if (kbits_rate) {
8b61709d
BP
1489 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1490 if (system(command) != 0) {
1491 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1492 return -1;
1493 }
1494
1495 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1496 kbits_rate, kbits_burst);
1497 if (system(command) != 0) {
1498 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1499 netdev_name);
1500 return -1;
1501 }
80a86fbe
BP
1502
1503 netdev_dev->kbits_rate = kbits_rate;
1504 netdev_dev->kbits_burst = kbits_burst;
1505 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1506 }
1507
1508 return 0;
1509}
1510
c1c9c9c4
BP
1511static int
1512netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1513 struct svec *types)
1514{
1515 const struct tc_ops **opsp;
1516
1517 for (opsp = tcs; *opsp != NULL; opsp++) {
1518 const struct tc_ops *ops = *opsp;
1519 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1520 svec_add(types, ops->ovs_name);
1521 }
1522 }
1523 return 0;
1524}
1525
1526static const struct tc_ops *
1527tc_lookup_ovs_name(const char *name)
1528{
1529 const struct tc_ops **opsp;
1530
1531 for (opsp = tcs; *opsp != NULL; opsp++) {
1532 const struct tc_ops *ops = *opsp;
1533 if (!strcmp(name, ops->ovs_name)) {
1534 return ops;
1535 }
1536 }
1537 return NULL;
1538}
1539
1540static const struct tc_ops *
1541tc_lookup_linux_name(const char *name)
1542{
1543 const struct tc_ops **opsp;
1544
1545 for (opsp = tcs; *opsp != NULL; opsp++) {
1546 const struct tc_ops *ops = *opsp;
1547 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1548 return ops;
1549 }
1550 }
1551 return NULL;
1552}
1553
93b13be8
BP
1554static struct tc_queue *
1555tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1556 size_t hash)
1557{
1558 struct netdev_dev_linux *netdev_dev =
1559 netdev_dev_linux_cast(netdev_get_dev(netdev));
1560 struct tc_queue *queue;
1561
4e8e4213 1562 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1563 if (queue->queue_id == queue_id) {
1564 return queue;
1565 }
1566 }
1567 return NULL;
1568}
1569
1570static struct tc_queue *
1571tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1572{
1573 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1574}
1575
c1c9c9c4
BP
1576static int
1577netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1578 const char *type,
1579 struct netdev_qos_capabilities *caps)
1580{
1581 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1582 if (!ops) {
1583 return EOPNOTSUPP;
1584 }
1585 caps->n_queues = ops->n_queues;
1586 return 0;
1587}
1588
1589static int
1590netdev_linux_get_qos(const struct netdev *netdev,
1591 const char **typep, struct shash *details)
1592{
1593 struct netdev_dev_linux *netdev_dev =
1594 netdev_dev_linux_cast(netdev_get_dev(netdev));
1595 int error;
1596
1597 error = tc_query_qdisc(netdev);
1598 if (error) {
1599 return error;
1600 }
1601
1602 *typep = netdev_dev->tc->ops->ovs_name;
1603 return (netdev_dev->tc->ops->qdisc_get
1604 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1605 : 0);
1606}
1607
1608static int
1609netdev_linux_set_qos(struct netdev *netdev,
1610 const char *type, const struct shash *details)
1611{
1612 struct netdev_dev_linux *netdev_dev =
1613 netdev_dev_linux_cast(netdev_get_dev(netdev));
1614 const struct tc_ops *new_ops;
1615 int error;
1616
1617 new_ops = tc_lookup_ovs_name(type);
1618 if (!new_ops || !new_ops->tc_install) {
1619 return EOPNOTSUPP;
1620 }
1621
1622 error = tc_query_qdisc(netdev);
1623 if (error) {
1624 return error;
1625 }
1626
1627 if (new_ops == netdev_dev->tc->ops) {
1628 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1629 } else {
1630 /* Delete existing qdisc. */
1631 error = tc_del_qdisc(netdev);
1632 if (error) {
1633 return error;
1634 }
1635 assert(netdev_dev->tc == NULL);
1636
1637 /* Install new qdisc. */
1638 error = new_ops->tc_install(netdev, details);
1639 assert((error == 0) == (netdev_dev->tc != NULL));
1640
1641 return error;
1642 }
1643}
1644
1645static int
1646netdev_linux_get_queue(const struct netdev *netdev,
1647 unsigned int queue_id, struct shash *details)
1648{
1649 struct netdev_dev_linux *netdev_dev =
1650 netdev_dev_linux_cast(netdev_get_dev(netdev));
1651 int error;
1652
1653 error = tc_query_qdisc(netdev);
1654 if (error) {
1655 return error;
93b13be8
BP
1656 } else {
1657 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1658 return (queue
1659 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1660 : ENOENT);
c1c9c9c4 1661 }
c1c9c9c4
BP
1662}
1663
1664static int
1665netdev_linux_set_queue(struct netdev *netdev,
1666 unsigned int queue_id, const struct shash *details)
1667{
1668 struct netdev_dev_linux *netdev_dev =
1669 netdev_dev_linux_cast(netdev_get_dev(netdev));
1670 int error;
1671
1672 error = tc_query_qdisc(netdev);
1673 if (error) {
1674 return error;
1675 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1676 || !netdev_dev->tc->ops->class_set) {
1677 return EINVAL;
1678 }
1679
1680 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1681}
1682
1683static int
1684netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1685{
1686 struct netdev_dev_linux *netdev_dev =
1687 netdev_dev_linux_cast(netdev_get_dev(netdev));
1688 int error;
1689
1690 error = tc_query_qdisc(netdev);
1691 if (error) {
1692 return error;
1693 } else if (!netdev_dev->tc->ops->class_delete) {
1694 return EINVAL;
93b13be8
BP
1695 } else {
1696 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1697 return (queue
1698 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1699 : ENOENT);
c1c9c9c4 1700 }
c1c9c9c4
BP
1701}
1702
1703static int
1704netdev_linux_get_queue_stats(const struct netdev *netdev,
1705 unsigned int queue_id,
1706 struct netdev_queue_stats *stats)
1707{
1708 struct netdev_dev_linux *netdev_dev =
1709 netdev_dev_linux_cast(netdev_get_dev(netdev));
1710 int error;
1711
1712 error = tc_query_qdisc(netdev);
1713 if (error) {
1714 return error;
c1c9c9c4
BP
1715 } else if (!netdev_dev->tc->ops->class_get_stats) {
1716 return EOPNOTSUPP;
93b13be8
BP
1717 } else {
1718 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1719 return (queue
1720 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1721 : ENOENT);
c1c9c9c4 1722 }
c1c9c9c4
BP
1723}
1724
23a98ffe 1725static bool
c1c9c9c4
BP
1726start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1727{
1728 struct ofpbuf request;
1729 struct tcmsg *tcmsg;
1730
1731 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1732 if (!tcmsg) {
1733 return false;
1734 }
3c4de644 1735 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1736 nl_dump_start(dump, rtnl_sock, &request);
1737 ofpbuf_uninit(&request);
23a98ffe 1738 return true;
c1c9c9c4
BP
1739}
1740
1741static int
1742netdev_linux_dump_queues(const struct netdev *netdev,
1743 netdev_dump_queues_cb *cb, void *aux)
1744{
1745 struct netdev_dev_linux *netdev_dev =
1746 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1747 struct tc_queue *queue;
c1c9c9c4
BP
1748 struct shash details;
1749 int last_error;
c1c9c9c4
BP
1750 int error;
1751
1752 error = tc_query_qdisc(netdev);
1753 if (error) {
1754 return error;
1755 } else if (!netdev_dev->tc->ops->class_get) {
1756 return EOPNOTSUPP;
1757 }
1758
1759 last_error = 0;
1760 shash_init(&details);
4e8e4213 1761 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1762 shash_clear(&details);
1763
93b13be8 1764 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1765 if (!error) {
93b13be8 1766 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1767 } else {
1768 last_error = error;
1769 }
1770 }
1771 shash_destroy(&details);
1772
1773 return last_error;
1774}
1775
1776static int
1777netdev_linux_dump_queue_stats(const struct netdev *netdev,
1778 netdev_dump_queue_stats_cb *cb, void *aux)
1779{
1780 struct netdev_dev_linux *netdev_dev =
1781 netdev_dev_linux_cast(netdev_get_dev(netdev));
1782 struct nl_dump dump;
1783 struct ofpbuf msg;
1784 int last_error;
1785 int error;
1786
1787 error = tc_query_qdisc(netdev);
1788 if (error) {
1789 return error;
1790 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1791 return EOPNOTSUPP;
1792 }
1793
1794 last_error = 0;
23a98ffe
BP
1795 if (!start_queue_dump(netdev, &dump)) {
1796 return ENODEV;
1797 }
c1c9c9c4
BP
1798 while (nl_dump_next(&dump, &msg)) {
1799 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1800 if (error) {
1801 last_error = error;
1802 }
1803 }
1804
1805 error = nl_dump_done(&dump);
1806 return error ? error : last_error;
1807}
1808
8b61709d 1809static int
f1acd62b
BP
1810netdev_linux_get_in4(const struct netdev *netdev_,
1811 struct in_addr *address, struct in_addr *netmask)
8b61709d 1812{
149f577a
JG
1813 struct netdev_dev_linux *netdev_dev =
1814 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1815
1816 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1817 int error;
1818
149f577a 1819 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1820 SIOCGIFADDR, "SIOCGIFADDR");
1821 if (error) {
1822 return error;
1823 }
1824
149f577a 1825 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1826 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1827 if (error) {
1828 return error;
1829 }
1830
149f577a 1831 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1832 }
149f577a
JG
1833 *address = netdev_dev->address;
1834 *netmask = netdev_dev->netmask;
f1acd62b 1835 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1836}
1837
8b61709d 1838static int
f1acd62b
BP
1839netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1840 struct in_addr netmask)
8b61709d 1841{
149f577a
JG
1842 struct netdev_dev_linux *netdev_dev =
1843 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1844 int error;
1845
f1acd62b 1846 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1847 if (!error) {
149f577a
JG
1848 netdev_dev->cache_valid |= VALID_IN4;
1849 netdev_dev->address = address;
1850 netdev_dev->netmask = netmask;
f1acd62b 1851 if (address.s_addr != INADDR_ANY) {
8b61709d 1852 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1853 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1854 }
1855 }
1856 return error;
1857}
1858
1859static bool
1860parse_if_inet6_line(const char *line,
1861 struct in6_addr *in6, char ifname[16 + 1])
1862{
1863 uint8_t *s6 = in6->s6_addr;
1864#define X8 "%2"SCNx8
1865 return sscanf(line,
1866 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1867 "%*x %*x %*x %*x %16s\n",
1868 &s6[0], &s6[1], &s6[2], &s6[3],
1869 &s6[4], &s6[5], &s6[6], &s6[7],
1870 &s6[8], &s6[9], &s6[10], &s6[11],
1871 &s6[12], &s6[13], &s6[14], &s6[15],
1872 ifname) == 17;
1873}
1874
1875/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1876 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1877static int
1878netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1879{
149f577a
JG
1880 struct netdev_dev_linux *netdev_dev =
1881 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1882 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
1883 FILE *file;
1884 char line[128];
1885
149f577a 1886 netdev_dev->in6 = in6addr_any;
8b61709d
BP
1887
1888 file = fopen("/proc/net/if_inet6", "r");
1889 if (file != NULL) {
1890 const char *name = netdev_get_name(netdev_);
1891 while (fgets(line, sizeof line, file)) {
2a022368 1892 struct in6_addr in6_tmp;
8b61709d 1893 char ifname[16 + 1];
2a022368 1894 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
1895 && !strcmp(name, ifname))
1896 {
2a022368 1897 netdev_dev->in6 = in6_tmp;
8b61709d
BP
1898 break;
1899 }
1900 }
1901 fclose(file);
1902 }
149f577a 1903 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 1904 }
149f577a 1905 *in6 = netdev_dev->in6;
8b61709d
BP
1906 return 0;
1907}
1908
1909static void
1910make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1911{
1912 struct sockaddr_in sin;
1913 memset(&sin, 0, sizeof sin);
1914 sin.sin_family = AF_INET;
1915 sin.sin_addr = addr;
1916 sin.sin_port = 0;
1917
1918 memset(sa, 0, sizeof *sa);
1919 memcpy(sa, &sin, sizeof sin);
1920}
1921
1922static int
1923do_set_addr(struct netdev *netdev,
1924 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1925{
1926 struct ifreq ifr;
71d7c22f 1927 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 1928 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
1929
1930 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1931 ioctl_name);
8b61709d
BP
1932}
1933
1934/* Adds 'router' as a default IP gateway. */
1935static int
67a4917b 1936netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
1937{
1938 struct in_addr any = { INADDR_ANY };
1939 struct rtentry rt;
1940 int error;
1941
1942 memset(&rt, 0, sizeof rt);
1943 make_in4_sockaddr(&rt.rt_dst, any);
1944 make_in4_sockaddr(&rt.rt_gateway, router);
1945 make_in4_sockaddr(&rt.rt_genmask, any);
1946 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
1947 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1948 if (error) {
1949 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1950 }
1951 return error;
1952}
1953
f1acd62b
BP
1954static int
1955netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1956 char **netdev_name)
1957{
1958 static const char fn[] = "/proc/net/route";
1959 FILE *stream;
1960 char line[256];
1961 int ln;
1962
1963 *netdev_name = NULL;
1964 stream = fopen(fn, "r");
1965 if (stream == NULL) {
1966 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1967 return errno;
1968 }
1969
1970 ln = 0;
1971 while (fgets(line, sizeof line, stream)) {
1972 if (++ln >= 2) {
1973 char iface[17];
1974 uint32_t dest, gateway, mask;
1975 int refcnt, metric, mtu;
1976 unsigned int flags, use, window, irtt;
1977
1978 if (sscanf(line,
1979 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1980 " %d %u %u\n",
1981 iface, &dest, &gateway, &flags, &refcnt,
1982 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1983
d295e8e9 1984 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
1985 fn, ln, line);
1986 continue;
1987 }
1988 if (!(flags & RTF_UP)) {
1989 /* Skip routes that aren't up. */
1990 continue;
1991 }
1992
1993 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 1994 * network byte order, so we don't need need any endian
f1acd62b
BP
1995 * conversions here. */
1996 if ((dest & mask) == (host->s_addr & mask)) {
1997 if (!gateway) {
1998 /* The host is directly reachable. */
1999 next_hop->s_addr = 0;
2000 } else {
2001 /* To reach the host, we must go through a gateway. */
2002 next_hop->s_addr = gateway;
2003 }
2004 *netdev_name = xstrdup(iface);
2005 fclose(stream);
2006 return 0;
2007 }
2008 }
2009 }
2010
2011 fclose(stream);
2012 return ENXIO;
2013}
2014
e210037e
AE
2015static int
2016netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2017{
2018 struct ethtool_drvinfo drvinfo;
2019 int error;
2020
2021 memset(&drvinfo, 0, sizeof drvinfo);
2022 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2023 (struct ethtool_cmd *)&drvinfo,
2024 ETHTOOL_GDRVINFO,
2025 "ETHTOOL_GDRVINFO");
2026 if (!error) {
2027 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2028 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2029 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2030 }
2031
2032 return error;
2033}
2034
8b61709d
BP
2035/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2036 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2037 * returns 0. Otherwise, it returns a positive errno value; in particular,
2038 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2039static int
2040netdev_linux_arp_lookup(const struct netdev *netdev,
2041 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2042{
2043 struct arpreq r;
c100e025 2044 struct sockaddr_in sin;
8b61709d
BP
2045 int retval;
2046
2047 memset(&r, 0, sizeof r);
f2cc621b 2048 memset(&sin, 0, sizeof sin);
c100e025
BP
2049 sin.sin_family = AF_INET;
2050 sin.sin_addr.s_addr = ip;
2051 sin.sin_port = 0;
2052 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2053 r.arp_ha.sa_family = ARPHRD_ETHER;
2054 r.arp_flags = 0;
71d7c22f 2055 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2056 COVERAGE_INC(netdev_arp_lookup);
2057 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2058 if (!retval) {
2059 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2060 } else if (retval != ENXIO) {
2061 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2062 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2063 }
2064 return retval;
2065}
2066
2067static int
2068nd_to_iff_flags(enum netdev_flags nd)
2069{
2070 int iff = 0;
2071 if (nd & NETDEV_UP) {
2072 iff |= IFF_UP;
2073 }
2074 if (nd & NETDEV_PROMISC) {
2075 iff |= IFF_PROMISC;
2076 }
2077 return iff;
2078}
2079
2080static int
2081iff_to_nd_flags(int iff)
2082{
2083 enum netdev_flags nd = 0;
2084 if (iff & IFF_UP) {
2085 nd |= NETDEV_UP;
2086 }
2087 if (iff & IFF_PROMISC) {
2088 nd |= NETDEV_PROMISC;
2089 }
2090 return nd;
2091}
2092
2093static int
2094netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2095 enum netdev_flags on, enum netdev_flags *old_flagsp)
2096{
2097 int old_flags, new_flags;
2098 int error;
2099
2100 error = get_flags(netdev, &old_flags);
2101 if (!error) {
2102 *old_flagsp = iff_to_nd_flags(old_flags);
2103 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2104 if (new_flags != old_flags) {
2105 error = set_flags(netdev, new_flags);
2106 }
2107 }
2108 return error;
2109}
2110
2111static void
2112poll_notify(struct list *list)
2113{
2114 struct netdev_linux_notifier *notifier;
4e8e4213 2115 LIST_FOR_EACH (notifier, node, list) {
8b61709d
BP
2116 struct netdev_notifier *n = &notifier->notifier;
2117 n->cb(n);
2118 }
2119}
2120
2121static void
21d6e22e 2122netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
67a4917b 2123 void *aux OVS_UNUSED)
8b61709d
BP
2124{
2125 if (change) {
2126 struct list *list = shash_find_data(&netdev_linux_notifiers,
2127 change->ifname);
2128 if (list) {
2129 poll_notify(list);
2130 }
2131 } else {
2132 struct shash_node *node;
2133 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2134 poll_notify(node->data);
2135 }
2136 }
2137}
2138
2139static int
2140netdev_linux_poll_add(struct netdev *netdev,
2141 void (*cb)(struct netdev_notifier *), void *aux,
2142 struct netdev_notifier **notifierp)
2143{
2144 const char *netdev_name = netdev_get_name(netdev);
2145 struct netdev_linux_notifier *notifier;
2146 struct list *list;
2147
2148 if (shash_is_empty(&netdev_linux_notifiers)) {
21d6e22e
EJ
2149 int error;
2150 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2151 netdev_linux_poll_cb, NULL);
8b61709d
BP
2152 if (error) {
2153 return error;
2154 }
2155 }
2156
2157 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2158 if (!list) {
2159 list = xmalloc(sizeof *list);
2160 list_init(list);
2161 shash_add(&netdev_linux_notifiers, netdev_name, list);
2162 }
2163
2164 notifier = xmalloc(sizeof *notifier);
2165 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2166 list_push_back(list, &notifier->node);
2167 *notifierp = &notifier->notifier;
2168 return 0;
2169}
2170
2171static void
2172netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2173{
2174 struct netdev_linux_notifier *notifier =
2175 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2176 struct list *list;
2177
2178 /* Remove 'notifier' from its list. */
2179 list = list_remove(&notifier->node);
2180 if (list_is_empty(list)) {
2181 /* The list is now empty. Remove it from the hash and free it. */
2182 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2183 shash_delete(&netdev_linux_notifiers,
2184 shash_find(&netdev_linux_notifiers, netdev_name));
2185 free(list);
2186 }
2187 free(notifier);
2188
2189 /* If that was the last notifier, unregister. */
2190 if (shash_is_empty(&netdev_linux_notifiers)) {
21d6e22e 2191 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
8b61709d
BP
2192 }
2193}
2194
c3827f61
BP
2195#define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2196{ \
2197 NAME, \
2198 \
2199 netdev_linux_init, \
2200 netdev_linux_run, \
2201 netdev_linux_wait, \
2202 \
2203 CREATE, \
2204 netdev_linux_destroy, \
6d9e6eb4 2205 NULL, /* set_config */ \
c3827f61
BP
2206 \
2207 netdev_linux_open, \
2208 netdev_linux_close, \
2209 \
2210 ENUMERATE, \
2211 \
2212 netdev_linux_recv, \
2213 netdev_linux_recv_wait, \
2214 netdev_linux_drain, \
2215 \
2216 netdev_linux_send, \
2217 netdev_linux_send_wait, \
2218 \
2219 netdev_linux_set_etheraddr, \
2220 netdev_linux_get_etheraddr, \
2221 netdev_linux_get_mtu, \
2222 netdev_linux_get_ifindex, \
2223 netdev_linux_get_carrier, \
63331829 2224 netdev_linux_get_miimon, \
c3827f61
BP
2225 netdev_linux_get_stats, \
2226 SET_STATS, \
2227 \
2228 netdev_linux_get_features, \
2229 netdev_linux_set_advertisements, \
2230 netdev_linux_get_vlan_vid, \
2231 \
2232 netdev_linux_set_policing, \
2233 netdev_linux_get_qos_types, \
2234 netdev_linux_get_qos_capabilities, \
2235 netdev_linux_get_qos, \
2236 netdev_linux_set_qos, \
2237 netdev_linux_get_queue, \
2238 netdev_linux_set_queue, \
2239 netdev_linux_delete_queue, \
2240 netdev_linux_get_queue_stats, \
2241 netdev_linux_dump_queues, \
2242 netdev_linux_dump_queue_stats, \
2243 \
2244 netdev_linux_get_in4, \
2245 netdev_linux_set_in4, \
2246 netdev_linux_get_in6, \
2247 netdev_linux_add_router, \
2248 netdev_linux_get_next_hop, \
e210037e 2249 netdev_linux_get_status, \
c3827f61
BP
2250 netdev_linux_arp_lookup, \
2251 \
2252 netdev_linux_update_flags, \
2253 \
2254 netdev_linux_poll_add, \
2255 netdev_linux_poll_remove \
2256}
2257
2258const struct netdev_class netdev_linux_class =
2259 NETDEV_LINUX_CLASS(
2260 "system",
2261 netdev_linux_create,
2262 netdev_linux_enumerate,
98563392 2263 NULL); /* set_stats */
c3827f61
BP
2264
2265const struct netdev_class netdev_tap_class =
2266 NETDEV_LINUX_CLASS(
2267 "tap",
2268 netdev_linux_create_tap,
2269 NULL, /* enumerate */
2270 NULL); /* set_stats */
2271
2272const struct netdev_class netdev_internal_class =
2273 NETDEV_LINUX_CLASS(
2274 "internal",
2275 netdev_linux_create,
2276 NULL, /* enumerate */
2277 netdev_vport_set_stats);
8b61709d 2278\f
c1c9c9c4 2279/* HTB traffic control class. */
559843ed 2280
c1c9c9c4 2281#define HTB_N_QUEUES 0xf000
8b61709d 2282
c1c9c9c4
BP
2283struct htb {
2284 struct tc tc;
2285 unsigned int max_rate; /* In bytes/s. */
2286};
8b61709d 2287
c1c9c9c4 2288struct htb_class {
93b13be8 2289 struct tc_queue tc_queue;
c1c9c9c4
BP
2290 unsigned int min_rate; /* In bytes/s. */
2291 unsigned int max_rate; /* In bytes/s. */
2292 unsigned int burst; /* In bytes. */
2293 unsigned int priority; /* Lower values are higher priorities. */
2294};
8b61709d 2295
c1c9c9c4
BP
2296static struct htb *
2297htb_get__(const struct netdev *netdev)
2298{
2299 struct netdev_dev_linux *netdev_dev =
2300 netdev_dev_linux_cast(netdev_get_dev(netdev));
2301 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2302}
2303
2304static struct htb *
2305htb_install__(struct netdev *netdev, uint64_t max_rate)
2306{
2307 struct netdev_dev_linux *netdev_dev =
2308 netdev_dev_linux_cast(netdev_get_dev(netdev));
2309 struct htb *htb;
2310
2311 htb = xmalloc(sizeof *htb);
2312 tc_init(&htb->tc, &tc_ops_htb);
2313 htb->max_rate = max_rate;
2314
2315 netdev_dev->tc = &htb->tc;
2316
2317 return htb;
2318}
2319
2320/* Create an HTB qdisc.
2321 *
a339aa81 2322 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2323static int
2324htb_setup_qdisc__(struct netdev *netdev)
2325{
2326 size_t opt_offset;
2327 struct tc_htb_glob opt;
2328 struct ofpbuf request;
2329 struct tcmsg *tcmsg;
2330
2331 tc_del_qdisc(netdev);
2332
2333 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2334 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2335 if (!tcmsg) {
2336 return ENODEV;
2337 }
c1c9c9c4
BP
2338 tcmsg->tcm_handle = tc_make_handle(1, 0);
2339 tcmsg->tcm_parent = TC_H_ROOT;
2340
2341 nl_msg_put_string(&request, TCA_KIND, "htb");
2342
2343 memset(&opt, 0, sizeof opt);
2344 opt.rate2quantum = 10;
2345 opt.version = 3;
4ecf12d5 2346 opt.defcls = 1;
c1c9c9c4
BP
2347
2348 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2349 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2350 nl_msg_end_nested(&request, opt_offset);
2351
2352 return tc_transact(&request, NULL);
2353}
2354
2355/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2356 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2357static int
2358htb_setup_class__(struct netdev *netdev, unsigned int handle,
2359 unsigned int parent, struct htb_class *class)
2360{
2361 size_t opt_offset;
2362 struct tc_htb_opt opt;
2363 struct ofpbuf request;
2364 struct tcmsg *tcmsg;
2365 int error;
2366 int mtu;
2367
2368 netdev_get_mtu(netdev, &mtu);
f915f1a8
BP
2369 if (mtu == INT_MAX) {
2370 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2371 netdev_get_name(netdev));
2372 return EINVAL;
2373 }
c1c9c9c4
BP
2374
2375 memset(&opt, 0, sizeof opt);
2376 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2377 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2378 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2379 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2380 opt.prio = class->priority;
2381
2382 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2383 if (!tcmsg) {
2384 return ENODEV;
2385 }
c1c9c9c4
BP
2386 tcmsg->tcm_handle = handle;
2387 tcmsg->tcm_parent = parent;
2388
2389 nl_msg_put_string(&request, TCA_KIND, "htb");
2390 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2391 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2392 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2393 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2394 nl_msg_end_nested(&request, opt_offset);
2395
2396 error = tc_transact(&request, NULL);
2397 if (error) {
2398 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2399 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2400 netdev_get_name(netdev),
2401 tc_get_major(handle), tc_get_minor(handle),
2402 tc_get_major(parent), tc_get_minor(parent),
2403 class->min_rate, class->max_rate,
2404 class->burst, class->priority, strerror(error));
2405 }
2406 return error;
2407}
2408
2409/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2410 * description of them into 'details'. The description complies with the
2411 * specification given in the vswitch database documentation for linux-htb
2412 * queue details. */
2413static int
2414htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2415{
2416 static const struct nl_policy tca_htb_policy[] = {
2417 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2418 .min_len = sizeof(struct tc_htb_opt) },
2419 };
2420
2421 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2422 const struct tc_htb_opt *htb;
2423
2424 if (!nl_parse_nested(nl_options, tca_htb_policy,
2425 attrs, ARRAY_SIZE(tca_htb_policy))) {
2426 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2427 return EPROTO;
2428 }
2429
2430 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2431 class->min_rate = htb->rate.rate;
2432 class->max_rate = htb->ceil.rate;
2433 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2434 class->priority = htb->prio;
2435 return 0;
2436}
2437
2438static int
2439htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2440 struct htb_class *options,
2441 struct netdev_queue_stats *stats)
2442{
2443 struct nlattr *nl_options;
2444 unsigned int handle;
2445 int error;
2446
2447 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2448 if (!error && queue_id) {
17ee3c1f
BP
2449 unsigned int major = tc_get_major(handle);
2450 unsigned int minor = tc_get_minor(handle);
2451 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2452 *queue_id = minor - 1;
c1c9c9c4
BP
2453 } else {
2454 error = EPROTO;
2455 }
2456 }
2457 if (!error && options) {
2458 error = htb_parse_tca_options__(nl_options, options);
2459 }
2460 return error;
2461}
2462
2463static void
2464htb_parse_qdisc_details__(struct netdev *netdev,
2465 const struct shash *details, struct htb_class *hc)
2466{
2467 const char *max_rate_s;
2468
2469 max_rate_s = shash_find_data(details, "max-rate");
2470 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2471 if (!hc->max_rate) {
2472 uint32_t current;
2473
2474 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2475 hc->max_rate = netdev_features_to_bps(current) / 8;
2476 }
2477 hc->min_rate = hc->max_rate;
2478 hc->burst = 0;
2479 hc->priority = 0;
2480}
2481
2482static int
2483htb_parse_class_details__(struct netdev *netdev,
2484 const struct shash *details, struct htb_class *hc)
2485{
2486 const struct htb *htb = htb_get__(netdev);
2487 const char *min_rate_s = shash_find_data(details, "min-rate");
2488 const char *max_rate_s = shash_find_data(details, "max-rate");
2489 const char *burst_s = shash_find_data(details, "burst");
2490 const char *priority_s = shash_find_data(details, "priority");
2491 int mtu;
2492
f915f1a8
BP
2493 netdev_get_mtu(netdev, &mtu);
2494 if (mtu == INT_MAX) {
2495 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2496 netdev_get_name(netdev));
2497 return EINVAL;
2498 }
2499
da3827b5 2500 /* min-rate. Don't allow a min-rate below 1500 bytes/s. */
c1c9c9c4
BP
2501 if (!min_rate_s) {
2502 /* min-rate is required. */
2503 return EINVAL;
2504 }
2505 hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
da3827b5 2506 hc->min_rate = MAX(hc->min_rate, 1500);
c1c9c9c4
BP
2507 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2508
2509 /* max-rate */
2510 hc->max_rate = (max_rate_s
2511 ? strtoull(max_rate_s, NULL, 10) / 8
2512 : htb->max_rate);
2513 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2514 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2515
2516 /* burst
2517 *
2518 * According to hints in the documentation that I've read, it is important
2519 * that 'burst' be at least as big as the largest frame that might be
2520 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2521 * but having it a bit too small is a problem. Since netdev_get_mtu()
2522 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2523 * the MTU. We actually add 64, instead of 14, as a guard against
2524 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2525 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2526 hc->burst = MAX(hc->burst, mtu + 64);
2527
2528 /* priority */
2529 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2530
2531 return 0;
2532}
2533
2534static int
2535htb_query_class__(const struct netdev *netdev, unsigned int handle,
2536 unsigned int parent, struct htb_class *options,
2537 struct netdev_queue_stats *stats)
2538{
2539 struct ofpbuf *reply;
2540 int error;
2541
2542 error = tc_query_class(netdev, handle, parent, &reply);
2543 if (!error) {
2544 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2545 ofpbuf_delete(reply);
2546 }
2547 return error;
2548}
2549
2550static int
2551htb_tc_install(struct netdev *netdev, const struct shash *details)
2552{
2553 int error;
2554
2555 error = htb_setup_qdisc__(netdev);
2556 if (!error) {
2557 struct htb_class hc;
2558
2559 htb_parse_qdisc_details__(netdev, details, &hc);
2560 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2561 tc_make_handle(1, 0), &hc);
2562 if (!error) {
2563 htb_install__(netdev, hc.max_rate);
2564 }
2565 }
2566 return error;
2567}
2568
93b13be8
BP
2569static struct htb_class *
2570htb_class_cast__(const struct tc_queue *queue)
2571{
2572 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2573}
2574
c1c9c9c4
BP
2575static void
2576htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2577 const struct htb_class *hc)
2578{
2579 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2580 size_t hash = hash_int(queue_id, 0);
2581 struct tc_queue *queue;
c1c9c9c4
BP
2582 struct htb_class *hcp;
2583
93b13be8
BP
2584 queue = tc_find_queue__(netdev, queue_id, hash);
2585 if (queue) {
2586 hcp = htb_class_cast__(queue);
2587 } else {
c1c9c9c4 2588 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2589 queue = &hcp->tc_queue;
2590 queue->queue_id = queue_id;
2591 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2592 }
93b13be8
BP
2593
2594 hcp->min_rate = hc->min_rate;
2595 hcp->max_rate = hc->max_rate;
2596 hcp->burst = hc->burst;
2597 hcp->priority = hc->priority;
c1c9c9c4
BP
2598}
2599
2600static int
2601htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2602{
c1c9c9c4
BP
2603 struct ofpbuf msg;
2604 struct nl_dump dump;
2605 struct htb_class hc;
2606 struct htb *htb;
2607
2608 /* Get qdisc options. */
2609 hc.max_rate = 0;
2610 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2611 htb = htb_install__(netdev, hc.max_rate);
2612
2613 /* Get queues. */
23a98ffe
BP
2614 if (!start_queue_dump(netdev, &dump)) {
2615 return ENODEV;
2616 }
c1c9c9c4
BP
2617 while (nl_dump_next(&dump, &msg)) {
2618 unsigned int queue_id;
2619
2620 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2621 htb_update_queue__(netdev, queue_id, &hc);
2622 }
2623 }
2624 nl_dump_done(&dump);
2625
2626 return 0;
2627}
2628
2629static void
2630htb_tc_destroy(struct tc *tc)
2631{
2632 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2633 struct htb_class *hc, *next;
c1c9c9c4 2634
4e8e4213 2635 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2636 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2637 free(hc);
2638 }
2639 tc_destroy(tc);
2640 free(htb);
2641}
2642
2643static int
2644htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2645{
2646 const struct htb *htb = htb_get__(netdev);
2647 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2648 return 0;
2649}
2650
2651static int
2652htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2653{
2654 struct htb_class hc;
2655 int error;
2656
2657 htb_parse_qdisc_details__(netdev, details, &hc);
2658 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2659 tc_make_handle(1, 0), &hc);
2660 if (!error) {
2661 htb_get__(netdev)->max_rate = hc.max_rate;
2662 }
2663 return error;
2664}
2665
2666static int
93b13be8
BP
2667htb_class_get(const struct netdev *netdev OVS_UNUSED,
2668 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2669{
93b13be8 2670 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2671
2672 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2673 if (hc->min_rate != hc->max_rate) {
2674 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2675 }
2676 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2677 if (hc->priority) {
2678 shash_add(details, "priority", xasprintf("%u", hc->priority));
2679 }
2680 return 0;
2681}
2682
2683static int
2684htb_class_set(struct netdev *netdev, unsigned int queue_id,
2685 const struct shash *details)
2686{
2687 struct htb_class hc;
2688 int error;
2689
2690 error = htb_parse_class_details__(netdev, details, &hc);
2691 if (error) {
2692 return error;
2693 }
2694
17ee3c1f 2695 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2696 tc_make_handle(1, 0xfffe), &hc);
2697 if (error) {
2698 return error;
2699 }
2700
2701 htb_update_queue__(netdev, queue_id, &hc);
2702 return 0;
2703}
2704
2705static int
93b13be8 2706htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2707{
93b13be8 2708 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2709 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2710 int error;
2711
93b13be8 2712 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2713 if (!error) {
93b13be8 2714 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2715 free(hc);
c1c9c9c4
BP
2716 }
2717 return error;
2718}
2719
2720static int
93b13be8 2721htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2722 struct netdev_queue_stats *stats)
2723{
93b13be8 2724 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2725 tc_make_handle(1, 0xfffe), NULL, stats);
2726}
2727
2728static int
2729htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2730 const struct ofpbuf *nlmsg,
2731 netdev_dump_queue_stats_cb *cb, void *aux)
2732{
2733 struct netdev_queue_stats stats;
17ee3c1f 2734 unsigned int handle, major, minor;
c1c9c9c4
BP
2735 int error;
2736
2737 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2738 if (error) {
2739 return error;
2740 }
2741
17ee3c1f
BP
2742 major = tc_get_major(handle);
2743 minor = tc_get_minor(handle);
2744 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2745 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2746 }
2747 return 0;
2748}
2749
2750static const struct tc_ops tc_ops_htb = {
2751 "htb", /* linux_name */
2752 "linux-htb", /* ovs_name */
2753 HTB_N_QUEUES, /* n_queues */
2754 htb_tc_install,
2755 htb_tc_load,
2756 htb_tc_destroy,
2757 htb_qdisc_get,
2758 htb_qdisc_set,
2759 htb_class_get,
2760 htb_class_set,
2761 htb_class_delete,
2762 htb_class_get_stats,
2763 htb_class_dump_stats
2764};
2765\f
a339aa81
EJ
2766/* "linux-hfsc" traffic control class. */
2767
2768#define HFSC_N_QUEUES 0xf000
2769
2770struct hfsc {
2771 struct tc tc;
2772 uint32_t max_rate;
2773};
2774
2775struct hfsc_class {
2776 struct tc_queue tc_queue;
2777 uint32_t min_rate;
2778 uint32_t max_rate;
2779};
2780
2781static struct hfsc *
2782hfsc_get__(const struct netdev *netdev)
2783{
2784 struct netdev_dev_linux *netdev_dev;
2785 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2786 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2787}
2788
2789static struct hfsc_class *
2790hfsc_class_cast__(const struct tc_queue *queue)
2791{
2792 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2793}
2794
2795static struct hfsc *
2796hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2797{
2798 struct netdev_dev_linux * netdev_dev;
2799 struct hfsc *hfsc;
2800
2801 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2802 hfsc = xmalloc(sizeof *hfsc);
2803 tc_init(&hfsc->tc, &tc_ops_hfsc);
2804 hfsc->max_rate = max_rate;
2805 netdev_dev->tc = &hfsc->tc;
2806
2807 return hfsc;
2808}
2809
2810static void
2811hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2812 const struct hfsc_class *hc)
2813{
2814 size_t hash;
2815 struct hfsc *hfsc;
2816 struct hfsc_class *hcp;
2817 struct tc_queue *queue;
2818
2819 hfsc = hfsc_get__(netdev);
2820 hash = hash_int(queue_id, 0);
2821
2822 queue = tc_find_queue__(netdev, queue_id, hash);
2823 if (queue) {
2824 hcp = hfsc_class_cast__(queue);
2825 } else {
2826 hcp = xmalloc(sizeof *hcp);
2827 queue = &hcp->tc_queue;
2828 queue->queue_id = queue_id;
2829 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2830 }
2831
2832 hcp->min_rate = hc->min_rate;
2833 hcp->max_rate = hc->max_rate;
2834}
2835
2836static int
2837hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2838{
2839 const struct tc_service_curve *rsc, *fsc, *usc;
2840 static const struct nl_policy tca_hfsc_policy[] = {
2841 [TCA_HFSC_RSC] = {
2842 .type = NL_A_UNSPEC,
2843 .optional = false,
2844 .min_len = sizeof(struct tc_service_curve),
2845 },
2846 [TCA_HFSC_FSC] = {
2847 .type = NL_A_UNSPEC,
2848 .optional = false,
2849 .min_len = sizeof(struct tc_service_curve),
2850 },
2851 [TCA_HFSC_USC] = {
2852 .type = NL_A_UNSPEC,
2853 .optional = false,
2854 .min_len = sizeof(struct tc_service_curve),
2855 },
2856 };
2857 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2858
2859 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2860 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2861 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2862 return EPROTO;
2863 }
2864
2865 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2866 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2867 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2868
2869 if (rsc->m1 != 0 || rsc->d != 0 ||
2870 fsc->m1 != 0 || fsc->d != 0 ||
2871 usc->m1 != 0 || usc->d != 0) {
2872 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2873 "Non-linear service curves are not supported.");
2874 return EPROTO;
2875 }
2876
2877 if (rsc->m2 != fsc->m2) {
2878 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2879 "Real-time service curves are not supported ");
2880 return EPROTO;
2881 }
2882
2883 if (rsc->m2 > usc->m2) {
2884 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2885 "Min-rate service curve is greater than "
2886 "the max-rate service curve.");
2887 return EPROTO;
2888 }
2889
2890 class->min_rate = fsc->m2;
2891 class->max_rate = usc->m2;
2892 return 0;
2893}
2894
2895static int
2896hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2897 struct hfsc_class *options,
2898 struct netdev_queue_stats *stats)
2899{
2900 int error;
2901 unsigned int handle;
2902 struct nlattr *nl_options;
2903
2904 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2905 if (error) {
2906 return error;
2907 }
2908
2909 if (queue_id) {
2910 unsigned int major, minor;
2911
2912 major = tc_get_major(handle);
2913 minor = tc_get_minor(handle);
2914 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2915 *queue_id = minor - 1;
2916 } else {
2917 return EPROTO;
2918 }
2919 }
2920
2921 if (options) {
2922 error = hfsc_parse_tca_options__(nl_options, options);
2923 }
2924
2925 return error;
2926}
2927
2928static int
2929hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2930 unsigned int parent, struct hfsc_class *options,
2931 struct netdev_queue_stats *stats)
2932{
2933 int error;
2934 struct ofpbuf *reply;
2935
2936 error = tc_query_class(netdev, handle, parent, &reply);
2937 if (error) {
2938 return error;
2939 }
2940
2941 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2942 ofpbuf_delete(reply);
2943 return error;
2944}
2945
2946static void
2947hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2948 struct hfsc_class *class)
2949{
2950 uint32_t max_rate;
2951 const char *max_rate_s;
2952
2953 max_rate_s = shash_find_data(details, "max-rate");
2954 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2955
2956 if (!max_rate) {
2957 uint32_t current;
2958
2959 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2960 max_rate = netdev_features_to_bps(current) / 8;
2961 }
2962
2963 class->min_rate = max_rate;
2964 class->max_rate = max_rate;
2965}
2966
2967static int
2968hfsc_parse_class_details__(struct netdev *netdev,
2969 const struct shash *details,
2970 struct hfsc_class * class)
2971{
2972 const struct hfsc *hfsc;
2973 uint32_t min_rate, max_rate;
2974 const char *min_rate_s, *max_rate_s;
2975
2976 hfsc = hfsc_get__(netdev);
2977 min_rate_s = shash_find_data(details, "min-rate");
2978 max_rate_s = shash_find_data(details, "max-rate");
2979
2980 if (!min_rate_s) {
2981 return EINVAL;
2982 }
2983
2984 min_rate = strtoull(min_rate_s, NULL, 10) / 8;
2985 min_rate = MAX(min_rate, 1500);
2986 min_rate = MIN(min_rate, hfsc->max_rate);
2987
2988 max_rate = (max_rate_s
2989 ? strtoull(max_rate_s, NULL, 10) / 8
2990 : hfsc->max_rate);
2991 max_rate = MAX(max_rate, min_rate);
2992 max_rate = MIN(max_rate, hfsc->max_rate);
2993
2994 class->min_rate = min_rate;
2995 class->max_rate = max_rate;
2996
2997 return 0;
2998}
2999
3000/* Create an HFSC qdisc.
3001 *
3002 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3003static int
3004hfsc_setup_qdisc__(struct netdev * netdev)
3005{
3006 struct tcmsg *tcmsg;
3007 struct ofpbuf request;
3008 struct tc_hfsc_qopt opt;
3009
3010 tc_del_qdisc(netdev);
3011
3012 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3013 NLM_F_EXCL | NLM_F_CREATE, &request);
3014
3015 if (!tcmsg) {
3016 return ENODEV;
3017 }
3018
3019 tcmsg->tcm_handle = tc_make_handle(1, 0);
3020 tcmsg->tcm_parent = TC_H_ROOT;
3021
3022 memset(&opt, 0, sizeof opt);
3023 opt.defcls = 1;
3024
3025 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3026 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3027
3028 return tc_transact(&request, NULL);
3029}
3030
3031/* Create an HFSC class.
3032 *
3033 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3034 * sc rate <min_rate> ul rate <max_rate>" */
3035static int
3036hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3037 unsigned int parent, struct hfsc_class *class)
3038{
3039 int error;
3040 size_t opt_offset;
3041 struct tcmsg *tcmsg;
3042 struct ofpbuf request;
3043 struct tc_service_curve min, max;
3044
3045 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3046
3047 if (!tcmsg) {
3048 return ENODEV;
3049 }
3050
3051 tcmsg->tcm_handle = handle;
3052 tcmsg->tcm_parent = parent;
3053
3054 min.m1 = 0;
3055 min.d = 0;
3056 min.m2 = class->min_rate;
3057
3058 max.m1 = 0;
3059 max.d = 0;
3060 max.m2 = class->max_rate;
3061
3062 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3063 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3064 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3065 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3066 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3067 nl_msg_end_nested(&request, opt_offset);
3068
3069 error = tc_transact(&request, NULL);
3070 if (error) {
3071 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3072 "min-rate %ubps, max-rate %ubps (%s)",
3073 netdev_get_name(netdev),
3074 tc_get_major(handle), tc_get_minor(handle),
3075 tc_get_major(parent), tc_get_minor(parent),
3076 class->min_rate, class->max_rate, strerror(error));
3077 }
3078
3079 return error;
3080}
3081
3082static int
3083hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3084{
3085 int error;
3086 struct hfsc_class class;
3087
3088 error = hfsc_setup_qdisc__(netdev);
3089
3090 if (error) {
3091 return error;
3092 }
3093
3094 hfsc_parse_qdisc_details__(netdev, details, &class);
3095 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3096 tc_make_handle(1, 0), &class);
3097
3098 if (error) {
3099 return error;
3100 }
3101
3102 hfsc_install__(netdev, class.max_rate);
3103 return 0;
3104}
3105
3106static int
3107hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3108{
3109 struct ofpbuf msg;
3110 struct hfsc *hfsc;
3111 struct nl_dump dump;
3112 struct hfsc_class hc;
3113
3114 hc.max_rate = 0;
3115 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3116 hfsc = hfsc_install__(netdev, hc.max_rate);
3117
3118 if (!start_queue_dump(netdev, &dump)) {
3119 return ENODEV;
3120 }
3121
3122 while (nl_dump_next(&dump, &msg)) {
3123 unsigned int queue_id;
3124
3125 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3126 hfsc_update_queue__(netdev, queue_id, &hc);
3127 }
3128 }
3129
3130 nl_dump_done(&dump);
3131 return 0;
3132}
3133
3134static void
3135hfsc_tc_destroy(struct tc *tc)
3136{
3137 struct hfsc *hfsc;
3138 struct hfsc_class *hc, *next;
3139
3140 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3141
3142 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3143 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3144 free(hc);
3145 }
3146
3147 tc_destroy(tc);
3148 free(hfsc);
3149}
3150
3151static int
3152hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3153{
3154 const struct hfsc *hfsc;
3155 hfsc = hfsc_get__(netdev);
3156 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3157 return 0;
3158}
3159
3160static int
3161hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3162{
3163 int error;
3164 struct hfsc_class class;
3165
3166 hfsc_parse_qdisc_details__(netdev, details, &class);
3167 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3168 tc_make_handle(1, 0), &class);
3169
3170 if (!error) {
3171 hfsc_get__(netdev)->max_rate = class.max_rate;
3172 }
3173
3174 return error;
3175}
3176
3177static int
3178hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3179 const struct tc_queue *queue, struct shash *details)
3180{
3181 const struct hfsc_class *hc;
3182
3183 hc = hfsc_class_cast__(queue);
3184 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3185 if (hc->min_rate != hc->max_rate) {
3186 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3187 }
3188 return 0;
3189}
3190
3191static int
3192hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3193 const struct shash *details)
3194{
3195 int error;
3196 struct hfsc_class class;
3197
3198 error = hfsc_parse_class_details__(netdev, details, &class);
3199 if (error) {
3200 return error;
3201 }
3202
3203 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3204 tc_make_handle(1, 0xfffe), &class);
3205 if (error) {
3206 return error;
3207 }
3208
3209 hfsc_update_queue__(netdev, queue_id, &class);
3210 return 0;
3211}
3212
3213static int
3214hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3215{
3216 int error;
3217 struct hfsc *hfsc;
3218 struct hfsc_class *hc;
3219
3220 hc = hfsc_class_cast__(queue);
3221 hfsc = hfsc_get__(netdev);
3222
3223 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3224 if (!error) {
3225 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3226 free(hc);
3227 }
3228 return error;
3229}
3230
3231static int
3232hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3233 struct netdev_queue_stats *stats)
3234{
3235 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3236 tc_make_handle(1, 0xfffe), NULL, stats);
3237}
3238
3239static int
3240hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3241 const struct ofpbuf *nlmsg,
3242 netdev_dump_queue_stats_cb *cb, void *aux)
3243{
3244 struct netdev_queue_stats stats;
3245 unsigned int handle, major, minor;
3246 int error;
3247
3248 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3249 if (error) {
3250 return error;
3251 }
3252
3253 major = tc_get_major(handle);
3254 minor = tc_get_minor(handle);
3255 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3256 (*cb)(minor - 1, &stats, aux);
3257 }
3258 return 0;
3259}
3260
3261static const struct tc_ops tc_ops_hfsc = {
3262 "hfsc", /* linux_name */
3263 "linux-hfsc", /* ovs_name */
3264 HFSC_N_QUEUES, /* n_queues */
3265 hfsc_tc_install, /* tc_install */
3266 hfsc_tc_load, /* tc_load */
3267 hfsc_tc_destroy, /* tc_destroy */
3268 hfsc_qdisc_get, /* qdisc_get */
3269 hfsc_qdisc_set, /* qdisc_set */
3270 hfsc_class_get, /* class_get */
3271 hfsc_class_set, /* class_set */
3272 hfsc_class_delete, /* class_delete */
3273 hfsc_class_get_stats, /* class_get_stats */
3274 hfsc_class_dump_stats /* class_dump_stats */
3275};
3276\f
c1c9c9c4
BP
3277/* "linux-default" traffic control class.
3278 *
3279 * This class represents the default, unnamed Linux qdisc. It corresponds to
3280 * the "" (empty string) QoS type in the OVS database. */
3281
3282static void
3283default_install__(struct netdev *netdev)
3284{
3285 struct netdev_dev_linux *netdev_dev =
3286 netdev_dev_linux_cast(netdev_get_dev(netdev));
3287 static struct tc *tc;
3288
3289 if (!tc) {
3290 tc = xmalloc(sizeof *tc);
3291 tc_init(tc, &tc_ops_default);
3292 }
3293 netdev_dev->tc = tc;
3294}
3295
3296static int
3297default_tc_install(struct netdev *netdev,
3298 const struct shash *details OVS_UNUSED)
3299{
3300 default_install__(netdev);
3301 return 0;
3302}
3303
3304static int
3305default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3306{
3307 default_install__(netdev);
3308 return 0;
3309}
3310
3311static const struct tc_ops tc_ops_default = {
3312 NULL, /* linux_name */
3313 "", /* ovs_name */
3314 0, /* n_queues */
3315 default_tc_install,
3316 default_tc_load,
3317 NULL, /* tc_destroy */
3318 NULL, /* qdisc_get */
3319 NULL, /* qdisc_set */
3320 NULL, /* class_get */
3321 NULL, /* class_set */
3322 NULL, /* class_delete */
3323 NULL, /* class_get_stats */
3324 NULL /* class_dump_stats */
3325};
3326\f
3327/* "linux-other" traffic control class.
3328 *
3329 * */
3330
3331static int
3332other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3333{
3334 struct netdev_dev_linux *netdev_dev =
3335 netdev_dev_linux_cast(netdev_get_dev(netdev));
3336 static struct tc *tc;
3337
3338 if (!tc) {
3339 tc = xmalloc(sizeof *tc);
3340 tc_init(tc, &tc_ops_other);
3341 }
3342 netdev_dev->tc = tc;
3343 return 0;
3344}
3345
3346static const struct tc_ops tc_ops_other = {
3347 NULL, /* linux_name */
3348 "linux-other", /* ovs_name */
3349 0, /* n_queues */
3350 NULL, /* tc_install */
3351 other_tc_load,
3352 NULL, /* tc_destroy */
3353 NULL, /* qdisc_get */
3354 NULL, /* qdisc_set */
3355 NULL, /* class_get */
3356 NULL, /* class_set */
3357 NULL, /* class_delete */
3358 NULL, /* class_get_stats */
3359 NULL /* class_dump_stats */
3360};
3361\f
3362/* Traffic control. */
3363
3364/* Number of kernel "tc" ticks per second. */
3365static double ticks_per_s;
3366
3367/* Number of kernel "jiffies" per second. This is used for the purpose of
3368 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3369 * one jiffy's worth of data.
3370 *
3371 * There are two possibilities here:
3372 *
3373 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3374 * approximate range of 100 to 1024. That means that we really need to
3375 * make sure that the qdisc can buffer that much data.
3376 *
3377 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3378 * has finely granular timers and there's no need to fudge additional room
3379 * for buffers. (There's no extra effort needed to implement that: the
3380 * large 'buffer_hz' is used as a divisor, so practically any number will
3381 * come out as 0 in the division. Small integer results in the case of
3382 * really high dividends won't have any real effect anyhow.)
3383 */
3384static unsigned int buffer_hz;
3385
3386/* Returns tc handle 'major':'minor'. */
3387static unsigned int
3388tc_make_handle(unsigned int major, unsigned int minor)
3389{
3390 return TC_H_MAKE(major << 16, minor);
3391}
3392
3393/* Returns the major number from 'handle'. */
3394static unsigned int
3395tc_get_major(unsigned int handle)
3396{
3397 return TC_H_MAJ(handle) >> 16;
3398}
3399
3400/* Returns the minor number from 'handle'. */
3401static unsigned int
3402tc_get_minor(unsigned int handle)
3403{
3404 return TC_H_MIN(handle);
3405}
3406
3407static struct tcmsg *
3408tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3409 struct ofpbuf *request)
3410{
3411 struct tcmsg *tcmsg;
3412 int ifindex;
3413 int error;
3414
3415 error = get_ifindex(netdev, &ifindex);
3416 if (error) {
3417 return NULL;
3418 }
3419
3420 ofpbuf_init(request, 512);
3421 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3422 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3423 tcmsg->tcm_family = AF_UNSPEC;
3424 tcmsg->tcm_ifindex = ifindex;
3425 /* Caller should fill in tcmsg->tcm_handle. */
3426 /* Caller should fill in tcmsg->tcm_parent. */
3427
3428 return tcmsg;
3429}
3430
3431static int
3432tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3433{
3434 int error = nl_sock_transact(rtnl_sock, request, replyp);
3435 ofpbuf_uninit(request);
3436 return error;
3437}
3438
3439static void
3440read_psched(void)
3441{
3442 /* The values in psched are not individually very meaningful, but they are
3443 * important. The tables below show some values seen in the wild.
3444 *
3445 * Some notes:
3446 *
3447 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3448 * (Before that, there are hints that it was 1000000000.)
3449 *
3450 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3451 * above.
3452 *
3453 * /proc/net/psched
3454 * -----------------------------------
3455 * [1] 000c8000 000f4240 000f4240 00000064
3456 * [2] 000003e8 00000400 000f4240 3b9aca00
3457 * [3] 000003e8 00000400 000f4240 3b9aca00
3458 * [4] 000003e8 00000400 000f4240 00000064
3459 * [5] 000003e8 00000040 000f4240 3b9aca00
3460 * [6] 000003e8 00000040 000f4240 000000f9
3461 *
3462 * a b c d ticks_per_s buffer_hz
3463 * ------- --------- ---------- ------------- ----------- -------------
3464 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3465 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3466 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3467 * [4] 1,000 1,024 1,000,000 100 976,562 100
3468 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3469 * [6] 1,000 64 1,000,000 249 15,625,000 249
3470 *
3471 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3472 * [2] 2.6.26-1-686-bigmem from Debian lenny
3473 * [3] 2.6.26-2-sparc64 from Debian lenny
3474 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3475 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3476 * [6] 2.6.34 from kernel.org on KVM
3477 */
3478 static const char fn[] = "/proc/net/psched";
3479 unsigned int a, b, c, d;
3480 FILE *stream;
3481
3482 ticks_per_s = 1.0;
3483 buffer_hz = 100;
3484
3485 stream = fopen(fn, "r");
3486 if (!stream) {
3487 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3488 return;
3489 }
3490
3491 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3492 VLOG_WARN("%s: read failed", fn);
3493 fclose(stream);
3494 return;
3495 }
3496 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3497 fclose(stream);
3498
3499 if (!a || !c) {
3500 VLOG_WARN("%s: invalid scheduler parameters", fn);
3501 return;
3502 }
3503
3504 ticks_per_s = (double) a * c / b;
3505 if (c == 1000000) {
3506 buffer_hz = d;
3507 } else {
3508 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3509 fn, a, b, c, d);
3510 }
3511 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3512}
3513
3514/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3515 * rate of 'rate' bytes per second. */
3516static unsigned int
3517tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3518{
3519 if (!buffer_hz) {
3520 read_psched();
3521 }
3522 return (rate * ticks) / ticks_per_s;
3523}
3524
3525/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3526 * rate of 'rate' bytes per second. */
3527static unsigned int
3528tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3529{
3530 if (!buffer_hz) {
3531 read_psched();
3532 }
015c93a4 3533 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3534}
3535
3536/* Returns the number of bytes that need to be reserved for qdisc buffering at
3537 * a transmission rate of 'rate' bytes per second. */
3538static unsigned int
3539tc_buffer_per_jiffy(unsigned int rate)
3540{
3541 if (!buffer_hz) {
3542 read_psched();
3543 }
3544 return rate / buffer_hz;
3545}
3546
3547/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3548 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3549 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3550 * stores NULL into it if it is absent.
3551 *
3552 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3553 * 'msg'.
3554 *
3555 * Returns 0 if successful, otherwise a positive errno value. */
3556static int
3557tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3558 struct nlattr **options)
3559{
3560 static const struct nl_policy tca_policy[] = {
3561 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3562 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3563 };
3564 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3565
3566 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3567 tca_policy, ta, ARRAY_SIZE(ta))) {
3568 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3569 goto error;
3570 }
3571
3572 if (kind) {
3573 *kind = nl_attr_get_string(ta[TCA_KIND]);
3574 }
3575
3576 if (options) {
3577 *options = ta[TCA_OPTIONS];
3578 }
3579
3580 return 0;
3581
3582error:
3583 if (kind) {
3584 *kind = NULL;
3585 }
3586 if (options) {
3587 *options = NULL;
3588 }
3589 return EPROTO;
3590}
3591
3592/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3593 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3594 * into '*options', and its queue statistics into '*stats'. Any of the output
3595 * arguments may be null.
3596 *
3597 * Returns 0 if successful, otherwise a positive errno value. */
3598static int
3599tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3600 struct nlattr **options, struct netdev_queue_stats *stats)
3601{
3602 static const struct nl_policy tca_policy[] = {
3603 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3604 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3605 };
3606 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3607
3608 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3609 tca_policy, ta, ARRAY_SIZE(ta))) {
3610 VLOG_WARN_RL(&rl, "failed to parse class message");
3611 goto error;
3612 }
3613
3614 if (handlep) {
3615 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3616 *handlep = tc->tcm_handle;
3617 }
3618
3619 if (options) {
3620 *options = ta[TCA_OPTIONS];
3621 }
3622
3623 if (stats) {
3624 const struct gnet_stats_queue *gsq;
3625 struct gnet_stats_basic gsb;
3626
3627 static const struct nl_policy stats_policy[] = {
3628 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3629 .min_len = sizeof gsb },
3630 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3631 .min_len = sizeof *gsq },
3632 };
3633 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3634
3635 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3636 sa, ARRAY_SIZE(sa))) {
3637 VLOG_WARN_RL(&rl, "failed to parse class stats");
3638 goto error;
3639 }
3640
3641 /* Alignment issues screw up the length of struct gnet_stats_basic on
3642 * some arch/bitsize combinations. Newer versions of Linux have a
3643 * struct gnet_stats_basic_packed, but we can't depend on that. The
3644 * easiest thing to do is just to make a copy. */
3645 memset(&gsb, 0, sizeof gsb);
3646 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3647 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3648 stats->tx_bytes = gsb.bytes;
3649 stats->tx_packets = gsb.packets;
3650
3651 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3652 stats->tx_errors = gsq->drops;
3653 }
3654
3655 return 0;
3656
3657error:
3658 if (options) {
3659 *options = NULL;
3660 }
3661 if (stats) {
3662 memset(stats, 0, sizeof *stats);
3663 }
3664 return EPROTO;
3665}
3666
3667/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3668 * on 'netdev'. */
3669static int
3670tc_query_class(const struct netdev *netdev,
3671 unsigned int handle, unsigned int parent,
3672 struct ofpbuf **replyp)
3673{
3674 struct ofpbuf request;
3675 struct tcmsg *tcmsg;
3676 int error;
3677
3678 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3679 if (!tcmsg) {
3680 return ENODEV;
3681 }
c1c9c9c4
BP
3682 tcmsg->tcm_handle = handle;
3683 tcmsg->tcm_parent = parent;
3684
3685 error = tc_transact(&request, replyp);
3686 if (error) {
3687 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3688 netdev_get_name(netdev),
3689 tc_get_major(handle), tc_get_minor(handle),
3690 tc_get_major(parent), tc_get_minor(parent),
3691 strerror(error));
3692 }
3693 return error;
3694}
3695
3696/* Equivalent to "tc class del dev <name> handle <handle>". */
3697static int
3698tc_delete_class(const struct netdev *netdev, unsigned int handle)
3699{
3700 struct ofpbuf request;
3701 struct tcmsg *tcmsg;
3702 int error;
3703
3704 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3705 if (!tcmsg) {
3706 return ENODEV;
3707 }
c1c9c9c4
BP
3708 tcmsg->tcm_handle = handle;
3709 tcmsg->tcm_parent = 0;
3710
3711 error = tc_transact(&request, NULL);
3712 if (error) {
3713 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3714 netdev_get_name(netdev),
3715 tc_get_major(handle), tc_get_minor(handle),
3716 strerror(error));
3717 }
3718 return error;
3719}
3720
3721/* Equivalent to "tc qdisc del dev <name> root". */
3722static int
3723tc_del_qdisc(struct netdev *netdev)
3724{
3725 struct netdev_dev_linux *netdev_dev =
3726 netdev_dev_linux_cast(netdev_get_dev(netdev));
3727 struct ofpbuf request;
3728 struct tcmsg *tcmsg;
3729 int error;
3730
3731 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3732 if (!tcmsg) {
3733 return ENODEV;
3734 }
c1c9c9c4
BP
3735 tcmsg->tcm_handle = tc_make_handle(1, 0);
3736 tcmsg->tcm_parent = TC_H_ROOT;
3737
3738 error = tc_transact(&request, NULL);
3739 if (error == EINVAL) {
3740 /* EINVAL probably means that the default qdisc was in use, in which
3741 * case we've accomplished our purpose. */
3742 error = 0;
3743 }
3744 if (!error && netdev_dev->tc) {
3745 if (netdev_dev->tc->ops->tc_destroy) {
3746 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3747 }
3748 netdev_dev->tc = NULL;
3749 }
3750 return error;
3751}
3752
3753/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3754 * kernel to determine what they are. Returns 0 if successful, otherwise a
3755 * positive errno value. */
3756static int
3757tc_query_qdisc(const struct netdev *netdev)
3758{
3759 struct netdev_dev_linux *netdev_dev =
3760 netdev_dev_linux_cast(netdev_get_dev(netdev));
3761 struct ofpbuf request, *qdisc;
3762 const struct tc_ops *ops;
3763 struct tcmsg *tcmsg;
3764 int load_error;
3765 int error;
3766
3767 if (netdev_dev->tc) {
3768 return 0;
3769 }
3770
3771 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3772 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3773 * 2.6.35 without that fix backported to it.
3774 *
3775 * To avoid the OOPS, we must not make a request that would attempt to dump
3776 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3777 * few others. There are a few ways that I can see to do this, but most of
3778 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3779 * technique chosen here is to assume that any non-default qdisc that we
3780 * create will have a class with handle 1:0. The built-in qdiscs only have
3781 * a class with handle 0:0.
3782 *
3783 * We could check for Linux 2.6.35+ and use a more straightforward method
3784 * there. */
3785 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3786 if (!tcmsg) {
3787 return ENODEV;
3788 }
c1c9c9c4
BP
3789 tcmsg->tcm_handle = tc_make_handle(1, 0);
3790 tcmsg->tcm_parent = 0;
3791
3792 /* Figure out what tc class to instantiate. */
3793 error = tc_transact(&request, &qdisc);
3794 if (!error) {
3795 const char *kind;
3796
3797 error = tc_parse_qdisc(qdisc, &kind, NULL);
3798 if (error) {
3799 ops = &tc_ops_other;
3800 } else {
3801 ops = tc_lookup_linux_name(kind);
3802 if (!ops) {
3803 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3804 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3805
3806 ops = &tc_ops_other;
3807 }
3808 }
3809 } else if (error == ENOENT) {
3810 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3811 * other entity that doesn't have a handle 1:0. We will assume
3812 * that it's the system default qdisc. */
3813 ops = &tc_ops_default;
3814 error = 0;
3815 } else {
3816 /* Who knows? Maybe the device got deleted. */
3817 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3818 netdev_get_name(netdev), strerror(error));
3819 ops = &tc_ops_other;
3820 }
3821
3822 /* Instantiate it. */
3823 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3824 assert((load_error == 0) == (netdev_dev->tc != NULL));
3825 ofpbuf_delete(qdisc);
3826
3827 return error ? error : load_error;
3828}
3829
3830/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3831 approximate the time to transmit packets of various lengths. For an MTU of
3832 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3833 represents two possible packet lengths; for a MTU of 513 through 1024, four
3834 possible lengths; and so on.
3835
3836 Returns, for the specified 'mtu', the number of bits that packet lengths
3837 need to be shifted right to fit within such a 256-entry table. */
3838static int
3839tc_calc_cell_log(unsigned int mtu)
3840{
3841 int cell_log;
3842
3843 if (!mtu) {
3844 mtu = ETH_PAYLOAD_MAX;
3845 }
3846 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3847
3848 for (cell_log = 0; mtu >= 256; cell_log++) {
3849 mtu >>= 1;
3850 }
3851
3852 return cell_log;
3853}
3854
3855/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3856 * of 'mtu'. */
3857static void
3858tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3859{
3860 memset(rate, 0, sizeof *rate);
3861 rate->cell_log = tc_calc_cell_log(mtu);
3862 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3863 /* rate->cell_align = 0; */ /* distro headers. */
3864 rate->mpu = ETH_TOTAL_MIN;
3865 rate->rate = Bps;
3866}
3867
3868/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3869 * attribute of the specified "type".
3870 *
3871 * See tc_calc_cell_log() above for a description of "rtab"s. */
3872static void
3873tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3874{
3875 uint32_t *rtab;
3876 unsigned int i;
3877
3878 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3879 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3880 unsigned packet_size = (i + 1) << rate->cell_log;
3881 if (packet_size < rate->mpu) {
3882 packet_size = rate->mpu;
3883 }
3884 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3885 }
3886}
3887
3888/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3889 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3890 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3891 * 0 is fine.) */
c1c9c9c4
BP
3892static int
3893tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3894{
3895 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3896 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3897}
3898
3899\f
3900/* Utility functions. */
3901
3902static int
3903get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3904{
3905 /* Policy for RTNLGRP_LINK messages.
3906 *
3907 * There are *many* more fields in these messages, but currently we only
3908 * care about these fields. */
3909 static const struct nl_policy rtnlgrp_link_policy[] = {
3910 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3911 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3912 .min_len = sizeof(struct rtnl_link_stats) },
3913 };
3914
3915 struct ofpbuf request;
3916 struct ofpbuf *reply;
3917 struct ifinfomsg *ifi;
3918 const struct rtnl_link_stats *rtnl_stats;
3919 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3920 int error;
3921
3922 ofpbuf_init(&request, 0);
3923 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3924 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3925 ifi->ifi_family = PF_UNSPEC;
3926 ifi->ifi_index = ifindex;
3927 error = nl_sock_transact(rtnl_sock, &request, &reply);
3928 ofpbuf_uninit(&request);
3929 if (error) {
3930 return error;
3931 }
3932
3933 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3934 rtnlgrp_link_policy,
3935 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3936 ofpbuf_delete(reply);
3937 return EPROTO;
3938 }
3939
3940 if (!attrs[IFLA_STATS]) {
3941 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3942 ofpbuf_delete(reply);
3943 return EPROTO;
3944 }
8b61709d
BP
3945
3946 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3947 stats->rx_packets = rtnl_stats->rx_packets;
3948 stats->tx_packets = rtnl_stats->tx_packets;
3949 stats->rx_bytes = rtnl_stats->rx_bytes;
3950 stats->tx_bytes = rtnl_stats->tx_bytes;
3951 stats->rx_errors = rtnl_stats->rx_errors;
3952 stats->tx_errors = rtnl_stats->tx_errors;
3953 stats->rx_dropped = rtnl_stats->rx_dropped;
3954 stats->tx_dropped = rtnl_stats->tx_dropped;
3955 stats->multicast = rtnl_stats->multicast;
3956 stats->collisions = rtnl_stats->collisions;
3957 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3958 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3959 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3960 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3961 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3962 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3963 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3964 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3965 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3966 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3967 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3968
576e26d7
BP
3969 ofpbuf_delete(reply);
3970
8b61709d
BP
3971 return 0;
3972}
3973
3974static int
3975get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3976{
3977 static const char fn[] = "/proc/net/dev";
3978 char line[1024];
3979 FILE *stream;
3980 int ln;
3981
3982 stream = fopen(fn, "r");
3983 if (!stream) {
3984 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3985 return errno;
3986 }
3987
3988 ln = 0;
3989 while (fgets(line, sizeof line, stream)) {
3990 if (++ln >= 3) {
3991 char devname[16];
3992#define X64 "%"SCNu64
3993 if (sscanf(line,
3994 " %15[^:]:"
3995 X64 X64 X64 X64 X64 X64 X64 "%*u"
3996 X64 X64 X64 X64 X64 X64 X64 "%*u",
3997 devname,
3998 &stats->rx_bytes,
3999 &stats->rx_packets,
4000 &stats->rx_errors,
4001 &stats->rx_dropped,
4002 &stats->rx_fifo_errors,
4003 &stats->rx_frame_errors,
4004 &stats->multicast,
4005 &stats->tx_bytes,
4006 &stats->tx_packets,
4007 &stats->tx_errors,
4008 &stats->tx_dropped,
4009 &stats->tx_fifo_errors,
4010 &stats->collisions,
4011 &stats->tx_carrier_errors) != 15) {
4012 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4013 } else if (!strcmp(devname, netdev_name)) {
4014 stats->rx_length_errors = UINT64_MAX;
4015 stats->rx_over_errors = UINT64_MAX;
4016 stats->rx_crc_errors = UINT64_MAX;
4017 stats->rx_missed_errors = UINT64_MAX;
4018 stats->tx_aborted_errors = UINT64_MAX;
4019 stats->tx_heartbeat_errors = UINT64_MAX;
4020 stats->tx_window_errors = UINT64_MAX;
4021 fclose(stream);
4022 return 0;
4023 }
4024 }
4025 }
4026 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4027 fclose(stream);
4028 return ENODEV;
4029}
c1c9c9c4 4030
8b61709d
BP
4031static int
4032get_flags(const struct netdev *netdev, int *flags)
4033{
4034 struct ifreq ifr;
4035 int error;
4036
149f577a
JG
4037 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4038 "SIOCGIFFLAGS");
8b61709d
BP
4039 *flags = ifr.ifr_flags;
4040 return error;
4041}
4042
4043static int
4044set_flags(struct netdev *netdev, int flags)
4045{
4046 struct ifreq ifr;
4047
4048 ifr.ifr_flags = flags;
149f577a
JG
4049 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4050 "SIOCSIFFLAGS");
8b61709d
BP
4051}
4052
4053static int
4054do_get_ifindex(const char *netdev_name)
4055{
4056 struct ifreq ifr;
4057
71d7c22f 4058 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4059 COVERAGE_INC(netdev_get_ifindex);
4060 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4061 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4062 netdev_name, strerror(errno));
4063 return -errno;
4064 }
4065 return ifr.ifr_ifindex;
4066}
4067
4068static int
4069get_ifindex(const struct netdev *netdev_, int *ifindexp)
4070{
149f577a
JG
4071 struct netdev_dev_linux *netdev_dev =
4072 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4073 *ifindexp = 0;
149f577a 4074 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4075 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4076 if (ifindex < 0) {
4077 return -ifindex;
4078 }
149f577a
JG
4079 netdev_dev->cache_valid |= VALID_IFINDEX;
4080 netdev_dev->ifindex = ifindex;
8b61709d 4081 }
149f577a 4082 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4083 return 0;
4084}
4085
4086static int
4087get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4088{
4089 struct ifreq ifr;
4090 int hwaddr_family;
4091
4092 memset(&ifr, 0, sizeof ifr);
71d7c22f 4093 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4094 COVERAGE_INC(netdev_get_hwaddr);
4095 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4096 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4097 netdev_name, strerror(errno));
4098 return errno;
4099 }
4100 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4101 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4102 VLOG_WARN("%s device has unknown hardware address family %d",
4103 netdev_name, hwaddr_family);
4104 }
4105 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4106 return 0;
4107}
4108
4109static int
4110set_etheraddr(const char *netdev_name, int hwaddr_family,
4111 const uint8_t mac[ETH_ADDR_LEN])
4112{
4113 struct ifreq ifr;
4114
4115 memset(&ifr, 0, sizeof ifr);
71d7c22f 4116 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4117 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4118 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4119 COVERAGE_INC(netdev_set_hwaddr);
4120 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4121 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4122 netdev_name, strerror(errno));
4123 return errno;
4124 }
4125 return 0;
4126}
4127
4128static int
0b0544d7 4129netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4130 int cmd, const char *cmd_name)
4131{
4132 struct ifreq ifr;
4133
4134 memset(&ifr, 0, sizeof ifr);
71d7c22f 4135 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4136 ifr.ifr_data = (caddr_t) ecmd;
4137
4138 ecmd->cmd = cmd;
4139 COVERAGE_INC(netdev_ethtool);
4140 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4141 return 0;
4142 } else {
4143 if (errno != EOPNOTSUPP) {
4144 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4145 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4146 } else {
4147 /* The device doesn't support this operation. That's pretty
4148 * common, so there's no point in logging anything. */
4149 }
4150 return errno;
4151 }
4152}
4153
4154static int
149f577a
JG
4155netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4156 const char *cmd_name)
8b61709d 4157{
71d7c22f 4158 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4159 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4160 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4161 strerror(errno));
8b61709d
BP
4162 return errno;
4163 }
4164 return 0;
4165}
f1acd62b
BP
4166
4167static int
4168netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4169 int cmd, const char *cmd_name)
4170{
4171 struct ifreq ifr;
4172 int error;
4173
4174 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4175 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4176 if (!error) {
4177 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4178 *ip = sin->sin_addr;
4179 }
4180 return error;
4181}