]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
qos: Remove min-rate requirement for linux-htb and linux-hfsc.
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
782e6111 2 * Copyright (c) 2009, 2010, 2011 Nicira Networks.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
8b61709d 18#include <assert.h>
e9e28be3 19#include <errno.h>
8b61709d
BP
20#include <fcntl.h>
21#include <arpa/inet.h>
22#include <inttypes.h>
c1c9c9c4 23#include <linux/gen_stats.h>
8b61709d 24#include <linux/if_tun.h>
a740f0de 25#include <linux/ip.h>
8b61709d
BP
26#include <linux/types.h>
27#include <linux/ethtool.h>
63331829 28#include <linux/mii.h>
6f42c8ea 29#include <linux/pkt_sched.h>
e9e28be3 30#include <linux/rtnetlink.h>
8b61709d
BP
31#include <linux/sockios.h>
32#include <linux/version.h>
33#include <sys/types.h>
34#include <sys/ioctl.h>
35#include <sys/socket.h>
36#include <netpacket/packet.h>
37#include <net/ethernet.h>
38#include <net/if.h>
a740f0de 39#include <linux/if_tunnel.h>
8b61709d
BP
40#include <net/if_arp.h>
41#include <net/if_packet.h>
42#include <net/route.h>
43#include <netinet/in.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
9fe3b9a2 50#include "dpif-linux.h"
8b61709d
BP
51#include "dynamic-string.h"
52#include "fatal-signal.h"
93b13be8
BP
53#include "hash.h"
54#include "hmap.h"
8b61709d 55#include "netdev-provider.h"
7fbef77a 56#include "netdev-vport.h"
e9e28be3 57#include "netlink.h"
2fe27d5a 58#include "netlink-socket.h"
e9e28be3 59#include "ofpbuf.h"
8b61709d
BP
60#include "openflow/openflow.h"
61#include "packets.h"
62#include "poll-loop.h"
559843ed 63#include "rtnetlink.h"
21d6e22e 64#include "rtnetlink-link.h"
8b61709d
BP
65#include "socket-util.h"
66#include "shash.h"
67#include "svec.h"
e9e28be3 68#include "vlog.h"
5136ce49 69
d98e6007 70VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea
BP
71
72COVERAGE_DEFINE(netdev_get_vlan_vid);
73COVERAGE_DEFINE(netdev_set_policing);
74COVERAGE_DEFINE(netdev_arp_lookup);
75COVERAGE_DEFINE(netdev_get_ifindex);
76COVERAGE_DEFINE(netdev_get_hwaddr);
77COVERAGE_DEFINE(netdev_set_hwaddr);
78COVERAGE_DEFINE(netdev_ethtool);
8b61709d
BP
79\f
80/* These were introduced in Linux 2.6.14, so they might be missing if we have
81 * old headers. */
82#ifndef ADVERTISED_Pause
83#define ADVERTISED_Pause (1 << 13)
84#endif
85#ifndef ADVERTISED_Asym_Pause
86#define ADVERTISED_Asym_Pause (1 << 14)
87#endif
88
c1c9c9c4
BP
89/* This was introduced in Linux 2.6.25, so it might be missing if we have old
90 * headers. */
91#ifndef TC_RTAB_SIZE
92#define TC_RTAB_SIZE 1024
93#endif
94
149f577a 95static struct rtnetlink_notifier netdev_linux_cache_notifier;
46415c90 96static int cache_notifier_refcount;
8b61709d
BP
97
98enum {
7fbef77a
JG
99 VALID_IFINDEX = 1 << 0,
100 VALID_ETHERADDR = 1 << 1,
101 VALID_IN4 = 1 << 2,
102 VALID_IN6 = 1 << 3,
103 VALID_MTU = 1 << 4,
104 VALID_CARRIER = 1 << 5,
105 VALID_IS_PSEUDO = 1 << 6, /* Represents is_internal and is_tap. */
106 VALID_POLICING = 1 << 7,
107 VALID_HAVE_VPORT_STATS = 1 << 8
8b61709d
BP
108};
109
149f577a
JG
110struct tap_state {
111 int fd;
61b999dd 112 bool opened;
149f577a 113};
c1c9c9c4
BP
114\f
115/* Traffic control. */
116
117/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
118 * network device.
119 *
120 * Each TC implementation subclasses this with whatever additional data it
121 * needs. */
c1c9c9c4
BP
122struct tc {
123 const struct tc_ops *ops;
93b13be8
BP
124 struct hmap queues; /* Contains "struct tc_queue"s.
125 * Read by generic TC layer.
126 * Written only by TC implementation. */
127};
c1c9c9c4 128
93b13be8
BP
129/* One traffic control queue.
130 *
131 * Each TC implementation subclasses this with whatever additional data it
132 * needs. */
133struct tc_queue {
134 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
135 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
136};
137
138/* A particular kind of traffic control. Each implementation generally maps to
139 * one particular Linux qdisc class.
140 *
141 * The functions below return 0 if successful or a positive errno value on
142 * failure, except where otherwise noted. All of them must be provided, except
143 * where otherwise noted. */
144struct tc_ops {
145 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
146 * This is null for tc_ops_default and tc_ops_other, for which there are no
147 * appropriate values. */
148 const char *linux_name;
149
150 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
151 const char *ovs_name;
152
153 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
154 * queues. The queues are numbered 0 through n_queues - 1. */
155 unsigned int n_queues;
156
157 /* Called to install this TC class on 'netdev'. The implementation should
158 * make the Netlink calls required to set up 'netdev' with the right qdisc
159 * and configure it according to 'details'. The implementation may assume
160 * that the current qdisc is the default; that is, there is no need for it
161 * to delete the current qdisc before installing itself.
162 *
163 * The contents of 'details' should be documented as valid for 'ovs_name'
164 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
165 * (which is built as ovs-vswitchd.conf.db(8)).
166 *
167 * This function must return 0 if and only if it sets 'netdev->tc' to an
168 * initialized 'struct tc'.
169 *
170 * (This function is null for tc_ops_other, which cannot be installed. For
171 * other TC classes it should always be nonnull.) */
172 int (*tc_install)(struct netdev *netdev, const struct shash *details);
173
174 /* Called when the netdev code determines (through a Netlink query) that
175 * this TC class's qdisc is installed on 'netdev', but we didn't install
176 * it ourselves and so don't know any of the details.
177 *
178 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
179 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
180 * implementation should parse the other attributes of 'nlmsg' as
181 * necessary to determine its configuration. If necessary it should also
182 * use Netlink queries to determine the configuration of queues on
183 * 'netdev'.
184 *
185 * This function must return 0 if and only if it sets 'netdev->tc' to an
186 * initialized 'struct tc'. */
187 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
188
189 /* Destroys the data structures allocated by the implementation as part of
190 * 'tc'. (This includes destroying 'tc->queues' by calling
191 * tc_destroy(tc).
192 *
193 * The implementation should not need to perform any Netlink calls. If
194 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
195 * (But it may not be desirable.)
196 *
197 * This function may be null if 'tc' is trivial. */
198 void (*tc_destroy)(struct tc *tc);
199
200 /* Retrieves details of 'netdev->tc' configuration into 'details'.
201 *
202 * The implementation should not need to perform any Netlink calls, because
203 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
204 * cached the configuration.
205 *
206 * The contents of 'details' should be documented as valid for 'ovs_name'
207 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
208 * (which is built as ovs-vswitchd.conf.db(8)).
209 *
210 * This function may be null if 'tc' is not configurable.
211 */
212 int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
213
214 /* Reconfigures 'netdev->tc' according to 'details', performing any
215 * required Netlink calls to complete the reconfiguration.
216 *
217 * The contents of 'details' should be documented as valid for 'ovs_name'
218 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
219 * (which is built as ovs-vswitchd.conf.db(8)).
220 *
221 * This function may be null if 'tc' is not configurable.
222 */
223 int (*qdisc_set)(struct netdev *, const struct shash *details);
224
93b13be8
BP
225 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
226 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
227 *
228 * The contents of 'details' should be documented as valid for 'ovs_name'
229 * in the "other_config" column in the "Queue" table in
230 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
231 *
232 * The implementation should not need to perform any Netlink calls, because
233 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
234 * cached the queue configuration.
235 *
236 * This function may be null if 'tc' does not have queues ('n_queues' is
237 * 0). */
93b13be8 238 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
239 struct shash *details);
240
241 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
242 * 'details', perfoming any required Netlink calls to complete the
243 * reconfiguration. The caller ensures that 'queue_id' is less than
244 * 'n_queues'.
245 *
246 * The contents of 'details' should be documented as valid for 'ovs_name'
247 * in the "other_config" column in the "Queue" table in
248 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
249 *
250 * This function may be null if 'tc' does not have queues or its queues are
251 * not configurable. */
252 int (*class_set)(struct netdev *, unsigned int queue_id,
253 const struct shash *details);
254
93b13be8
BP
255 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
256 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
257 *
258 * This function may be null if 'tc' does not have queues or its queues
259 * cannot be deleted. */
93b13be8 260 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 261
93b13be8
BP
262 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
263 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
264 *
265 * On success, initializes '*stats'.
266 *
267 * This function may be null if 'tc' does not have queues or if it cannot
268 * report queue statistics. */
93b13be8
BP
269 int (*class_get_stats)(const struct netdev *netdev,
270 const struct tc_queue *queue,
c1c9c9c4
BP
271 struct netdev_queue_stats *stats);
272
273 /* Extracts queue stats from 'nlmsg', which is a response to a
274 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
275 *
276 * This function may be null if 'tc' does not have queues or if it cannot
277 * report queue statistics. */
278 int (*class_dump_stats)(const struct netdev *netdev,
279 const struct ofpbuf *nlmsg,
280 netdev_dump_queue_stats_cb *cb, void *aux);
281};
282
283static void
284tc_init(struct tc *tc, const struct tc_ops *ops)
285{
286 tc->ops = ops;
93b13be8 287 hmap_init(&tc->queues);
c1c9c9c4
BP
288}
289
290static void
291tc_destroy(struct tc *tc)
292{
93b13be8 293 hmap_destroy(&tc->queues);
c1c9c9c4
BP
294}
295
296static const struct tc_ops tc_ops_htb;
a339aa81 297static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
298static const struct tc_ops tc_ops_default;
299static const struct tc_ops tc_ops_other;
300
301static const struct tc_ops *tcs[] = {
302 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 303 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
304 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
305 &tc_ops_other, /* Some other qdisc. */
306 NULL
307};
149f577a 308
c1c9c9c4
BP
309static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
310static unsigned int tc_get_major(unsigned int handle);
311static unsigned int tc_get_minor(unsigned int handle);
312
313static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
314static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
315static unsigned int tc_buffer_per_jiffy(unsigned int rate);
316
317static struct tcmsg *tc_make_request(const struct netdev *, int type,
318 unsigned int flags, struct ofpbuf *);
319static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
320
321static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
322 struct nlattr **options);
323static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
324 struct nlattr **options,
325 struct netdev_queue_stats *);
326static int tc_query_class(const struct netdev *,
327 unsigned int handle, unsigned int parent,
328 struct ofpbuf **replyp);
329static int tc_delete_class(const struct netdev *, unsigned int handle);
330
331static int tc_del_qdisc(struct netdev *netdev);
332static int tc_query_qdisc(const struct netdev *netdev);
333
334static int tc_calc_cell_log(unsigned int mtu);
335static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
336static void tc_put_rtab(struct ofpbuf *, uint16_t type,
337 const struct tc_ratespec *rate);
338static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
339\f
149f577a
JG
340struct netdev_dev_linux {
341 struct netdev_dev netdev_dev;
342
8b61709d 343 struct shash_node *shash_node;
149f577a 344 unsigned int cache_valid;
8b61709d 345
8722022c
BP
346 /* The following are figured out "on demand" only. They are only valid
347 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
348 int ifindex;
349 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 350 struct in_addr address, netmask;
8b61709d
BP
351 struct in6_addr in6;
352 int mtu;
353 int carrier;
8722022c
BP
354 bool is_internal; /* Is this an openvswitch internal device? */
355 bool is_tap; /* Is this a tuntap device? */
80a86fbe
BP
356 uint32_t kbits_rate; /* Policing data. */
357 uint32_t kbits_burst;
7fbef77a 358 bool have_vport_stats;
c1c9c9c4 359 struct tc *tc;
149f577a
JG
360
361 union {
362 struct tap_state tap;
363 } state;
8b61709d
BP
364};
365
149f577a
JG
366struct netdev_linux {
367 struct netdev netdev;
5b7448ed 368 int fd;
149f577a 369};
8b61709d 370
8b61709d
BP
371/* An AF_INET socket (used for ioctl operations). */
372static int af_inet_sock = -1;
373
ff4ed3c9
BP
374/* A Netlink routing socket that is not subscribed to any multicast groups. */
375static struct nl_sock *rtnl_sock;
376
8b61709d
BP
377struct netdev_linux_notifier {
378 struct netdev_notifier notifier;
379 struct list node;
380};
381
382static struct shash netdev_linux_notifiers =
383 SHASH_INITIALIZER(&netdev_linux_notifiers);
46097491 384static struct rtnetlink_notifier netdev_linux_poll_notifier;
8b61709d
BP
385
386/* This is set pretty low because we probably won't learn anything from the
387 * additional log messages. */
388static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
389
15b3596a 390static int netdev_linux_init(void);
6f643e49 391
0b0544d7 392static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 393 int cmd, const char *cmd_name);
149f577a
JG
394static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
395 const char *cmd_name);
f1acd62b
BP
396static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
397 int cmd, const char *cmd_name);
8b61709d
BP
398static int get_flags(const struct netdev *, int *flagsp);
399static int set_flags(struct netdev *, int flags);
400static int do_get_ifindex(const char *netdev_name);
401static int get_ifindex(const struct netdev *, int *ifindexp);
402static int do_set_addr(struct netdev *netdev,
403 int ioctl_nr, const char *ioctl_name,
404 struct in_addr addr);
405static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
406static int set_etheraddr(const char *netdev_name, int hwaddr_family,
407 const uint8_t[ETH_ADDR_LEN]);
408static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
409static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
410
15b3596a
JG
411static bool
412is_netdev_linux_class(const struct netdev_class *netdev_class)
413{
414 return netdev_class->init == netdev_linux_init;
415}
416
149f577a
JG
417static struct netdev_dev_linux *
418netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 419{
15b3596a
JG
420 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
421 assert(is_netdev_linux_class(netdev_class));
422
149f577a 423 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
424}
425
8b61709d
BP
426static struct netdev_linux *
427netdev_linux_cast(const struct netdev *netdev)
428{
15b3596a
JG
429 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
430 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
431 assert(is_netdev_linux_class(netdev_class));
432
8b61709d
BP
433 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
434}
ff4ed3c9 435\f
8b61709d
BP
436static int
437netdev_linux_init(void)
438{
439 static int status = -1;
440 if (status < 0) {
ff4ed3c9 441 /* Create AF_INET socket. */
8b61709d
BP
442 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
443 status = af_inet_sock >= 0 ? 0 : errno;
444 if (status) {
445 VLOG_ERR("failed to create inet socket: %s", strerror(status));
446 }
ff4ed3c9
BP
447
448 /* Create rtnetlink socket. */
449 if (!status) {
cceb11f5 450 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
451 if (status) {
452 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
453 strerror(status));
454 }
455 }
8b61709d
BP
456 }
457 return status;
458}
459
460static void
461netdev_linux_run(void)
462{
21d6e22e 463 rtnetlink_link_notifier_run();
8b61709d
BP
464}
465
466static void
467netdev_linux_wait(void)
468{
21d6e22e 469 rtnetlink_link_notifier_wait();
8b61709d
BP
470}
471
472static void
21d6e22e 473netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 474 void *aux OVS_UNUSED)
8b61709d 475{
149f577a 476 struct netdev_dev_linux *dev;
8b61709d 477 if (change) {
46415c90
JG
478 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
479 if (base_dev) {
15b3596a
JG
480 const struct netdev_class *netdev_class =
481 netdev_dev_get_class(base_dev);
482
483 if (is_netdev_linux_class(netdev_class)) {
484 dev = netdev_dev_linux_cast(base_dev);
485 dev->cache_valid = 0;
486 }
8b61709d
BP
487 }
488 } else {
46415c90 489 struct shash device_shash;
8b61709d 490 struct shash_node *node;
46415c90
JG
491
492 shash_init(&device_shash);
493 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
494 SHASH_FOR_EACH (node, &device_shash) {
149f577a
JG
495 dev = node->data;
496 dev->cache_valid = 0;
8b61709d 497 }
46415c90 498 shash_destroy(&device_shash);
8b61709d
BP
499 }
500}
501
c3827f61 502/* Creates system and internal devices. */
8b61709d 503static int
c3827f61 504netdev_linux_create(const struct netdev_class *class,
b8dcf5e9
BP
505 const char *name, const struct shash *args,
506 struct netdev_dev **netdev_devp)
6c88d577 507{
149f577a
JG
508 struct netdev_dev_linux *netdev_dev;
509 int error;
6c88d577
JP
510
511 if (!shash_is_empty(args)) {
c3827f61
BP
512 VLOG_WARN("%s: arguments for %s devices should be empty",
513 name, class->type);
6c88d577
JP
514 }
515
46415c90 516 if (!cache_notifier_refcount) {
21d6e22e
EJ
517 error = rtnetlink_link_notifier_register(&netdev_linux_cache_notifier,
518 netdev_linux_cache_cb, NULL);
149f577a
JG
519 if (error) {
520 return error;
521 }
522 }
46415c90 523 cache_notifier_refcount++;
6c88d577 524
149f577a 525 netdev_dev = xzalloc(sizeof *netdev_dev);
6d9e6eb4 526 netdev_dev_init(&netdev_dev->netdev_dev, name, args, class);
46415c90 527
149f577a 528 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
529 return 0;
530}
531
5b7448ed
JG
532/* For most types of netdevs we open the device for each call of
533 * netdev_open(). However, this is not the case with tap devices,
534 * since it is only possible to open the device once. In this
535 * situation we share a single file descriptor, and consequently
536 * buffers, across all readers. Therefore once data is read it will
537 * be unavailable to other reads for tap devices. */
a740f0de 538static int
b8dcf5e9
BP
539netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
540 const char *name, const struct shash *args,
541 struct netdev_dev **netdev_devp)
a740f0de 542{
149f577a 543 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
544 struct tap_state *state;
545 static const char tap_dev[] = "/dev/net/tun";
546 struct ifreq ifr;
547 int error;
548
549 if (!shash_is_empty(args)) {
149f577a 550 VLOG_WARN("%s: arguments for TAP devices should be empty", name);
6c88d577
JP
551 }
552
149f577a
JG
553 netdev_dev = xzalloc(sizeof *netdev_dev);
554 state = &netdev_dev->state.tap;
a740f0de 555
6c88d577 556 /* Open tap device. */
149f577a
JG
557 state->fd = open(tap_dev, O_RDWR);
558 if (state->fd < 0) {
6c88d577
JP
559 error = errno;
560 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
561 goto error;
562 }
563
564 /* Create tap device. */
565 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 566 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 567 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
568 VLOG_WARN("%s: creating tap device failed: %s", name,
569 strerror(errno));
570 error = errno;
571 goto error;
572 }
573
574 /* Make non-blocking. */
149f577a 575 error = set_nonblocking(state->fd);
a740f0de
JG
576 if (error) {
577 goto error;
578 }
579
6d9e6eb4 580 netdev_dev_init(&netdev_dev->netdev_dev, name, args, &netdev_tap_class);
149f577a 581 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
582 return 0;
583
584error:
149f577a 585 free(netdev_dev);
a740f0de
JG
586 return error;
587}
588
a740f0de 589static void
149f577a 590destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 591{
149f577a
JG
592 struct tap_state *state = &netdev_dev->state.tap;
593
594 if (state->fd >= 0) {
595 close(state->fd);
a740f0de
JG
596 }
597}
598
149f577a 599/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 600static void
149f577a 601netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 602{
149f577a 603 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 604 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 605
c1c9c9c4
BP
606 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
607 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
608 }
609
d2bb2799 610 if (class == &netdev_linux_class || class == &netdev_internal_class) {
46415c90 611 cache_notifier_refcount--;
149f577a 612
46415c90 613 if (!cache_notifier_refcount) {
21d6e22e 614 rtnetlink_link_notifier_unregister(&netdev_linux_cache_notifier);
149f577a 615 }
d2bb2799 616 } else if (class == &netdev_tap_class) {
149f577a 617 destroy_tap(netdev_dev);
d2bb2799
BP
618 } else {
619 NOT_REACHED();
6c88d577 620 }
149f577a 621
658797c8 622 free(netdev_dev);
6c88d577
JP
623}
624
8b61709d 625static int
5b7448ed 626netdev_linux_open(struct netdev_dev *netdev_dev_, int ethertype,
149f577a 627 struct netdev **netdevp)
8b61709d 628{
5b7448ed 629 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
8b61709d
BP
630 struct netdev_linux *netdev;
631 enum netdev_flags flags;
632 int error;
633
634 /* Allocate network device. */
ec6fde61 635 netdev = xzalloc(sizeof *netdev);
49a6a163 636 netdev->fd = -1;
5b7448ed 637 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 638
c3827f61
BP
639 /* Verify that the device really exists, by attempting to read its flags.
640 * (The flags might be cached, in which case this won't actually do an
641 * ioctl.)
642 *
643 * Don't do this for "internal" netdevs, though, because those have to be
644 * created as netdev objects before they exist in the kernel, because
645 * creating them in the kernel happens by passing a netdev object to
646 * dpif_port_add(). */
647 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
648 error = netdev_get_flags(&netdev->netdev, &flags);
649 if (error == ENODEV) {
650 goto error;
651 }
8b61709d
BP
652 }
653
61b999dd
JG
654 if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
655 !netdev_dev->state.tap.opened) {
656
657 /* We assume that the first user of the tap device is the primary user
658 * and give them the tap FD. Subsequent users probably just expect
659 * this to be a system device so open it normally to avoid send/receive
660 * directions appearing to be reversed. */
5b7448ed 661 netdev->fd = netdev_dev->state.tap.fd;
61b999dd 662 netdev_dev->state.tap.opened = true;
5b7448ed 663 } else if (ethertype != NETDEV_ETH_TYPE_NONE) {
8b61709d
BP
664 struct sockaddr_ll sll;
665 int protocol;
666 int ifindex;
667
668 /* Create file descriptor. */
669 protocol = (ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
670 : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
671 : ethertype);
5b7448ed
JG
672 netdev->fd = socket(PF_PACKET, SOCK_RAW, htons(protocol));
673 if (netdev->fd < 0) {
8b61709d
BP
674 error = errno;
675 goto error;
676 }
8b61709d
BP
677
678 /* Set non-blocking mode. */
5b7448ed 679 error = set_nonblocking(netdev->fd);
8b61709d
BP
680 if (error) {
681 goto error;
682 }
683
684 /* Get ethernet device index. */
685 error = get_ifindex(&netdev->netdev, &ifindex);
686 if (error) {
687 goto error;
688 }
689
690 /* Bind to specific ethernet device. */
691 memset(&sll, 0, sizeof sll);
692 sll.sll_family = AF_PACKET;
693 sll.sll_ifindex = ifindex;
5b7448ed 694 if (bind(netdev->fd,
8b61709d
BP
695 (struct sockaddr *) &sll, sizeof sll) < 0) {
696 error = errno;
5b7448ed 697 VLOG_ERR("bind to %s failed: %s", netdev_dev_get_name(netdev_dev_),
149f577a 698 strerror(error));
8b61709d
BP
699 goto error;
700 }
701
702 /* Between the socket() and bind() calls above, the socket receives all
703 * packets of the requested type on all system interfaces. We do not
704 * want to receive that data, but there is no way to avoid it. So we
705 * must now drain out the receive queue. */
5b7448ed 706 error = drain_rcvbuf(netdev->fd);
8b61709d
BP
707 if (error) {
708 goto error;
709 }
710 }
711
712 *netdevp = &netdev->netdev;
713 return 0;
714
715error:
149f577a 716 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
717 return error;
718}
719
720/* Closes and destroys 'netdev'. */
721static void
722netdev_linux_close(struct netdev *netdev_)
723{
724 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
725
49a6a163 726 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 727 close(netdev->fd);
8b61709d
BP
728 }
729 free(netdev);
730}
e9e28be3 731
8b61709d
BP
732/* Initializes 'svec' with a list of the names of all known network devices. */
733static int
734netdev_linux_enumerate(struct svec *svec)
735{
736 struct if_nameindex *names;
737
738 names = if_nameindex();
739 if (names) {
740 size_t i;
741
742 for (i = 0; names[i].if_name != NULL; i++) {
743 svec_add(svec, names[i].if_name);
744 }
745 if_freenameindex(names);
746 return 0;
747 } else {
748 VLOG_WARN("could not obtain list of network device names: %s",
749 strerror(errno));
750 return errno;
751 }
752}
753
754static int
755netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
756{
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
758
5b7448ed 759 if (netdev->fd < 0) {
8b61709d 760 /* Device was opened with NETDEV_ETH_TYPE_NONE. */
c0e5f6ca 761 return -EAGAIN;
8b61709d
BP
762 }
763
764 for (;;) {
5b7448ed 765 ssize_t retval = read(netdev->fd, data, size);
8b61709d
BP
766 if (retval >= 0) {
767 return retval;
768 } else if (errno != EINTR) {
769 if (errno != EAGAIN) {
770 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
771 strerror(errno), netdev_get_name(netdev_));
772 }
c0e5f6ca 773 return -errno;
8b61709d
BP
774 }
775 }
776}
777
778/* Registers with the poll loop to wake up from the next call to poll_block()
779 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
780static void
781netdev_linux_recv_wait(struct netdev *netdev_)
782{
783 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
784 if (netdev->fd >= 0) {
785 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
786 }
787}
788
789/* Discards all packets waiting to be received from 'netdev'. */
790static int
791netdev_linux_drain(struct netdev *netdev_)
792{
793 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 794 if (netdev->fd < 0) {
8b61709d 795 return 0;
5b7448ed 796 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 797 struct ifreq ifr;
149f577a 798 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
799 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
800 if (error) {
801 return error;
802 }
5b7448ed 803 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
804 return 0;
805 } else {
5b7448ed 806 return drain_rcvbuf(netdev->fd);
8b61709d
BP
807 }
808}
809
810/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
811 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
812 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
813 * the packet is too big or too small to transmit on the device.
814 *
815 * The caller retains ownership of 'buffer' in all cases.
816 *
817 * The kernel maintains a packet transmission queue, so the caller is not
818 * expected to do additional queuing of packets. */
819static int
820netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
821{
822 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
823
824 /* XXX should support sending even if 'ethertype' was NETDEV_ETH_TYPE_NONE.
825 */
5b7448ed 826 if (netdev->fd < 0) {
8b61709d
BP
827 return EPIPE;
828 }
829
830 for (;;) {
5b7448ed 831 ssize_t retval = write(netdev->fd, data, size);
8b61709d
BP
832 if (retval < 0) {
833 /* The Linux AF_PACKET implementation never blocks waiting for room
834 * for packets, instead returning ENOBUFS. Translate this into
835 * EAGAIN for the caller. */
836 if (errno == ENOBUFS) {
837 return EAGAIN;
838 } else if (errno == EINTR) {
839 continue;
840 } else if (errno != EAGAIN) {
841 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
842 netdev_get_name(netdev_), strerror(errno));
843 }
844 return errno;
845 } else if (retval != size) {
846 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
847 "%zu) on %s", retval, size, netdev_get_name(netdev_));
848 return EMSGSIZE;
849 } else {
850 return 0;
851 }
852 }
853}
854
855/* Registers with the poll loop to wake up from the next call to poll_block()
856 * when the packet transmission queue has sufficient room to transmit a packet
857 * with netdev_send().
858 *
859 * The kernel maintains a packet transmission queue, so the client is not
860 * expected to do additional queuing of packets. Thus, this function is
861 * unlikely to ever be used. It is included for completeness. */
862static void
863netdev_linux_send_wait(struct netdev *netdev_)
864{
865 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 866 if (netdev->fd < 0) {
8b61709d 867 /* Nothing to do. */
5b7448ed
JG
868 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
869 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
870 } else {
871 /* TAP device always accepts packets.*/
872 poll_immediate_wake();
873 }
874}
875
876/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
877 * otherwise a positive errno value. */
878static int
879netdev_linux_set_etheraddr(struct netdev *netdev_,
880 const uint8_t mac[ETH_ADDR_LEN])
881{
149f577a
JG
882 struct netdev_dev_linux *netdev_dev =
883 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e
BP
884 int error;
885
149f577a
JG
886 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
887 || !eth_addr_equals(netdev_dev->etheraddr, mac)) {
eb395f2e
BP
888 error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
889 if (!error) {
149f577a
JG
890 netdev_dev->cache_valid |= VALID_ETHERADDR;
891 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e
BP
892 }
893 } else {
894 error = 0;
8b61709d
BP
895 }
896 return error;
897}
898
899/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
900 * free the returned buffer. */
901static int
902netdev_linux_get_etheraddr(const struct netdev *netdev_,
903 uint8_t mac[ETH_ADDR_LEN])
904{
149f577a
JG
905 struct netdev_dev_linux *netdev_dev =
906 netdev_dev_linux_cast(netdev_get_dev(netdev_));
907 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 908 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 909 netdev_dev->etheraddr);
8b61709d
BP
910 if (error) {
911 return error;
912 }
149f577a 913 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 914 }
149f577a 915 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
8b61709d
BP
916 return 0;
917}
918
919/* Returns the maximum size of transmitted (and received) packets on 'netdev',
920 * in bytes, not including the hardware header; thus, this is typically 1500
921 * bytes for Ethernet devices. */
922static int
923netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
924{
149f577a
JG
925 struct netdev_dev_linux *netdev_dev =
926 netdev_dev_linux_cast(netdev_get_dev(netdev_));
927 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
928 struct ifreq ifr;
929 int error;
930
149f577a
JG
931 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
932 SIOCGIFMTU, "SIOCGIFMTU");
8b61709d
BP
933 if (error) {
934 return error;
935 }
149f577a
JG
936 netdev_dev->mtu = ifr.ifr_mtu;
937 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 938 }
149f577a 939 *mtup = netdev_dev->mtu;
8b61709d
BP
940 return 0;
941}
942
9ab3d9a3
BP
943/* Returns the ifindex of 'netdev', if successful, as a positive number.
944 * On failure, returns a negative errno value. */
945static int
946netdev_linux_get_ifindex(const struct netdev *netdev)
947{
948 int ifindex, error;
949
950 error = get_ifindex(netdev, &ifindex);
951 return error ? -error : ifindex;
952}
953
8b61709d
BP
954static int
955netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
956{
149f577a
JG
957 struct netdev_dev_linux *netdev_dev =
958 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
959 int error = 0;
960 char *fn = NULL;
961 int fd = -1;
962
149f577a 963 if (!(netdev_dev->cache_valid & VALID_CARRIER)) {
8b61709d
BP
964 char line[8];
965 int retval;
966
149f577a
JG
967 fn = xasprintf("/sys/class/net/%s/carrier",
968 netdev_get_name(netdev_));
8b61709d
BP
969 fd = open(fn, O_RDONLY);
970 if (fd < 0) {
971 error = errno;
972 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
973 goto exit;
974 }
975
976 retval = read(fd, line, sizeof line);
977 if (retval < 0) {
978 error = errno;
979 if (error == EINVAL) {
980 /* This is the normal return value when we try to check carrier
981 * if the network device is not up. */
982 } else {
983 VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
984 }
985 goto exit;
986 } else if (retval == 0) {
987 error = EPROTO;
988 VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
989 goto exit;
990 }
991
992 if (line[0] != '0' && line[0] != '1') {
993 error = EPROTO;
994 VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)",
995 fn, line[0]);
996 goto exit;
997 }
149f577a
JG
998 netdev_dev->carrier = line[0] != '0';
999 netdev_dev->cache_valid |= VALID_CARRIER;
8b61709d 1000 }
149f577a 1001 *carrier = netdev_dev->carrier;
8b61709d
BP
1002 error = 0;
1003
1004exit:
1005 if (fd >= 0) {
1006 close(fd);
1007 }
1008 free(fn);
1009 return error;
1010}
1011
63331829 1012static int
782e6111
EJ
1013netdev_linux_do_miimon(const struct netdev *netdev, int cmd,
1014 const char *cmd_name, struct mii_ioctl_data *data)
63331829 1015{
63331829 1016 struct ifreq ifr;
782e6111 1017 int error;
63331829 1018
63331829 1019 memset(&ifr, 0, sizeof ifr);
782e6111
EJ
1020 memcpy(&ifr.ifr_data, data, sizeof *data);
1021 error = netdev_linux_do_ioctl(netdev_get_name(netdev),
1022 &ifr, cmd, cmd_name);
1023 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1024
782e6111
EJ
1025 return error;
1026}
1027
1028static int
1029netdev_linux_get_miimon(const struct netdev *netdev, bool *miimon)
1030{
1031 const char *name = netdev_get_name(netdev);
1032 struct mii_ioctl_data data;
1033 int error;
63331829 1034
782e6111
EJ
1035 *miimon = false;
1036
1037 memset(&data, 0, sizeof data);
1038 error = netdev_linux_do_miimon(netdev, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1039 if (!error) {
1040 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1041 data.reg_num = MII_BMSR;
1042 error = netdev_linux_do_miimon(netdev, SIOCGMIIREG, "SIOCGMIIREG",
1043 &data);
63331829
EJ
1044
1045 if (!error) {
782e6111 1046 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1047 } else {
1048 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1049 }
1050 } else {
1051 struct ethtool_cmd ecmd;
63331829
EJ
1052
1053 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1054 name);
1055
1056 memset(&ecmd, 0, sizeof ecmd);
1057 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1058 "ETHTOOL_GLINK");
1059 if (!error) {
782e6111
EJ
1060 struct ethtool_value eval;
1061
1062 memcpy(&eval, &ecmd, sizeof eval);
1063 *miimon = !!eval.data;
63331829
EJ
1064 } else {
1065 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1066 }
1067 }
1068
1069 return error;
1070}
1071
8b61709d
BP
1072/* Check whether we can we use RTM_GETLINK to get network device statistics.
1073 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1074 * enabled. */
1075static bool
1076check_for_working_netlink_stats(void)
1077{
1078 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1079 * preferable, so if that works, we'll use it. */
1080 int ifindex = do_get_ifindex("lo");
1081 if (ifindex < 0) {
1082 VLOG_WARN("failed to get ifindex for lo, "
1083 "obtaining netdev stats from proc");
1084 return false;
1085 } else {
1086 struct netdev_stats stats;
1087 int error = get_stats_via_netlink(ifindex, &stats);
1088 if (!error) {
1089 VLOG_DBG("obtaining netdev stats via rtnetlink");
1090 return true;
1091 } else {
1092 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1093 "via proc (you are probably running a pre-2.6.19 "
1094 "kernel)", strerror(error));
1095 return false;
1096 }
1097 }
1098}
1099
8722022c
BP
1100/* Brings the 'is_internal' and 'is_tap' members of 'netdev_dev' up-to-date. */
1101static void
1102netdev_linux_update_is_pseudo(struct netdev_dev_linux *netdev_dev)
1103{
1104 if (!(netdev_dev->cache_valid & VALID_IS_PSEUDO)) {
1105 const char *name = netdev_dev_get_name(&netdev_dev->netdev_dev);
1106 const char *type = netdev_dev_get_type(&netdev_dev->netdev_dev);
d295e8e9 1107
8722022c 1108 netdev_dev->is_tap = !strcmp(type, "tap");
9fe3b9a2
BP
1109 netdev_dev->is_internal = (!netdev_dev->is_tap
1110 && dpif_linux_is_internal_device(name));
8722022c
BP
1111 netdev_dev->cache_valid |= VALID_IS_PSEUDO;
1112 }
1113}
1114
92df599c
JG
1115static void
1116swap_uint64(uint64_t *a, uint64_t *b)
1117{
1118 *a ^= *b;
1119 *b ^= *a;
1120 *a ^= *b;
1121}
1122
7fbef77a 1123/* Retrieves current device stats for 'netdev'. */
8b61709d 1124static int
149f577a
JG
1125netdev_linux_get_stats(const struct netdev *netdev_,
1126 struct netdev_stats *stats)
8b61709d 1127{
149f577a
JG
1128 struct netdev_dev_linux *netdev_dev =
1129 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1130 static int use_netlink_stats = -1;
1131 int error;
1132
7fbef77a
JG
1133 if (netdev_dev->have_vport_stats ||
1134 !(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1135
1136 error = netdev_vport_get_stats(netdev_, stats);
1137 netdev_dev->have_vport_stats = !error;
1138 netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
8b61709d 1139 }
8b61709d 1140
7fbef77a
JG
1141 if (!netdev_dev->have_vport_stats) {
1142 if (use_netlink_stats < 0) {
1143 use_netlink_stats = check_for_working_netlink_stats();
1144 }
1145 if (use_netlink_stats) {
1146 int ifindex;
1147
1148 error = get_ifindex(netdev_, &ifindex);
1149 if (!error) {
1150 error = get_stats_via_netlink(ifindex, stats);
1151 }
1152 } else {
1153 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
8b61709d 1154 }
8b61709d 1155 }
fe6b0e03
JG
1156
1157 /* If this port is an internal port then the transmit and receive stats
1158 * will appear to be swapped relative to the other ports since we are the
1159 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1160 * them back here. This does not apply if we are getting stats from the
1161 * vport layer because it always tracks stats from the perspective of the
1162 * switch. */
92df599c 1163 netdev_linux_update_is_pseudo(netdev_dev);
7fbef77a
JG
1164 if (!error && !netdev_dev->have_vport_stats &&
1165 (netdev_dev->is_internal || netdev_dev->is_tap)) {
92df599c
JG
1166 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1167 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1168 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1169 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1170 stats->rx_length_errors = 0;
1171 stats->rx_over_errors = 0;
1172 stats->rx_crc_errors = 0;
1173 stats->rx_frame_errors = 0;
1174 stats->rx_fifo_errors = 0;
1175 stats->rx_missed_errors = 0;
1176 stats->tx_aborted_errors = 0;
1177 stats->tx_carrier_errors = 0;
1178 stats->tx_fifo_errors = 0;
1179 stats->tx_heartbeat_errors = 0;
1180 stats->tx_window_errors = 0;
1181 }
1182
8b61709d
BP
1183 return error;
1184}
1185
1186/* Stores the features supported by 'netdev' into each of '*current',
1187 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1188 * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
7671589a 1189 * successful, otherwise a positive errno value. */
8b61709d 1190static int
6f2f5cce 1191netdev_linux_get_features(const struct netdev *netdev,
8b61709d
BP
1192 uint32_t *current, uint32_t *advertised,
1193 uint32_t *supported, uint32_t *peer)
1194{
1195 struct ethtool_cmd ecmd;
1196 int error;
1197
1198 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1199 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1200 ETHTOOL_GSET, "ETHTOOL_GSET");
1201 if (error) {
1202 return error;
1203 }
1204
1205 /* Supported features. */
1206 *supported = 0;
1207 if (ecmd.supported & SUPPORTED_10baseT_Half) {
1208 *supported |= OFPPF_10MB_HD;
1209 }
1210 if (ecmd.supported & SUPPORTED_10baseT_Full) {
1211 *supported |= OFPPF_10MB_FD;
1212 }
1213 if (ecmd.supported & SUPPORTED_100baseT_Half) {
1214 *supported |= OFPPF_100MB_HD;
1215 }
1216 if (ecmd.supported & SUPPORTED_100baseT_Full) {
1217 *supported |= OFPPF_100MB_FD;
1218 }
1219 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1220 *supported |= OFPPF_1GB_HD;
1221 }
1222 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1223 *supported |= OFPPF_1GB_FD;
1224 }
1225 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1226 *supported |= OFPPF_10GB_FD;
1227 }
1228 if (ecmd.supported & SUPPORTED_TP) {
1229 *supported |= OFPPF_COPPER;
1230 }
1231 if (ecmd.supported & SUPPORTED_FIBRE) {
1232 *supported |= OFPPF_FIBER;
1233 }
1234 if (ecmd.supported & SUPPORTED_Autoneg) {
1235 *supported |= OFPPF_AUTONEG;
1236 }
1237 if (ecmd.supported & SUPPORTED_Pause) {
1238 *supported |= OFPPF_PAUSE;
1239 }
1240 if (ecmd.supported & SUPPORTED_Asym_Pause) {
1241 *supported |= OFPPF_PAUSE_ASYM;
1242 }
1243
1244 /* Advertised features. */
1245 *advertised = 0;
1246 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1247 *advertised |= OFPPF_10MB_HD;
1248 }
1249 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1250 *advertised |= OFPPF_10MB_FD;
1251 }
1252 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1253 *advertised |= OFPPF_100MB_HD;
1254 }
1255 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1256 *advertised |= OFPPF_100MB_FD;
1257 }
1258 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1259 *advertised |= OFPPF_1GB_HD;
1260 }
1261 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1262 *advertised |= OFPPF_1GB_FD;
1263 }
1264 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1265 *advertised |= OFPPF_10GB_FD;
1266 }
1267 if (ecmd.advertising & ADVERTISED_TP) {
1268 *advertised |= OFPPF_COPPER;
1269 }
1270 if (ecmd.advertising & ADVERTISED_FIBRE) {
1271 *advertised |= OFPPF_FIBER;
1272 }
1273 if (ecmd.advertising & ADVERTISED_Autoneg) {
1274 *advertised |= OFPPF_AUTONEG;
1275 }
1276 if (ecmd.advertising & ADVERTISED_Pause) {
1277 *advertised |= OFPPF_PAUSE;
1278 }
1279 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1280 *advertised |= OFPPF_PAUSE_ASYM;
1281 }
1282
1283 /* Current settings. */
1284 if (ecmd.speed == SPEED_10) {
1285 *current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1286 } else if (ecmd.speed == SPEED_100) {
1287 *current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1288 } else if (ecmd.speed == SPEED_1000) {
1289 *current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1290 } else if (ecmd.speed == SPEED_10000) {
1291 *current = OFPPF_10GB_FD;
1292 } else {
1293 *current = 0;
1294 }
1295
1296 if (ecmd.port == PORT_TP) {
1297 *current |= OFPPF_COPPER;
1298 } else if (ecmd.port == PORT_FIBRE) {
1299 *current |= OFPPF_FIBER;
1300 }
1301
1302 if (ecmd.autoneg) {
1303 *current |= OFPPF_AUTONEG;
1304 }
1305
1306 /* Peer advertisements. */
1307 *peer = 0; /* XXX */
1308
1309 return 0;
1310}
1311
1312/* Set the features advertised by 'netdev' to 'advertise'. */
1313static int
1314netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1315{
1316 struct ethtool_cmd ecmd;
1317 int error;
1318
1319 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1320 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1321 ETHTOOL_GSET, "ETHTOOL_GSET");
1322 if (error) {
1323 return error;
1324 }
1325
1326 ecmd.advertising = 0;
1327 if (advertise & OFPPF_10MB_HD) {
1328 ecmd.advertising |= ADVERTISED_10baseT_Half;
1329 }
1330 if (advertise & OFPPF_10MB_FD) {
1331 ecmd.advertising |= ADVERTISED_10baseT_Full;
1332 }
1333 if (advertise & OFPPF_100MB_HD) {
1334 ecmd.advertising |= ADVERTISED_100baseT_Half;
1335 }
1336 if (advertise & OFPPF_100MB_FD) {
1337 ecmd.advertising |= ADVERTISED_100baseT_Full;
1338 }
1339 if (advertise & OFPPF_1GB_HD) {
1340 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1341 }
1342 if (advertise & OFPPF_1GB_FD) {
1343 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1344 }
1345 if (advertise & OFPPF_10GB_FD) {
1346 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1347 }
1348 if (advertise & OFPPF_COPPER) {
1349 ecmd.advertising |= ADVERTISED_TP;
1350 }
1351 if (advertise & OFPPF_FIBER) {
1352 ecmd.advertising |= ADVERTISED_FIBRE;
1353 }
1354 if (advertise & OFPPF_AUTONEG) {
1355 ecmd.advertising |= ADVERTISED_Autoneg;
1356 }
1357 if (advertise & OFPPF_PAUSE) {
1358 ecmd.advertising |= ADVERTISED_Pause;
1359 }
1360 if (advertise & OFPPF_PAUSE_ASYM) {
1361 ecmd.advertising |= ADVERTISED_Asym_Pause;
1362 }
0b0544d7 1363 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1364 ETHTOOL_SSET, "ETHTOOL_SSET");
1365}
1366
1367/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
1368 * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
1369 * and returns 0. Otherwise returns a errno value (specifically ENOENT if
1370 * 'netdev_name' is the name of a network device that is not a VLAN device) and
1371 * sets '*vlan_vid' to -1. */
1372static int
1373netdev_linux_get_vlan_vid(const struct netdev *netdev, int *vlan_vid)
1374{
1375 const char *netdev_name = netdev_get_name(netdev);
1376 struct ds line = DS_EMPTY_INITIALIZER;
1377 FILE *stream = NULL;
1378 int error;
1379 char *fn;
1380
1381 COVERAGE_INC(netdev_get_vlan_vid);
1382 fn = xasprintf("/proc/net/vlan/%s", netdev_name);
1383 stream = fopen(fn, "r");
1384 if (!stream) {
1385 error = errno;
1386 goto done;
1387 }
1388
1389 if (ds_get_line(&line, stream)) {
1390 if (ferror(stream)) {
1391 error = errno;
1392 VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
1393 } else {
1394 error = EPROTO;
1395 VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
1396 }
1397 goto done;
1398 }
1399
1400 if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
1401 error = EPROTO;
1402 VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
1403 fn, ds_cstr(&line));
1404 goto done;
1405 }
1406
1407 error = 0;
1408
1409done:
1410 free(fn);
1411 if (stream) {
1412 fclose(stream);
1413 }
1414 ds_destroy(&line);
1415 if (error) {
1416 *vlan_vid = -1;
1417 }
1418 return error;
1419}
1420
1421#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1422#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
8b61709d 1423
8e460221 1424/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
6f42c8ea
BP
1425 * positive errno value.
1426 *
1427 * This function is equivalent to running
1428 * /sbin/tc qdisc del dev %s handle ffff: ingress
1429 * but it is much, much faster.
1430 */
8e460221
BP
1431static int
1432netdev_linux_remove_policing(struct netdev *netdev)
1433{
80a86fbe
BP
1434 struct netdev_dev_linux *netdev_dev =
1435 netdev_dev_linux_cast(netdev_get_dev(netdev));
8e460221 1436 const char *netdev_name = netdev_get_name(netdev);
8e460221 1437
6f42c8ea 1438 struct ofpbuf request;
6f42c8ea 1439 struct tcmsg *tcmsg;
6f42c8ea
BP
1440 int error;
1441
c1c9c9c4 1442 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
1443 if (!tcmsg) {
1444 return ENODEV;
1445 }
c1c9c9c4 1446 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
6f42c8ea
BP
1447 tcmsg->tcm_parent = TC_H_INGRESS;
1448 nl_msg_put_string(&request, TCA_KIND, "ingress");
1449 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
c1c9c9c4
BP
1450
1451 error = tc_transact(&request, NULL);
4d10512c 1452 if (error && error != ENOENT && error != EINVAL) {
6f42c8ea
BP
1453 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1454 netdev_name, strerror(error));
1455 return error;
1456 }
1457
80a86fbe
BP
1458 netdev_dev->kbits_rate = 0;
1459 netdev_dev->kbits_burst = 0;
1460 netdev_dev->cache_valid |= VALID_POLICING;
8e460221
BP
1461 return 0;
1462}
1463
8b61709d
BP
1464/* Attempts to set input rate limiting (policing) policy. */
1465static int
1466netdev_linux_set_policing(struct netdev *netdev,
1467 uint32_t kbits_rate, uint32_t kbits_burst)
1468{
80a86fbe
BP
1469 struct netdev_dev_linux *netdev_dev =
1470 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d
BP
1471 const char *netdev_name = netdev_get_name(netdev);
1472 char command[1024];
1473
1474 COVERAGE_INC(netdev_set_policing);
8e460221 1475
80a86fbe
BP
1476 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1477 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1478 : kbits_burst); /* Stick with user-specified value. */
1479
1480 if (netdev_dev->cache_valid & VALID_POLICING
1481 && netdev_dev->kbits_rate == kbits_rate
1482 && netdev_dev->kbits_burst == kbits_burst) {
1483 /* Assume that settings haven't changed since we last set them. */
1484 return 0;
1485 }
1486
8e460221 1487 netdev_linux_remove_policing(netdev);
8b61709d 1488 if (kbits_rate) {
8b61709d
BP
1489 snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1490 if (system(command) != 0) {
1491 VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1492 return -1;
1493 }
1494
1495 snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1496 kbits_rate, kbits_burst);
1497 if (system(command) != 0) {
1498 VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1499 netdev_name);
1500 return -1;
1501 }
80a86fbe
BP
1502
1503 netdev_dev->kbits_rate = kbits_rate;
1504 netdev_dev->kbits_burst = kbits_burst;
1505 netdev_dev->cache_valid |= VALID_POLICING;
8b61709d
BP
1506 }
1507
1508 return 0;
1509}
1510
c1c9c9c4
BP
1511static int
1512netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1513 struct svec *types)
1514{
1515 const struct tc_ops **opsp;
1516
1517 for (opsp = tcs; *opsp != NULL; opsp++) {
1518 const struct tc_ops *ops = *opsp;
1519 if (ops->tc_install && ops->ovs_name[0] != '\0') {
1520 svec_add(types, ops->ovs_name);
1521 }
1522 }
1523 return 0;
1524}
1525
1526static const struct tc_ops *
1527tc_lookup_ovs_name(const char *name)
1528{
1529 const struct tc_ops **opsp;
1530
1531 for (opsp = tcs; *opsp != NULL; opsp++) {
1532 const struct tc_ops *ops = *opsp;
1533 if (!strcmp(name, ops->ovs_name)) {
1534 return ops;
1535 }
1536 }
1537 return NULL;
1538}
1539
1540static const struct tc_ops *
1541tc_lookup_linux_name(const char *name)
1542{
1543 const struct tc_ops **opsp;
1544
1545 for (opsp = tcs; *opsp != NULL; opsp++) {
1546 const struct tc_ops *ops = *opsp;
1547 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1548 return ops;
1549 }
1550 }
1551 return NULL;
1552}
1553
93b13be8
BP
1554static struct tc_queue *
1555tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1556 size_t hash)
1557{
1558 struct netdev_dev_linux *netdev_dev =
1559 netdev_dev_linux_cast(netdev_get_dev(netdev));
1560 struct tc_queue *queue;
1561
4e8e4213 1562 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1563 if (queue->queue_id == queue_id) {
1564 return queue;
1565 }
1566 }
1567 return NULL;
1568}
1569
1570static struct tc_queue *
1571tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1572{
1573 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1574}
1575
c1c9c9c4
BP
1576static int
1577netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1578 const char *type,
1579 struct netdev_qos_capabilities *caps)
1580{
1581 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1582 if (!ops) {
1583 return EOPNOTSUPP;
1584 }
1585 caps->n_queues = ops->n_queues;
1586 return 0;
1587}
1588
1589static int
1590netdev_linux_get_qos(const struct netdev *netdev,
1591 const char **typep, struct shash *details)
1592{
1593 struct netdev_dev_linux *netdev_dev =
1594 netdev_dev_linux_cast(netdev_get_dev(netdev));
1595 int error;
1596
1597 error = tc_query_qdisc(netdev);
1598 if (error) {
1599 return error;
1600 }
1601
1602 *typep = netdev_dev->tc->ops->ovs_name;
1603 return (netdev_dev->tc->ops->qdisc_get
1604 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1605 : 0);
1606}
1607
1608static int
1609netdev_linux_set_qos(struct netdev *netdev,
1610 const char *type, const struct shash *details)
1611{
1612 struct netdev_dev_linux *netdev_dev =
1613 netdev_dev_linux_cast(netdev_get_dev(netdev));
1614 const struct tc_ops *new_ops;
1615 int error;
1616
1617 new_ops = tc_lookup_ovs_name(type);
1618 if (!new_ops || !new_ops->tc_install) {
1619 return EOPNOTSUPP;
1620 }
1621
1622 error = tc_query_qdisc(netdev);
1623 if (error) {
1624 return error;
1625 }
1626
1627 if (new_ops == netdev_dev->tc->ops) {
1628 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1629 } else {
1630 /* Delete existing qdisc. */
1631 error = tc_del_qdisc(netdev);
1632 if (error) {
1633 return error;
1634 }
1635 assert(netdev_dev->tc == NULL);
1636
1637 /* Install new qdisc. */
1638 error = new_ops->tc_install(netdev, details);
1639 assert((error == 0) == (netdev_dev->tc != NULL));
1640
1641 return error;
1642 }
1643}
1644
1645static int
1646netdev_linux_get_queue(const struct netdev *netdev,
1647 unsigned int queue_id, struct shash *details)
1648{
1649 struct netdev_dev_linux *netdev_dev =
1650 netdev_dev_linux_cast(netdev_get_dev(netdev));
1651 int error;
1652
1653 error = tc_query_qdisc(netdev);
1654 if (error) {
1655 return error;
93b13be8
BP
1656 } else {
1657 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1658 return (queue
1659 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1660 : ENOENT);
c1c9c9c4 1661 }
c1c9c9c4
BP
1662}
1663
1664static int
1665netdev_linux_set_queue(struct netdev *netdev,
1666 unsigned int queue_id, const struct shash *details)
1667{
1668 struct netdev_dev_linux *netdev_dev =
1669 netdev_dev_linux_cast(netdev_get_dev(netdev));
1670 int error;
1671
1672 error = tc_query_qdisc(netdev);
1673 if (error) {
1674 return error;
1675 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1676 || !netdev_dev->tc->ops->class_set) {
1677 return EINVAL;
1678 }
1679
1680 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1681}
1682
1683static int
1684netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1685{
1686 struct netdev_dev_linux *netdev_dev =
1687 netdev_dev_linux_cast(netdev_get_dev(netdev));
1688 int error;
1689
1690 error = tc_query_qdisc(netdev);
1691 if (error) {
1692 return error;
1693 } else if (!netdev_dev->tc->ops->class_delete) {
1694 return EINVAL;
93b13be8
BP
1695 } else {
1696 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1697 return (queue
1698 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1699 : ENOENT);
c1c9c9c4 1700 }
c1c9c9c4
BP
1701}
1702
1703static int
1704netdev_linux_get_queue_stats(const struct netdev *netdev,
1705 unsigned int queue_id,
1706 struct netdev_queue_stats *stats)
1707{
1708 struct netdev_dev_linux *netdev_dev =
1709 netdev_dev_linux_cast(netdev_get_dev(netdev));
1710 int error;
1711
1712 error = tc_query_qdisc(netdev);
1713 if (error) {
1714 return error;
c1c9c9c4
BP
1715 } else if (!netdev_dev->tc->ops->class_get_stats) {
1716 return EOPNOTSUPP;
93b13be8
BP
1717 } else {
1718 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1719 return (queue
1720 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1721 : ENOENT);
c1c9c9c4 1722 }
c1c9c9c4
BP
1723}
1724
23a98ffe 1725static bool
c1c9c9c4
BP
1726start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1727{
1728 struct ofpbuf request;
1729 struct tcmsg *tcmsg;
1730
1731 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
1732 if (!tcmsg) {
1733 return false;
1734 }
3c4de644 1735 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
1736 nl_dump_start(dump, rtnl_sock, &request);
1737 ofpbuf_uninit(&request);
23a98ffe 1738 return true;
c1c9c9c4
BP
1739}
1740
1741static int
1742netdev_linux_dump_queues(const struct netdev *netdev,
1743 netdev_dump_queues_cb *cb, void *aux)
1744{
1745 struct netdev_dev_linux *netdev_dev =
1746 netdev_dev_linux_cast(netdev_get_dev(netdev));
93b13be8 1747 struct tc_queue *queue;
c1c9c9c4
BP
1748 struct shash details;
1749 int last_error;
c1c9c9c4
BP
1750 int error;
1751
1752 error = tc_query_qdisc(netdev);
1753 if (error) {
1754 return error;
1755 } else if (!netdev_dev->tc->ops->class_get) {
1756 return EOPNOTSUPP;
1757 }
1758
1759 last_error = 0;
1760 shash_init(&details);
4e8e4213 1761 HMAP_FOR_EACH (queue, hmap_node, &netdev_dev->tc->queues) {
c1c9c9c4
BP
1762 shash_clear(&details);
1763
93b13be8 1764 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 1765 if (!error) {
93b13be8 1766 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
1767 } else {
1768 last_error = error;
1769 }
1770 }
1771 shash_destroy(&details);
1772
1773 return last_error;
1774}
1775
1776static int
1777netdev_linux_dump_queue_stats(const struct netdev *netdev,
1778 netdev_dump_queue_stats_cb *cb, void *aux)
1779{
1780 struct netdev_dev_linux *netdev_dev =
1781 netdev_dev_linux_cast(netdev_get_dev(netdev));
1782 struct nl_dump dump;
1783 struct ofpbuf msg;
1784 int last_error;
1785 int error;
1786
1787 error = tc_query_qdisc(netdev);
1788 if (error) {
1789 return error;
1790 } else if (!netdev_dev->tc->ops->class_dump_stats) {
1791 return EOPNOTSUPP;
1792 }
1793
1794 last_error = 0;
23a98ffe
BP
1795 if (!start_queue_dump(netdev, &dump)) {
1796 return ENODEV;
1797 }
c1c9c9c4
BP
1798 while (nl_dump_next(&dump, &msg)) {
1799 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1800 if (error) {
1801 last_error = error;
1802 }
1803 }
1804
1805 error = nl_dump_done(&dump);
1806 return error ? error : last_error;
1807}
1808
8b61709d 1809static int
f1acd62b
BP
1810netdev_linux_get_in4(const struct netdev *netdev_,
1811 struct in_addr *address, struct in_addr *netmask)
8b61709d 1812{
149f577a
JG
1813 struct netdev_dev_linux *netdev_dev =
1814 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1815
1816 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
1817 int error;
1818
149f577a 1819 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
1820 SIOCGIFADDR, "SIOCGIFADDR");
1821 if (error) {
1822 return error;
1823 }
1824
149f577a 1825 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
1826 SIOCGIFNETMASK, "SIOCGIFNETMASK");
1827 if (error) {
1828 return error;
1829 }
1830
149f577a 1831 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 1832 }
149f577a
JG
1833 *address = netdev_dev->address;
1834 *netmask = netdev_dev->netmask;
f1acd62b 1835 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
1836}
1837
8b61709d 1838static int
f1acd62b
BP
1839netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1840 struct in_addr netmask)
8b61709d 1841{
149f577a
JG
1842 struct netdev_dev_linux *netdev_dev =
1843 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
1844 int error;
1845
f1acd62b 1846 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 1847 if (!error) {
149f577a
JG
1848 netdev_dev->cache_valid |= VALID_IN4;
1849 netdev_dev->address = address;
1850 netdev_dev->netmask = netmask;
f1acd62b 1851 if (address.s_addr != INADDR_ANY) {
8b61709d 1852 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 1853 "SIOCSIFNETMASK", netmask);
8b61709d
BP
1854 }
1855 }
1856 return error;
1857}
1858
1859static bool
1860parse_if_inet6_line(const char *line,
1861 struct in6_addr *in6, char ifname[16 + 1])
1862{
1863 uint8_t *s6 = in6->s6_addr;
1864#define X8 "%2"SCNx8
1865 return sscanf(line,
1866 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1867 "%*x %*x %*x %*x %16s\n",
1868 &s6[0], &s6[1], &s6[2], &s6[3],
1869 &s6[4], &s6[5], &s6[6], &s6[7],
1870 &s6[8], &s6[9], &s6[10], &s6[11],
1871 &s6[12], &s6[13], &s6[14], &s6[15],
1872 ifname) == 17;
1873}
1874
1875/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
1876 * 'in6' is non-null) and returns true. Otherwise, returns false. */
1877static int
1878netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
1879{
149f577a
JG
1880 struct netdev_dev_linux *netdev_dev =
1881 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1882 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
1883 FILE *file;
1884 char line[128];
1885
149f577a 1886 netdev_dev->in6 = in6addr_any;
8b61709d
BP
1887
1888 file = fopen("/proc/net/if_inet6", "r");
1889 if (file != NULL) {
1890 const char *name = netdev_get_name(netdev_);
1891 while (fgets(line, sizeof line, file)) {
2a022368 1892 struct in6_addr in6_tmp;
8b61709d 1893 char ifname[16 + 1];
2a022368 1894 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
1895 && !strcmp(name, ifname))
1896 {
2a022368 1897 netdev_dev->in6 = in6_tmp;
8b61709d
BP
1898 break;
1899 }
1900 }
1901 fclose(file);
1902 }
149f577a 1903 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 1904 }
149f577a 1905 *in6 = netdev_dev->in6;
8b61709d
BP
1906 return 0;
1907}
1908
1909static void
1910make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
1911{
1912 struct sockaddr_in sin;
1913 memset(&sin, 0, sizeof sin);
1914 sin.sin_family = AF_INET;
1915 sin.sin_addr = addr;
1916 sin.sin_port = 0;
1917
1918 memset(sa, 0, sizeof *sa);
1919 memcpy(sa, &sin, sizeof sin);
1920}
1921
1922static int
1923do_set_addr(struct netdev *netdev,
1924 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
1925{
1926 struct ifreq ifr;
71d7c22f 1927 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 1928 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
1929
1930 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
1931 ioctl_name);
8b61709d
BP
1932}
1933
1934/* Adds 'router' as a default IP gateway. */
1935static int
67a4917b 1936netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
1937{
1938 struct in_addr any = { INADDR_ANY };
1939 struct rtentry rt;
1940 int error;
1941
1942 memset(&rt, 0, sizeof rt);
1943 make_in4_sockaddr(&rt.rt_dst, any);
1944 make_in4_sockaddr(&rt.rt_gateway, router);
1945 make_in4_sockaddr(&rt.rt_genmask, any);
1946 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
1947 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
1948 if (error) {
1949 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
1950 }
1951 return error;
1952}
1953
f1acd62b
BP
1954static int
1955netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
1956 char **netdev_name)
1957{
1958 static const char fn[] = "/proc/net/route";
1959 FILE *stream;
1960 char line[256];
1961 int ln;
1962
1963 *netdev_name = NULL;
1964 stream = fopen(fn, "r");
1965 if (stream == NULL) {
1966 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
1967 return errno;
1968 }
1969
1970 ln = 0;
1971 while (fgets(line, sizeof line, stream)) {
1972 if (++ln >= 2) {
1973 char iface[17];
1974 uint32_t dest, gateway, mask;
1975 int refcnt, metric, mtu;
1976 unsigned int flags, use, window, irtt;
1977
1978 if (sscanf(line,
1979 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
1980 " %d %u %u\n",
1981 iface, &dest, &gateway, &flags, &refcnt,
1982 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
1983
d295e8e9 1984 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
1985 fn, ln, line);
1986 continue;
1987 }
1988 if (!(flags & RTF_UP)) {
1989 /* Skip routes that aren't up. */
1990 continue;
1991 }
1992
1993 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 1994 * network byte order, so we don't need need any endian
f1acd62b
BP
1995 * conversions here. */
1996 if ((dest & mask) == (host->s_addr & mask)) {
1997 if (!gateway) {
1998 /* The host is directly reachable. */
1999 next_hop->s_addr = 0;
2000 } else {
2001 /* To reach the host, we must go through a gateway. */
2002 next_hop->s_addr = gateway;
2003 }
2004 *netdev_name = xstrdup(iface);
2005 fclose(stream);
2006 return 0;
2007 }
2008 }
2009 }
2010
2011 fclose(stream);
2012 return ENXIO;
2013}
2014
e210037e
AE
2015static int
2016netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2017{
2018 struct ethtool_drvinfo drvinfo;
2019 int error;
2020
2021 memset(&drvinfo, 0, sizeof drvinfo);
2022 error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2023 (struct ethtool_cmd *)&drvinfo,
2024 ETHTOOL_GDRVINFO,
2025 "ETHTOOL_GDRVINFO");
2026 if (!error) {
2027 shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2028 shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2029 shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2030 }
2031
2032 return error;
2033}
2034
8b61709d
BP
2035/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2036 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2037 * returns 0. Otherwise, it returns a positive errno value; in particular,
2038 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2039static int
2040netdev_linux_arp_lookup(const struct netdev *netdev,
2041 uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
2042{
2043 struct arpreq r;
c100e025 2044 struct sockaddr_in sin;
8b61709d
BP
2045 int retval;
2046
2047 memset(&r, 0, sizeof r);
f2cc621b 2048 memset(&sin, 0, sizeof sin);
c100e025
BP
2049 sin.sin_family = AF_INET;
2050 sin.sin_addr.s_addr = ip;
2051 sin.sin_port = 0;
2052 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2053 r.arp_ha.sa_family = ARPHRD_ETHER;
2054 r.arp_flags = 0;
71d7c22f 2055 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2056 COVERAGE_INC(netdev_arp_lookup);
2057 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2058 if (!retval) {
2059 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2060 } else if (retval != ENXIO) {
2061 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
149f577a 2062 netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
8b61709d
BP
2063 }
2064 return retval;
2065}
2066
2067static int
2068nd_to_iff_flags(enum netdev_flags nd)
2069{
2070 int iff = 0;
2071 if (nd & NETDEV_UP) {
2072 iff |= IFF_UP;
2073 }
2074 if (nd & NETDEV_PROMISC) {
2075 iff |= IFF_PROMISC;
2076 }
2077 return iff;
2078}
2079
2080static int
2081iff_to_nd_flags(int iff)
2082{
2083 enum netdev_flags nd = 0;
2084 if (iff & IFF_UP) {
2085 nd |= NETDEV_UP;
2086 }
2087 if (iff & IFF_PROMISC) {
2088 nd |= NETDEV_PROMISC;
2089 }
2090 return nd;
2091}
2092
2093static int
2094netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2095 enum netdev_flags on, enum netdev_flags *old_flagsp)
2096{
2097 int old_flags, new_flags;
2098 int error;
2099
2100 error = get_flags(netdev, &old_flags);
2101 if (!error) {
2102 *old_flagsp = iff_to_nd_flags(old_flags);
2103 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2104 if (new_flags != old_flags) {
2105 error = set_flags(netdev, new_flags);
2106 }
2107 }
2108 return error;
2109}
2110
2111static void
2112poll_notify(struct list *list)
2113{
2114 struct netdev_linux_notifier *notifier;
4e8e4213 2115 LIST_FOR_EACH (notifier, node, list) {
8b61709d
BP
2116 struct netdev_notifier *n = &notifier->notifier;
2117 n->cb(n);
2118 }
2119}
2120
2121static void
21d6e22e 2122netdev_linux_poll_cb(const struct rtnetlink_link_change *change,
67a4917b 2123 void *aux OVS_UNUSED)
8b61709d
BP
2124{
2125 if (change) {
2126 struct list *list = shash_find_data(&netdev_linux_notifiers,
2127 change->ifname);
2128 if (list) {
2129 poll_notify(list);
2130 }
2131 } else {
2132 struct shash_node *node;
2133 SHASH_FOR_EACH (node, &netdev_linux_notifiers) {
2134 poll_notify(node->data);
2135 }
2136 }
2137}
2138
2139static int
2140netdev_linux_poll_add(struct netdev *netdev,
2141 void (*cb)(struct netdev_notifier *), void *aux,
2142 struct netdev_notifier **notifierp)
2143{
2144 const char *netdev_name = netdev_get_name(netdev);
2145 struct netdev_linux_notifier *notifier;
2146 struct list *list;
2147
2148 if (shash_is_empty(&netdev_linux_notifiers)) {
21d6e22e
EJ
2149 int error;
2150 error = rtnetlink_link_notifier_register(&netdev_linux_poll_notifier,
2151 netdev_linux_poll_cb, NULL);
8b61709d
BP
2152 if (error) {
2153 return error;
2154 }
2155 }
2156
2157 list = shash_find_data(&netdev_linux_notifiers, netdev_name);
2158 if (!list) {
2159 list = xmalloc(sizeof *list);
2160 list_init(list);
2161 shash_add(&netdev_linux_notifiers, netdev_name, list);
2162 }
2163
2164 notifier = xmalloc(sizeof *notifier);
2165 netdev_notifier_init(&notifier->notifier, netdev, cb, aux);
2166 list_push_back(list, &notifier->node);
2167 *notifierp = &notifier->notifier;
2168 return 0;
2169}
2170
2171static void
2172netdev_linux_poll_remove(struct netdev_notifier *notifier_)
2173{
2174 struct netdev_linux_notifier *notifier =
2175 CONTAINER_OF(notifier_, struct netdev_linux_notifier, notifier);
2176 struct list *list;
2177
2178 /* Remove 'notifier' from its list. */
2179 list = list_remove(&notifier->node);
2180 if (list_is_empty(list)) {
2181 /* The list is now empty. Remove it from the hash and free it. */
2182 const char *netdev_name = netdev_get_name(notifier->notifier.netdev);
2183 shash_delete(&netdev_linux_notifiers,
2184 shash_find(&netdev_linux_notifiers, netdev_name));
2185 free(list);
2186 }
2187 free(notifier);
2188
2189 /* If that was the last notifier, unregister. */
2190 if (shash_is_empty(&netdev_linux_notifiers)) {
21d6e22e 2191 rtnetlink_link_notifier_unregister(&netdev_linux_poll_notifier);
8b61709d
BP
2192 }
2193}
2194
c3827f61
BP
2195#define NETDEV_LINUX_CLASS(NAME, CREATE, ENUMERATE, SET_STATS) \
2196{ \
2197 NAME, \
2198 \
2199 netdev_linux_init, \
2200 netdev_linux_run, \
2201 netdev_linux_wait, \
2202 \
2203 CREATE, \
2204 netdev_linux_destroy, \
6d9e6eb4 2205 NULL, /* set_config */ \
c3827f61
BP
2206 \
2207 netdev_linux_open, \
2208 netdev_linux_close, \
2209 \
2210 ENUMERATE, \
2211 \
2212 netdev_linux_recv, \
2213 netdev_linux_recv_wait, \
2214 netdev_linux_drain, \
2215 \
2216 netdev_linux_send, \
2217 netdev_linux_send_wait, \
2218 \
2219 netdev_linux_set_etheraddr, \
2220 netdev_linux_get_etheraddr, \
2221 netdev_linux_get_mtu, \
2222 netdev_linux_get_ifindex, \
2223 netdev_linux_get_carrier, \
63331829 2224 netdev_linux_get_miimon, \
c3827f61
BP
2225 netdev_linux_get_stats, \
2226 SET_STATS, \
2227 \
2228 netdev_linux_get_features, \
2229 netdev_linux_set_advertisements, \
2230 netdev_linux_get_vlan_vid, \
2231 \
2232 netdev_linux_set_policing, \
2233 netdev_linux_get_qos_types, \
2234 netdev_linux_get_qos_capabilities, \
2235 netdev_linux_get_qos, \
2236 netdev_linux_set_qos, \
2237 netdev_linux_get_queue, \
2238 netdev_linux_set_queue, \
2239 netdev_linux_delete_queue, \
2240 netdev_linux_get_queue_stats, \
2241 netdev_linux_dump_queues, \
2242 netdev_linux_dump_queue_stats, \
2243 \
2244 netdev_linux_get_in4, \
2245 netdev_linux_set_in4, \
2246 netdev_linux_get_in6, \
2247 netdev_linux_add_router, \
2248 netdev_linux_get_next_hop, \
e210037e 2249 netdev_linux_get_status, \
c3827f61
BP
2250 netdev_linux_arp_lookup, \
2251 \
2252 netdev_linux_update_flags, \
2253 \
2254 netdev_linux_poll_add, \
2255 netdev_linux_poll_remove \
2256}
2257
2258const struct netdev_class netdev_linux_class =
2259 NETDEV_LINUX_CLASS(
2260 "system",
2261 netdev_linux_create,
2262 netdev_linux_enumerate,
98563392 2263 NULL); /* set_stats */
c3827f61
BP
2264
2265const struct netdev_class netdev_tap_class =
2266 NETDEV_LINUX_CLASS(
2267 "tap",
2268 netdev_linux_create_tap,
2269 NULL, /* enumerate */
2270 NULL); /* set_stats */
2271
2272const struct netdev_class netdev_internal_class =
2273 NETDEV_LINUX_CLASS(
2274 "internal",
2275 netdev_linux_create,
2276 NULL, /* enumerate */
2277 netdev_vport_set_stats);
8b61709d 2278\f
c1c9c9c4 2279/* HTB traffic control class. */
559843ed 2280
c1c9c9c4 2281#define HTB_N_QUEUES 0xf000
8b61709d 2282
c1c9c9c4
BP
2283struct htb {
2284 struct tc tc;
2285 unsigned int max_rate; /* In bytes/s. */
2286};
8b61709d 2287
c1c9c9c4 2288struct htb_class {
93b13be8 2289 struct tc_queue tc_queue;
c1c9c9c4
BP
2290 unsigned int min_rate; /* In bytes/s. */
2291 unsigned int max_rate; /* In bytes/s. */
2292 unsigned int burst; /* In bytes. */
2293 unsigned int priority; /* Lower values are higher priorities. */
2294};
8b61709d 2295
c1c9c9c4
BP
2296static struct htb *
2297htb_get__(const struct netdev *netdev)
2298{
2299 struct netdev_dev_linux *netdev_dev =
2300 netdev_dev_linux_cast(netdev_get_dev(netdev));
2301 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2302}
2303
2304static struct htb *
2305htb_install__(struct netdev *netdev, uint64_t max_rate)
2306{
2307 struct netdev_dev_linux *netdev_dev =
2308 netdev_dev_linux_cast(netdev_get_dev(netdev));
2309 struct htb *htb;
2310
2311 htb = xmalloc(sizeof *htb);
2312 tc_init(&htb->tc, &tc_ops_htb);
2313 htb->max_rate = max_rate;
2314
2315 netdev_dev->tc = &htb->tc;
2316
2317 return htb;
2318}
2319
2320/* Create an HTB qdisc.
2321 *
a339aa81 2322 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2323static int
2324htb_setup_qdisc__(struct netdev *netdev)
2325{
2326 size_t opt_offset;
2327 struct tc_htb_glob opt;
2328 struct ofpbuf request;
2329 struct tcmsg *tcmsg;
2330
2331 tc_del_qdisc(netdev);
2332
2333 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2334 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2335 if (!tcmsg) {
2336 return ENODEV;
2337 }
c1c9c9c4
BP
2338 tcmsg->tcm_handle = tc_make_handle(1, 0);
2339 tcmsg->tcm_parent = TC_H_ROOT;
2340
2341 nl_msg_put_string(&request, TCA_KIND, "htb");
2342
2343 memset(&opt, 0, sizeof opt);
2344 opt.rate2quantum = 10;
2345 opt.version = 3;
4ecf12d5 2346 opt.defcls = 1;
c1c9c9c4
BP
2347
2348 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2349 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2350 nl_msg_end_nested(&request, opt_offset);
2351
2352 return tc_transact(&request, NULL);
2353}
2354
2355/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2356 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2357static int
2358htb_setup_class__(struct netdev *netdev, unsigned int handle,
2359 unsigned int parent, struct htb_class *class)
2360{
2361 size_t opt_offset;
2362 struct tc_htb_opt opt;
2363 struct ofpbuf request;
2364 struct tcmsg *tcmsg;
2365 int error;
2366 int mtu;
2367
2368 netdev_get_mtu(netdev, &mtu);
f915f1a8
BP
2369 if (mtu == INT_MAX) {
2370 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2371 netdev_get_name(netdev));
2372 return EINVAL;
2373 }
c1c9c9c4
BP
2374
2375 memset(&opt, 0, sizeof opt);
2376 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2377 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2378 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2379 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2380 opt.prio = class->priority;
2381
2382 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2383 if (!tcmsg) {
2384 return ENODEV;
2385 }
c1c9c9c4
BP
2386 tcmsg->tcm_handle = handle;
2387 tcmsg->tcm_parent = parent;
2388
2389 nl_msg_put_string(&request, TCA_KIND, "htb");
2390 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2391 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2392 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2393 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2394 nl_msg_end_nested(&request, opt_offset);
2395
2396 error = tc_transact(&request, NULL);
2397 if (error) {
2398 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2399 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2400 netdev_get_name(netdev),
2401 tc_get_major(handle), tc_get_minor(handle),
2402 tc_get_major(parent), tc_get_minor(parent),
2403 class->min_rate, class->max_rate,
2404 class->burst, class->priority, strerror(error));
2405 }
2406 return error;
2407}
2408
2409/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2410 * description of them into 'details'. The description complies with the
2411 * specification given in the vswitch database documentation for linux-htb
2412 * queue details. */
2413static int
2414htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2415{
2416 static const struct nl_policy tca_htb_policy[] = {
2417 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2418 .min_len = sizeof(struct tc_htb_opt) },
2419 };
2420
2421 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2422 const struct tc_htb_opt *htb;
2423
2424 if (!nl_parse_nested(nl_options, tca_htb_policy,
2425 attrs, ARRAY_SIZE(tca_htb_policy))) {
2426 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2427 return EPROTO;
2428 }
2429
2430 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2431 class->min_rate = htb->rate.rate;
2432 class->max_rate = htb->ceil.rate;
2433 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2434 class->priority = htb->prio;
2435 return 0;
2436}
2437
2438static int
2439htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2440 struct htb_class *options,
2441 struct netdev_queue_stats *stats)
2442{
2443 struct nlattr *nl_options;
2444 unsigned int handle;
2445 int error;
2446
2447 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2448 if (!error && queue_id) {
17ee3c1f
BP
2449 unsigned int major = tc_get_major(handle);
2450 unsigned int minor = tc_get_minor(handle);
2451 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2452 *queue_id = minor - 1;
c1c9c9c4
BP
2453 } else {
2454 error = EPROTO;
2455 }
2456 }
2457 if (!error && options) {
2458 error = htb_parse_tca_options__(nl_options, options);
2459 }
2460 return error;
2461}
2462
2463static void
2464htb_parse_qdisc_details__(struct netdev *netdev,
2465 const struct shash *details, struct htb_class *hc)
2466{
2467 const char *max_rate_s;
2468
2469 max_rate_s = shash_find_data(details, "max-rate");
2470 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2471 if (!hc->max_rate) {
2472 uint32_t current;
2473
2474 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2475 hc->max_rate = netdev_features_to_bps(current) / 8;
2476 }
2477 hc->min_rate = hc->max_rate;
2478 hc->burst = 0;
2479 hc->priority = 0;
2480}
2481
2482static int
2483htb_parse_class_details__(struct netdev *netdev,
2484 const struct shash *details, struct htb_class *hc)
2485{
2486 const struct htb *htb = htb_get__(netdev);
2487 const char *min_rate_s = shash_find_data(details, "min-rate");
2488 const char *max_rate_s = shash_find_data(details, "max-rate");
2489 const char *burst_s = shash_find_data(details, "burst");
2490 const char *priority_s = shash_find_data(details, "priority");
2491 int mtu;
2492
f915f1a8
BP
2493 netdev_get_mtu(netdev, &mtu);
2494 if (mtu == INT_MAX) {
2495 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2496 netdev_get_name(netdev));
2497 return EINVAL;
2498 }
2499
c45ab5e9 2500 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
da3827b5 2501 hc->min_rate = MAX(hc->min_rate, 1500);
c1c9c9c4
BP
2502 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2503
2504 /* max-rate */
2505 hc->max_rate = (max_rate_s
2506 ? strtoull(max_rate_s, NULL, 10) / 8
2507 : htb->max_rate);
2508 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2509 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2510
2511 /* burst
2512 *
2513 * According to hints in the documentation that I've read, it is important
2514 * that 'burst' be at least as big as the largest frame that might be
2515 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2516 * but having it a bit too small is a problem. Since netdev_get_mtu()
2517 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2518 * the MTU. We actually add 64, instead of 14, as a guard against
2519 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2520 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2521 hc->burst = MAX(hc->burst, mtu + 64);
2522
2523 /* priority */
2524 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2525
2526 return 0;
2527}
2528
2529static int
2530htb_query_class__(const struct netdev *netdev, unsigned int handle,
2531 unsigned int parent, struct htb_class *options,
2532 struct netdev_queue_stats *stats)
2533{
2534 struct ofpbuf *reply;
2535 int error;
2536
2537 error = tc_query_class(netdev, handle, parent, &reply);
2538 if (!error) {
2539 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2540 ofpbuf_delete(reply);
2541 }
2542 return error;
2543}
2544
2545static int
2546htb_tc_install(struct netdev *netdev, const struct shash *details)
2547{
2548 int error;
2549
2550 error = htb_setup_qdisc__(netdev);
2551 if (!error) {
2552 struct htb_class hc;
2553
2554 htb_parse_qdisc_details__(netdev, details, &hc);
2555 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2556 tc_make_handle(1, 0), &hc);
2557 if (!error) {
2558 htb_install__(netdev, hc.max_rate);
2559 }
2560 }
2561 return error;
2562}
2563
93b13be8
BP
2564static struct htb_class *
2565htb_class_cast__(const struct tc_queue *queue)
2566{
2567 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2568}
2569
c1c9c9c4
BP
2570static void
2571htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2572 const struct htb_class *hc)
2573{
2574 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2575 size_t hash = hash_int(queue_id, 0);
2576 struct tc_queue *queue;
c1c9c9c4
BP
2577 struct htb_class *hcp;
2578
93b13be8
BP
2579 queue = tc_find_queue__(netdev, queue_id, hash);
2580 if (queue) {
2581 hcp = htb_class_cast__(queue);
2582 } else {
c1c9c9c4 2583 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2584 queue = &hcp->tc_queue;
2585 queue->queue_id = queue_id;
2586 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2587 }
93b13be8
BP
2588
2589 hcp->min_rate = hc->min_rate;
2590 hcp->max_rate = hc->max_rate;
2591 hcp->burst = hc->burst;
2592 hcp->priority = hc->priority;
c1c9c9c4
BP
2593}
2594
2595static int
2596htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2597{
c1c9c9c4
BP
2598 struct ofpbuf msg;
2599 struct nl_dump dump;
2600 struct htb_class hc;
2601 struct htb *htb;
2602
2603 /* Get qdisc options. */
2604 hc.max_rate = 0;
2605 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2606 htb = htb_install__(netdev, hc.max_rate);
2607
2608 /* Get queues. */
23a98ffe
BP
2609 if (!start_queue_dump(netdev, &dump)) {
2610 return ENODEV;
2611 }
c1c9c9c4
BP
2612 while (nl_dump_next(&dump, &msg)) {
2613 unsigned int queue_id;
2614
2615 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2616 htb_update_queue__(netdev, queue_id, &hc);
2617 }
2618 }
2619 nl_dump_done(&dump);
2620
2621 return 0;
2622}
2623
2624static void
2625htb_tc_destroy(struct tc *tc)
2626{
2627 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2628 struct htb_class *hc, *next;
c1c9c9c4 2629
4e8e4213 2630 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2631 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2632 free(hc);
2633 }
2634 tc_destroy(tc);
2635 free(htb);
2636}
2637
2638static int
2639htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2640{
2641 const struct htb *htb = htb_get__(netdev);
2642 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2643 return 0;
2644}
2645
2646static int
2647htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2648{
2649 struct htb_class hc;
2650 int error;
2651
2652 htb_parse_qdisc_details__(netdev, details, &hc);
2653 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2654 tc_make_handle(1, 0), &hc);
2655 if (!error) {
2656 htb_get__(netdev)->max_rate = hc.max_rate;
2657 }
2658 return error;
2659}
2660
2661static int
93b13be8
BP
2662htb_class_get(const struct netdev *netdev OVS_UNUSED,
2663 const struct tc_queue *queue, struct shash *details)
c1c9c9c4 2664{
93b13be8 2665 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4
BP
2666
2667 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2668 if (hc->min_rate != hc->max_rate) {
2669 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2670 }
2671 shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2672 if (hc->priority) {
2673 shash_add(details, "priority", xasprintf("%u", hc->priority));
2674 }
2675 return 0;
2676}
2677
2678static int
2679htb_class_set(struct netdev *netdev, unsigned int queue_id,
2680 const struct shash *details)
2681{
2682 struct htb_class hc;
2683 int error;
2684
2685 error = htb_parse_class_details__(netdev, details, &hc);
2686 if (error) {
2687 return error;
2688 }
2689
17ee3c1f 2690 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2691 tc_make_handle(1, 0xfffe), &hc);
2692 if (error) {
2693 return error;
2694 }
2695
2696 htb_update_queue__(netdev, queue_id, &hc);
2697 return 0;
2698}
2699
2700static int
93b13be8 2701htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2702{
93b13be8 2703 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2704 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2705 int error;
2706
93b13be8 2707 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2708 if (!error) {
93b13be8 2709 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2710 free(hc);
c1c9c9c4
BP
2711 }
2712 return error;
2713}
2714
2715static int
93b13be8 2716htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2717 struct netdev_queue_stats *stats)
2718{
93b13be8 2719 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2720 tc_make_handle(1, 0xfffe), NULL, stats);
2721}
2722
2723static int
2724htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2725 const struct ofpbuf *nlmsg,
2726 netdev_dump_queue_stats_cb *cb, void *aux)
2727{
2728 struct netdev_queue_stats stats;
17ee3c1f 2729 unsigned int handle, major, minor;
c1c9c9c4
BP
2730 int error;
2731
2732 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2733 if (error) {
2734 return error;
2735 }
2736
17ee3c1f
BP
2737 major = tc_get_major(handle);
2738 minor = tc_get_minor(handle);
2739 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2740 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2741 }
2742 return 0;
2743}
2744
2745static const struct tc_ops tc_ops_htb = {
2746 "htb", /* linux_name */
2747 "linux-htb", /* ovs_name */
2748 HTB_N_QUEUES, /* n_queues */
2749 htb_tc_install,
2750 htb_tc_load,
2751 htb_tc_destroy,
2752 htb_qdisc_get,
2753 htb_qdisc_set,
2754 htb_class_get,
2755 htb_class_set,
2756 htb_class_delete,
2757 htb_class_get_stats,
2758 htb_class_dump_stats
2759};
2760\f
a339aa81
EJ
2761/* "linux-hfsc" traffic control class. */
2762
2763#define HFSC_N_QUEUES 0xf000
2764
2765struct hfsc {
2766 struct tc tc;
2767 uint32_t max_rate;
2768};
2769
2770struct hfsc_class {
2771 struct tc_queue tc_queue;
2772 uint32_t min_rate;
2773 uint32_t max_rate;
2774};
2775
2776static struct hfsc *
2777hfsc_get__(const struct netdev *netdev)
2778{
2779 struct netdev_dev_linux *netdev_dev;
2780 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2781 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2782}
2783
2784static struct hfsc_class *
2785hfsc_class_cast__(const struct tc_queue *queue)
2786{
2787 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2788}
2789
2790static struct hfsc *
2791hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2792{
2793 struct netdev_dev_linux * netdev_dev;
2794 struct hfsc *hfsc;
2795
2796 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2797 hfsc = xmalloc(sizeof *hfsc);
2798 tc_init(&hfsc->tc, &tc_ops_hfsc);
2799 hfsc->max_rate = max_rate;
2800 netdev_dev->tc = &hfsc->tc;
2801
2802 return hfsc;
2803}
2804
2805static void
2806hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2807 const struct hfsc_class *hc)
2808{
2809 size_t hash;
2810 struct hfsc *hfsc;
2811 struct hfsc_class *hcp;
2812 struct tc_queue *queue;
2813
2814 hfsc = hfsc_get__(netdev);
2815 hash = hash_int(queue_id, 0);
2816
2817 queue = tc_find_queue__(netdev, queue_id, hash);
2818 if (queue) {
2819 hcp = hfsc_class_cast__(queue);
2820 } else {
2821 hcp = xmalloc(sizeof *hcp);
2822 queue = &hcp->tc_queue;
2823 queue->queue_id = queue_id;
2824 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2825 }
2826
2827 hcp->min_rate = hc->min_rate;
2828 hcp->max_rate = hc->max_rate;
2829}
2830
2831static int
2832hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2833{
2834 const struct tc_service_curve *rsc, *fsc, *usc;
2835 static const struct nl_policy tca_hfsc_policy[] = {
2836 [TCA_HFSC_RSC] = {
2837 .type = NL_A_UNSPEC,
2838 .optional = false,
2839 .min_len = sizeof(struct tc_service_curve),
2840 },
2841 [TCA_HFSC_FSC] = {
2842 .type = NL_A_UNSPEC,
2843 .optional = false,
2844 .min_len = sizeof(struct tc_service_curve),
2845 },
2846 [TCA_HFSC_USC] = {
2847 .type = NL_A_UNSPEC,
2848 .optional = false,
2849 .min_len = sizeof(struct tc_service_curve),
2850 },
2851 };
2852 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2853
2854 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2855 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2856 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2857 return EPROTO;
2858 }
2859
2860 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2861 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2862 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2863
2864 if (rsc->m1 != 0 || rsc->d != 0 ||
2865 fsc->m1 != 0 || fsc->d != 0 ||
2866 usc->m1 != 0 || usc->d != 0) {
2867 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2868 "Non-linear service curves are not supported.");
2869 return EPROTO;
2870 }
2871
2872 if (rsc->m2 != fsc->m2) {
2873 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2874 "Real-time service curves are not supported ");
2875 return EPROTO;
2876 }
2877
2878 if (rsc->m2 > usc->m2) {
2879 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2880 "Min-rate service curve is greater than "
2881 "the max-rate service curve.");
2882 return EPROTO;
2883 }
2884
2885 class->min_rate = fsc->m2;
2886 class->max_rate = usc->m2;
2887 return 0;
2888}
2889
2890static int
2891hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2892 struct hfsc_class *options,
2893 struct netdev_queue_stats *stats)
2894{
2895 int error;
2896 unsigned int handle;
2897 struct nlattr *nl_options;
2898
2899 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2900 if (error) {
2901 return error;
2902 }
2903
2904 if (queue_id) {
2905 unsigned int major, minor;
2906
2907 major = tc_get_major(handle);
2908 minor = tc_get_minor(handle);
2909 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2910 *queue_id = minor - 1;
2911 } else {
2912 return EPROTO;
2913 }
2914 }
2915
2916 if (options) {
2917 error = hfsc_parse_tca_options__(nl_options, options);
2918 }
2919
2920 return error;
2921}
2922
2923static int
2924hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2925 unsigned int parent, struct hfsc_class *options,
2926 struct netdev_queue_stats *stats)
2927{
2928 int error;
2929 struct ofpbuf *reply;
2930
2931 error = tc_query_class(netdev, handle, parent, &reply);
2932 if (error) {
2933 return error;
2934 }
2935
2936 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2937 ofpbuf_delete(reply);
2938 return error;
2939}
2940
2941static void
2942hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2943 struct hfsc_class *class)
2944{
2945 uint32_t max_rate;
2946 const char *max_rate_s;
2947
2948 max_rate_s = shash_find_data(details, "max-rate");
2949 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2950
2951 if (!max_rate) {
2952 uint32_t current;
2953
2954 netdev_get_features(netdev, &current, NULL, NULL, NULL);
2955 max_rate = netdev_features_to_bps(current) / 8;
2956 }
2957
2958 class->min_rate = max_rate;
2959 class->max_rate = max_rate;
2960}
2961
2962static int
2963hfsc_parse_class_details__(struct netdev *netdev,
2964 const struct shash *details,
2965 struct hfsc_class * class)
2966{
2967 const struct hfsc *hfsc;
2968 uint32_t min_rate, max_rate;
2969 const char *min_rate_s, *max_rate_s;
2970
2971 hfsc = hfsc_get__(netdev);
2972 min_rate_s = shash_find_data(details, "min-rate");
2973 max_rate_s = shash_find_data(details, "max-rate");
2974
c45ab5e9 2975 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
a339aa81
EJ
2976 min_rate = MAX(min_rate, 1500);
2977 min_rate = MIN(min_rate, hfsc->max_rate);
2978
2979 max_rate = (max_rate_s
2980 ? strtoull(max_rate_s, NULL, 10) / 8
2981 : hfsc->max_rate);
2982 max_rate = MAX(max_rate, min_rate);
2983 max_rate = MIN(max_rate, hfsc->max_rate);
2984
2985 class->min_rate = min_rate;
2986 class->max_rate = max_rate;
2987
2988 return 0;
2989}
2990
2991/* Create an HFSC qdisc.
2992 *
2993 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
2994static int
2995hfsc_setup_qdisc__(struct netdev * netdev)
2996{
2997 struct tcmsg *tcmsg;
2998 struct ofpbuf request;
2999 struct tc_hfsc_qopt opt;
3000
3001 tc_del_qdisc(netdev);
3002
3003 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3004 NLM_F_EXCL | NLM_F_CREATE, &request);
3005
3006 if (!tcmsg) {
3007 return ENODEV;
3008 }
3009
3010 tcmsg->tcm_handle = tc_make_handle(1, 0);
3011 tcmsg->tcm_parent = TC_H_ROOT;
3012
3013 memset(&opt, 0, sizeof opt);
3014 opt.defcls = 1;
3015
3016 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3017 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3018
3019 return tc_transact(&request, NULL);
3020}
3021
3022/* Create an HFSC class.
3023 *
3024 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3025 * sc rate <min_rate> ul rate <max_rate>" */
3026static int
3027hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3028 unsigned int parent, struct hfsc_class *class)
3029{
3030 int error;
3031 size_t opt_offset;
3032 struct tcmsg *tcmsg;
3033 struct ofpbuf request;
3034 struct tc_service_curve min, max;
3035
3036 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3037
3038 if (!tcmsg) {
3039 return ENODEV;
3040 }
3041
3042 tcmsg->tcm_handle = handle;
3043 tcmsg->tcm_parent = parent;
3044
3045 min.m1 = 0;
3046 min.d = 0;
3047 min.m2 = class->min_rate;
3048
3049 max.m1 = 0;
3050 max.d = 0;
3051 max.m2 = class->max_rate;
3052
3053 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3054 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3055 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3056 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3057 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3058 nl_msg_end_nested(&request, opt_offset);
3059
3060 error = tc_transact(&request, NULL);
3061 if (error) {
3062 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3063 "min-rate %ubps, max-rate %ubps (%s)",
3064 netdev_get_name(netdev),
3065 tc_get_major(handle), tc_get_minor(handle),
3066 tc_get_major(parent), tc_get_minor(parent),
3067 class->min_rate, class->max_rate, strerror(error));
3068 }
3069
3070 return error;
3071}
3072
3073static int
3074hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3075{
3076 int error;
3077 struct hfsc_class class;
3078
3079 error = hfsc_setup_qdisc__(netdev);
3080
3081 if (error) {
3082 return error;
3083 }
3084
3085 hfsc_parse_qdisc_details__(netdev, details, &class);
3086 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3087 tc_make_handle(1, 0), &class);
3088
3089 if (error) {
3090 return error;
3091 }
3092
3093 hfsc_install__(netdev, class.max_rate);
3094 return 0;
3095}
3096
3097static int
3098hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3099{
3100 struct ofpbuf msg;
3101 struct hfsc *hfsc;
3102 struct nl_dump dump;
3103 struct hfsc_class hc;
3104
3105 hc.max_rate = 0;
3106 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3107 hfsc = hfsc_install__(netdev, hc.max_rate);
3108
3109 if (!start_queue_dump(netdev, &dump)) {
3110 return ENODEV;
3111 }
3112
3113 while (nl_dump_next(&dump, &msg)) {
3114 unsigned int queue_id;
3115
3116 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3117 hfsc_update_queue__(netdev, queue_id, &hc);
3118 }
3119 }
3120
3121 nl_dump_done(&dump);
3122 return 0;
3123}
3124
3125static void
3126hfsc_tc_destroy(struct tc *tc)
3127{
3128 struct hfsc *hfsc;
3129 struct hfsc_class *hc, *next;
3130
3131 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3132
3133 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3134 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3135 free(hc);
3136 }
3137
3138 tc_destroy(tc);
3139 free(hfsc);
3140}
3141
3142static int
3143hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3144{
3145 const struct hfsc *hfsc;
3146 hfsc = hfsc_get__(netdev);
3147 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3148 return 0;
3149}
3150
3151static int
3152hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3153{
3154 int error;
3155 struct hfsc_class class;
3156
3157 hfsc_parse_qdisc_details__(netdev, details, &class);
3158 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3159 tc_make_handle(1, 0), &class);
3160
3161 if (!error) {
3162 hfsc_get__(netdev)->max_rate = class.max_rate;
3163 }
3164
3165 return error;
3166}
3167
3168static int
3169hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3170 const struct tc_queue *queue, struct shash *details)
3171{
3172 const struct hfsc_class *hc;
3173
3174 hc = hfsc_class_cast__(queue);
3175 shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3176 if (hc->min_rate != hc->max_rate) {
3177 shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3178 }
3179 return 0;
3180}
3181
3182static int
3183hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3184 const struct shash *details)
3185{
3186 int error;
3187 struct hfsc_class class;
3188
3189 error = hfsc_parse_class_details__(netdev, details, &class);
3190 if (error) {
3191 return error;
3192 }
3193
3194 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3195 tc_make_handle(1, 0xfffe), &class);
3196 if (error) {
3197 return error;
3198 }
3199
3200 hfsc_update_queue__(netdev, queue_id, &class);
3201 return 0;
3202}
3203
3204static int
3205hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3206{
3207 int error;
3208 struct hfsc *hfsc;
3209 struct hfsc_class *hc;
3210
3211 hc = hfsc_class_cast__(queue);
3212 hfsc = hfsc_get__(netdev);
3213
3214 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3215 if (!error) {
3216 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3217 free(hc);
3218 }
3219 return error;
3220}
3221
3222static int
3223hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3224 struct netdev_queue_stats *stats)
3225{
3226 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3227 tc_make_handle(1, 0xfffe), NULL, stats);
3228}
3229
3230static int
3231hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3232 const struct ofpbuf *nlmsg,
3233 netdev_dump_queue_stats_cb *cb, void *aux)
3234{
3235 struct netdev_queue_stats stats;
3236 unsigned int handle, major, minor;
3237 int error;
3238
3239 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3240 if (error) {
3241 return error;
3242 }
3243
3244 major = tc_get_major(handle);
3245 minor = tc_get_minor(handle);
3246 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3247 (*cb)(minor - 1, &stats, aux);
3248 }
3249 return 0;
3250}
3251
3252static const struct tc_ops tc_ops_hfsc = {
3253 "hfsc", /* linux_name */
3254 "linux-hfsc", /* ovs_name */
3255 HFSC_N_QUEUES, /* n_queues */
3256 hfsc_tc_install, /* tc_install */
3257 hfsc_tc_load, /* tc_load */
3258 hfsc_tc_destroy, /* tc_destroy */
3259 hfsc_qdisc_get, /* qdisc_get */
3260 hfsc_qdisc_set, /* qdisc_set */
3261 hfsc_class_get, /* class_get */
3262 hfsc_class_set, /* class_set */
3263 hfsc_class_delete, /* class_delete */
3264 hfsc_class_get_stats, /* class_get_stats */
3265 hfsc_class_dump_stats /* class_dump_stats */
3266};
3267\f
c1c9c9c4
BP
3268/* "linux-default" traffic control class.
3269 *
3270 * This class represents the default, unnamed Linux qdisc. It corresponds to
3271 * the "" (empty string) QoS type in the OVS database. */
3272
3273static void
3274default_install__(struct netdev *netdev)
3275{
3276 struct netdev_dev_linux *netdev_dev =
3277 netdev_dev_linux_cast(netdev_get_dev(netdev));
3278 static struct tc *tc;
3279
3280 if (!tc) {
3281 tc = xmalloc(sizeof *tc);
3282 tc_init(tc, &tc_ops_default);
3283 }
3284 netdev_dev->tc = tc;
3285}
3286
3287static int
3288default_tc_install(struct netdev *netdev,
3289 const struct shash *details OVS_UNUSED)
3290{
3291 default_install__(netdev);
3292 return 0;
3293}
3294
3295static int
3296default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3297{
3298 default_install__(netdev);
3299 return 0;
3300}
3301
3302static const struct tc_ops tc_ops_default = {
3303 NULL, /* linux_name */
3304 "", /* ovs_name */
3305 0, /* n_queues */
3306 default_tc_install,
3307 default_tc_load,
3308 NULL, /* tc_destroy */
3309 NULL, /* qdisc_get */
3310 NULL, /* qdisc_set */
3311 NULL, /* class_get */
3312 NULL, /* class_set */
3313 NULL, /* class_delete */
3314 NULL, /* class_get_stats */
3315 NULL /* class_dump_stats */
3316};
3317\f
3318/* "linux-other" traffic control class.
3319 *
3320 * */
3321
3322static int
3323other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3324{
3325 struct netdev_dev_linux *netdev_dev =
3326 netdev_dev_linux_cast(netdev_get_dev(netdev));
3327 static struct tc *tc;
3328
3329 if (!tc) {
3330 tc = xmalloc(sizeof *tc);
3331 tc_init(tc, &tc_ops_other);
3332 }
3333 netdev_dev->tc = tc;
3334 return 0;
3335}
3336
3337static const struct tc_ops tc_ops_other = {
3338 NULL, /* linux_name */
3339 "linux-other", /* ovs_name */
3340 0, /* n_queues */
3341 NULL, /* tc_install */
3342 other_tc_load,
3343 NULL, /* tc_destroy */
3344 NULL, /* qdisc_get */
3345 NULL, /* qdisc_set */
3346 NULL, /* class_get */
3347 NULL, /* class_set */
3348 NULL, /* class_delete */
3349 NULL, /* class_get_stats */
3350 NULL /* class_dump_stats */
3351};
3352\f
3353/* Traffic control. */
3354
3355/* Number of kernel "tc" ticks per second. */
3356static double ticks_per_s;
3357
3358/* Number of kernel "jiffies" per second. This is used for the purpose of
3359 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3360 * one jiffy's worth of data.
3361 *
3362 * There are two possibilities here:
3363 *
3364 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3365 * approximate range of 100 to 1024. That means that we really need to
3366 * make sure that the qdisc can buffer that much data.
3367 *
3368 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3369 * has finely granular timers and there's no need to fudge additional room
3370 * for buffers. (There's no extra effort needed to implement that: the
3371 * large 'buffer_hz' is used as a divisor, so practically any number will
3372 * come out as 0 in the division. Small integer results in the case of
3373 * really high dividends won't have any real effect anyhow.)
3374 */
3375static unsigned int buffer_hz;
3376
3377/* Returns tc handle 'major':'minor'. */
3378static unsigned int
3379tc_make_handle(unsigned int major, unsigned int minor)
3380{
3381 return TC_H_MAKE(major << 16, minor);
3382}
3383
3384/* Returns the major number from 'handle'. */
3385static unsigned int
3386tc_get_major(unsigned int handle)
3387{
3388 return TC_H_MAJ(handle) >> 16;
3389}
3390
3391/* Returns the minor number from 'handle'. */
3392static unsigned int
3393tc_get_minor(unsigned int handle)
3394{
3395 return TC_H_MIN(handle);
3396}
3397
3398static struct tcmsg *
3399tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3400 struct ofpbuf *request)
3401{
3402 struct tcmsg *tcmsg;
3403 int ifindex;
3404 int error;
3405
3406 error = get_ifindex(netdev, &ifindex);
3407 if (error) {
3408 return NULL;
3409 }
3410
3411 ofpbuf_init(request, 512);
3412 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3413 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3414 tcmsg->tcm_family = AF_UNSPEC;
3415 tcmsg->tcm_ifindex = ifindex;
3416 /* Caller should fill in tcmsg->tcm_handle. */
3417 /* Caller should fill in tcmsg->tcm_parent. */
3418
3419 return tcmsg;
3420}
3421
3422static int
3423tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3424{
3425 int error = nl_sock_transact(rtnl_sock, request, replyp);
3426 ofpbuf_uninit(request);
3427 return error;
3428}
3429
3430static void
3431read_psched(void)
3432{
3433 /* The values in psched are not individually very meaningful, but they are
3434 * important. The tables below show some values seen in the wild.
3435 *
3436 * Some notes:
3437 *
3438 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3439 * (Before that, there are hints that it was 1000000000.)
3440 *
3441 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3442 * above.
3443 *
3444 * /proc/net/psched
3445 * -----------------------------------
3446 * [1] 000c8000 000f4240 000f4240 00000064
3447 * [2] 000003e8 00000400 000f4240 3b9aca00
3448 * [3] 000003e8 00000400 000f4240 3b9aca00
3449 * [4] 000003e8 00000400 000f4240 00000064
3450 * [5] 000003e8 00000040 000f4240 3b9aca00
3451 * [6] 000003e8 00000040 000f4240 000000f9
3452 *
3453 * a b c d ticks_per_s buffer_hz
3454 * ------- --------- ---------- ------------- ----------- -------------
3455 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3456 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3457 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3458 * [4] 1,000 1,024 1,000,000 100 976,562 100
3459 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3460 * [6] 1,000 64 1,000,000 249 15,625,000 249
3461 *
3462 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3463 * [2] 2.6.26-1-686-bigmem from Debian lenny
3464 * [3] 2.6.26-2-sparc64 from Debian lenny
3465 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3466 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3467 * [6] 2.6.34 from kernel.org on KVM
3468 */
3469 static const char fn[] = "/proc/net/psched";
3470 unsigned int a, b, c, d;
3471 FILE *stream;
3472
3473 ticks_per_s = 1.0;
3474 buffer_hz = 100;
3475
3476 stream = fopen(fn, "r");
3477 if (!stream) {
3478 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3479 return;
3480 }
3481
3482 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3483 VLOG_WARN("%s: read failed", fn);
3484 fclose(stream);
3485 return;
3486 }
3487 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3488 fclose(stream);
3489
3490 if (!a || !c) {
3491 VLOG_WARN("%s: invalid scheduler parameters", fn);
3492 return;
3493 }
3494
3495 ticks_per_s = (double) a * c / b;
3496 if (c == 1000000) {
3497 buffer_hz = d;
3498 } else {
3499 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3500 fn, a, b, c, d);
3501 }
3502 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3503}
3504
3505/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3506 * rate of 'rate' bytes per second. */
3507static unsigned int
3508tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3509{
3510 if (!buffer_hz) {
3511 read_psched();
3512 }
3513 return (rate * ticks) / ticks_per_s;
3514}
3515
3516/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3517 * rate of 'rate' bytes per second. */
3518static unsigned int
3519tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3520{
3521 if (!buffer_hz) {
3522 read_psched();
3523 }
015c93a4 3524 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3525}
3526
3527/* Returns the number of bytes that need to be reserved for qdisc buffering at
3528 * a transmission rate of 'rate' bytes per second. */
3529static unsigned int
3530tc_buffer_per_jiffy(unsigned int rate)
3531{
3532 if (!buffer_hz) {
3533 read_psched();
3534 }
3535 return rate / buffer_hz;
3536}
3537
3538/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3539 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3540 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3541 * stores NULL into it if it is absent.
3542 *
3543 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3544 * 'msg'.
3545 *
3546 * Returns 0 if successful, otherwise a positive errno value. */
3547static int
3548tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3549 struct nlattr **options)
3550{
3551 static const struct nl_policy tca_policy[] = {
3552 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3553 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3554 };
3555 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3556
3557 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3558 tca_policy, ta, ARRAY_SIZE(ta))) {
3559 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3560 goto error;
3561 }
3562
3563 if (kind) {
3564 *kind = nl_attr_get_string(ta[TCA_KIND]);
3565 }
3566
3567 if (options) {
3568 *options = ta[TCA_OPTIONS];
3569 }
3570
3571 return 0;
3572
3573error:
3574 if (kind) {
3575 *kind = NULL;
3576 }
3577 if (options) {
3578 *options = NULL;
3579 }
3580 return EPROTO;
3581}
3582
3583/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3584 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3585 * into '*options', and its queue statistics into '*stats'. Any of the output
3586 * arguments may be null.
3587 *
3588 * Returns 0 if successful, otherwise a positive errno value. */
3589static int
3590tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3591 struct nlattr **options, struct netdev_queue_stats *stats)
3592{
3593 static const struct nl_policy tca_policy[] = {
3594 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3595 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3596 };
3597 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3598
3599 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3600 tca_policy, ta, ARRAY_SIZE(ta))) {
3601 VLOG_WARN_RL(&rl, "failed to parse class message");
3602 goto error;
3603 }
3604
3605 if (handlep) {
3606 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3607 *handlep = tc->tcm_handle;
3608 }
3609
3610 if (options) {
3611 *options = ta[TCA_OPTIONS];
3612 }
3613
3614 if (stats) {
3615 const struct gnet_stats_queue *gsq;
3616 struct gnet_stats_basic gsb;
3617
3618 static const struct nl_policy stats_policy[] = {
3619 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3620 .min_len = sizeof gsb },
3621 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3622 .min_len = sizeof *gsq },
3623 };
3624 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3625
3626 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3627 sa, ARRAY_SIZE(sa))) {
3628 VLOG_WARN_RL(&rl, "failed to parse class stats");
3629 goto error;
3630 }
3631
3632 /* Alignment issues screw up the length of struct gnet_stats_basic on
3633 * some arch/bitsize combinations. Newer versions of Linux have a
3634 * struct gnet_stats_basic_packed, but we can't depend on that. The
3635 * easiest thing to do is just to make a copy. */
3636 memset(&gsb, 0, sizeof gsb);
3637 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3638 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3639 stats->tx_bytes = gsb.bytes;
3640 stats->tx_packets = gsb.packets;
3641
3642 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3643 stats->tx_errors = gsq->drops;
3644 }
3645
3646 return 0;
3647
3648error:
3649 if (options) {
3650 *options = NULL;
3651 }
3652 if (stats) {
3653 memset(stats, 0, sizeof *stats);
3654 }
3655 return EPROTO;
3656}
3657
3658/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3659 * on 'netdev'. */
3660static int
3661tc_query_class(const struct netdev *netdev,
3662 unsigned int handle, unsigned int parent,
3663 struct ofpbuf **replyp)
3664{
3665 struct ofpbuf request;
3666 struct tcmsg *tcmsg;
3667 int error;
3668
3669 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
3670 if (!tcmsg) {
3671 return ENODEV;
3672 }
c1c9c9c4
BP
3673 tcmsg->tcm_handle = handle;
3674 tcmsg->tcm_parent = parent;
3675
3676 error = tc_transact(&request, replyp);
3677 if (error) {
3678 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3679 netdev_get_name(netdev),
3680 tc_get_major(handle), tc_get_minor(handle),
3681 tc_get_major(parent), tc_get_minor(parent),
3682 strerror(error));
3683 }
3684 return error;
3685}
3686
3687/* Equivalent to "tc class del dev <name> handle <handle>". */
3688static int
3689tc_delete_class(const struct netdev *netdev, unsigned int handle)
3690{
3691 struct ofpbuf request;
3692 struct tcmsg *tcmsg;
3693 int error;
3694
3695 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
3696 if (!tcmsg) {
3697 return ENODEV;
3698 }
c1c9c9c4
BP
3699 tcmsg->tcm_handle = handle;
3700 tcmsg->tcm_parent = 0;
3701
3702 error = tc_transact(&request, NULL);
3703 if (error) {
3704 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3705 netdev_get_name(netdev),
3706 tc_get_major(handle), tc_get_minor(handle),
3707 strerror(error));
3708 }
3709 return error;
3710}
3711
3712/* Equivalent to "tc qdisc del dev <name> root". */
3713static int
3714tc_del_qdisc(struct netdev *netdev)
3715{
3716 struct netdev_dev_linux *netdev_dev =
3717 netdev_dev_linux_cast(netdev_get_dev(netdev));
3718 struct ofpbuf request;
3719 struct tcmsg *tcmsg;
3720 int error;
3721
3722 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
3723 if (!tcmsg) {
3724 return ENODEV;
3725 }
c1c9c9c4
BP
3726 tcmsg->tcm_handle = tc_make_handle(1, 0);
3727 tcmsg->tcm_parent = TC_H_ROOT;
3728
3729 error = tc_transact(&request, NULL);
3730 if (error == EINVAL) {
3731 /* EINVAL probably means that the default qdisc was in use, in which
3732 * case we've accomplished our purpose. */
3733 error = 0;
3734 }
3735 if (!error && netdev_dev->tc) {
3736 if (netdev_dev->tc->ops->tc_destroy) {
3737 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3738 }
3739 netdev_dev->tc = NULL;
3740 }
3741 return error;
3742}
3743
3744/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3745 * kernel to determine what they are. Returns 0 if successful, otherwise a
3746 * positive errno value. */
3747static int
3748tc_query_qdisc(const struct netdev *netdev)
3749{
3750 struct netdev_dev_linux *netdev_dev =
3751 netdev_dev_linux_cast(netdev_get_dev(netdev));
3752 struct ofpbuf request, *qdisc;
3753 const struct tc_ops *ops;
3754 struct tcmsg *tcmsg;
3755 int load_error;
3756 int error;
3757
3758 if (netdev_dev->tc) {
3759 return 0;
3760 }
3761
3762 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3763 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3764 * 2.6.35 without that fix backported to it.
3765 *
3766 * To avoid the OOPS, we must not make a request that would attempt to dump
3767 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3768 * few others. There are a few ways that I can see to do this, but most of
3769 * them seem to be racy (and if you lose the race the kernel OOPSes). The
3770 * technique chosen here is to assume that any non-default qdisc that we
3771 * create will have a class with handle 1:0. The built-in qdiscs only have
3772 * a class with handle 0:0.
3773 *
3774 * We could check for Linux 2.6.35+ and use a more straightforward method
3775 * there. */
3776 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
3777 if (!tcmsg) {
3778 return ENODEV;
3779 }
c1c9c9c4
BP
3780 tcmsg->tcm_handle = tc_make_handle(1, 0);
3781 tcmsg->tcm_parent = 0;
3782
3783 /* Figure out what tc class to instantiate. */
3784 error = tc_transact(&request, &qdisc);
3785 if (!error) {
3786 const char *kind;
3787
3788 error = tc_parse_qdisc(qdisc, &kind, NULL);
3789 if (error) {
3790 ops = &tc_ops_other;
3791 } else {
3792 ops = tc_lookup_linux_name(kind);
3793 if (!ops) {
3794 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3795 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3796
3797 ops = &tc_ops_other;
3798 }
3799 }
3800 } else if (error == ENOENT) {
3801 /* Either it's a built-in qdisc, or it's a qdisc set up by some
3802 * other entity that doesn't have a handle 1:0. We will assume
3803 * that it's the system default qdisc. */
3804 ops = &tc_ops_default;
3805 error = 0;
3806 } else {
3807 /* Who knows? Maybe the device got deleted. */
3808 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3809 netdev_get_name(netdev), strerror(error));
3810 ops = &tc_ops_other;
3811 }
3812
3813 /* Instantiate it. */
3814 load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3815 assert((load_error == 0) == (netdev_dev->tc != NULL));
3816 ofpbuf_delete(qdisc);
3817
3818 return error ? error : load_error;
3819}
3820
3821/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3822 approximate the time to transmit packets of various lengths. For an MTU of
3823 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3824 represents two possible packet lengths; for a MTU of 513 through 1024, four
3825 possible lengths; and so on.
3826
3827 Returns, for the specified 'mtu', the number of bits that packet lengths
3828 need to be shifted right to fit within such a 256-entry table. */
3829static int
3830tc_calc_cell_log(unsigned int mtu)
3831{
3832 int cell_log;
3833
3834 if (!mtu) {
3835 mtu = ETH_PAYLOAD_MAX;
3836 }
3837 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3838
3839 for (cell_log = 0; mtu >= 256; cell_log++) {
3840 mtu >>= 1;
3841 }
3842
3843 return cell_log;
3844}
3845
3846/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3847 * of 'mtu'. */
3848static void
3849tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3850{
3851 memset(rate, 0, sizeof *rate);
3852 rate->cell_log = tc_calc_cell_log(mtu);
3853 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3854 /* rate->cell_align = 0; */ /* distro headers. */
3855 rate->mpu = ETH_TOTAL_MIN;
3856 rate->rate = Bps;
3857}
3858
3859/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3860 * attribute of the specified "type".
3861 *
3862 * See tc_calc_cell_log() above for a description of "rtab"s. */
3863static void
3864tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3865{
3866 uint32_t *rtab;
3867 unsigned int i;
3868
3869 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3870 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3871 unsigned packet_size = (i + 1) << rate->cell_log;
3872 if (packet_size < rate->mpu) {
3873 packet_size = rate->mpu;
3874 }
3875 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3876 }
3877}
3878
3879/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3880 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3881 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 3882 * 0 is fine.) */
c1c9c9c4
BP
3883static int
3884tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3885{
3886 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3887 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3888}
3889
3890\f
3891/* Utility functions. */
3892
3893static int
3894get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
3895{
3896 /* Policy for RTNLGRP_LINK messages.
3897 *
3898 * There are *many* more fields in these messages, but currently we only
3899 * care about these fields. */
3900 static const struct nl_policy rtnlgrp_link_policy[] = {
3901 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
3902 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
3903 .min_len = sizeof(struct rtnl_link_stats) },
3904 };
3905
3906 struct ofpbuf request;
3907 struct ofpbuf *reply;
3908 struct ifinfomsg *ifi;
3909 const struct rtnl_link_stats *rtnl_stats;
3910 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
3911 int error;
3912
3913 ofpbuf_init(&request, 0);
3914 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
3915 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
3916 ifi->ifi_family = PF_UNSPEC;
3917 ifi->ifi_index = ifindex;
3918 error = nl_sock_transact(rtnl_sock, &request, &reply);
3919 ofpbuf_uninit(&request);
3920 if (error) {
3921 return error;
3922 }
3923
3924 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
3925 rtnlgrp_link_policy,
3926 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
3927 ofpbuf_delete(reply);
3928 return EPROTO;
3929 }
3930
3931 if (!attrs[IFLA_STATS]) {
3932 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
3933 ofpbuf_delete(reply);
3934 return EPROTO;
3935 }
8b61709d
BP
3936
3937 rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
3938 stats->rx_packets = rtnl_stats->rx_packets;
3939 stats->tx_packets = rtnl_stats->tx_packets;
3940 stats->rx_bytes = rtnl_stats->rx_bytes;
3941 stats->tx_bytes = rtnl_stats->tx_bytes;
3942 stats->rx_errors = rtnl_stats->rx_errors;
3943 stats->tx_errors = rtnl_stats->tx_errors;
3944 stats->rx_dropped = rtnl_stats->rx_dropped;
3945 stats->tx_dropped = rtnl_stats->tx_dropped;
3946 stats->multicast = rtnl_stats->multicast;
3947 stats->collisions = rtnl_stats->collisions;
3948 stats->rx_length_errors = rtnl_stats->rx_length_errors;
3949 stats->rx_over_errors = rtnl_stats->rx_over_errors;
3950 stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
3951 stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
3952 stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
3953 stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
3954 stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
3955 stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
3956 stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
3957 stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
3958 stats->tx_window_errors = rtnl_stats->tx_window_errors;
3959
576e26d7
BP
3960 ofpbuf_delete(reply);
3961
8b61709d
BP
3962 return 0;
3963}
3964
3965static int
3966get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
3967{
3968 static const char fn[] = "/proc/net/dev";
3969 char line[1024];
3970 FILE *stream;
3971 int ln;
3972
3973 stream = fopen(fn, "r");
3974 if (!stream) {
3975 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
3976 return errno;
3977 }
3978
3979 ln = 0;
3980 while (fgets(line, sizeof line, stream)) {
3981 if (++ln >= 3) {
3982 char devname[16];
3983#define X64 "%"SCNu64
3984 if (sscanf(line,
3985 " %15[^:]:"
3986 X64 X64 X64 X64 X64 X64 X64 "%*u"
3987 X64 X64 X64 X64 X64 X64 X64 "%*u",
3988 devname,
3989 &stats->rx_bytes,
3990 &stats->rx_packets,
3991 &stats->rx_errors,
3992 &stats->rx_dropped,
3993 &stats->rx_fifo_errors,
3994 &stats->rx_frame_errors,
3995 &stats->multicast,
3996 &stats->tx_bytes,
3997 &stats->tx_packets,
3998 &stats->tx_errors,
3999 &stats->tx_dropped,
4000 &stats->tx_fifo_errors,
4001 &stats->collisions,
4002 &stats->tx_carrier_errors) != 15) {
4003 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4004 } else if (!strcmp(devname, netdev_name)) {
4005 stats->rx_length_errors = UINT64_MAX;
4006 stats->rx_over_errors = UINT64_MAX;
4007 stats->rx_crc_errors = UINT64_MAX;
4008 stats->rx_missed_errors = UINT64_MAX;
4009 stats->tx_aborted_errors = UINT64_MAX;
4010 stats->tx_heartbeat_errors = UINT64_MAX;
4011 stats->tx_window_errors = UINT64_MAX;
4012 fclose(stream);
4013 return 0;
4014 }
4015 }
4016 }
4017 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4018 fclose(stream);
4019 return ENODEV;
4020}
c1c9c9c4 4021
8b61709d
BP
4022static int
4023get_flags(const struct netdev *netdev, int *flags)
4024{
4025 struct ifreq ifr;
4026 int error;
4027
149f577a
JG
4028 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4029 "SIOCGIFFLAGS");
8b61709d
BP
4030 *flags = ifr.ifr_flags;
4031 return error;
4032}
4033
4034static int
4035set_flags(struct netdev *netdev, int flags)
4036{
4037 struct ifreq ifr;
4038
4039 ifr.ifr_flags = flags;
149f577a
JG
4040 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4041 "SIOCSIFFLAGS");
8b61709d
BP
4042}
4043
4044static int
4045do_get_ifindex(const char *netdev_name)
4046{
4047 struct ifreq ifr;
4048
71d7c22f 4049 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4050 COVERAGE_INC(netdev_get_ifindex);
4051 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4052 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4053 netdev_name, strerror(errno));
4054 return -errno;
4055 }
4056 return ifr.ifr_ifindex;
4057}
4058
4059static int
4060get_ifindex(const struct netdev *netdev_, int *ifindexp)
4061{
149f577a
JG
4062 struct netdev_dev_linux *netdev_dev =
4063 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 4064 *ifindexp = 0;
149f577a 4065 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d
BP
4066 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4067 if (ifindex < 0) {
4068 return -ifindex;
4069 }
149f577a
JG
4070 netdev_dev->cache_valid |= VALID_IFINDEX;
4071 netdev_dev->ifindex = ifindex;
8b61709d 4072 }
149f577a 4073 *ifindexp = netdev_dev->ifindex;
8b61709d
BP
4074 return 0;
4075}
4076
4077static int
4078get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4079{
4080 struct ifreq ifr;
4081 int hwaddr_family;
4082
4083 memset(&ifr, 0, sizeof ifr);
71d7c22f 4084 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4085 COVERAGE_INC(netdev_get_hwaddr);
4086 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4087 VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4088 netdev_name, strerror(errno));
4089 return errno;
4090 }
4091 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4092 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4093 VLOG_WARN("%s device has unknown hardware address family %d",
4094 netdev_name, hwaddr_family);
4095 }
4096 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4097 return 0;
4098}
4099
4100static int
4101set_etheraddr(const char *netdev_name, int hwaddr_family,
4102 const uint8_t mac[ETH_ADDR_LEN])
4103{
4104 struct ifreq ifr;
4105
4106 memset(&ifr, 0, sizeof ifr);
71d7c22f 4107 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4108 ifr.ifr_hwaddr.sa_family = hwaddr_family;
4109 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4110 COVERAGE_INC(netdev_set_hwaddr);
4111 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4112 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4113 netdev_name, strerror(errno));
4114 return errno;
4115 }
4116 return 0;
4117}
4118
4119static int
0b0544d7 4120netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4121 int cmd, const char *cmd_name)
4122{
4123 struct ifreq ifr;
4124
4125 memset(&ifr, 0, sizeof ifr);
71d7c22f 4126 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4127 ifr.ifr_data = (caddr_t) ecmd;
4128
4129 ecmd->cmd = cmd;
4130 COVERAGE_INC(netdev_ethtool);
4131 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4132 return 0;
4133 } else {
4134 if (errno != EOPNOTSUPP) {
4135 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4136 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4137 } else {
4138 /* The device doesn't support this operation. That's pretty
4139 * common, so there's no point in logging anything. */
4140 }
4141 return errno;
4142 }
4143}
4144
4145static int
149f577a
JG
4146netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4147 const char *cmd_name)
8b61709d 4148{
71d7c22f 4149 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4150 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4151 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4152 strerror(errno));
8b61709d
BP
4153 return errno;
4154 }
4155 return 0;
4156}
f1acd62b
BP
4157
4158static int
4159netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4160 int cmd, const char *cmd_name)
4161{
4162 struct ifreq ifr;
4163 int error;
4164
4165 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4166 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4167 if (!error) {
4168 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4169 *ip = sin->sin_addr;
4170 }
4171 return error;
4172}