]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
Replace most uses of assert by ovs_assert.
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
275707c3 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d
BP
22#include <fcntl.h>
23#include <arpa/inet.h>
24#include <inttypes.h>
c1c9c9c4 25#include <linux/gen_stats.h>
bb7d0e22 26#include <linux/if_ether.h>
8b61709d
BP
27#include <linux/if_tun.h>
28#include <linux/types.h>
29#include <linux/ethtool.h>
63331829 30#include <linux/mii.h>
f8500004 31#include <linux/pkt_cls.h>
6f42c8ea 32#include <linux/pkt_sched.h>
e9e28be3 33#include <linux/rtnetlink.h>
8b61709d
BP
34#include <linux/sockios.h>
35#include <linux/version.h>
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
39#include <netpacket/packet.h>
8b61709d
BP
40#include <net/if.h>
41#include <net/if_arp.h>
42#include <net/if_packet.h>
43#include <net/route.h>
44#include <netinet/in.h>
e9e28be3 45#include <poll.h>
8b61709d
BP
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
e9e28be3
BP
49
50#include "coverage.h"
9fe3b9a2 51#include "dpif-linux.h"
8b61709d
BP
52#include "dynamic-string.h"
53#include "fatal-signal.h"
93b13be8
BP
54#include "hash.h"
55#include "hmap.h"
8b61709d 56#include "netdev-provider.h"
7fbef77a 57#include "netdev-vport.h"
e9e28be3 58#include "netlink.h"
45c8d3a1 59#include "netlink-notifier.h"
2fe27d5a 60#include "netlink-socket.h"
e9e28be3 61#include "ofpbuf.h"
8b61709d
BP
62#include "openflow/openflow.h"
63#include "packets.h"
64#include "poll-loop.h"
21d6e22e 65#include "rtnetlink-link.h"
8b61709d
BP
66#include "socket-util.h"
67#include "shash.h"
19993ef3 68#include "sset.h"
1670c579 69#include "timer.h"
e9e28be3 70#include "vlog.h"
5136ce49 71
d98e6007 72VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 73
d76f09ea
BP
74COVERAGE_DEFINE(netdev_set_policing);
75COVERAGE_DEFINE(netdev_arp_lookup);
76COVERAGE_DEFINE(netdev_get_ifindex);
77COVERAGE_DEFINE(netdev_get_hwaddr);
78COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
79COVERAGE_DEFINE(netdev_get_ethtool);
80COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 81
8b61709d
BP
82\f
83/* These were introduced in Linux 2.6.14, so they might be missing if we have
84 * old headers. */
85#ifndef ADVERTISED_Pause
86#define ADVERTISED_Pause (1 << 13)
87#endif
88#ifndef ADVERTISED_Asym_Pause
89#define ADVERTISED_Asym_Pause (1 << 14)
90#endif
91
e47bd51a
JP
92/* These were introduced in Linux 2.6.24, so they might be missing if we
93 * have old headers. */
94#ifndef ETHTOOL_GFLAGS
95#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96#endif
97#ifndef ETHTOOL_SFLAGS
98#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
99#endif
100
c1c9c9c4
BP
101/* This was introduced in Linux 2.6.25, so it might be missing if we have old
102 * headers. */
103#ifndef TC_RTAB_SIZE
104#define TC_RTAB_SIZE 1024
105#endif
106
2ee6545f 107static struct nln_notifier *netdev_linux_cache_notifier = NULL;
46415c90 108static int cache_notifier_refcount;
8b61709d
BP
109
110enum {
7fbef77a
JG
111 VALID_IFINDEX = 1 << 0,
112 VALID_ETHERADDR = 1 << 1,
113 VALID_IN4 = 1 << 2,
114 VALID_IN6 = 1 << 3,
115 VALID_MTU = 1 << 4,
3a183124 116 VALID_POLICING = 1 << 5,
4f925bd3
PS
117 VALID_VPORT_STAT_ERROR = 1 << 6,
118 VALID_DRVINFO = 1 << 7,
51f87458 119 VALID_FEATURES = 1 << 8,
8b61709d
BP
120};
121
149f577a
JG
122struct tap_state {
123 int fd;
61b999dd 124 bool opened;
149f577a 125};
c1c9c9c4
BP
126\f
127/* Traffic control. */
128
129/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
130 * network device.
131 *
132 * Each TC implementation subclasses this with whatever additional data it
133 * needs. */
c1c9c9c4
BP
134struct tc {
135 const struct tc_ops *ops;
93b13be8
BP
136 struct hmap queues; /* Contains "struct tc_queue"s.
137 * Read by generic TC layer.
138 * Written only by TC implementation. */
139};
c1c9c9c4 140
93b13be8
BP
141/* One traffic control queue.
142 *
143 * Each TC implementation subclasses this with whatever additional data it
144 * needs. */
145struct tc_queue {
146 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
147 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
148};
149
150/* A particular kind of traffic control. Each implementation generally maps to
151 * one particular Linux qdisc class.
152 *
153 * The functions below return 0 if successful or a positive errno value on
154 * failure, except where otherwise noted. All of them must be provided, except
155 * where otherwise noted. */
156struct tc_ops {
157 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
158 * This is null for tc_ops_default and tc_ops_other, for which there are no
159 * appropriate values. */
160 const char *linux_name;
161
162 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
163 const char *ovs_name;
164
165 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
166 * queues. The queues are numbered 0 through n_queues - 1. */
167 unsigned int n_queues;
168
169 /* Called to install this TC class on 'netdev'. The implementation should
170 * make the Netlink calls required to set up 'netdev' with the right qdisc
171 * and configure it according to 'details'. The implementation may assume
172 * that the current qdisc is the default; that is, there is no need for it
173 * to delete the current qdisc before installing itself.
174 *
175 * The contents of 'details' should be documented as valid for 'ovs_name'
176 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
177 * (which is built as ovs-vswitchd.conf.db(8)).
178 *
179 * This function must return 0 if and only if it sets 'netdev->tc' to an
180 * initialized 'struct tc'.
181 *
182 * (This function is null for tc_ops_other, which cannot be installed. For
183 * other TC classes it should always be nonnull.) */
79f1cbe9 184 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
185
186 /* Called when the netdev code determines (through a Netlink query) that
187 * this TC class's qdisc is installed on 'netdev', but we didn't install
188 * it ourselves and so don't know any of the details.
189 *
190 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
191 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
192 * implementation should parse the other attributes of 'nlmsg' as
193 * necessary to determine its configuration. If necessary it should also
194 * use Netlink queries to determine the configuration of queues on
195 * 'netdev'.
196 *
197 * This function must return 0 if and only if it sets 'netdev->tc' to an
198 * initialized 'struct tc'. */
199 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
200
201 /* Destroys the data structures allocated by the implementation as part of
202 * 'tc'. (This includes destroying 'tc->queues' by calling
203 * tc_destroy(tc).
204 *
205 * The implementation should not need to perform any Netlink calls. If
206 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
207 * (But it may not be desirable.)
208 *
209 * This function may be null if 'tc' is trivial. */
210 void (*tc_destroy)(struct tc *tc);
211
212 /* Retrieves details of 'netdev->tc' configuration into 'details'.
213 *
214 * The implementation should not need to perform any Netlink calls, because
215 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
216 * cached the configuration.
217 *
218 * The contents of 'details' should be documented as valid for 'ovs_name'
219 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
220 * (which is built as ovs-vswitchd.conf.db(8)).
221 *
222 * This function may be null if 'tc' is not configurable.
223 */
79f1cbe9 224 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
225
226 /* Reconfigures 'netdev->tc' according to 'details', performing any
227 * required Netlink calls to complete the reconfiguration.
228 *
229 * The contents of 'details' should be documented as valid for 'ovs_name'
230 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
231 * (which is built as ovs-vswitchd.conf.db(8)).
232 *
233 * This function may be null if 'tc' is not configurable.
234 */
79f1cbe9 235 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 236
93b13be8
BP
237 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
238 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
239 *
240 * The contents of 'details' should be documented as valid for 'ovs_name'
241 * in the "other_config" column in the "Queue" table in
242 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
243 *
244 * The implementation should not need to perform any Netlink calls, because
245 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
246 * cached the queue configuration.
247 *
248 * This function may be null if 'tc' does not have queues ('n_queues' is
249 * 0). */
93b13be8 250 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 251 struct smap *details);
c1c9c9c4
BP
252
253 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
254 * 'details', perfoming any required Netlink calls to complete the
255 * reconfiguration. The caller ensures that 'queue_id' is less than
256 * 'n_queues'.
257 *
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "Queue" table in
260 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
261 *
262 * This function may be null if 'tc' does not have queues or its queues are
263 * not configurable. */
264 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 265 const struct smap *details);
c1c9c9c4 266
93b13be8
BP
267 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
268 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
269 *
270 * This function may be null if 'tc' does not have queues or its queues
271 * cannot be deleted. */
93b13be8 272 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 273
93b13be8
BP
274 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
275 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
276 *
277 * On success, initializes '*stats'.
278 *
279 * This function may be null if 'tc' does not have queues or if it cannot
280 * report queue statistics. */
93b13be8
BP
281 int (*class_get_stats)(const struct netdev *netdev,
282 const struct tc_queue *queue,
c1c9c9c4
BP
283 struct netdev_queue_stats *stats);
284
285 /* Extracts queue stats from 'nlmsg', which is a response to a
286 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
287 *
288 * This function may be null if 'tc' does not have queues or if it cannot
289 * report queue statistics. */
290 int (*class_dump_stats)(const struct netdev *netdev,
291 const struct ofpbuf *nlmsg,
292 netdev_dump_queue_stats_cb *cb, void *aux);
293};
294
295static void
296tc_init(struct tc *tc, const struct tc_ops *ops)
297{
298 tc->ops = ops;
93b13be8 299 hmap_init(&tc->queues);
c1c9c9c4
BP
300}
301
302static void
303tc_destroy(struct tc *tc)
304{
93b13be8 305 hmap_destroy(&tc->queues);
c1c9c9c4
BP
306}
307
308static const struct tc_ops tc_ops_htb;
a339aa81 309static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
310static const struct tc_ops tc_ops_default;
311static const struct tc_ops tc_ops_other;
312
313static const struct tc_ops *tcs[] = {
314 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 315 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
316 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
317 &tc_ops_other, /* Some other qdisc. */
318 NULL
319};
149f577a 320
c1c9c9c4
BP
321static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
322static unsigned int tc_get_major(unsigned int handle);
323static unsigned int tc_get_minor(unsigned int handle);
324
325static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
326static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
327static unsigned int tc_buffer_per_jiffy(unsigned int rate);
328
329static struct tcmsg *tc_make_request(const struct netdev *, int type,
330 unsigned int flags, struct ofpbuf *);
331static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004
JP
332static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
333static int tc_add_policer(struct netdev *netdev, int kbits_rate,
334 int kbits_burst);
c1c9c9c4
BP
335
336static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
337 struct nlattr **options);
338static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
339 struct nlattr **options,
340 struct netdev_queue_stats *);
341static int tc_query_class(const struct netdev *,
342 unsigned int handle, unsigned int parent,
343 struct ofpbuf **replyp);
344static int tc_delete_class(const struct netdev *, unsigned int handle);
345
346static int tc_del_qdisc(struct netdev *netdev);
347static int tc_query_qdisc(const struct netdev *netdev);
348
349static int tc_calc_cell_log(unsigned int mtu);
350static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
351static void tc_put_rtab(struct ofpbuf *, uint16_t type,
352 const struct tc_ratespec *rate);
353static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
354\f
149f577a
JG
355struct netdev_dev_linux {
356 struct netdev_dev netdev_dev;
357
8b61709d 358 struct shash_node *shash_node;
149f577a 359 unsigned int cache_valid;
ac4d3bcb 360 unsigned int change_seq;
8b61709d 361
1670c579
EJ
362 bool miimon; /* Link status of last poll. */
363 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
364 struct timer miimon_timer;
365
8722022c
BP
366 /* The following are figured out "on demand" only. They are only valid
367 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
368 int ifindex;
369 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 370 struct in_addr address, netmask;
8b61709d
BP
371 struct in6_addr in6;
372 int mtu;
059e5f4f 373 unsigned int ifi_flags;
65c3058c 374 long long int carrier_resets;
80a86fbe
BP
375 uint32_t kbits_rate; /* Policing data. */
376 uint32_t kbits_burst;
bba1e6f3
PS
377 int vport_stats_error; /* Cached error code from vport_get_stats().
378 0 or an errno value. */
90a6637d 379 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 380 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 381 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 382 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 383 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 384
a00ca915
EJ
385 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
386 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
387 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
388 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
90a6637d 389
4f925bd3 390 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 391 struct tc *tc;
149f577a
JG
392
393 union {
394 struct tap_state tap;
395 } state;
8b61709d
BP
396};
397
149f577a
JG
398struct netdev_linux {
399 struct netdev netdev;
5b7448ed 400 int fd;
149f577a 401};
8b61709d 402
76c308b5
BP
403/* Sockets used for ioctl operations. */
404static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 405
ff4ed3c9
BP
406/* A Netlink routing socket that is not subscribed to any multicast groups. */
407static struct nl_sock *rtnl_sock;
408
8b61709d
BP
409/* This is set pretty low because we probably won't learn anything from the
410 * additional log messages. */
411static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
412
15b3596a 413static int netdev_linux_init(void);
6f643e49 414
0b0544d7 415static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 416 int cmd, const char *cmd_name);
149f577a
JG
417static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
418 const char *cmd_name);
f1acd62b
BP
419static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
420 int cmd, const char *cmd_name);
059e5f4f
EJ
421static int get_flags(const struct netdev_dev *, unsigned int *flags);
422static int set_flags(struct netdev *, unsigned int flags);
8b61709d
BP
423static int do_get_ifindex(const char *netdev_name);
424static int get_ifindex(const struct netdev *, int *ifindexp);
425static int do_set_addr(struct netdev *netdev,
426 int ioctl_nr, const char *ioctl_name,
427 struct in_addr addr);
428static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 429static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
8b61709d
BP
430static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
431static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 432static int af_packet_sock(void);
1670c579
EJ
433static void netdev_linux_miimon_run(void);
434static void netdev_linux_miimon_wait(void);
8b61709d 435
15b3596a
JG
436static bool
437is_netdev_linux_class(const struct netdev_class *netdev_class)
438{
439 return netdev_class->init == netdev_linux_init;
440}
441
149f577a
JG
442static struct netdev_dev_linux *
443netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 444{
15b3596a 445 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
cb22974d 446 ovs_assert(is_netdev_linux_class(netdev_class));
15b3596a 447
149f577a 448 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
449}
450
8b61709d
BP
451static struct netdev_linux *
452netdev_linux_cast(const struct netdev *netdev)
453{
15b3596a
JG
454 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
455 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
cb22974d 456 ovs_assert(is_netdev_linux_class(netdev_class));
15b3596a 457
8b61709d
BP
458 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
459}
ff4ed3c9 460\f
8b61709d
BP
461static int
462netdev_linux_init(void)
463{
464 static int status = -1;
465 if (status < 0) {
ff4ed3c9 466 /* Create AF_INET socket. */
8b61709d
BP
467 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
468 status = af_inet_sock >= 0 ? 0 : errno;
469 if (status) {
470 VLOG_ERR("failed to create inet socket: %s", strerror(status));
471 }
ff4ed3c9
BP
472
473 /* Create rtnetlink socket. */
474 if (!status) {
cceb11f5 475 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
476 if (status) {
477 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
478 strerror(status));
479 }
480 }
8b61709d
BP
481 }
482 return status;
483}
484
485static void
486netdev_linux_run(void)
487{
18a23781 488 rtnetlink_link_run();
1670c579 489 netdev_linux_miimon_run();
8b61709d
BP
490}
491
492static void
493netdev_linux_wait(void)
494{
18a23781 495 rtnetlink_link_wait();
1670c579 496 netdev_linux_miimon_wait();
8b61709d
BP
497}
498
ac4d3bcb 499static void
4f925bd3
PS
500netdev_dev_linux_changed(struct netdev_dev_linux *dev,
501 unsigned int ifi_flags,
502 unsigned int mask)
ac4d3bcb
EJ
503{
504 dev->change_seq++;
505 if (!dev->change_seq) {
506 dev->change_seq++;
507 }
8aa77183
BP
508
509 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
510 dev->carrier_resets++;
511 }
512 dev->ifi_flags = ifi_flags;
513
4f925bd3
PS
514 dev->cache_valid &= mask;
515}
516
517static void
518netdev_dev_linux_update(struct netdev_dev_linux *dev,
519 const struct rtnetlink_link_change *change)
520{
521 if (change->nlmsg_type == RTM_NEWLINK) {
522 /* Keep drv-info */
523 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 524
c7b1b0a5 525 /* Update netdev from rtnl-change msg. */
90a6637d
PS
526 if (change->mtu) {
527 dev->mtu = change->mtu;
528 dev->cache_valid |= VALID_MTU;
529 dev->netdev_mtu_error = 0;
530 }
531
44445cac
PS
532 if (!eth_addr_is_zero(change->addr)) {
533 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
534 dev->cache_valid |= VALID_ETHERADDR;
535 dev->ether_addr_error = 0;
536 }
537
c7b1b0a5
PS
538 dev->ifindex = change->ifi_index;
539 dev->cache_valid |= VALID_IFINDEX;
540 dev->get_ifindex_error = 0;
541
4f925bd3
PS
542 } else {
543 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
544 }
ac4d3bcb
EJ
545}
546
8b61709d 547static void
21d6e22e 548netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 549 void *aux OVS_UNUSED)
8b61709d 550{
149f577a 551 struct netdev_dev_linux *dev;
8b61709d 552 if (change) {
46415c90
JG
553 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
554 if (base_dev) {
15b3596a
JG
555 const struct netdev_class *netdev_class =
556 netdev_dev_get_class(base_dev);
557
558 if (is_netdev_linux_class(netdev_class)) {
559 dev = netdev_dev_linux_cast(base_dev);
4f925bd3 560 netdev_dev_linux_update(dev, change);
15b3596a 561 }
8b61709d
BP
562 }
563 } else {
46415c90 564 struct shash device_shash;
8b61709d 565 struct shash_node *node;
46415c90
JG
566
567 shash_init(&device_shash);
568 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
569 SHASH_FOR_EACH (node, &device_shash) {
059e5f4f 570 unsigned int flags;
3a183124 571
149f577a 572 dev = node->data;
3a183124 573
755be9ea 574 get_flags(&dev->netdev_dev, &flags);
4f925bd3 575 netdev_dev_linux_changed(dev, flags, 0);
8b61709d 576 }
46415c90 577 shash_destroy(&device_shash);
8b61709d
BP
578 }
579}
580
581static int
1f6e0fbd 582cache_notifier_ref(void)
6c88d577 583{
46415c90 584 if (!cache_notifier_refcount) {
cb22974d 585 ovs_assert(!netdev_linux_cache_notifier);
2ee6545f
EJ
586
587 netdev_linux_cache_notifier =
588 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
589
590 if (!netdev_linux_cache_notifier) {
591 return EINVAL;
149f577a
JG
592 }
593 }
46415c90 594 cache_notifier_refcount++;
6c88d577 595
1f6e0fbd
BP
596 return 0;
597}
598
599static void
600cache_notifier_unref(void)
601{
cb22974d 602 ovs_assert(cache_notifier_refcount > 0);
1f6e0fbd 603 if (!--cache_notifier_refcount) {
cb22974d 604 ovs_assert(netdev_linux_cache_notifier);
1f6e0fbd
BP
605 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
606 netdev_linux_cache_notifier = NULL;
607 }
608}
609
610/* Creates system and internal devices. */
611static int
612netdev_linux_create(const struct netdev_class *class, const char *name,
613 struct netdev_dev **netdev_devp)
614{
615 struct netdev_dev_linux *netdev_dev;
616 int error;
617
618 error = cache_notifier_ref();
619 if (error) {
620 return error;
621 }
622
149f577a 623 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 624 netdev_dev->change_seq = 1;
de5cdb90 625 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
c37d4da4 626 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
46415c90 627
149f577a 628 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
629 return 0;
630}
631
5b7448ed
JG
632/* For most types of netdevs we open the device for each call of
633 * netdev_open(). However, this is not the case with tap devices,
634 * since it is only possible to open the device once. In this
635 * situation we share a single file descriptor, and consequently
636 * buffers, across all readers. Therefore once data is read it will
637 * be unavailable to other reads for tap devices. */
a740f0de 638static int
b8dcf5e9 639netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
de5cdb90 640 const char *name, struct netdev_dev **netdev_devp)
a740f0de 641{
149f577a 642 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
643 struct tap_state *state;
644 static const char tap_dev[] = "/dev/net/tun";
645 struct ifreq ifr;
646 int error;
647
149f577a
JG
648 netdev_dev = xzalloc(sizeof *netdev_dev);
649 state = &netdev_dev->state.tap;
a740f0de 650
1f6e0fbd
BP
651 error = cache_notifier_ref();
652 if (error) {
653 goto error;
654 }
655
6c88d577 656 /* Open tap device. */
149f577a
JG
657 state->fd = open(tap_dev, O_RDWR);
658 if (state->fd < 0) {
6c88d577
JP
659 error = errno;
660 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
1f6e0fbd 661 goto error_unref_notifier;
6c88d577
JP
662 }
663
664 /* Create tap device. */
665 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 666 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 667 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
668 VLOG_WARN("%s: creating tap device failed: %s", name,
669 strerror(errno));
670 error = errno;
1f6e0fbd 671 goto error_unref_notifier;
6c88d577
JP
672 }
673
674 /* Make non-blocking. */
149f577a 675 error = set_nonblocking(state->fd);
a740f0de 676 if (error) {
1f6e0fbd 677 goto error_unref_notifier;
a740f0de
JG
678 }
679
de5cdb90 680 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
149f577a 681 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
682 return 0;
683
1f6e0fbd
BP
684error_unref_notifier:
685 cache_notifier_unref();
a740f0de 686error:
149f577a 687 free(netdev_dev);
a740f0de
JG
688 return error;
689}
690
a740f0de 691static void
149f577a 692destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 693{
149f577a
JG
694 struct tap_state *state = &netdev_dev->state.tap;
695
696 if (state->fd >= 0) {
697 close(state->fd);
a740f0de
JG
698 }
699}
700
149f577a 701/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 702static void
149f577a 703netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 704{
149f577a 705 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 706 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 707
c1c9c9c4
BP
708 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
709 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
710 }
711
1f6e0fbd 712 if (class == &netdev_tap_class) {
149f577a 713 destroy_tap(netdev_dev);
6c88d577 714 }
658797c8 715 free(netdev_dev);
1f6e0fbd
BP
716
717 cache_notifier_unref();
6c88d577
JP
718}
719
8b61709d 720static int
7b6b0ef4 721netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d
BP
722{
723 struct netdev_linux *netdev;
724 enum netdev_flags flags;
725 int error;
726
727 /* Allocate network device. */
ec6fde61 728 netdev = xzalloc(sizeof *netdev);
49a6a163 729 netdev->fd = -1;
5b7448ed 730 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 731
c3827f61
BP
732 /* Verify that the device really exists, by attempting to read its flags.
733 * (The flags might be cached, in which case this won't actually do an
734 * ioctl.)
735 *
736 * Don't do this for "internal" netdevs, though, because those have to be
737 * created as netdev objects before they exist in the kernel, because
738 * creating them in the kernel happens by passing a netdev object to
739 * dpif_port_add(). */
740 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
741 error = netdev_get_flags(&netdev->netdev, &flags);
742 if (error == ENODEV) {
743 goto error;
744 }
8b61709d
BP
745 }
746
8b61709d
BP
747 *netdevp = &netdev->netdev;
748 return 0;
749
750error:
149f577a 751 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
752 return error;
753}
754
755/* Closes and destroys 'netdev'. */
756static void
757netdev_linux_close(struct netdev *netdev_)
758{
759 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
760
49a6a163 761 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 762 close(netdev->fd);
8b61709d
BP
763 }
764 free(netdev);
765}
e9e28be3 766
7b6b0ef4
BP
767static int
768netdev_linux_listen(struct netdev *netdev_)
769{
770 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
33d82a56
JP
771 struct netdev_dev_linux *netdev_dev =
772 netdev_dev_linux_cast(netdev_get_dev(netdev_));
7b6b0ef4
BP
773 struct sockaddr_ll sll;
774 int ifindex;
775 int error;
776 int fd;
777
778 if (netdev->fd >= 0) {
779 return 0;
780 }
781
33d82a56
JP
782 if (!strcmp(netdev_get_type(netdev_), "tap")
783 && !netdev_dev->state.tap.opened) {
784 netdev->fd = netdev_dev->state.tap.fd;
785 netdev_dev->state.tap.opened = true;
786 return 0;
787 }
788
7b6b0ef4
BP
789 /* Create file descriptor. */
790 fd = socket(PF_PACKET, SOCK_RAW, 0);
791 if (fd < 0) {
792 error = errno;
793 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
794 goto error;
795 }
796
797 /* Set non-blocking mode. */
798 error = set_nonblocking(fd);
799 if (error) {
800 goto error;
801 }
802
803 /* Get ethernet device index. */
804 error = get_ifindex(&netdev->netdev, &ifindex);
805 if (error) {
806 goto error;
807 }
808
809 /* Bind to specific ethernet device. */
810 memset(&sll, 0, sizeof sll);
811 sll.sll_family = AF_PACKET;
812 sll.sll_ifindex = ifindex;
813 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
814 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
815 error = errno;
816 VLOG_ERR("%s: failed to bind raw socket (%s)",
817 netdev_get_name(netdev_), strerror(error));
818 goto error;
819 }
820
821 netdev->fd = fd;
822 return 0;
823
824error:
825 if (fd >= 0) {
826 close(fd);
827 }
828 return error;
829}
830
8b61709d
BP
831static int
832netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
833{
834 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
835
5b7448ed 836 if (netdev->fd < 0) {
7b6b0ef4 837 /* Device is not listening. */
c0e5f6ca 838 return -EAGAIN;
8b61709d
BP
839 }
840
841 for (;;) {
8e8cddf7
BP
842 ssize_t retval;
843
844 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
845 ? read(netdev->fd, data, size)
846 : recv(netdev->fd, data, size, MSG_TRUNC));
0e15264f
BP
847 if (retval >= 0) {
848 return retval <= size ? retval : -EMSGSIZE;
8b61709d
BP
849 } else if (errno != EINTR) {
850 if (errno != EAGAIN) {
851 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
852 strerror(errno), netdev_get_name(netdev_));
853 }
c0e5f6ca 854 return -errno;
8b61709d
BP
855 }
856 }
857}
858
859/* Registers with the poll loop to wake up from the next call to poll_block()
860 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
861static void
862netdev_linux_recv_wait(struct netdev *netdev_)
863{
864 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
865 if (netdev->fd >= 0) {
866 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
867 }
868}
869
870/* Discards all packets waiting to be received from 'netdev'. */
871static int
872netdev_linux_drain(struct netdev *netdev_)
873{
874 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 875 if (netdev->fd < 0) {
8b61709d 876 return 0;
5b7448ed 877 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 878 struct ifreq ifr;
149f577a 879 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
880 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
881 if (error) {
882 return error;
883 }
5b7448ed 884 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
885 return 0;
886 } else {
5b7448ed 887 return drain_rcvbuf(netdev->fd);
8b61709d
BP
888 }
889}
890
891/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
892 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
893 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
894 * the packet is too big or too small to transmit on the device.
895 *
896 * The caller retains ownership of 'buffer' in all cases.
897 *
898 * The kernel maintains a packet transmission queue, so the caller is not
899 * expected to do additional queuing of packets. */
900static int
901netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
902{
f23347ea
BP
903 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
904 for (;;) {
905 ssize_t retval;
8b61709d 906
f23347ea
BP
907 if (netdev->fd < 0) {
908 /* Use our AF_PACKET socket to send to this device. */
909 struct sockaddr_ll sll;
910 struct msghdr msg;
911 struct iovec iov;
912 int ifindex;
913 int error;
488d734d
BP
914 int sock;
915
916 sock = af_packet_sock();
917 if (sock < 0) {
918 return sock;
919 }
f23347ea
BP
920
921 error = get_ifindex(netdev_, &ifindex);
922 if (error) {
923 return error;
924 }
8b61709d 925
f23347ea
BP
926 /* We don't bother setting most fields in sockaddr_ll because the
927 * kernel ignores them for SOCK_RAW. */
928 memset(&sll, 0, sizeof sll);
929 sll.sll_family = AF_PACKET;
930 sll.sll_ifindex = ifindex;
76c308b5 931
ebc56baa 932 iov.iov_base = CONST_CAST(void *, data);
f23347ea 933 iov.iov_len = size;
76c308b5 934
f23347ea
BP
935 msg.msg_name = &sll;
936 msg.msg_namelen = sizeof sll;
937 msg.msg_iov = &iov;
938 msg.msg_iovlen = 1;
939 msg.msg_control = NULL;
940 msg.msg_controllen = 0;
941 msg.msg_flags = 0;
942
488d734d 943 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
944 } else {
945 /* Use the netdev's own fd to send to this device. This is
946 * essential for tap devices, because packets sent to a tap device
947 * with an AF_PACKET socket will loop back to be *received* again
948 * on the tap device. */
949 retval = write(netdev->fd, data, size);
950 }
76c308b5 951
8b61709d
BP
952 if (retval < 0) {
953 /* The Linux AF_PACKET implementation never blocks waiting for room
954 * for packets, instead returning ENOBUFS. Translate this into
955 * EAGAIN for the caller. */
956 if (errno == ENOBUFS) {
957 return EAGAIN;
958 } else if (errno == EINTR) {
959 continue;
960 } else if (errno != EAGAIN) {
961 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
962 netdev_get_name(netdev_), strerror(errno));
963 }
964 return errno;
965 } else if (retval != size) {
966 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
967 "%zu) on %s", retval, size, netdev_get_name(netdev_));
968 return EMSGSIZE;
969 } else {
970 return 0;
971 }
972 }
973}
974
975/* Registers with the poll loop to wake up from the next call to poll_block()
976 * when the packet transmission queue has sufficient room to transmit a packet
977 * with netdev_send().
978 *
979 * The kernel maintains a packet transmission queue, so the client is not
980 * expected to do additional queuing of packets. Thus, this function is
981 * unlikely to ever be used. It is included for completeness. */
982static void
983netdev_linux_send_wait(struct netdev *netdev_)
984{
985 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 986 if (netdev->fd < 0) {
8b61709d 987 /* Nothing to do. */
5b7448ed
JG
988 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
989 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
990 } else {
991 /* TAP device always accepts packets.*/
992 poll_immediate_wake();
993 }
994}
995
996/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
997 * otherwise a positive errno value. */
998static int
999netdev_linux_set_etheraddr(struct netdev *netdev_,
1000 const uint8_t mac[ETH_ADDR_LEN])
1001{
149f577a
JG
1002 struct netdev_dev_linux *netdev_dev =
1003 netdev_dev_linux_cast(netdev_get_dev(netdev_));
eb395f2e 1004 int error;
7eb1bd81 1005 bool up_again = false;
eb395f2e 1006
44445cac
PS
1007 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1008 if (netdev_dev->ether_addr_error) {
1009 return netdev_dev->ether_addr_error;
1010 }
1011 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1012 return 0;
1013 }
1014 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1015 }
1016
7eb1bd81
JP
1017 /* Tap devices must be brought down before setting the address. */
1018 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1019 enum netdev_flags flags;
1020
1021 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
1022 netdev_turn_flags_off(netdev_, NETDEV_UP, false);
1023 up_again = true;
1024 }
1025 }
44445cac
PS
1026 error = set_etheraddr(netdev_get_name(netdev_), mac);
1027 if (!error || error == ENODEV) {
1028 netdev_dev->ether_addr_error = error;
1029 netdev_dev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1030 if (!error) {
149f577a 1031 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 1032 }
8b61709d 1033 }
44445cac 1034
7eb1bd81
JP
1035 if (up_again) {
1036 netdev_turn_flags_on(netdev_, NETDEV_UP, false);
1037 }
1038
8b61709d
BP
1039 return error;
1040}
1041
44445cac 1042/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1043static int
1044netdev_linux_get_etheraddr(const struct netdev *netdev_,
1045 uint8_t mac[ETH_ADDR_LEN])
1046{
149f577a
JG
1047 struct netdev_dev_linux *netdev_dev =
1048 netdev_dev_linux_cast(netdev_get_dev(netdev_));
44445cac 1049
149f577a 1050 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 1051 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 1052 netdev_dev->etheraddr);
44445cac
PS
1053
1054 netdev_dev->ether_addr_error = error;
149f577a 1055 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 1056 }
44445cac
PS
1057
1058 if (!netdev_dev->ether_addr_error) {
1059 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1060 }
1061
1062 return netdev_dev->ether_addr_error;
8b61709d
BP
1063}
1064
1065/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1066 * in bytes, not including the hardware header; thus, this is typically 1500
1067 * bytes for Ethernet devices. */
1068static int
1069netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1070{
149f577a
JG
1071 struct netdev_dev_linux *netdev_dev =
1072 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1073 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
1074 struct ifreq ifr;
1075 int error;
1076
149f577a
JG
1077 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1078 SIOCGIFMTU, "SIOCGIFMTU");
90a6637d
PS
1079
1080 netdev_dev->netdev_mtu_error = error;
149f577a
JG
1081 netdev_dev->mtu = ifr.ifr_mtu;
1082 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 1083 }
90a6637d
PS
1084
1085 if (!netdev_dev->netdev_mtu_error) {
1086 *mtup = netdev_dev->mtu;
1087 }
1088 return netdev_dev->netdev_mtu_error;
8b61709d
BP
1089}
1090
9b020780
PS
1091/* Sets the maximum size of transmitted (MTU) for given device using linux
1092 * networking ioctl interface.
1093 */
1094static int
1095netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1096{
1097 struct netdev_dev_linux *netdev_dev =
1098 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1099 struct ifreq ifr;
1100 int error;
1101
90a6637d
PS
1102 if (netdev_dev->cache_valid & VALID_MTU) {
1103 if (netdev_dev->netdev_mtu_error) {
1104 return netdev_dev->netdev_mtu_error;
1105 }
1106 if (netdev_dev->mtu == mtu) {
1107 return 0;
1108 }
1109 netdev_dev->cache_valid &= ~VALID_MTU;
153e5481 1110 }
9b020780
PS
1111 ifr.ifr_mtu = mtu;
1112 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1113 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d
PS
1114 if (!error || error == ENODEV) {
1115 netdev_dev->netdev_mtu_error = error;
1116 netdev_dev->mtu = ifr.ifr_mtu;
1117 netdev_dev->cache_valid |= VALID_MTU;
9b020780 1118 }
90a6637d 1119 return error;
9b020780
PS
1120}
1121
9ab3d9a3
BP
1122/* Returns the ifindex of 'netdev', if successful, as a positive number.
1123 * On failure, returns a negative errno value. */
1124static int
1125netdev_linux_get_ifindex(const struct netdev *netdev)
1126{
1127 int ifindex, error;
1128
1129 error = get_ifindex(netdev, &ifindex);
1130 return error ? -error : ifindex;
1131}
1132
8b61709d
BP
1133static int
1134netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1135{
149f577a
JG
1136 struct netdev_dev_linux *netdev_dev =
1137 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1138
1670c579
EJ
1139 if (netdev_dev->miimon_interval > 0) {
1140 *carrier = netdev_dev->miimon;
3a183124 1141 } else {
c37d4da4 1142 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1143 }
8b61709d 1144
3a183124 1145 return 0;
8b61709d
BP
1146}
1147
65c3058c
EJ
1148static long long int
1149netdev_linux_get_carrier_resets(const struct netdev *netdev)
1150{
1151 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1152}
1153
63331829 1154static int
1670c579
EJ
1155netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1156 struct mii_ioctl_data *data)
63331829 1157{
63331829 1158 struct ifreq ifr;
782e6111 1159 int error;
63331829 1160
63331829 1161 memset(&ifr, 0, sizeof ifr);
782e6111 1162 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1163 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1164 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1165
782e6111
EJ
1166 return error;
1167}
1168
1169static int
1670c579 1170netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1171{
782e6111
EJ
1172 struct mii_ioctl_data data;
1173 int error;
63331829 1174
782e6111
EJ
1175 *miimon = false;
1176
1177 memset(&data, 0, sizeof data);
1670c579 1178 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1179 if (!error) {
1180 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1181 data.reg_num = MII_BMSR;
1670c579 1182 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1183 &data);
63331829
EJ
1184
1185 if (!error) {
782e6111 1186 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1187 } else {
1188 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1189 }
1190 } else {
1191 struct ethtool_cmd ecmd;
63331829
EJ
1192
1193 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1194 name);
1195
ab985a77 1196 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1197 memset(&ecmd, 0, sizeof ecmd);
1198 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1199 "ETHTOOL_GLINK");
1200 if (!error) {
782e6111
EJ
1201 struct ethtool_value eval;
1202
1203 memcpy(&eval, &ecmd, sizeof eval);
1204 *miimon = !!eval.data;
63331829
EJ
1205 } else {
1206 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1207 }
1208 }
1209
1210 return error;
1211}
1212
1670c579
EJ
1213static int
1214netdev_linux_set_miimon_interval(struct netdev *netdev_,
1215 long long int interval)
1216{
1217 struct netdev_dev_linux *netdev_dev;
1218
1219 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1220
1221 interval = interval > 0 ? MAX(interval, 100) : 0;
1222 if (netdev_dev->miimon_interval != interval) {
1223 netdev_dev->miimon_interval = interval;
1224 timer_set_expired(&netdev_dev->miimon_timer);
1225 }
1226
1227 return 0;
1228}
1229
1230static void
1231netdev_linux_miimon_run(void)
1232{
1233 struct shash device_shash;
1234 struct shash_node *node;
1235
1236 shash_init(&device_shash);
1237 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1238 SHASH_FOR_EACH (node, &device_shash) {
1239 struct netdev_dev_linux *dev = node->data;
1240 bool miimon;
1241
1242 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1243 continue;
1244 }
1245
1246 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1247 if (miimon != dev->miimon) {
1670c579 1248 dev->miimon = miimon;
4f925bd3 1249 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1670c579
EJ
1250 }
1251
1252 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1253 }
1254
1255 shash_destroy(&device_shash);
1256}
1257
1258static void
1259netdev_linux_miimon_wait(void)
1260{
1261 struct shash device_shash;
1262 struct shash_node *node;
1263
1264 shash_init(&device_shash);
1265 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1266 SHASH_FOR_EACH (node, &device_shash) {
1267 struct netdev_dev_linux *dev = node->data;
1268
1269 if (dev->miimon_interval > 0) {
1270 timer_wait(&dev->miimon_timer);
1271 }
1272 }
1273 shash_destroy(&device_shash);
1274}
1275
8b61709d
BP
1276/* Check whether we can we use RTM_GETLINK to get network device statistics.
1277 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1278 * enabled. */
1279static bool
1280check_for_working_netlink_stats(void)
1281{
1282 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1283 * preferable, so if that works, we'll use it. */
1284 int ifindex = do_get_ifindex("lo");
1285 if (ifindex < 0) {
1286 VLOG_WARN("failed to get ifindex for lo, "
1287 "obtaining netdev stats from proc");
1288 return false;
1289 } else {
1290 struct netdev_stats stats;
1291 int error = get_stats_via_netlink(ifindex, &stats);
1292 if (!error) {
1293 VLOG_DBG("obtaining netdev stats via rtnetlink");
1294 return true;
1295 } else {
1296 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1297 "via proc (you are probably running a pre-2.6.19 "
1298 "kernel)", strerror(error));
1299 return false;
1300 }
1301 }
1302}
1303
92df599c
JG
1304static void
1305swap_uint64(uint64_t *a, uint64_t *b)
1306{
1de0e8ae
BP
1307 uint64_t tmp = *a;
1308 *a = *b;
1309 *b = tmp;
92df599c
JG
1310}
1311
f613a0d7
PS
1312static void
1313get_stats_via_vport(const struct netdev *netdev_,
1314 struct netdev_stats *stats)
8b61709d 1315{
149f577a
JG
1316 struct netdev_dev_linux *netdev_dev =
1317 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1318
bba1e6f3
PS
1319 if (!netdev_dev->vport_stats_error ||
1320 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1321 int error;
7fbef77a
JG
1322
1323 error = netdev_vport_get_stats(netdev_, stats);
bcb1f5a1 1324 if (error && error != ENOENT) {
a57a8488
BP
1325 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1326 "(%s)", netdev_get_name(netdev_), strerror(error));
f613a0d7 1327 }
bba1e6f3
PS
1328 netdev_dev->vport_stats_error = error;
1329 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1330 }
f613a0d7 1331}
8b61709d 1332
f613a0d7
PS
1333static int
1334netdev_linux_sys_get_stats(const struct netdev *netdev_,
1335 struct netdev_stats *stats)
1336{
1337 static int use_netlink_stats = -1;
1338 int error;
1339
1340 if (use_netlink_stats < 0) {
1341 use_netlink_stats = check_for_working_netlink_stats();
1342 }
1343
1344 if (use_netlink_stats) {
1345 int ifindex;
1346
1347 error = get_ifindex(netdev_, &ifindex);
1348 if (!error) {
1349 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1350 }
f613a0d7
PS
1351 } else {
1352 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1353 }
7fbef77a 1354
f613a0d7
PS
1355 if (error) {
1356 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1357 netdev_get_name(netdev_), error);
1358 }
1359 return error;
1360
1361}
1362
1363/* Retrieves current device stats for 'netdev-linux'. */
1364static int
1365netdev_linux_get_stats(const struct netdev *netdev_,
1366 struct netdev_stats *stats)
1367{
1368 struct netdev_dev_linux *netdev_dev =
1369 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1370 struct netdev_stats dev_stats;
1371 int error;
1372
1373 get_stats_via_vport(netdev_, stats);
1374
1375 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1376
1377 if (error) {
bba1e6f3 1378 if (netdev_dev->vport_stats_error) {
f613a0d7 1379 return error;
7fbef77a 1380 } else {
f613a0d7
PS
1381 return 0;
1382 }
1383 }
1384
bba1e6f3 1385 if (netdev_dev->vport_stats_error) {
f613a0d7
PS
1386 /* stats not available from OVS then use ioctl stats. */
1387 *stats = dev_stats;
1388 } else {
1389 stats->rx_errors += dev_stats.rx_errors;
1390 stats->tx_errors += dev_stats.tx_errors;
1391 stats->rx_dropped += dev_stats.rx_dropped;
1392 stats->tx_dropped += dev_stats.tx_dropped;
1393 stats->multicast += dev_stats.multicast;
1394 stats->collisions += dev_stats.collisions;
1395 stats->rx_length_errors += dev_stats.rx_length_errors;
1396 stats->rx_over_errors += dev_stats.rx_over_errors;
1397 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1398 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1399 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1400 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1401 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1402 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1403 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1404 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1405 stats->tx_window_errors += dev_stats.tx_window_errors;
1406 }
1407 return 0;
1408}
1409
1410/* Retrieves current device stats for 'netdev-tap' netdev or
1411 * netdev-internal. */
1412static int
bba1e6f3 1413netdev_tap_get_stats(const struct netdev *netdev_,
f613a0d7
PS
1414 struct netdev_stats *stats)
1415{
1416 struct netdev_dev_linux *netdev_dev =
1417 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1418 struct netdev_stats dev_stats;
1419 int error;
1420
1421 get_stats_via_vport(netdev_, stats);
1422
1423 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1424 if (error) {
bba1e6f3 1425 if (netdev_dev->vport_stats_error) {
f613a0d7
PS
1426 return error;
1427 } else {
1428 return 0;
8b61709d 1429 }
8b61709d 1430 }
fe6b0e03
JG
1431
1432 /* If this port is an internal port then the transmit and receive stats
1433 * will appear to be swapped relative to the other ports since we are the
1434 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1435 * them back here. This does not apply if we are getting stats from the
1436 * vport layer because it always tracks stats from the perspective of the
1437 * switch. */
bba1e6f3 1438 if (netdev_dev->vport_stats_error) {
f613a0d7 1439 *stats = dev_stats;
92df599c
JG
1440 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1441 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1442 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1443 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1444 stats->rx_length_errors = 0;
1445 stats->rx_over_errors = 0;
1446 stats->rx_crc_errors = 0;
1447 stats->rx_frame_errors = 0;
1448 stats->rx_fifo_errors = 0;
1449 stats->rx_missed_errors = 0;
1450 stats->tx_aborted_errors = 0;
1451 stats->tx_carrier_errors = 0;
1452 stats->tx_fifo_errors = 0;
1453 stats->tx_heartbeat_errors = 0;
1454 stats->tx_window_errors = 0;
f613a0d7
PS
1455 } else {
1456 stats->rx_dropped += dev_stats.tx_dropped;
1457 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1458
f613a0d7
PS
1459 stats->rx_errors += dev_stats.tx_errors;
1460 stats->tx_errors += dev_stats.rx_errors;
1461
1462 stats->multicast += dev_stats.multicast;
1463 stats->collisions += dev_stats.collisions;
1464 }
1465 return 0;
8b61709d
BP
1466}
1467
bba1e6f3
PS
1468static int
1469netdev_internal_get_stats(const struct netdev *netdev_,
1470 struct netdev_stats *stats)
1471{
1472 struct netdev_dev_linux *netdev_dev =
1473 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1474
1475 get_stats_via_vport(netdev_, stats);
1476 return netdev_dev->vport_stats_error;
1477}
1478
2f31a822
EJ
1479static int
1480netdev_internal_set_stats(struct netdev *netdev,
1481 const struct netdev_stats *stats)
1482{
1483 struct ovs_vport_stats vport_stats;
1484 struct dpif_linux_vport vport;
1485 int err;
1486
1487 vport_stats.rx_packets = stats->rx_packets;
1488 vport_stats.tx_packets = stats->tx_packets;
1489 vport_stats.rx_bytes = stats->rx_bytes;
1490 vport_stats.tx_bytes = stats->tx_bytes;
1491 vport_stats.rx_errors = stats->rx_errors;
1492 vport_stats.tx_errors = stats->tx_errors;
1493 vport_stats.rx_dropped = stats->rx_dropped;
1494 vport_stats.tx_dropped = stats->tx_dropped;
1495
1496 dpif_linux_vport_init(&vport);
1497 vport.cmd = OVS_VPORT_CMD_SET;
1498 vport.name = netdev_get_name(netdev);
1499 vport.stats = &vport_stats;
1500
1501 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1502
1503 /* If the vport layer doesn't know about the device, that doesn't mean it
1504 * doesn't exist (after all were able to open it when netdev_open() was
1505 * called), it just means that it isn't attached and we'll be getting
1506 * stats a different way. */
1507 if (err == ENODEV) {
1508 err = EOPNOTSUPP;
1509 }
1510
1511 return err;
1512}
1513
51f87458
PS
1514static void
1515netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
8b61709d
BP
1516{
1517 struct ethtool_cmd ecmd;
6c038611 1518 uint32_t speed;
8b61709d
BP
1519 int error;
1520
51f87458
PS
1521 if (netdev_dev->cache_valid & VALID_FEATURES) {
1522 return;
1523 }
1524
ab985a77 1525 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1526 memset(&ecmd, 0, sizeof ecmd);
51f87458 1527 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
8b61709d
BP
1528 ETHTOOL_GSET, "ETHTOOL_GSET");
1529 if (error) {
51f87458 1530 goto out;
8b61709d
BP
1531 }
1532
1533 /* Supported features. */
51f87458 1534 netdev_dev->supported = 0;
8b61709d 1535 if (ecmd.supported & SUPPORTED_10baseT_Half) {
51f87458 1536 netdev_dev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1537 }
1538 if (ecmd.supported & SUPPORTED_10baseT_Full) {
51f87458 1539 netdev_dev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1540 }
1541 if (ecmd.supported & SUPPORTED_100baseT_Half) {
51f87458 1542 netdev_dev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1543 }
1544 if (ecmd.supported & SUPPORTED_100baseT_Full) {
51f87458 1545 netdev_dev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1546 }
1547 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
51f87458 1548 netdev_dev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1549 }
1550 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
51f87458 1551 netdev_dev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1552 }
1553 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
51f87458 1554 netdev_dev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1555 }
1556 if (ecmd.supported & SUPPORTED_TP) {
51f87458 1557 netdev_dev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1558 }
1559 if (ecmd.supported & SUPPORTED_FIBRE) {
51f87458 1560 netdev_dev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1561 }
1562 if (ecmd.supported & SUPPORTED_Autoneg) {
51f87458 1563 netdev_dev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1564 }
1565 if (ecmd.supported & SUPPORTED_Pause) {
51f87458 1566 netdev_dev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1567 }
1568 if (ecmd.supported & SUPPORTED_Asym_Pause) {
51f87458 1569 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1570 }
1571
1572 /* Advertised features. */
51f87458 1573 netdev_dev->advertised = 0;
8b61709d 1574 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
51f87458 1575 netdev_dev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1576 }
1577 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
51f87458 1578 netdev_dev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1579 }
1580 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
51f87458 1581 netdev_dev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1582 }
1583 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
51f87458 1584 netdev_dev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1585 }
1586 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
51f87458 1587 netdev_dev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1588 }
1589 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
51f87458 1590 netdev_dev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1591 }
1592 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
51f87458 1593 netdev_dev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1594 }
1595 if (ecmd.advertising & ADVERTISED_TP) {
51f87458 1596 netdev_dev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1597 }
1598 if (ecmd.advertising & ADVERTISED_FIBRE) {
51f87458 1599 netdev_dev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1600 }
1601 if (ecmd.advertising & ADVERTISED_Autoneg) {
51f87458 1602 netdev_dev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1603 }
1604 if (ecmd.advertising & ADVERTISED_Pause) {
51f87458 1605 netdev_dev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1606 }
1607 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
51f87458 1608 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1609 }
1610
1611 /* Current settings. */
2a529ead 1612 speed = ecmd.speed;
6c038611 1613 if (speed == SPEED_10) {
51f87458 1614 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1615 } else if (speed == SPEED_100) {
51f87458 1616 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1617 } else if (speed == SPEED_1000) {
51f87458 1618 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1619 } else if (speed == SPEED_10000) {
51f87458 1620 netdev_dev->current = NETDEV_F_10GB_FD;
6c038611 1621 } else if (speed == 40000) {
51f87458 1622 netdev_dev->current = NETDEV_F_40GB_FD;
6c038611 1623 } else if (speed == 100000) {
51f87458 1624 netdev_dev->current = NETDEV_F_100GB_FD;
6c038611 1625 } else if (speed == 1000000) {
51f87458 1626 netdev_dev->current = NETDEV_F_1TB_FD;
8b61709d 1627 } else {
51f87458 1628 netdev_dev->current = 0;
8b61709d
BP
1629 }
1630
1631 if (ecmd.port == PORT_TP) {
51f87458 1632 netdev_dev->current |= NETDEV_F_COPPER;
8b61709d 1633 } else if (ecmd.port == PORT_FIBRE) {
51f87458 1634 netdev_dev->current |= NETDEV_F_FIBER;
8b61709d
BP
1635 }
1636
1637 if (ecmd.autoneg) {
51f87458 1638 netdev_dev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1639 }
1640
1641 /* Peer advertisements. */
51f87458 1642 netdev_dev->peer = 0; /* XXX */
8b61709d 1643
51f87458
PS
1644out:
1645 netdev_dev->cache_valid |= VALID_FEATURES;
1646 netdev_dev->get_features_error = error;
1647}
1648
1649/* Stores the features supported by 'netdev' into each of '*current',
1650 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1651 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1652 * errno value. */
1653static int
1654netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1655 enum netdev_features *current,
1656 enum netdev_features *advertised,
1657 enum netdev_features *supported,
1658 enum netdev_features *peer)
51f87458
PS
1659{
1660 struct netdev_dev_linux *netdev_dev =
1661 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1662
1663 netdev_linux_read_features(netdev_dev);
1664
1665 if (!netdev_dev->get_features_error) {
1666 *current = netdev_dev->current;
1667 *advertised = netdev_dev->advertised;
1668 *supported = netdev_dev->supported;
1669 *peer = netdev_dev->peer;
1670 }
1671 return netdev_dev->get_features_error;
8b61709d
BP
1672}
1673
1674/* Set the features advertised by 'netdev' to 'advertise'. */
1675static int
6c038611
BP
1676netdev_linux_set_advertisements(struct netdev *netdev,
1677 enum netdev_features advertise)
8b61709d
BP
1678{
1679 struct ethtool_cmd ecmd;
1680 int error;
1681
ab985a77 1682 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1683 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1684 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1685 ETHTOOL_GSET, "ETHTOOL_GSET");
1686 if (error) {
1687 return error;
1688 }
1689
1690 ecmd.advertising = 0;
6c038611 1691 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1692 ecmd.advertising |= ADVERTISED_10baseT_Half;
1693 }
6c038611 1694 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1695 ecmd.advertising |= ADVERTISED_10baseT_Full;
1696 }
6c038611 1697 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1698 ecmd.advertising |= ADVERTISED_100baseT_Half;
1699 }
6c038611 1700 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1701 ecmd.advertising |= ADVERTISED_100baseT_Full;
1702 }
6c038611 1703 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1704 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1705 }
6c038611 1706 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1707 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1708 }
6c038611 1709 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1710 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1711 }
6c038611 1712 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1713 ecmd.advertising |= ADVERTISED_TP;
1714 }
6c038611 1715 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1716 ecmd.advertising |= ADVERTISED_FIBRE;
1717 }
6c038611 1718 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1719 ecmd.advertising |= ADVERTISED_Autoneg;
1720 }
6c038611 1721 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1722 ecmd.advertising |= ADVERTISED_Pause;
1723 }
6c038611 1724 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1725 ecmd.advertising |= ADVERTISED_Asym_Pause;
1726 }
ab985a77 1727 COVERAGE_INC(netdev_set_ethtool);
0b0544d7 1728 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1729 ETHTOOL_SSET, "ETHTOOL_SSET");
1730}
1731
f8500004
JP
1732/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1733 * successful, otherwise a positive errno value. */
8b61709d
BP
1734static int
1735netdev_linux_set_policing(struct netdev *netdev,
1736 uint32_t kbits_rate, uint32_t kbits_burst)
1737{
80a86fbe
BP
1738 struct netdev_dev_linux *netdev_dev =
1739 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d 1740 const char *netdev_name = netdev_get_name(netdev);
f8500004 1741 int error;
8b61709d 1742
8e460221 1743
80a86fbe
BP
1744 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1745 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1746 : kbits_burst); /* Stick with user-specified value. */
1747
c9f71668
PS
1748 if (netdev_dev->cache_valid & VALID_POLICING) {
1749 if (netdev_dev->netdev_policing_error) {
1750 return netdev_dev->netdev_policing_error;
1751 }
1752
1753 if (netdev_dev->kbits_rate == kbits_rate &&
1754 netdev_dev->kbits_burst == kbits_burst) {
1755 /* Assume that settings haven't changed since we last set them. */
1756 return 0;
1757 }
1758 netdev_dev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1759 }
1760
ac8c3412 1761 COVERAGE_INC(netdev_set_policing);
f8500004
JP
1762 /* Remove any existing ingress qdisc. */
1763 error = tc_add_del_ingress_qdisc(netdev, false);
1764 if (error) {
1765 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1766 netdev_name, strerror(error));
c9f71668 1767 goto out;
f8500004
JP
1768 }
1769
8b61709d 1770 if (kbits_rate) {
f8500004
JP
1771 error = tc_add_del_ingress_qdisc(netdev, true);
1772 if (error) {
1773 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1774 netdev_name, strerror(error));
c9f71668 1775 goto out;
8b61709d
BP
1776 }
1777
f8500004
JP
1778 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1779 if (error){
1780 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1781 netdev_name, strerror(error));
c9f71668 1782 goto out;
8b61709d 1783 }
8b61709d
BP
1784 }
1785
f8500004
JP
1786 netdev_dev->kbits_rate = kbits_rate;
1787 netdev_dev->kbits_burst = kbits_burst;
f8500004 1788
c9f71668
PS
1789out:
1790 if (!error || error == ENODEV) {
1791 netdev_dev->netdev_policing_error = error;
1792 netdev_dev->cache_valid |= VALID_POLICING;
1793 }
1794 return error;
8b61709d
BP
1795}
1796
c1c9c9c4
BP
1797static int
1798netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1799 struct sset *types)
c1c9c9c4
BP
1800{
1801 const struct tc_ops **opsp;
1802
1803 for (opsp = tcs; *opsp != NULL; opsp++) {
1804 const struct tc_ops *ops = *opsp;
1805 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1806 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1807 }
1808 }
1809 return 0;
1810}
1811
1812static const struct tc_ops *
1813tc_lookup_ovs_name(const char *name)
1814{
1815 const struct tc_ops **opsp;
1816
1817 for (opsp = tcs; *opsp != NULL; opsp++) {
1818 const struct tc_ops *ops = *opsp;
1819 if (!strcmp(name, ops->ovs_name)) {
1820 return ops;
1821 }
1822 }
1823 return NULL;
1824}
1825
1826static const struct tc_ops *
1827tc_lookup_linux_name(const char *name)
1828{
1829 const struct tc_ops **opsp;
1830
1831 for (opsp = tcs; *opsp != NULL; opsp++) {
1832 const struct tc_ops *ops = *opsp;
1833 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1834 return ops;
1835 }
1836 }
1837 return NULL;
1838}
1839
93b13be8
BP
1840static struct tc_queue *
1841tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1842 size_t hash)
1843{
1844 struct netdev_dev_linux *netdev_dev =
1845 netdev_dev_linux_cast(netdev_get_dev(netdev));
1846 struct tc_queue *queue;
1847
4e8e4213 1848 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1849 if (queue->queue_id == queue_id) {
1850 return queue;
1851 }
1852 }
1853 return NULL;
1854}
1855
1856static struct tc_queue *
1857tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1858{
1859 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1860}
1861
c1c9c9c4
BP
1862static int
1863netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1864 const char *type,
1865 struct netdev_qos_capabilities *caps)
1866{
1867 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1868 if (!ops) {
1869 return EOPNOTSUPP;
1870 }
1871 caps->n_queues = ops->n_queues;
1872 return 0;
1873}
1874
1875static int
1876netdev_linux_get_qos(const struct netdev *netdev,
79f1cbe9 1877 const char **typep, struct smap *details)
c1c9c9c4
BP
1878{
1879 struct netdev_dev_linux *netdev_dev =
1880 netdev_dev_linux_cast(netdev_get_dev(netdev));
1881 int error;
1882
1883 error = tc_query_qdisc(netdev);
1884 if (error) {
1885 return error;
1886 }
1887
1888 *typep = netdev_dev->tc->ops->ovs_name;
1889 return (netdev_dev->tc->ops->qdisc_get
1890 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1891 : 0);
1892}
1893
1894static int
1895netdev_linux_set_qos(struct netdev *netdev,
79f1cbe9 1896 const char *type, const struct smap *details)
c1c9c9c4
BP
1897{
1898 struct netdev_dev_linux *netdev_dev =
1899 netdev_dev_linux_cast(netdev_get_dev(netdev));
1900 const struct tc_ops *new_ops;
1901 int error;
1902
1903 new_ops = tc_lookup_ovs_name(type);
1904 if (!new_ops || !new_ops->tc_install) {
1905 return EOPNOTSUPP;
1906 }
1907
1908 error = tc_query_qdisc(netdev);
1909 if (error) {
1910 return error;
1911 }
1912
1913 if (new_ops == netdev_dev->tc->ops) {
1914 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1915 } else {
1916 /* Delete existing qdisc. */
1917 error = tc_del_qdisc(netdev);
1918 if (error) {
1919 return error;
1920 }
cb22974d 1921 ovs_assert(netdev_dev->tc == NULL);
c1c9c9c4
BP
1922
1923 /* Install new qdisc. */
1924 error = new_ops->tc_install(netdev, details);
cb22974d 1925 ovs_assert((error == 0) == (netdev_dev->tc != NULL));
c1c9c9c4
BP
1926
1927 return error;
1928 }
1929}
1930
1931static int
1932netdev_linux_get_queue(const struct netdev *netdev,
79f1cbe9 1933 unsigned int queue_id, struct smap *details)
c1c9c9c4
BP
1934{
1935 struct netdev_dev_linux *netdev_dev =
1936 netdev_dev_linux_cast(netdev_get_dev(netdev));
1937 int error;
1938
1939 error = tc_query_qdisc(netdev);
1940 if (error) {
1941 return error;
93b13be8
BP
1942 } else {
1943 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1944 return (queue
1945 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1946 : ENOENT);
c1c9c9c4 1947 }
c1c9c9c4
BP
1948}
1949
1950static int
1951netdev_linux_set_queue(struct netdev *netdev,
79f1cbe9 1952 unsigned int queue_id, const struct smap *details)
c1c9c9c4
BP
1953{
1954 struct netdev_dev_linux *netdev_dev =
1955 netdev_dev_linux_cast(netdev_get_dev(netdev));
1956 int error;
1957
1958 error = tc_query_qdisc(netdev);
1959 if (error) {
1960 return error;
1961 } else if (queue_id >= netdev_dev->tc->ops->n_queues
1962 || !netdev_dev->tc->ops->class_set) {
1963 return EINVAL;
1964 }
1965
1966 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1967}
1968
1969static int
1970netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1971{
1972 struct netdev_dev_linux *netdev_dev =
1973 netdev_dev_linux_cast(netdev_get_dev(netdev));
1974 int error;
1975
1976 error = tc_query_qdisc(netdev);
1977 if (error) {
1978 return error;
1979 } else if (!netdev_dev->tc->ops->class_delete) {
1980 return EINVAL;
93b13be8
BP
1981 } else {
1982 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1983 return (queue
1984 ? netdev_dev->tc->ops->class_delete(netdev, queue)
1985 : ENOENT);
c1c9c9c4 1986 }
c1c9c9c4
BP
1987}
1988
1989static int
1990netdev_linux_get_queue_stats(const struct netdev *netdev,
1991 unsigned int queue_id,
1992 struct netdev_queue_stats *stats)
1993{
1994 struct netdev_dev_linux *netdev_dev =
1995 netdev_dev_linux_cast(netdev_get_dev(netdev));
1996 int error;
1997
1998 error = tc_query_qdisc(netdev);
1999 if (error) {
2000 return error;
c1c9c9c4
BP
2001 } else if (!netdev_dev->tc->ops->class_get_stats) {
2002 return EOPNOTSUPP;
93b13be8
BP
2003 } else {
2004 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2005 return (queue
2006 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2007 : ENOENT);
c1c9c9c4 2008 }
c1c9c9c4
BP
2009}
2010
23a98ffe 2011static bool
c1c9c9c4
BP
2012start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2013{
2014 struct ofpbuf request;
2015 struct tcmsg *tcmsg;
2016
2017 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2018 if (!tcmsg) {
2019 return false;
2020 }
3c4de644 2021 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
2022 nl_dump_start(dump, rtnl_sock, &request);
2023 ofpbuf_uninit(&request);
23a98ffe 2024 return true;
c1c9c9c4
BP
2025}
2026
2027static int
2028netdev_linux_dump_queues(const struct netdev *netdev,
2029 netdev_dump_queues_cb *cb, void *aux)
2030{
2031 struct netdev_dev_linux *netdev_dev =
2032 netdev_dev_linux_cast(netdev_get_dev(netdev));
f486e840 2033 struct tc_queue *queue, *next_queue;
79f1cbe9 2034 struct smap details;
c1c9c9c4 2035 int last_error;
c1c9c9c4
BP
2036 int error;
2037
2038 error = tc_query_qdisc(netdev);
2039 if (error) {
2040 return error;
2041 } else if (!netdev_dev->tc->ops->class_get) {
2042 return EOPNOTSUPP;
2043 }
2044
2045 last_error = 0;
79f1cbe9 2046 smap_init(&details);
f486e840
BP
2047 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2048 &netdev_dev->tc->queues) {
79f1cbe9 2049 smap_clear(&details);
c1c9c9c4 2050
93b13be8 2051 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 2052 if (!error) {
93b13be8 2053 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
2054 } else {
2055 last_error = error;
2056 }
2057 }
79f1cbe9 2058 smap_destroy(&details);
c1c9c9c4
BP
2059
2060 return last_error;
2061}
2062
2063static int
2064netdev_linux_dump_queue_stats(const struct netdev *netdev,
2065 netdev_dump_queue_stats_cb *cb, void *aux)
2066{
2067 struct netdev_dev_linux *netdev_dev =
2068 netdev_dev_linux_cast(netdev_get_dev(netdev));
2069 struct nl_dump dump;
2070 struct ofpbuf msg;
2071 int last_error;
2072 int error;
2073
2074 error = tc_query_qdisc(netdev);
2075 if (error) {
2076 return error;
2077 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2078 return EOPNOTSUPP;
2079 }
2080
2081 last_error = 0;
23a98ffe
BP
2082 if (!start_queue_dump(netdev, &dump)) {
2083 return ENODEV;
2084 }
c1c9c9c4
BP
2085 while (nl_dump_next(&dump, &msg)) {
2086 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2087 if (error) {
2088 last_error = error;
2089 }
2090 }
2091
2092 error = nl_dump_done(&dump);
2093 return error ? error : last_error;
2094}
2095
8b61709d 2096static int
f1acd62b
BP
2097netdev_linux_get_in4(const struct netdev *netdev_,
2098 struct in_addr *address, struct in_addr *netmask)
8b61709d 2099{
149f577a
JG
2100 struct netdev_dev_linux *netdev_dev =
2101 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2102
2103 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
2104 int error;
2105
149f577a 2106 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
2107 SIOCGIFADDR, "SIOCGIFADDR");
2108 if (error) {
2109 return error;
2110 }
2111
149f577a 2112 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
2113 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2114 if (error) {
2115 return error;
2116 }
2117
149f577a 2118 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 2119 }
149f577a
JG
2120 *address = netdev_dev->address;
2121 *netmask = netdev_dev->netmask;
f1acd62b 2122 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
2123}
2124
8b61709d 2125static int
f1acd62b
BP
2126netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2127 struct in_addr netmask)
8b61709d 2128{
149f577a
JG
2129 struct netdev_dev_linux *netdev_dev =
2130 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
2131 int error;
2132
f1acd62b 2133 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2134 if (!error) {
149f577a
JG
2135 netdev_dev->cache_valid |= VALID_IN4;
2136 netdev_dev->address = address;
2137 netdev_dev->netmask = netmask;
f1acd62b 2138 if (address.s_addr != INADDR_ANY) {
8b61709d 2139 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2140 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2141 }
2142 }
2143 return error;
2144}
2145
2146static bool
2147parse_if_inet6_line(const char *line,
2148 struct in6_addr *in6, char ifname[16 + 1])
2149{
2150 uint8_t *s6 = in6->s6_addr;
2151#define X8 "%2"SCNx8
2152 return sscanf(line,
2153 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2154 "%*x %*x %*x %*x %16s\n",
2155 &s6[0], &s6[1], &s6[2], &s6[3],
2156 &s6[4], &s6[5], &s6[6], &s6[7],
2157 &s6[8], &s6[9], &s6[10], &s6[11],
2158 &s6[12], &s6[13], &s6[14], &s6[15],
2159 ifname) == 17;
2160}
2161
2162/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2163 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2164static int
2165netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2166{
149f577a
JG
2167 struct netdev_dev_linux *netdev_dev =
2168 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2169 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2170 FILE *file;
2171 char line[128];
2172
149f577a 2173 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2174
2175 file = fopen("/proc/net/if_inet6", "r");
2176 if (file != NULL) {
2177 const char *name = netdev_get_name(netdev_);
2178 while (fgets(line, sizeof line, file)) {
2a022368 2179 struct in6_addr in6_tmp;
8b61709d 2180 char ifname[16 + 1];
2a022368 2181 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2182 && !strcmp(name, ifname))
2183 {
2a022368 2184 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2185 break;
2186 }
2187 }
2188 fclose(file);
2189 }
149f577a 2190 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2191 }
149f577a 2192 *in6 = netdev_dev->in6;
8b61709d
BP
2193 return 0;
2194}
2195
2196static void
2197make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2198{
2199 struct sockaddr_in sin;
2200 memset(&sin, 0, sizeof sin);
2201 sin.sin_family = AF_INET;
2202 sin.sin_addr = addr;
2203 sin.sin_port = 0;
2204
2205 memset(sa, 0, sizeof *sa);
2206 memcpy(sa, &sin, sizeof sin);
2207}
2208
2209static int
2210do_set_addr(struct netdev *netdev,
2211 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2212{
2213 struct ifreq ifr;
71d7c22f 2214 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2215 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2216
2217 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2218 ioctl_name);
8b61709d
BP
2219}
2220
2221/* Adds 'router' as a default IP gateway. */
2222static int
67a4917b 2223netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2224{
2225 struct in_addr any = { INADDR_ANY };
2226 struct rtentry rt;
2227 int error;
2228
2229 memset(&rt, 0, sizeof rt);
2230 make_in4_sockaddr(&rt.rt_dst, any);
2231 make_in4_sockaddr(&rt.rt_gateway, router);
2232 make_in4_sockaddr(&rt.rt_genmask, any);
2233 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2234 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2235 if (error) {
2236 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2237 }
2238 return error;
2239}
2240
f1acd62b
BP
2241static int
2242netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2243 char **netdev_name)
2244{
2245 static const char fn[] = "/proc/net/route";
2246 FILE *stream;
2247 char line[256];
2248 int ln;
2249
2250 *netdev_name = NULL;
2251 stream = fopen(fn, "r");
2252 if (stream == NULL) {
2253 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2254 return errno;
2255 }
2256
2257 ln = 0;
2258 while (fgets(line, sizeof line, stream)) {
2259 if (++ln >= 2) {
2260 char iface[17];
dbba996b 2261 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2262 int refcnt, metric, mtu;
2263 unsigned int flags, use, window, irtt;
2264
2265 if (sscanf(line,
2266 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2267 " %d %u %u\n",
2268 iface, &dest, &gateway, &flags, &refcnt,
2269 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2270
d295e8e9 2271 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2272 fn, ln, line);
2273 continue;
2274 }
2275 if (!(flags & RTF_UP)) {
2276 /* Skip routes that aren't up. */
2277 continue;
2278 }
2279
2280 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2281 * network byte order, so we don't need need any endian
f1acd62b
BP
2282 * conversions here. */
2283 if ((dest & mask) == (host->s_addr & mask)) {
2284 if (!gateway) {
2285 /* The host is directly reachable. */
2286 next_hop->s_addr = 0;
2287 } else {
2288 /* To reach the host, we must go through a gateway. */
2289 next_hop->s_addr = gateway;
2290 }
2291 *netdev_name = xstrdup(iface);
2292 fclose(stream);
2293 return 0;
2294 }
2295 }
2296 }
2297
2298 fclose(stream);
2299 return ENXIO;
2300}
2301
e210037e 2302static int
275707c3 2303netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
e210037e 2304{
275707c3
EJ
2305 struct netdev_dev_linux *netdev_dev;
2306 int error = 0;
2307
2308 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2309 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2310 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2311
2312 COVERAGE_INC(netdev_get_ethtool);
2313 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2314 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
2315 cmd,
2316 ETHTOOL_GDRVINFO,
2317 "ETHTOOL_GDRVINFO");
2318 if (!error) {
2319 netdev_dev->cache_valid |= VALID_DRVINFO;
2320 }
2321 }
e210037e 2322
e210037e 2323 if (!error) {
79f1cbe9
EJ
2324 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2325 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2326 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
e210037e 2327 }
e210037e
AE
2328 return error;
2329}
2330
4f925bd3 2331static int
275707c3
EJ
2332netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2333 struct smap *smap)
4f925bd3 2334{
79f1cbe9 2335 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2336 return 0;
2337}
2338
8b61709d
BP
2339/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2340 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2341 * returns 0. Otherwise, it returns a positive errno value; in particular,
2342 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2343static int
2344netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2345 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2346{
2347 struct arpreq r;
c100e025 2348 struct sockaddr_in sin;
8b61709d
BP
2349 int retval;
2350
2351 memset(&r, 0, sizeof r);
f2cc621b 2352 memset(&sin, 0, sizeof sin);
c100e025
BP
2353 sin.sin_family = AF_INET;
2354 sin.sin_addr.s_addr = ip;
2355 sin.sin_port = 0;
2356 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2357 r.arp_ha.sa_family = ARPHRD_ETHER;
2358 r.arp_flags = 0;
71d7c22f 2359 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2360 COVERAGE_INC(netdev_arp_lookup);
2361 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2362 if (!retval) {
2363 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2364 } else if (retval != ENXIO) {
2365 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
ed36537e 2366 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
8b61709d
BP
2367 }
2368 return retval;
2369}
2370
2371static int
2372nd_to_iff_flags(enum netdev_flags nd)
2373{
2374 int iff = 0;
2375 if (nd & NETDEV_UP) {
2376 iff |= IFF_UP;
2377 }
2378 if (nd & NETDEV_PROMISC) {
2379 iff |= IFF_PROMISC;
2380 }
2381 return iff;
2382}
2383
2384static int
2385iff_to_nd_flags(int iff)
2386{
2387 enum netdev_flags nd = 0;
2388 if (iff & IFF_UP) {
2389 nd |= NETDEV_UP;
2390 }
2391 if (iff & IFF_PROMISC) {
2392 nd |= NETDEV_PROMISC;
2393 }
2394 return nd;
2395}
2396
2397static int
2398netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2399 enum netdev_flags on, enum netdev_flags *old_flagsp)
2400{
c37d4da4 2401 struct netdev_dev_linux *netdev_dev;
8b61709d 2402 int old_flags, new_flags;
c37d4da4
EJ
2403 int error = 0;
2404
2405 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2406 old_flags = netdev_dev->ifi_flags;
2407 *old_flagsp = iff_to_nd_flags(old_flags);
2408 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2409 if (new_flags != old_flags) {
2410 error = set_flags(netdev, new_flags);
2411 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
8b61709d
BP
2412 }
2413 return error;
2414}
2415
ac4d3bcb
EJ
2416static unsigned int
2417netdev_linux_change_seq(const struct netdev *netdev)
2418{
2419 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2420}
2421
4f925bd3 2422#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
51f87458 2423 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2424{ \
2425 NAME, \
2426 \
2427 netdev_linux_init, \
2428 netdev_linux_run, \
2429 netdev_linux_wait, \
2430 \
2431 CREATE, \
2432 netdev_linux_destroy, \
de5cdb90 2433 NULL, /* get_config */ \
6d9e6eb4 2434 NULL, /* set_config */ \
f431bf7d 2435 NULL, /* get_tunnel_config */ \
c3827f61
BP
2436 \
2437 netdev_linux_open, \
2438 netdev_linux_close, \
2439 \
7b6b0ef4 2440 netdev_linux_listen, \
c3827f61
BP
2441 netdev_linux_recv, \
2442 netdev_linux_recv_wait, \
2443 netdev_linux_drain, \
2444 \
2445 netdev_linux_send, \
2446 netdev_linux_send_wait, \
2447 \
2448 netdev_linux_set_etheraddr, \
2449 netdev_linux_get_etheraddr, \
2450 netdev_linux_get_mtu, \
9b020780 2451 netdev_linux_set_mtu, \
c3827f61
BP
2452 netdev_linux_get_ifindex, \
2453 netdev_linux_get_carrier, \
65c3058c 2454 netdev_linux_get_carrier_resets, \
1670c579 2455 netdev_linux_set_miimon_interval, \
f613a0d7 2456 GET_STATS, \
c3827f61
BP
2457 SET_STATS, \
2458 \
51f87458 2459 GET_FEATURES, \
c3827f61 2460 netdev_linux_set_advertisements, \
c3827f61
BP
2461 \
2462 netdev_linux_set_policing, \
2463 netdev_linux_get_qos_types, \
2464 netdev_linux_get_qos_capabilities, \
2465 netdev_linux_get_qos, \
2466 netdev_linux_set_qos, \
2467 netdev_linux_get_queue, \
2468 netdev_linux_set_queue, \
2469 netdev_linux_delete_queue, \
2470 netdev_linux_get_queue_stats, \
2471 netdev_linux_dump_queues, \
2472 netdev_linux_dump_queue_stats, \
2473 \
2474 netdev_linux_get_in4, \
2475 netdev_linux_set_in4, \
2476 netdev_linux_get_in6, \
2477 netdev_linux_add_router, \
2478 netdev_linux_get_next_hop, \
4f925bd3 2479 GET_STATUS, \
c3827f61
BP
2480 netdev_linux_arp_lookup, \
2481 \
2482 netdev_linux_update_flags, \
2483 \
ac4d3bcb 2484 netdev_linux_change_seq \
c3827f61
BP
2485}
2486
2487const struct netdev_class netdev_linux_class =
2488 NETDEV_LINUX_CLASS(
2489 "system",
2490 netdev_linux_create,
f613a0d7 2491 netdev_linux_get_stats,
4f925bd3 2492 NULL, /* set_stats */
51f87458 2493 netdev_linux_get_features,
275707c3 2494 netdev_linux_get_status);
c3827f61
BP
2495
2496const struct netdev_class netdev_tap_class =
2497 NETDEV_LINUX_CLASS(
2498 "tap",
2499 netdev_linux_create_tap,
bba1e6f3 2500 netdev_tap_get_stats,
4f925bd3 2501 NULL, /* set_stats */
51f87458 2502 netdev_linux_get_features,
275707c3 2503 netdev_linux_get_status);
c3827f61
BP
2504
2505const struct netdev_class netdev_internal_class =
2506 NETDEV_LINUX_CLASS(
2507 "internal",
2508 netdev_linux_create,
bba1e6f3 2509 netdev_internal_get_stats,
2f31a822 2510 netdev_internal_set_stats,
51f87458 2511 NULL, /* get_features */
275707c3 2512 netdev_internal_get_status);
8b61709d 2513\f
c1c9c9c4 2514/* HTB traffic control class. */
559843ed 2515
c1c9c9c4 2516#define HTB_N_QUEUES 0xf000
8b61709d 2517
c1c9c9c4
BP
2518struct htb {
2519 struct tc tc;
2520 unsigned int max_rate; /* In bytes/s. */
2521};
8b61709d 2522
c1c9c9c4 2523struct htb_class {
93b13be8 2524 struct tc_queue tc_queue;
c1c9c9c4
BP
2525 unsigned int min_rate; /* In bytes/s. */
2526 unsigned int max_rate; /* In bytes/s. */
2527 unsigned int burst; /* In bytes. */
2528 unsigned int priority; /* Lower values are higher priorities. */
2529};
8b61709d 2530
c1c9c9c4
BP
2531static struct htb *
2532htb_get__(const struct netdev *netdev)
2533{
2534 struct netdev_dev_linux *netdev_dev =
2535 netdev_dev_linux_cast(netdev_get_dev(netdev));
2536 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2537}
2538
24045e35 2539static void
c1c9c9c4
BP
2540htb_install__(struct netdev *netdev, uint64_t max_rate)
2541{
2542 struct netdev_dev_linux *netdev_dev =
2543 netdev_dev_linux_cast(netdev_get_dev(netdev));
2544 struct htb *htb;
2545
2546 htb = xmalloc(sizeof *htb);
2547 tc_init(&htb->tc, &tc_ops_htb);
2548 htb->max_rate = max_rate;
2549
2550 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2551}
2552
2553/* Create an HTB qdisc.
2554 *
a339aa81 2555 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2556static int
2557htb_setup_qdisc__(struct netdev *netdev)
2558{
2559 size_t opt_offset;
2560 struct tc_htb_glob opt;
2561 struct ofpbuf request;
2562 struct tcmsg *tcmsg;
2563
2564 tc_del_qdisc(netdev);
2565
2566 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2567 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2568 if (!tcmsg) {
2569 return ENODEV;
2570 }
c1c9c9c4
BP
2571 tcmsg->tcm_handle = tc_make_handle(1, 0);
2572 tcmsg->tcm_parent = TC_H_ROOT;
2573
2574 nl_msg_put_string(&request, TCA_KIND, "htb");
2575
2576 memset(&opt, 0, sizeof opt);
2577 opt.rate2quantum = 10;
2578 opt.version = 3;
4ecf12d5 2579 opt.defcls = 1;
c1c9c9c4
BP
2580
2581 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2582 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2583 nl_msg_end_nested(&request, opt_offset);
2584
2585 return tc_transact(&request, NULL);
2586}
2587
2588/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2589 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2590static int
2591htb_setup_class__(struct netdev *netdev, unsigned int handle,
2592 unsigned int parent, struct htb_class *class)
2593{
2594 size_t opt_offset;
2595 struct tc_htb_opt opt;
2596 struct ofpbuf request;
2597 struct tcmsg *tcmsg;
2598 int error;
2599 int mtu;
2600
9b020780
PS
2601 error = netdev_get_mtu(netdev, &mtu);
2602 if (error) {
f915f1a8
BP
2603 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2604 netdev_get_name(netdev));
9b020780 2605 return error;
f915f1a8 2606 }
c1c9c9c4
BP
2607
2608 memset(&opt, 0, sizeof opt);
2609 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2610 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2611 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2612 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2613 opt.prio = class->priority;
2614
2615 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2616 if (!tcmsg) {
2617 return ENODEV;
2618 }
c1c9c9c4
BP
2619 tcmsg->tcm_handle = handle;
2620 tcmsg->tcm_parent = parent;
2621
2622 nl_msg_put_string(&request, TCA_KIND, "htb");
2623 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2624 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2625 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2626 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2627 nl_msg_end_nested(&request, opt_offset);
2628
2629 error = tc_transact(&request, NULL);
2630 if (error) {
2631 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2632 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2633 netdev_get_name(netdev),
2634 tc_get_major(handle), tc_get_minor(handle),
2635 tc_get_major(parent), tc_get_minor(parent),
2636 class->min_rate, class->max_rate,
2637 class->burst, class->priority, strerror(error));
2638 }
2639 return error;
2640}
2641
2642/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2643 * description of them into 'details'. The description complies with the
2644 * specification given in the vswitch database documentation for linux-htb
2645 * queue details. */
2646static int
2647htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2648{
2649 static const struct nl_policy tca_htb_policy[] = {
2650 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2651 .min_len = sizeof(struct tc_htb_opt) },
2652 };
2653
2654 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2655 const struct tc_htb_opt *htb;
2656
2657 if (!nl_parse_nested(nl_options, tca_htb_policy,
2658 attrs, ARRAY_SIZE(tca_htb_policy))) {
2659 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2660 return EPROTO;
2661 }
2662
2663 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2664 class->min_rate = htb->rate.rate;
2665 class->max_rate = htb->ceil.rate;
2666 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2667 class->priority = htb->prio;
2668 return 0;
2669}
2670
2671static int
2672htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2673 struct htb_class *options,
2674 struct netdev_queue_stats *stats)
2675{
2676 struct nlattr *nl_options;
2677 unsigned int handle;
2678 int error;
2679
2680 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2681 if (!error && queue_id) {
17ee3c1f
BP
2682 unsigned int major = tc_get_major(handle);
2683 unsigned int minor = tc_get_minor(handle);
2684 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2685 *queue_id = minor - 1;
c1c9c9c4
BP
2686 } else {
2687 error = EPROTO;
2688 }
2689 }
2690 if (!error && options) {
2691 error = htb_parse_tca_options__(nl_options, options);
2692 }
2693 return error;
2694}
2695
2696static void
2697htb_parse_qdisc_details__(struct netdev *netdev,
79f1cbe9 2698 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2699{
2700 const char *max_rate_s;
2701
79f1cbe9 2702 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
2703 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2704 if (!hc->max_rate) {
a00ca915 2705 enum netdev_features current;
c1c9c9c4
BP
2706
2707 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 2708 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
2709 }
2710 hc->min_rate = hc->max_rate;
2711 hc->burst = 0;
2712 hc->priority = 0;
2713}
2714
2715static int
2716htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 2717 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2718{
2719 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
2720 const char *min_rate_s = smap_get(details, "min-rate");
2721 const char *max_rate_s = smap_get(details, "max-rate");
2722 const char *burst_s = smap_get(details, "burst");
2723 const char *priority_s = smap_get(details, "priority");
9b020780 2724 int mtu, error;
c1c9c9c4 2725
9b020780
PS
2726 error = netdev_get_mtu(netdev, &mtu);
2727 if (error) {
f915f1a8
BP
2728 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2729 netdev_get_name(netdev));
9b020780 2730 return error;
f915f1a8
BP
2731 }
2732
4f104611
EJ
2733 /* HTB requires at least an mtu sized min-rate to send any traffic even
2734 * on uncongested links. */
c45ab5e9 2735 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2736 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2737 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2738
2739 /* max-rate */
2740 hc->max_rate = (max_rate_s
2741 ? strtoull(max_rate_s, NULL, 10) / 8
2742 : htb->max_rate);
2743 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2744 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2745
2746 /* burst
2747 *
2748 * According to hints in the documentation that I've read, it is important
2749 * that 'burst' be at least as big as the largest frame that might be
2750 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2751 * but having it a bit too small is a problem. Since netdev_get_mtu()
2752 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2753 * the MTU. We actually add 64, instead of 14, as a guard against
2754 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2755 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2756 hc->burst = MAX(hc->burst, mtu + 64);
2757
2758 /* priority */
2759 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2760
2761 return 0;
2762}
2763
2764static int
2765htb_query_class__(const struct netdev *netdev, unsigned int handle,
2766 unsigned int parent, struct htb_class *options,
2767 struct netdev_queue_stats *stats)
2768{
2769 struct ofpbuf *reply;
2770 int error;
2771
2772 error = tc_query_class(netdev, handle, parent, &reply);
2773 if (!error) {
2774 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2775 ofpbuf_delete(reply);
2776 }
2777 return error;
2778}
2779
2780static int
79f1cbe9 2781htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2782{
2783 int error;
2784
2785 error = htb_setup_qdisc__(netdev);
2786 if (!error) {
2787 struct htb_class hc;
2788
2789 htb_parse_qdisc_details__(netdev, details, &hc);
2790 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2791 tc_make_handle(1, 0), &hc);
2792 if (!error) {
2793 htb_install__(netdev, hc.max_rate);
2794 }
2795 }
2796 return error;
2797}
2798
93b13be8
BP
2799static struct htb_class *
2800htb_class_cast__(const struct tc_queue *queue)
2801{
2802 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2803}
2804
c1c9c9c4
BP
2805static void
2806htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2807 const struct htb_class *hc)
2808{
2809 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2810 size_t hash = hash_int(queue_id, 0);
2811 struct tc_queue *queue;
c1c9c9c4
BP
2812 struct htb_class *hcp;
2813
93b13be8
BP
2814 queue = tc_find_queue__(netdev, queue_id, hash);
2815 if (queue) {
2816 hcp = htb_class_cast__(queue);
2817 } else {
c1c9c9c4 2818 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2819 queue = &hcp->tc_queue;
2820 queue->queue_id = queue_id;
2821 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2822 }
93b13be8
BP
2823
2824 hcp->min_rate = hc->min_rate;
2825 hcp->max_rate = hc->max_rate;
2826 hcp->burst = hc->burst;
2827 hcp->priority = hc->priority;
c1c9c9c4
BP
2828}
2829
2830static int
2831htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2832{
c1c9c9c4
BP
2833 struct ofpbuf msg;
2834 struct nl_dump dump;
2835 struct htb_class hc;
c1c9c9c4
BP
2836
2837 /* Get qdisc options. */
2838 hc.max_rate = 0;
2839 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2840 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2841
2842 /* Get queues. */
23a98ffe
BP
2843 if (!start_queue_dump(netdev, &dump)) {
2844 return ENODEV;
2845 }
c1c9c9c4
BP
2846 while (nl_dump_next(&dump, &msg)) {
2847 unsigned int queue_id;
2848
2849 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2850 htb_update_queue__(netdev, queue_id, &hc);
2851 }
2852 }
2853 nl_dump_done(&dump);
2854
2855 return 0;
2856}
2857
2858static void
2859htb_tc_destroy(struct tc *tc)
2860{
2861 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2862 struct htb_class *hc, *next;
c1c9c9c4 2863
4e8e4213 2864 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2865 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2866 free(hc);
2867 }
2868 tc_destroy(tc);
2869 free(htb);
2870}
2871
2872static int
79f1cbe9 2873htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
2874{
2875 const struct htb *htb = htb_get__(netdev);
79f1cbe9 2876 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
2877 return 0;
2878}
2879
2880static int
79f1cbe9 2881htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2882{
2883 struct htb_class hc;
2884 int error;
2885
2886 htb_parse_qdisc_details__(netdev, details, &hc);
2887 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2888 tc_make_handle(1, 0), &hc);
2889 if (!error) {
2890 htb_get__(netdev)->max_rate = hc.max_rate;
2891 }
2892 return error;
2893}
2894
2895static int
93b13be8 2896htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 2897 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 2898{
93b13be8 2899 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2900
79f1cbe9 2901 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 2902 if (hc->min_rate != hc->max_rate) {
79f1cbe9 2903 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 2904 }
79f1cbe9 2905 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 2906 if (hc->priority) {
79f1cbe9 2907 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
2908 }
2909 return 0;
2910}
2911
2912static int
2913htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 2914 const struct smap *details)
c1c9c9c4
BP
2915{
2916 struct htb_class hc;
2917 int error;
2918
2919 error = htb_parse_class_details__(netdev, details, &hc);
2920 if (error) {
2921 return error;
2922 }
2923
17ee3c1f 2924 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2925 tc_make_handle(1, 0xfffe), &hc);
2926 if (error) {
2927 return error;
2928 }
2929
2930 htb_update_queue__(netdev, queue_id, &hc);
2931 return 0;
2932}
2933
2934static int
93b13be8 2935htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2936{
93b13be8 2937 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2938 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2939 int error;
2940
93b13be8 2941 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2942 if (!error) {
93b13be8 2943 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2944 free(hc);
c1c9c9c4
BP
2945 }
2946 return error;
2947}
2948
2949static int
93b13be8 2950htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
2951 struct netdev_queue_stats *stats)
2952{
93b13be8 2953 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
2954 tc_make_handle(1, 0xfffe), NULL, stats);
2955}
2956
2957static int
2958htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2959 const struct ofpbuf *nlmsg,
2960 netdev_dump_queue_stats_cb *cb, void *aux)
2961{
2962 struct netdev_queue_stats stats;
17ee3c1f 2963 unsigned int handle, major, minor;
c1c9c9c4
BP
2964 int error;
2965
2966 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2967 if (error) {
2968 return error;
2969 }
2970
17ee3c1f
BP
2971 major = tc_get_major(handle);
2972 minor = tc_get_minor(handle);
2973 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 2974 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
2975 }
2976 return 0;
2977}
2978
2979static const struct tc_ops tc_ops_htb = {
2980 "htb", /* linux_name */
2981 "linux-htb", /* ovs_name */
2982 HTB_N_QUEUES, /* n_queues */
2983 htb_tc_install,
2984 htb_tc_load,
2985 htb_tc_destroy,
2986 htb_qdisc_get,
2987 htb_qdisc_set,
2988 htb_class_get,
2989 htb_class_set,
2990 htb_class_delete,
2991 htb_class_get_stats,
2992 htb_class_dump_stats
2993};
2994\f
a339aa81
EJ
2995/* "linux-hfsc" traffic control class. */
2996
2997#define HFSC_N_QUEUES 0xf000
2998
2999struct hfsc {
3000 struct tc tc;
3001 uint32_t max_rate;
3002};
3003
3004struct hfsc_class {
3005 struct tc_queue tc_queue;
3006 uint32_t min_rate;
3007 uint32_t max_rate;
3008};
3009
3010static struct hfsc *
3011hfsc_get__(const struct netdev *netdev)
3012{
3013 struct netdev_dev_linux *netdev_dev;
3014 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3015 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3016}
3017
3018static struct hfsc_class *
3019hfsc_class_cast__(const struct tc_queue *queue)
3020{
3021 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3022}
3023
24045e35 3024static void
a339aa81
EJ
3025hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3026{
3027 struct netdev_dev_linux * netdev_dev;
3028 struct hfsc *hfsc;
3029
3030 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3031 hfsc = xmalloc(sizeof *hfsc);
3032 tc_init(&hfsc->tc, &tc_ops_hfsc);
3033 hfsc->max_rate = max_rate;
3034 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
3035}
3036
3037static void
3038hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3039 const struct hfsc_class *hc)
3040{
3041 size_t hash;
3042 struct hfsc *hfsc;
3043 struct hfsc_class *hcp;
3044 struct tc_queue *queue;
3045
3046 hfsc = hfsc_get__(netdev);
3047 hash = hash_int(queue_id, 0);
3048
3049 queue = tc_find_queue__(netdev, queue_id, hash);
3050 if (queue) {
3051 hcp = hfsc_class_cast__(queue);
3052 } else {
3053 hcp = xmalloc(sizeof *hcp);
3054 queue = &hcp->tc_queue;
3055 queue->queue_id = queue_id;
3056 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3057 }
3058
3059 hcp->min_rate = hc->min_rate;
3060 hcp->max_rate = hc->max_rate;
3061}
3062
3063static int
3064hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3065{
3066 const struct tc_service_curve *rsc, *fsc, *usc;
3067 static const struct nl_policy tca_hfsc_policy[] = {
3068 [TCA_HFSC_RSC] = {
3069 .type = NL_A_UNSPEC,
3070 .optional = false,
3071 .min_len = sizeof(struct tc_service_curve),
3072 },
3073 [TCA_HFSC_FSC] = {
3074 .type = NL_A_UNSPEC,
3075 .optional = false,
3076 .min_len = sizeof(struct tc_service_curve),
3077 },
3078 [TCA_HFSC_USC] = {
3079 .type = NL_A_UNSPEC,
3080 .optional = false,
3081 .min_len = sizeof(struct tc_service_curve),
3082 },
3083 };
3084 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3085
3086 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3087 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3088 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3089 return EPROTO;
3090 }
3091
3092 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3093 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3094 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3095
3096 if (rsc->m1 != 0 || rsc->d != 0 ||
3097 fsc->m1 != 0 || fsc->d != 0 ||
3098 usc->m1 != 0 || usc->d != 0) {
3099 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3100 "Non-linear service curves are not supported.");
3101 return EPROTO;
3102 }
3103
3104 if (rsc->m2 != fsc->m2) {
3105 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3106 "Real-time service curves are not supported ");
3107 return EPROTO;
3108 }
3109
3110 if (rsc->m2 > usc->m2) {
3111 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3112 "Min-rate service curve is greater than "
3113 "the max-rate service curve.");
3114 return EPROTO;
3115 }
3116
3117 class->min_rate = fsc->m2;
3118 class->max_rate = usc->m2;
3119 return 0;
3120}
3121
3122static int
3123hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3124 struct hfsc_class *options,
3125 struct netdev_queue_stats *stats)
3126{
3127 int error;
3128 unsigned int handle;
3129 struct nlattr *nl_options;
3130
3131 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3132 if (error) {
3133 return error;
3134 }
3135
3136 if (queue_id) {
3137 unsigned int major, minor;
3138
3139 major = tc_get_major(handle);
3140 minor = tc_get_minor(handle);
3141 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3142 *queue_id = minor - 1;
3143 } else {
3144 return EPROTO;
3145 }
3146 }
3147
3148 if (options) {
3149 error = hfsc_parse_tca_options__(nl_options, options);
3150 }
3151
3152 return error;
3153}
3154
3155static int
3156hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3157 unsigned int parent, struct hfsc_class *options,
3158 struct netdev_queue_stats *stats)
3159{
3160 int error;
3161 struct ofpbuf *reply;
3162
3163 error = tc_query_class(netdev, handle, parent, &reply);
3164 if (error) {
3165 return error;
3166 }
3167
3168 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3169 ofpbuf_delete(reply);
3170 return error;
3171}
3172
3173static void
79f1cbe9 3174hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
a339aa81
EJ
3175 struct hfsc_class *class)
3176{
3177 uint32_t max_rate;
3178 const char *max_rate_s;
3179
79f1cbe9 3180 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
3181 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3182
3183 if (!max_rate) {
a00ca915 3184 enum netdev_features current;
a339aa81
EJ
3185
3186 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 3187 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
3188 }
3189
3190 class->min_rate = max_rate;
3191 class->max_rate = max_rate;
3192}
3193
3194static int
3195hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 3196 const struct smap *details,
a339aa81
EJ
3197 struct hfsc_class * class)
3198{
3199 const struct hfsc *hfsc;
3200 uint32_t min_rate, max_rate;
3201 const char *min_rate_s, *max_rate_s;
3202
3203 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
3204 min_rate_s = smap_get(details, "min-rate");
3205 max_rate_s = smap_get(details, "max-rate");
a339aa81 3206
c45ab5e9 3207 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3208 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3209 min_rate = MIN(min_rate, hfsc->max_rate);
3210
3211 max_rate = (max_rate_s
3212 ? strtoull(max_rate_s, NULL, 10) / 8
3213 : hfsc->max_rate);
3214 max_rate = MAX(max_rate, min_rate);
3215 max_rate = MIN(max_rate, hfsc->max_rate);
3216
3217 class->min_rate = min_rate;
3218 class->max_rate = max_rate;
3219
3220 return 0;
3221}
3222
3223/* Create an HFSC qdisc.
3224 *
3225 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3226static int
3227hfsc_setup_qdisc__(struct netdev * netdev)
3228{
3229 struct tcmsg *tcmsg;
3230 struct ofpbuf request;
3231 struct tc_hfsc_qopt opt;
3232
3233 tc_del_qdisc(netdev);
3234
3235 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3236 NLM_F_EXCL | NLM_F_CREATE, &request);
3237
3238 if (!tcmsg) {
3239 return ENODEV;
3240 }
3241
3242 tcmsg->tcm_handle = tc_make_handle(1, 0);
3243 tcmsg->tcm_parent = TC_H_ROOT;
3244
3245 memset(&opt, 0, sizeof opt);
3246 opt.defcls = 1;
3247
3248 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3249 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3250
3251 return tc_transact(&request, NULL);
3252}
3253
3254/* Create an HFSC class.
3255 *
3256 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3257 * sc rate <min_rate> ul rate <max_rate>" */
3258static int
3259hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3260 unsigned int parent, struct hfsc_class *class)
3261{
3262 int error;
3263 size_t opt_offset;
3264 struct tcmsg *tcmsg;
3265 struct ofpbuf request;
3266 struct tc_service_curve min, max;
3267
3268 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3269
3270 if (!tcmsg) {
3271 return ENODEV;
3272 }
3273
3274 tcmsg->tcm_handle = handle;
3275 tcmsg->tcm_parent = parent;
3276
3277 min.m1 = 0;
3278 min.d = 0;
3279 min.m2 = class->min_rate;
3280
3281 max.m1 = 0;
3282 max.d = 0;
3283 max.m2 = class->max_rate;
3284
3285 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3286 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3287 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3288 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3289 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3290 nl_msg_end_nested(&request, opt_offset);
3291
3292 error = tc_transact(&request, NULL);
3293 if (error) {
3294 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3295 "min-rate %ubps, max-rate %ubps (%s)",
3296 netdev_get_name(netdev),
3297 tc_get_major(handle), tc_get_minor(handle),
3298 tc_get_major(parent), tc_get_minor(parent),
3299 class->min_rate, class->max_rate, strerror(error));
3300 }
3301
3302 return error;
3303}
3304
3305static int
79f1cbe9 3306hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3307{
3308 int error;
3309 struct hfsc_class class;
3310
3311 error = hfsc_setup_qdisc__(netdev);
3312
3313 if (error) {
3314 return error;
3315 }
3316
3317 hfsc_parse_qdisc_details__(netdev, details, &class);
3318 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3319 tc_make_handle(1, 0), &class);
3320
3321 if (error) {
3322 return error;
3323 }
3324
3325 hfsc_install__(netdev, class.max_rate);
3326 return 0;
3327}
3328
3329static int
3330hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3331{
3332 struct ofpbuf msg;
a339aa81
EJ
3333 struct nl_dump dump;
3334 struct hfsc_class hc;
3335
3336 hc.max_rate = 0;
3337 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3338 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3339
3340 if (!start_queue_dump(netdev, &dump)) {
3341 return ENODEV;
3342 }
3343
3344 while (nl_dump_next(&dump, &msg)) {
3345 unsigned int queue_id;
3346
3347 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3348 hfsc_update_queue__(netdev, queue_id, &hc);
3349 }
3350 }
3351
3352 nl_dump_done(&dump);
3353 return 0;
3354}
3355
3356static void
3357hfsc_tc_destroy(struct tc *tc)
3358{
3359 struct hfsc *hfsc;
3360 struct hfsc_class *hc, *next;
3361
3362 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3363
3364 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3365 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3366 free(hc);
3367 }
3368
3369 tc_destroy(tc);
3370 free(hfsc);
3371}
3372
3373static int
79f1cbe9 3374hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
3375{
3376 const struct hfsc *hfsc;
3377 hfsc = hfsc_get__(netdev);
79f1cbe9 3378 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
3379 return 0;
3380}
3381
3382static int
79f1cbe9 3383hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3384{
3385 int error;
3386 struct hfsc_class class;
3387
3388 hfsc_parse_qdisc_details__(netdev, details, &class);
3389 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3390 tc_make_handle(1, 0), &class);
3391
3392 if (!error) {
3393 hfsc_get__(netdev)->max_rate = class.max_rate;
3394 }
3395
3396 return error;
3397}
3398
3399static int
3400hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3401 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
3402{
3403 const struct hfsc_class *hc;
3404
3405 hc = hfsc_class_cast__(queue);
79f1cbe9 3406 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 3407 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3408 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
3409 }
3410 return 0;
3411}
3412
3413static int
3414hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3415 const struct smap *details)
a339aa81
EJ
3416{
3417 int error;
3418 struct hfsc_class class;
3419
3420 error = hfsc_parse_class_details__(netdev, details, &class);
3421 if (error) {
3422 return error;
3423 }
3424
3425 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3426 tc_make_handle(1, 0xfffe), &class);
3427 if (error) {
3428 return error;
3429 }
3430
3431 hfsc_update_queue__(netdev, queue_id, &class);
3432 return 0;
3433}
3434
3435static int
3436hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3437{
3438 int error;
3439 struct hfsc *hfsc;
3440 struct hfsc_class *hc;
3441
3442 hc = hfsc_class_cast__(queue);
3443 hfsc = hfsc_get__(netdev);
3444
3445 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3446 if (!error) {
3447 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3448 free(hc);
3449 }
3450 return error;
3451}
3452
3453static int
3454hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3455 struct netdev_queue_stats *stats)
3456{
3457 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3458 tc_make_handle(1, 0xfffe), NULL, stats);
3459}
3460
3461static int
3462hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3463 const struct ofpbuf *nlmsg,
3464 netdev_dump_queue_stats_cb *cb, void *aux)
3465{
3466 struct netdev_queue_stats stats;
3467 unsigned int handle, major, minor;
3468 int error;
3469
3470 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3471 if (error) {
3472 return error;
3473 }
3474
3475 major = tc_get_major(handle);
3476 minor = tc_get_minor(handle);
3477 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3478 (*cb)(minor - 1, &stats, aux);
3479 }
3480 return 0;
3481}
3482
3483static const struct tc_ops tc_ops_hfsc = {
3484 "hfsc", /* linux_name */
3485 "linux-hfsc", /* ovs_name */
3486 HFSC_N_QUEUES, /* n_queues */
3487 hfsc_tc_install, /* tc_install */
3488 hfsc_tc_load, /* tc_load */
3489 hfsc_tc_destroy, /* tc_destroy */
3490 hfsc_qdisc_get, /* qdisc_get */
3491 hfsc_qdisc_set, /* qdisc_set */
3492 hfsc_class_get, /* class_get */
3493 hfsc_class_set, /* class_set */
3494 hfsc_class_delete, /* class_delete */
3495 hfsc_class_get_stats, /* class_get_stats */
3496 hfsc_class_dump_stats /* class_dump_stats */
3497};
3498\f
c1c9c9c4
BP
3499/* "linux-default" traffic control class.
3500 *
3501 * This class represents the default, unnamed Linux qdisc. It corresponds to
3502 * the "" (empty string) QoS type in the OVS database. */
3503
3504static void
3505default_install__(struct netdev *netdev)
3506{
3507 struct netdev_dev_linux *netdev_dev =
3508 netdev_dev_linux_cast(netdev_get_dev(netdev));
3509 static struct tc *tc;
3510
3511 if (!tc) {
3512 tc = xmalloc(sizeof *tc);
3513 tc_init(tc, &tc_ops_default);
3514 }
3515 netdev_dev->tc = tc;
3516}
3517
3518static int
3519default_tc_install(struct netdev *netdev,
79f1cbe9 3520 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
3521{
3522 default_install__(netdev);
3523 return 0;
3524}
3525
3526static int
3527default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3528{
3529 default_install__(netdev);
3530 return 0;
3531}
3532
3533static const struct tc_ops tc_ops_default = {
3534 NULL, /* linux_name */
3535 "", /* ovs_name */
3536 0, /* n_queues */
3537 default_tc_install,
3538 default_tc_load,
3539 NULL, /* tc_destroy */
3540 NULL, /* qdisc_get */
3541 NULL, /* qdisc_set */
3542 NULL, /* class_get */
3543 NULL, /* class_set */
3544 NULL, /* class_delete */
3545 NULL, /* class_get_stats */
3546 NULL /* class_dump_stats */
3547};
3548\f
3549/* "linux-other" traffic control class.
3550 *
3551 * */
3552
3553static int
3554other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3555{
3556 struct netdev_dev_linux *netdev_dev =
3557 netdev_dev_linux_cast(netdev_get_dev(netdev));
3558 static struct tc *tc;
3559
3560 if (!tc) {
3561 tc = xmalloc(sizeof *tc);
3562 tc_init(tc, &tc_ops_other);
3563 }
3564 netdev_dev->tc = tc;
3565 return 0;
3566}
3567
3568static const struct tc_ops tc_ops_other = {
3569 NULL, /* linux_name */
3570 "linux-other", /* ovs_name */
3571 0, /* n_queues */
3572 NULL, /* tc_install */
3573 other_tc_load,
3574 NULL, /* tc_destroy */
3575 NULL, /* qdisc_get */
3576 NULL, /* qdisc_set */
3577 NULL, /* class_get */
3578 NULL, /* class_set */
3579 NULL, /* class_delete */
3580 NULL, /* class_get_stats */
3581 NULL /* class_dump_stats */
3582};
3583\f
3584/* Traffic control. */
3585
3586/* Number of kernel "tc" ticks per second. */
3587static double ticks_per_s;
3588
3589/* Number of kernel "jiffies" per second. This is used for the purpose of
3590 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3591 * one jiffy's worth of data.
3592 *
3593 * There are two possibilities here:
3594 *
3595 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3596 * approximate range of 100 to 1024. That means that we really need to
3597 * make sure that the qdisc can buffer that much data.
3598 *
3599 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3600 * has finely granular timers and there's no need to fudge additional room
3601 * for buffers. (There's no extra effort needed to implement that: the
3602 * large 'buffer_hz' is used as a divisor, so practically any number will
3603 * come out as 0 in the division. Small integer results in the case of
3604 * really high dividends won't have any real effect anyhow.)
3605 */
3606static unsigned int buffer_hz;
3607
3608/* Returns tc handle 'major':'minor'. */
3609static unsigned int
3610tc_make_handle(unsigned int major, unsigned int minor)
3611{
3612 return TC_H_MAKE(major << 16, minor);
3613}
3614
3615/* Returns the major number from 'handle'. */
3616static unsigned int
3617tc_get_major(unsigned int handle)
3618{
3619 return TC_H_MAJ(handle) >> 16;
3620}
3621
3622/* Returns the minor number from 'handle'. */
3623static unsigned int
3624tc_get_minor(unsigned int handle)
3625{
3626 return TC_H_MIN(handle);
3627}
3628
3629static struct tcmsg *
3630tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3631 struct ofpbuf *request)
3632{
3633 struct tcmsg *tcmsg;
3634 int ifindex;
3635 int error;
3636
3637 error = get_ifindex(netdev, &ifindex);
3638 if (error) {
3639 return NULL;
3640 }
3641
3642 ofpbuf_init(request, 512);
3643 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3644 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3645 tcmsg->tcm_family = AF_UNSPEC;
3646 tcmsg->tcm_ifindex = ifindex;
3647 /* Caller should fill in tcmsg->tcm_handle. */
3648 /* Caller should fill in tcmsg->tcm_parent. */
3649
3650 return tcmsg;
3651}
3652
3653static int
3654tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3655{
3656 int error = nl_sock_transact(rtnl_sock, request, replyp);
3657 ofpbuf_uninit(request);
3658 return error;
3659}
3660
f8500004
JP
3661/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3662 * policing configuration.
3663 *
3664 * This function is equivalent to running the following when 'add' is true:
3665 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3666 *
3667 * This function is equivalent to running the following when 'add' is false:
3668 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3669 *
3670 * The configuration and stats may be seen with the following command:
3671 * /sbin/tc -s qdisc show dev <devname>
3672 *
3673 * Returns 0 if successful, otherwise a positive errno value.
3674 */
3675static int
3676tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3677{
3678 struct ofpbuf request;
3679 struct tcmsg *tcmsg;
3680 int error;
3681 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3682 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3683
3684 tcmsg = tc_make_request(netdev, type, flags, &request);
3685 if (!tcmsg) {
3686 return ENODEV;
3687 }
3688 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3689 tcmsg->tcm_parent = TC_H_INGRESS;
3690 nl_msg_put_string(&request, TCA_KIND, "ingress");
3691 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3692
3693 error = tc_transact(&request, NULL);
3694 if (error) {
3695 /* If we're deleting the qdisc, don't worry about some of the
3696 * error conditions. */
3697 if (!add && (error == ENOENT || error == EINVAL)) {
3698 return 0;
3699 }
3700 return error;
3701 }
3702
3703 return 0;
3704}
3705
3706/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3707 * of 'kbits_burst'.
3708 *
3709 * This function is equivalent to running:
3710 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3711 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3712 * mtu 65535 drop
3713 *
3714 * The configuration and stats may be seen with the following command:
3715 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3716 *
3717 * Returns 0 if successful, otherwise a positive errno value.
3718 */
3719static int
3720tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3721{
3722 struct tc_police tc_police;
3723 struct ofpbuf request;
3724 struct tcmsg *tcmsg;
3725 size_t basic_offset;
3726 size_t police_offset;
3727 int error;
3728 int mtu = 65535;
3729
3730 memset(&tc_police, 0, sizeof tc_police);
3731 tc_police.action = TC_POLICE_SHOT;
3732 tc_police.mtu = mtu;
3733 tc_fill_rate(&tc_police.rate, kbits_rate/8 * 1000, mtu);
3734 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3735 kbits_burst * 1024);
3736
3737 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3738 NLM_F_EXCL | NLM_F_CREATE, &request);
3739 if (!tcmsg) {
3740 return ENODEV;
3741 }
3742 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3743 tcmsg->tcm_info = tc_make_handle(49,
3744 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3745
3746 nl_msg_put_string(&request, TCA_KIND, "basic");
3747 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3748 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3749 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3750 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3751 nl_msg_end_nested(&request, police_offset);
3752 nl_msg_end_nested(&request, basic_offset);
3753
3754 error = tc_transact(&request, NULL);
3755 if (error) {
3756 return error;
3757 }
3758
3759 return 0;
3760}
3761
c1c9c9c4
BP
3762static void
3763read_psched(void)
3764{
3765 /* The values in psched are not individually very meaningful, but they are
3766 * important. The tables below show some values seen in the wild.
3767 *
3768 * Some notes:
3769 *
3770 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3771 * (Before that, there are hints that it was 1000000000.)
3772 *
3773 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3774 * above.
3775 *
3776 * /proc/net/psched
3777 * -----------------------------------
3778 * [1] 000c8000 000f4240 000f4240 00000064
3779 * [2] 000003e8 00000400 000f4240 3b9aca00
3780 * [3] 000003e8 00000400 000f4240 3b9aca00
3781 * [4] 000003e8 00000400 000f4240 00000064
3782 * [5] 000003e8 00000040 000f4240 3b9aca00
3783 * [6] 000003e8 00000040 000f4240 000000f9
3784 *
3785 * a b c d ticks_per_s buffer_hz
3786 * ------- --------- ---------- ------------- ----------- -------------
3787 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3788 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3789 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3790 * [4] 1,000 1,024 1,000,000 100 976,562 100
3791 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3792 * [6] 1,000 64 1,000,000 249 15,625,000 249
3793 *
3794 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3795 * [2] 2.6.26-1-686-bigmem from Debian lenny
3796 * [3] 2.6.26-2-sparc64 from Debian lenny
3797 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3798 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3799 * [6] 2.6.34 from kernel.org on KVM
3800 */
3801 static const char fn[] = "/proc/net/psched";
3802 unsigned int a, b, c, d;
3803 FILE *stream;
3804
3805 ticks_per_s = 1.0;
3806 buffer_hz = 100;
3807
3808 stream = fopen(fn, "r");
3809 if (!stream) {
3810 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3811 return;
3812 }
3813
3814 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3815 VLOG_WARN("%s: read failed", fn);
3816 fclose(stream);
3817 return;
3818 }
3819 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3820 fclose(stream);
3821
3822 if (!a || !c) {
3823 VLOG_WARN("%s: invalid scheduler parameters", fn);
3824 return;
3825 }
3826
3827 ticks_per_s = (double) a * c / b;
3828 if (c == 1000000) {
3829 buffer_hz = d;
3830 } else {
3831 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3832 fn, a, b, c, d);
3833 }
3834 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3835}
3836
3837/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3838 * rate of 'rate' bytes per second. */
3839static unsigned int
3840tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3841{
3842 if (!buffer_hz) {
3843 read_psched();
3844 }
3845 return (rate * ticks) / ticks_per_s;
3846}
3847
3848/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3849 * rate of 'rate' bytes per second. */
3850static unsigned int
3851tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3852{
3853 if (!buffer_hz) {
3854 read_psched();
3855 }
015c93a4 3856 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3857}
3858
3859/* Returns the number of bytes that need to be reserved for qdisc buffering at
3860 * a transmission rate of 'rate' bytes per second. */
3861static unsigned int
3862tc_buffer_per_jiffy(unsigned int rate)
3863{
3864 if (!buffer_hz) {
3865 read_psched();
3866 }
3867 return rate / buffer_hz;
3868}
3869
3870/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3871 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3872 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3873 * stores NULL into it if it is absent.
3874 *
3875 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3876 * 'msg'.
3877 *
3878 * Returns 0 if successful, otherwise a positive errno value. */
3879static int
3880tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3881 struct nlattr **options)
3882{
3883 static const struct nl_policy tca_policy[] = {
3884 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3885 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3886 };
3887 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3888
3889 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3890 tca_policy, ta, ARRAY_SIZE(ta))) {
3891 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3892 goto error;
3893 }
3894
3895 if (kind) {
3896 *kind = nl_attr_get_string(ta[TCA_KIND]);
3897 }
3898
3899 if (options) {
3900 *options = ta[TCA_OPTIONS];
3901 }
3902
3903 return 0;
3904
3905error:
3906 if (kind) {
3907 *kind = NULL;
3908 }
3909 if (options) {
3910 *options = NULL;
3911 }
3912 return EPROTO;
3913}
3914
3915/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3916 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3917 * into '*options', and its queue statistics into '*stats'. Any of the output
3918 * arguments may be null.
3919 *
3920 * Returns 0 if successful, otherwise a positive errno value. */
3921static int
3922tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3923 struct nlattr **options, struct netdev_queue_stats *stats)
3924{
3925 static const struct nl_policy tca_policy[] = {
3926 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3927 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3928 };
3929 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3930
3931 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3932 tca_policy, ta, ARRAY_SIZE(ta))) {
3933 VLOG_WARN_RL(&rl, "failed to parse class message");
3934 goto error;
3935 }
3936
3937 if (handlep) {
3938 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3939 *handlep = tc->tcm_handle;
3940 }
3941
3942 if (options) {
3943 *options = ta[TCA_OPTIONS];
3944 }
3945
3946 if (stats) {
3947 const struct gnet_stats_queue *gsq;
3948 struct gnet_stats_basic gsb;
3949
3950 static const struct nl_policy stats_policy[] = {
3951 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3952 .min_len = sizeof gsb },
3953 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3954 .min_len = sizeof *gsq },
3955 };
3956 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3957
3958 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3959 sa, ARRAY_SIZE(sa))) {
3960 VLOG_WARN_RL(&rl, "failed to parse class stats");
3961 goto error;
3962 }
3963
3964 /* Alignment issues screw up the length of struct gnet_stats_basic on
3965 * some arch/bitsize combinations. Newer versions of Linux have a
3966 * struct gnet_stats_basic_packed, but we can't depend on that. The
3967 * easiest thing to do is just to make a copy. */
3968 memset(&gsb, 0, sizeof gsb);
3969 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3970 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3971 stats->tx_bytes = gsb.bytes;
3972 stats->tx_packets = gsb.packets;
3973
3974 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3975 stats->tx_errors = gsq->drops;
3976 }
3977
3978 return 0;
3979
3980error:
3981 if (options) {
3982 *options = NULL;
3983 }
3984 if (stats) {
3985 memset(stats, 0, sizeof *stats);
3986 }
3987 return EPROTO;
3988}
3989
3990/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3991 * on 'netdev'. */
3992static int
3993tc_query_class(const struct netdev *netdev,
3994 unsigned int handle, unsigned int parent,
3995 struct ofpbuf **replyp)
3996{
3997 struct ofpbuf request;
3998 struct tcmsg *tcmsg;
3999 int error;
4000
4001 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
4002 if (!tcmsg) {
4003 return ENODEV;
4004 }
c1c9c9c4
BP
4005 tcmsg->tcm_handle = handle;
4006 tcmsg->tcm_parent = parent;
4007
4008 error = tc_transact(&request, replyp);
4009 if (error) {
4010 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4011 netdev_get_name(netdev),
4012 tc_get_major(handle), tc_get_minor(handle),
4013 tc_get_major(parent), tc_get_minor(parent),
4014 strerror(error));
4015 }
4016 return error;
4017}
4018
4019/* Equivalent to "tc class del dev <name> handle <handle>". */
4020static int
4021tc_delete_class(const struct netdev *netdev, unsigned int handle)
4022{
4023 struct ofpbuf request;
4024 struct tcmsg *tcmsg;
4025 int error;
4026
4027 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
4028 if (!tcmsg) {
4029 return ENODEV;
4030 }
c1c9c9c4
BP
4031 tcmsg->tcm_handle = handle;
4032 tcmsg->tcm_parent = 0;
4033
4034 error = tc_transact(&request, NULL);
4035 if (error) {
4036 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4037 netdev_get_name(netdev),
4038 tc_get_major(handle), tc_get_minor(handle),
4039 strerror(error));
4040 }
4041 return error;
4042}
4043
4044/* Equivalent to "tc qdisc del dev <name> root". */
4045static int
4046tc_del_qdisc(struct netdev *netdev)
4047{
4048 struct netdev_dev_linux *netdev_dev =
4049 netdev_dev_linux_cast(netdev_get_dev(netdev));
4050 struct ofpbuf request;
4051 struct tcmsg *tcmsg;
4052 int error;
4053
4054 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
4055 if (!tcmsg) {
4056 return ENODEV;
4057 }
c1c9c9c4
BP
4058 tcmsg->tcm_handle = tc_make_handle(1, 0);
4059 tcmsg->tcm_parent = TC_H_ROOT;
4060
4061 error = tc_transact(&request, NULL);
4062 if (error == EINVAL) {
4063 /* EINVAL probably means that the default qdisc was in use, in which
4064 * case we've accomplished our purpose. */
4065 error = 0;
4066 }
4067 if (!error && netdev_dev->tc) {
4068 if (netdev_dev->tc->ops->tc_destroy) {
4069 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4070 }
4071 netdev_dev->tc = NULL;
4072 }
4073 return error;
4074}
4075
4076/* If 'netdev''s qdisc type and parameters are not yet known, queries the
4077 * kernel to determine what they are. Returns 0 if successful, otherwise a
4078 * positive errno value. */
4079static int
4080tc_query_qdisc(const struct netdev *netdev)
4081{
4082 struct netdev_dev_linux *netdev_dev =
4083 netdev_dev_linux_cast(netdev_get_dev(netdev));
4084 struct ofpbuf request, *qdisc;
4085 const struct tc_ops *ops;
4086 struct tcmsg *tcmsg;
4087 int load_error;
4088 int error;
4089
4090 if (netdev_dev->tc) {
4091 return 0;
4092 }
4093
4094 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4095 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4096 * 2.6.35 without that fix backported to it.
4097 *
4098 * To avoid the OOPS, we must not make a request that would attempt to dump
4099 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4100 * few others. There are a few ways that I can see to do this, but most of
4101 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4102 * technique chosen here is to assume that any non-default qdisc that we
4103 * create will have a class with handle 1:0. The built-in qdiscs only have
4104 * a class with handle 0:0.
4105 *
4106 * We could check for Linux 2.6.35+ and use a more straightforward method
4107 * there. */
4108 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
4109 if (!tcmsg) {
4110 return ENODEV;
4111 }
c1c9c9c4
BP
4112 tcmsg->tcm_handle = tc_make_handle(1, 0);
4113 tcmsg->tcm_parent = 0;
4114
4115 /* Figure out what tc class to instantiate. */
4116 error = tc_transact(&request, &qdisc);
4117 if (!error) {
4118 const char *kind;
4119
4120 error = tc_parse_qdisc(qdisc, &kind, NULL);
4121 if (error) {
4122 ops = &tc_ops_other;
4123 } else {
4124 ops = tc_lookup_linux_name(kind);
4125 if (!ops) {
4126 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4127 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4128
4129 ops = &tc_ops_other;
4130 }
4131 }
4132 } else if (error == ENOENT) {
4133 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4134 * other entity that doesn't have a handle 1:0. We will assume
4135 * that it's the system default qdisc. */
4136 ops = &tc_ops_default;
4137 error = 0;
4138 } else {
4139 /* Who knows? Maybe the device got deleted. */
4140 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4141 netdev_get_name(netdev), strerror(error));
4142 ops = &tc_ops_other;
4143 }
4144
4145 /* Instantiate it. */
ebc56baa 4146 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
cb22974d 4147 ovs_assert((load_error == 0) == (netdev_dev->tc != NULL));
c1c9c9c4
BP
4148 ofpbuf_delete(qdisc);
4149
4150 return error ? error : load_error;
4151}
4152
4153/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4154 approximate the time to transmit packets of various lengths. For an MTU of
4155 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4156 represents two possible packet lengths; for a MTU of 513 through 1024, four
4157 possible lengths; and so on.
4158
4159 Returns, for the specified 'mtu', the number of bits that packet lengths
4160 need to be shifted right to fit within such a 256-entry table. */
4161static int
4162tc_calc_cell_log(unsigned int mtu)
4163{
4164 int cell_log;
4165
4166 if (!mtu) {
4167 mtu = ETH_PAYLOAD_MAX;
4168 }
4169 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4170
4171 for (cell_log = 0; mtu >= 256; cell_log++) {
4172 mtu >>= 1;
4173 }
4174
4175 return cell_log;
4176}
4177
4178/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4179 * of 'mtu'. */
4180static void
4181tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4182{
4183 memset(rate, 0, sizeof *rate);
4184 rate->cell_log = tc_calc_cell_log(mtu);
4185 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4186 /* rate->cell_align = 0; */ /* distro headers. */
4187 rate->mpu = ETH_TOTAL_MIN;
4188 rate->rate = Bps;
4189}
4190
4191/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4192 * attribute of the specified "type".
4193 *
4194 * See tc_calc_cell_log() above for a description of "rtab"s. */
4195static void
4196tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4197{
4198 uint32_t *rtab;
4199 unsigned int i;
4200
4201 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4202 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4203 unsigned packet_size = (i + 1) << rate->cell_log;
4204 if (packet_size < rate->mpu) {
4205 packet_size = rate->mpu;
4206 }
4207 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4208 }
4209}
4210
4211/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4212 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4213 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4214 * 0 is fine.) */
c1c9c9c4
BP
4215static int
4216tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4217{
4218 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4219 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4220}
d3980822 4221\f
aaf2fb1a
BP
4222/* Linux-only functions declared in netdev-linux.h */
4223
025e874a
BP
4224/* Returns a fd for an AF_INET socket or a negative errno value. */
4225int
4226netdev_linux_get_af_inet_sock(void)
4227{
4228 int error = netdev_linux_init();
4229 return error ? -error : af_inet_sock;
4230}
4231
aaf2fb1a
BP
4232/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4233 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4234int
4235netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4236 const char *flag_name, bool enable)
4237{
4238 const char *netdev_name = netdev_get_name(netdev);
4239 struct ethtool_value evalue;
4240 uint32_t new_flags;
4241 int error;
4242
ab985a77 4243 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4244 memset(&evalue, 0, sizeof evalue);
4245 error = netdev_linux_do_ethtool(netdev_name,
4246 (struct ethtool_cmd *)&evalue,
4247 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4248 if (error) {
4249 return error;
4250 }
4251
ab985a77 4252 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
4253 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4254 error = netdev_linux_do_ethtool(netdev_name,
4255 (struct ethtool_cmd *)&evalue,
4256 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4257 if (error) {
4258 return error;
4259 }
4260
ab985a77 4261 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4262 memset(&evalue, 0, sizeof evalue);
4263 error = netdev_linux_do_ethtool(netdev_name,
4264 (struct ethtool_cmd *)&evalue,
4265 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4266 if (error) {
4267 return error;
4268 }
4269
4270 if (new_flags != evalue.data) {
4271 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4272 "device %s failed", enable ? "enable" : "disable",
4273 flag_name, netdev_name);
4274 return EOPNOTSUPP;
4275 }
4276
4277 return 0;
4278}
4279\f
4280/* Utility functions. */
4281
d3980822 4282/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4283static void
d3980822
BP
4284netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4285 const struct rtnl_link_stats *src)
4286{
f613a0d7
PS
4287 dst->rx_packets = src->rx_packets;
4288 dst->tx_packets = src->tx_packets;
4289 dst->rx_bytes = src->rx_bytes;
4290 dst->tx_bytes = src->tx_bytes;
4291 dst->rx_errors = src->rx_errors;
4292 dst->tx_errors = src->tx_errors;
4293 dst->rx_dropped = src->rx_dropped;
4294 dst->tx_dropped = src->tx_dropped;
4295 dst->multicast = src->multicast;
4296 dst->collisions = src->collisions;
4297 dst->rx_length_errors = src->rx_length_errors;
4298 dst->rx_over_errors = src->rx_over_errors;
4299 dst->rx_crc_errors = src->rx_crc_errors;
4300 dst->rx_frame_errors = src->rx_frame_errors;
4301 dst->rx_fifo_errors = src->rx_fifo_errors;
4302 dst->rx_missed_errors = src->rx_missed_errors;
4303 dst->tx_aborted_errors = src->tx_aborted_errors;
4304 dst->tx_carrier_errors = src->tx_carrier_errors;
4305 dst->tx_fifo_errors = src->tx_fifo_errors;
4306 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4307 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4308}
4309
c1c9c9c4
BP
4310static int
4311get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4312{
4313 /* Policy for RTNLGRP_LINK messages.
4314 *
4315 * There are *many* more fields in these messages, but currently we only
4316 * care about these fields. */
4317 static const struct nl_policy rtnlgrp_link_policy[] = {
4318 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4319 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4320 .min_len = sizeof(struct rtnl_link_stats) },
4321 };
4322
4323 struct ofpbuf request;
4324 struct ofpbuf *reply;
4325 struct ifinfomsg *ifi;
c1c9c9c4
BP
4326 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4327 int error;
4328
4329 ofpbuf_init(&request, 0);
4330 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4331 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4332 ifi->ifi_family = PF_UNSPEC;
4333 ifi->ifi_index = ifindex;
4334 error = nl_sock_transact(rtnl_sock, &request, &reply);
4335 ofpbuf_uninit(&request);
4336 if (error) {
4337 return error;
4338 }
4339
4340 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4341 rtnlgrp_link_policy,
4342 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4343 ofpbuf_delete(reply);
4344 return EPROTO;
4345 }
4346
4347 if (!attrs[IFLA_STATS]) {
4348 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4349 ofpbuf_delete(reply);
4350 return EPROTO;
4351 }
8b61709d 4352
d3980822 4353 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4354
576e26d7
BP
4355 ofpbuf_delete(reply);
4356
8b61709d
BP
4357 return 0;
4358}
4359
4360static int
4361get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4362{
4363 static const char fn[] = "/proc/net/dev";
4364 char line[1024];
4365 FILE *stream;
4366 int ln;
4367
4368 stream = fopen(fn, "r");
4369 if (!stream) {
4370 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4371 return errno;
4372 }
4373
4374 ln = 0;
4375 while (fgets(line, sizeof line, stream)) {
4376 if (++ln >= 3) {
4377 char devname[16];
4378#define X64 "%"SCNu64
4379 if (sscanf(line,
4380 " %15[^:]:"
4381 X64 X64 X64 X64 X64 X64 X64 "%*u"
4382 X64 X64 X64 X64 X64 X64 X64 "%*u",
4383 devname,
4384 &stats->rx_bytes,
4385 &stats->rx_packets,
4386 &stats->rx_errors,
4387 &stats->rx_dropped,
4388 &stats->rx_fifo_errors,
4389 &stats->rx_frame_errors,
4390 &stats->multicast,
4391 &stats->tx_bytes,
4392 &stats->tx_packets,
4393 &stats->tx_errors,
4394 &stats->tx_dropped,
4395 &stats->tx_fifo_errors,
4396 &stats->collisions,
4397 &stats->tx_carrier_errors) != 15) {
4398 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4399 } else if (!strcmp(devname, netdev_name)) {
4400 stats->rx_length_errors = UINT64_MAX;
4401 stats->rx_over_errors = UINT64_MAX;
4402 stats->rx_crc_errors = UINT64_MAX;
4403 stats->rx_missed_errors = UINT64_MAX;
4404 stats->tx_aborted_errors = UINT64_MAX;
4405 stats->tx_heartbeat_errors = UINT64_MAX;
4406 stats->tx_window_errors = UINT64_MAX;
4407 fclose(stream);
4408 return 0;
4409 }
4410 }
4411 }
4412 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4413 fclose(stream);
4414 return ENODEV;
4415}
c1c9c9c4 4416
3a183124 4417static int
059e5f4f 4418get_flags(const struct netdev_dev *dev, unsigned int *flags)
8b61709d
BP
4419{
4420 struct ifreq ifr;
4421 int error;
4422
755be9ea
EJ
4423 *flags = 0;
4424 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
149f577a 4425 "SIOCGIFFLAGS");
755be9ea
EJ
4426 if (!error) {
4427 *flags = ifr.ifr_flags;
4428 }
8b61709d
BP
4429 return error;
4430}
4431
4432static int
059e5f4f 4433set_flags(struct netdev *netdev, unsigned int flags)
8b61709d
BP
4434{
4435 struct ifreq ifr;
4436
4437 ifr.ifr_flags = flags;
149f577a
JG
4438 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4439 "SIOCSIFFLAGS");
8b61709d
BP
4440}
4441
4442static int
4443do_get_ifindex(const char *netdev_name)
4444{
4445 struct ifreq ifr;
4446
71d7c22f 4447 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4448 COVERAGE_INC(netdev_get_ifindex);
4449 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4450 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4451 netdev_name, strerror(errno));
4452 return -errno;
4453 }
4454 return ifr.ifr_ifindex;
4455}
4456
4457static int
4458get_ifindex(const struct netdev *netdev_, int *ifindexp)
4459{
149f577a
JG
4460 struct netdev_dev_linux *netdev_dev =
4461 netdev_dev_linux_cast(netdev_get_dev(netdev_));
c7b1b0a5 4462
149f577a 4463 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d 4464 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 4465
8b61709d 4466 if (ifindex < 0) {
c7b1b0a5
PS
4467 netdev_dev->get_ifindex_error = -ifindex;
4468 netdev_dev->ifindex = 0;
4469 } else {
4470 netdev_dev->get_ifindex_error = 0;
4471 netdev_dev->ifindex = ifindex;
8b61709d 4472 }
149f577a 4473 netdev_dev->cache_valid |= VALID_IFINDEX;
8b61709d 4474 }
c7b1b0a5 4475
149f577a 4476 *ifindexp = netdev_dev->ifindex;
c7b1b0a5 4477 return netdev_dev->get_ifindex_error;
8b61709d
BP
4478}
4479
4480static int
4481get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4482{
4483 struct ifreq ifr;
4484 int hwaddr_family;
4485
4486 memset(&ifr, 0, sizeof ifr);
71d7c22f 4487 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4488 COVERAGE_INC(netdev_get_hwaddr);
4489 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4490 /* ENODEV probably means that a vif disappeared asynchronously and
4491 * hasn't been removed from the database yet, so reduce the log level
4492 * to INFO for that case. */
4493 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4494 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4495 netdev_name, strerror(errno));
8b61709d
BP
4496 return errno;
4497 }
4498 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4499 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4500 VLOG_WARN("%s device has unknown hardware address family %d",
4501 netdev_name, hwaddr_family);
4502 }
4503 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4504 return 0;
4505}
4506
4507static int
44445cac 4508set_etheraddr(const char *netdev_name,
8b61709d
BP
4509 const uint8_t mac[ETH_ADDR_LEN])
4510{
4511 struct ifreq ifr;
4512
4513 memset(&ifr, 0, sizeof ifr);
71d7c22f 4514 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 4515 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
4516 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4517 COVERAGE_INC(netdev_set_hwaddr);
4518 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4519 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4520 netdev_name, strerror(errno));
4521 return errno;
4522 }
4523 return 0;
4524}
4525
4526static int
0b0544d7 4527netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4528 int cmd, const char *cmd_name)
4529{
4530 struct ifreq ifr;
4531
4532 memset(&ifr, 0, sizeof ifr);
71d7c22f 4533 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4534 ifr.ifr_data = (caddr_t) ecmd;
4535
4536 ecmd->cmd = cmd;
8b61709d
BP
4537 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4538 return 0;
4539 } else {
4540 if (errno != EOPNOTSUPP) {
4541 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4542 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4543 } else {
4544 /* The device doesn't support this operation. That's pretty
4545 * common, so there's no point in logging anything. */
4546 }
4547 return errno;
4548 }
4549}
4550
4551static int
149f577a
JG
4552netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4553 const char *cmd_name)
8b61709d 4554{
71d7c22f 4555 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4556 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4557 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4558 strerror(errno));
8b61709d
BP
4559 return errno;
4560 }
4561 return 0;
4562}
f1acd62b
BP
4563
4564static int
4565netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4566 int cmd, const char *cmd_name)
4567{
4568 struct ifreq ifr;
4569 int error;
4570
4571 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4572 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4573 if (!error) {
4574 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4575 *ip = sin->sin_addr;
4576 }
4577 return error;
4578}
488d734d
BP
4579
4580/* Returns an AF_PACKET raw socket or a negative errno value. */
4581static int
4582af_packet_sock(void)
4583{
4584 static int sock = INT_MIN;
4585
4586 if (sock == INT_MIN) {
4587 sock = socket(AF_PACKET, SOCK_RAW, 0);
4588 if (sock >= 0) {
4589 set_nonblocking(sock);
4590 } else {
4591 sock = -errno;
4592 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4593 }
4594 }
4595
4596 return sock;
4597}