]> git.proxmox.com Git - ovs.git/blame - lib/netdev-linux.c
meta-flow: Simplify mf_from_ofp_port_string()
[ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
275707c3 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d
BP
22#include <fcntl.h>
23#include <arpa/inet.h>
24#include <inttypes.h>
c1c9c9c4 25#include <linux/gen_stats.h>
bb7d0e22 26#include <linux/if_ether.h>
8b61709d
BP
27#include <linux/if_tun.h>
28#include <linux/types.h>
29#include <linux/ethtool.h>
63331829 30#include <linux/mii.h>
f8500004 31#include <linux/pkt_cls.h>
6f42c8ea 32#include <linux/pkt_sched.h>
e9e28be3 33#include <linux/rtnetlink.h>
8b61709d
BP
34#include <linux/sockios.h>
35#include <linux/version.h>
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
39#include <netpacket/packet.h>
8b61709d
BP
40#include <net/if.h>
41#include <net/if_arp.h>
42#include <net/if_packet.h>
43#include <net/route.h>
44#include <netinet/in.h>
e9e28be3 45#include <poll.h>
8b61709d
BP
46#include <stdlib.h>
47#include <string.h>
48#include <unistd.h>
e9e28be3
BP
49
50#include "coverage.h"
9fe3b9a2 51#include "dpif-linux.h"
8b61709d
BP
52#include "dynamic-string.h"
53#include "fatal-signal.h"
93b13be8
BP
54#include "hash.h"
55#include "hmap.h"
8b61709d 56#include "netdev-provider.h"
7fbef77a 57#include "netdev-vport.h"
45c8d3a1 58#include "netlink-notifier.h"
2fe27d5a 59#include "netlink-socket.h"
c060c4cf 60#include "netlink.h"
e9e28be3 61#include "ofpbuf.h"
8b61709d
BP
62#include "openflow/openflow.h"
63#include "packets.h"
64#include "poll-loop.h"
21d6e22e 65#include "rtnetlink-link.h"
8b61709d 66#include "shash.h"
c060c4cf 67#include "socket-util.h"
19993ef3 68#include "sset.h"
1670c579 69#include "timer.h"
c060c4cf 70#include "unaligned.h"
e9e28be3 71#include "vlog.h"
5136ce49 72
d98e6007 73VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 74
d76f09ea
BP
75COVERAGE_DEFINE(netdev_set_policing);
76COVERAGE_DEFINE(netdev_arp_lookup);
77COVERAGE_DEFINE(netdev_get_ifindex);
78COVERAGE_DEFINE(netdev_get_hwaddr);
79COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
80COVERAGE_DEFINE(netdev_get_ethtool);
81COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 82
8b61709d
BP
83\f
84/* These were introduced in Linux 2.6.14, so they might be missing if we have
85 * old headers. */
86#ifndef ADVERTISED_Pause
87#define ADVERTISED_Pause (1 << 13)
88#endif
89#ifndef ADVERTISED_Asym_Pause
90#define ADVERTISED_Asym_Pause (1 << 14)
91#endif
92
e47bd51a
JP
93/* These were introduced in Linux 2.6.24, so they might be missing if we
94 * have old headers. */
95#ifndef ETHTOOL_GFLAGS
96#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
97#endif
98#ifndef ETHTOOL_SFLAGS
99#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100#endif
101
c1c9c9c4
BP
102/* This was introduced in Linux 2.6.25, so it might be missing if we have old
103 * headers. */
104#ifndef TC_RTAB_SIZE
105#define TC_RTAB_SIZE 1024
106#endif
107
2ee6545f 108static struct nln_notifier *netdev_linux_cache_notifier = NULL;
46415c90 109static int cache_notifier_refcount;
8b61709d
BP
110
111enum {
7fbef77a
JG
112 VALID_IFINDEX = 1 << 0,
113 VALID_ETHERADDR = 1 << 1,
114 VALID_IN4 = 1 << 2,
115 VALID_IN6 = 1 << 3,
116 VALID_MTU = 1 << 4,
3a183124 117 VALID_POLICING = 1 << 5,
4f925bd3
PS
118 VALID_VPORT_STAT_ERROR = 1 << 6,
119 VALID_DRVINFO = 1 << 7,
51f87458 120 VALID_FEATURES = 1 << 8,
8b61709d
BP
121};
122
149f577a
JG
123struct tap_state {
124 int fd;
61b999dd 125 bool opened;
149f577a 126};
c1c9c9c4
BP
127\f
128/* Traffic control. */
129
130/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
131 * network device.
132 *
133 * Each TC implementation subclasses this with whatever additional data it
134 * needs. */
c1c9c9c4
BP
135struct tc {
136 const struct tc_ops *ops;
93b13be8
BP
137 struct hmap queues; /* Contains "struct tc_queue"s.
138 * Read by generic TC layer.
139 * Written only by TC implementation. */
140};
c1c9c9c4 141
559eb230
BP
142#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
143
93b13be8
BP
144/* One traffic control queue.
145 *
146 * Each TC implementation subclasses this with whatever additional data it
147 * needs. */
148struct tc_queue {
149 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
150 unsigned int queue_id; /* OpenFlow queue ID. */
c1c9c9c4
BP
151};
152
153/* A particular kind of traffic control. Each implementation generally maps to
154 * one particular Linux qdisc class.
155 *
156 * The functions below return 0 if successful or a positive errno value on
157 * failure, except where otherwise noted. All of them must be provided, except
158 * where otherwise noted. */
159struct tc_ops {
160 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
161 * This is null for tc_ops_default and tc_ops_other, for which there are no
162 * appropriate values. */
163 const char *linux_name;
164
165 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
166 const char *ovs_name;
167
168 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
169 * queues. The queues are numbered 0 through n_queues - 1. */
170 unsigned int n_queues;
171
172 /* Called to install this TC class on 'netdev'. The implementation should
173 * make the Netlink calls required to set up 'netdev' with the right qdisc
174 * and configure it according to 'details'. The implementation may assume
175 * that the current qdisc is the default; that is, there is no need for it
176 * to delete the current qdisc before installing itself.
177 *
178 * The contents of 'details' should be documented as valid for 'ovs_name'
179 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
180 * (which is built as ovs-vswitchd.conf.db(8)).
181 *
182 * This function must return 0 if and only if it sets 'netdev->tc' to an
183 * initialized 'struct tc'.
184 *
185 * (This function is null for tc_ops_other, which cannot be installed. For
186 * other TC classes it should always be nonnull.) */
79f1cbe9 187 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
188
189 /* Called when the netdev code determines (through a Netlink query) that
190 * this TC class's qdisc is installed on 'netdev', but we didn't install
191 * it ourselves and so don't know any of the details.
192 *
193 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
194 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
195 * implementation should parse the other attributes of 'nlmsg' as
196 * necessary to determine its configuration. If necessary it should also
197 * use Netlink queries to determine the configuration of queues on
198 * 'netdev'.
199 *
200 * This function must return 0 if and only if it sets 'netdev->tc' to an
201 * initialized 'struct tc'. */
202 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
203
204 /* Destroys the data structures allocated by the implementation as part of
205 * 'tc'. (This includes destroying 'tc->queues' by calling
206 * tc_destroy(tc).
207 *
208 * The implementation should not need to perform any Netlink calls. If
209 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
210 * (But it may not be desirable.)
211 *
212 * This function may be null if 'tc' is trivial. */
213 void (*tc_destroy)(struct tc *tc);
214
215 /* Retrieves details of 'netdev->tc' configuration into 'details'.
216 *
217 * The implementation should not need to perform any Netlink calls, because
218 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
219 * cached the configuration.
220 *
221 * The contents of 'details' should be documented as valid for 'ovs_name'
222 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
223 * (which is built as ovs-vswitchd.conf.db(8)).
224 *
225 * This function may be null if 'tc' is not configurable.
226 */
79f1cbe9 227 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
228
229 /* Reconfigures 'netdev->tc' according to 'details', performing any
230 * required Netlink calls to complete the reconfiguration.
231 *
232 * The contents of 'details' should be documented as valid for 'ovs_name'
233 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
234 * (which is built as ovs-vswitchd.conf.db(8)).
235 *
236 * This function may be null if 'tc' is not configurable.
237 */
79f1cbe9 238 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 239
93b13be8
BP
240 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
241 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
242 *
243 * The contents of 'details' should be documented as valid for 'ovs_name'
244 * in the "other_config" column in the "Queue" table in
245 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
246 *
247 * The implementation should not need to perform any Netlink calls, because
248 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
249 * cached the queue configuration.
250 *
251 * This function may be null if 'tc' does not have queues ('n_queues' is
252 * 0). */
93b13be8 253 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 254 struct smap *details);
c1c9c9c4
BP
255
256 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
257 * 'details', perfoming any required Netlink calls to complete the
258 * reconfiguration. The caller ensures that 'queue_id' is less than
259 * 'n_queues'.
260 *
261 * The contents of 'details' should be documented as valid for 'ovs_name'
262 * in the "other_config" column in the "Queue" table in
263 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
264 *
265 * This function may be null if 'tc' does not have queues or its queues are
266 * not configurable. */
267 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 268 const struct smap *details);
c1c9c9c4 269
93b13be8
BP
270 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
271 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
272 *
273 * This function may be null if 'tc' does not have queues or its queues
274 * cannot be deleted. */
93b13be8 275 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 276
93b13be8
BP
277 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
278 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
279 *
280 * On success, initializes '*stats'.
281 *
282 * This function may be null if 'tc' does not have queues or if it cannot
283 * report queue statistics. */
93b13be8
BP
284 int (*class_get_stats)(const struct netdev *netdev,
285 const struct tc_queue *queue,
c1c9c9c4
BP
286 struct netdev_queue_stats *stats);
287
288 /* Extracts queue stats from 'nlmsg', which is a response to a
289 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
290 *
291 * This function may be null if 'tc' does not have queues or if it cannot
292 * report queue statistics. */
293 int (*class_dump_stats)(const struct netdev *netdev,
294 const struct ofpbuf *nlmsg,
295 netdev_dump_queue_stats_cb *cb, void *aux);
296};
297
298static void
299tc_init(struct tc *tc, const struct tc_ops *ops)
300{
301 tc->ops = ops;
93b13be8 302 hmap_init(&tc->queues);
c1c9c9c4
BP
303}
304
305static void
306tc_destroy(struct tc *tc)
307{
93b13be8 308 hmap_destroy(&tc->queues);
c1c9c9c4
BP
309}
310
311static const struct tc_ops tc_ops_htb;
a339aa81 312static const struct tc_ops tc_ops_hfsc;
c1c9c9c4
BP
313static const struct tc_ops tc_ops_default;
314static const struct tc_ops tc_ops_other;
315
559eb230 316static const struct tc_ops *const tcs[] = {
c1c9c9c4 317 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 318 &tc_ops_hfsc, /* Hierarchical fair service curve. */
c1c9c9c4
BP
319 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
320 &tc_ops_other, /* Some other qdisc. */
321 NULL
322};
149f577a 323
c1c9c9c4
BP
324static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
325static unsigned int tc_get_major(unsigned int handle);
326static unsigned int tc_get_minor(unsigned int handle);
327
328static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
329static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
330static unsigned int tc_buffer_per_jiffy(unsigned int rate);
331
332static struct tcmsg *tc_make_request(const struct netdev *, int type,
333 unsigned int flags, struct ofpbuf *);
334static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004
JP
335static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
336static int tc_add_policer(struct netdev *netdev, int kbits_rate,
337 int kbits_burst);
c1c9c9c4
BP
338
339static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
340 struct nlattr **options);
341static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
342 struct nlattr **options,
343 struct netdev_queue_stats *);
344static int tc_query_class(const struct netdev *,
345 unsigned int handle, unsigned int parent,
346 struct ofpbuf **replyp);
347static int tc_delete_class(const struct netdev *, unsigned int handle);
348
349static int tc_del_qdisc(struct netdev *netdev);
350static int tc_query_qdisc(const struct netdev *netdev);
351
352static int tc_calc_cell_log(unsigned int mtu);
353static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
354static void tc_put_rtab(struct ofpbuf *, uint16_t type,
355 const struct tc_ratespec *rate);
356static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
357\f
149f577a
JG
358struct netdev_dev_linux {
359 struct netdev_dev netdev_dev;
360
8b61709d 361 struct shash_node *shash_node;
149f577a 362 unsigned int cache_valid;
ac4d3bcb 363 unsigned int change_seq;
8b61709d 364
1670c579
EJ
365 bool miimon; /* Link status of last poll. */
366 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
367 struct timer miimon_timer;
368
8722022c
BP
369 /* The following are figured out "on demand" only. They are only valid
370 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d
BP
371 int ifindex;
372 uint8_t etheraddr[ETH_ADDR_LEN];
f1acd62b 373 struct in_addr address, netmask;
8b61709d
BP
374 struct in6_addr in6;
375 int mtu;
059e5f4f 376 unsigned int ifi_flags;
65c3058c 377 long long int carrier_resets;
80a86fbe
BP
378 uint32_t kbits_rate; /* Policing data. */
379 uint32_t kbits_burst;
bba1e6f3
PS
380 int vport_stats_error; /* Cached error code from vport_get_stats().
381 0 or an errno value. */
90a6637d 382 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 383 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 384 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 385 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 386 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 387
a00ca915
EJ
388 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
389 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
390 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
391 enum netdev_features peer; /* Cached from ETHTOOL_GSET. */
90a6637d 392
4f925bd3 393 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 394 struct tc *tc;
149f577a
JG
395
396 union {
397 struct tap_state tap;
398 } state;
8b61709d
BP
399};
400
149f577a
JG
401struct netdev_linux {
402 struct netdev netdev;
5b7448ed 403 int fd;
149f577a 404};
8b61709d 405
76c308b5
BP
406/* Sockets used for ioctl operations. */
407static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
8b61709d 408
ff4ed3c9
BP
409/* A Netlink routing socket that is not subscribed to any multicast groups. */
410static struct nl_sock *rtnl_sock;
411
8b61709d
BP
412/* This is set pretty low because we probably won't learn anything from the
413 * additional log messages. */
414static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
415
15b3596a 416static int netdev_linux_init(void);
6f643e49 417
0b0544d7 418static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 419 int cmd, const char *cmd_name);
149f577a
JG
420static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
421 const char *cmd_name);
f1acd62b
BP
422static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
423 int cmd, const char *cmd_name);
059e5f4f 424static int get_flags(const struct netdev_dev *, unsigned int *flags);
4b609110 425static int set_flags(const char *, unsigned int flags);
8b61709d
BP
426static int do_get_ifindex(const char *netdev_name);
427static int get_ifindex(const struct netdev *, int *ifindexp);
428static int do_set_addr(struct netdev *netdev,
429 int ioctl_nr, const char *ioctl_name,
430 struct in_addr addr);
431static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
44445cac 432static int set_etheraddr(const char *netdev_name, const uint8_t[ETH_ADDR_LEN]);
8b61709d
BP
433static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
434static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
488d734d 435static int af_packet_sock(void);
1670c579
EJ
436static void netdev_linux_miimon_run(void);
437static void netdev_linux_miimon_wait(void);
8b61709d 438
15b3596a
JG
439static bool
440is_netdev_linux_class(const struct netdev_class *netdev_class)
441{
442 return netdev_class->init == netdev_linux_init;
443}
444
149f577a
JG
445static struct netdev_dev_linux *
446netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
6c88d577 447{
15b3596a 448 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
cb22974d 449 ovs_assert(is_netdev_linux_class(netdev_class));
15b3596a 450
149f577a 451 return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
6c88d577
JP
452}
453
8b61709d
BP
454static struct netdev_linux *
455netdev_linux_cast(const struct netdev *netdev)
456{
15b3596a
JG
457 struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
458 const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
cb22974d 459 ovs_assert(is_netdev_linux_class(netdev_class));
15b3596a 460
8b61709d
BP
461 return CONTAINER_OF(netdev, struct netdev_linux, netdev);
462}
ff4ed3c9 463\f
8b61709d
BP
464static int
465netdev_linux_init(void)
466{
467 static int status = -1;
468 if (status < 0) {
ff4ed3c9 469 /* Create AF_INET socket. */
8b61709d
BP
470 af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
471 status = af_inet_sock >= 0 ? 0 : errno;
472 if (status) {
473 VLOG_ERR("failed to create inet socket: %s", strerror(status));
474 }
ff4ed3c9
BP
475
476 /* Create rtnetlink socket. */
477 if (!status) {
cceb11f5 478 status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
ff4ed3c9
BP
479 if (status) {
480 VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
481 strerror(status));
482 }
483 }
8b61709d
BP
484 }
485 return status;
486}
487
488static void
489netdev_linux_run(void)
490{
18a23781 491 rtnetlink_link_run();
1670c579 492 netdev_linux_miimon_run();
8b61709d
BP
493}
494
495static void
496netdev_linux_wait(void)
497{
18a23781 498 rtnetlink_link_wait();
1670c579 499 netdev_linux_miimon_wait();
8b61709d
BP
500}
501
ac4d3bcb 502static void
4f925bd3
PS
503netdev_dev_linux_changed(struct netdev_dev_linux *dev,
504 unsigned int ifi_flags,
505 unsigned int mask)
ac4d3bcb
EJ
506{
507 dev->change_seq++;
508 if (!dev->change_seq) {
509 dev->change_seq++;
510 }
8aa77183
BP
511
512 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
513 dev->carrier_resets++;
514 }
515 dev->ifi_flags = ifi_flags;
516
4f925bd3
PS
517 dev->cache_valid &= mask;
518}
519
520static void
521netdev_dev_linux_update(struct netdev_dev_linux *dev,
522 const struct rtnetlink_link_change *change)
523{
524 if (change->nlmsg_type == RTM_NEWLINK) {
525 /* Keep drv-info */
526 netdev_dev_linux_changed(dev, change->ifi_flags, VALID_DRVINFO);
90a6637d 527
c7b1b0a5 528 /* Update netdev from rtnl-change msg. */
90a6637d
PS
529 if (change->mtu) {
530 dev->mtu = change->mtu;
531 dev->cache_valid |= VALID_MTU;
532 dev->netdev_mtu_error = 0;
533 }
534
44445cac
PS
535 if (!eth_addr_is_zero(change->addr)) {
536 memcpy(dev->etheraddr, change->addr, ETH_ADDR_LEN);
537 dev->cache_valid |= VALID_ETHERADDR;
538 dev->ether_addr_error = 0;
539 }
540
c7b1b0a5
PS
541 dev->ifindex = change->ifi_index;
542 dev->cache_valid |= VALID_IFINDEX;
543 dev->get_ifindex_error = 0;
544
4f925bd3
PS
545 } else {
546 netdev_dev_linux_changed(dev, change->ifi_flags, 0);
547 }
ac4d3bcb
EJ
548}
549
8b61709d 550static void
21d6e22e 551netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
67a4917b 552 void *aux OVS_UNUSED)
8b61709d 553{
149f577a 554 struct netdev_dev_linux *dev;
8b61709d 555 if (change) {
46415c90
JG
556 struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
557 if (base_dev) {
15b3596a
JG
558 const struct netdev_class *netdev_class =
559 netdev_dev_get_class(base_dev);
560
561 if (is_netdev_linux_class(netdev_class)) {
562 dev = netdev_dev_linux_cast(base_dev);
4f925bd3 563 netdev_dev_linux_update(dev, change);
15b3596a 564 }
8b61709d
BP
565 }
566 } else {
46415c90 567 struct shash device_shash;
8b61709d 568 struct shash_node *node;
46415c90
JG
569
570 shash_init(&device_shash);
571 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
572 SHASH_FOR_EACH (node, &device_shash) {
059e5f4f 573 unsigned int flags;
3a183124 574
149f577a 575 dev = node->data;
3a183124 576
755be9ea 577 get_flags(&dev->netdev_dev, &flags);
4f925bd3 578 netdev_dev_linux_changed(dev, flags, 0);
8b61709d 579 }
46415c90 580 shash_destroy(&device_shash);
8b61709d
BP
581 }
582}
583
584static int
1f6e0fbd 585cache_notifier_ref(void)
6c88d577 586{
46415c90 587 if (!cache_notifier_refcount) {
cb22974d 588 ovs_assert(!netdev_linux_cache_notifier);
2ee6545f
EJ
589
590 netdev_linux_cache_notifier =
591 rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
592
593 if (!netdev_linux_cache_notifier) {
594 return EINVAL;
149f577a
JG
595 }
596 }
46415c90 597 cache_notifier_refcount++;
6c88d577 598
1f6e0fbd
BP
599 return 0;
600}
601
602static void
603cache_notifier_unref(void)
604{
cb22974d 605 ovs_assert(cache_notifier_refcount > 0);
1f6e0fbd 606 if (!--cache_notifier_refcount) {
cb22974d 607 ovs_assert(netdev_linux_cache_notifier);
1f6e0fbd
BP
608 rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
609 netdev_linux_cache_notifier = NULL;
610 }
611}
612
613/* Creates system and internal devices. */
614static int
615netdev_linux_create(const struct netdev_class *class, const char *name,
616 struct netdev_dev **netdev_devp)
617{
618 struct netdev_dev_linux *netdev_dev;
619 int error;
620
621 error = cache_notifier_ref();
622 if (error) {
623 return error;
624 }
625
149f577a 626 netdev_dev = xzalloc(sizeof *netdev_dev);
ac4d3bcb 627 netdev_dev->change_seq = 1;
de5cdb90 628 netdev_dev_init(&netdev_dev->netdev_dev, name, class);
c37d4da4 629 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
46415c90 630
149f577a 631 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
632 return 0;
633}
634
5b7448ed
JG
635/* For most types of netdevs we open the device for each call of
636 * netdev_open(). However, this is not the case with tap devices,
637 * since it is only possible to open the device once. In this
638 * situation we share a single file descriptor, and consequently
639 * buffers, across all readers. Therefore once data is read it will
640 * be unavailable to other reads for tap devices. */
a740f0de 641static int
b8dcf5e9 642netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
de5cdb90 643 const char *name, struct netdev_dev **netdev_devp)
a740f0de 644{
149f577a 645 struct netdev_dev_linux *netdev_dev;
a740f0de
JG
646 struct tap_state *state;
647 static const char tap_dev[] = "/dev/net/tun";
648 struct ifreq ifr;
649 int error;
650
149f577a
JG
651 netdev_dev = xzalloc(sizeof *netdev_dev);
652 state = &netdev_dev->state.tap;
a740f0de 653
1f6e0fbd
BP
654 error = cache_notifier_ref();
655 if (error) {
656 goto error;
657 }
658
6c88d577 659 /* Open tap device. */
149f577a
JG
660 state->fd = open(tap_dev, O_RDWR);
661 if (state->fd < 0) {
6c88d577
JP
662 error = errno;
663 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
1f6e0fbd 664 goto error_unref_notifier;
6c88d577
JP
665 }
666
667 /* Create tap device. */
668 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 669 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
149f577a 670 if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
6c88d577
JP
671 VLOG_WARN("%s: creating tap device failed: %s", name,
672 strerror(errno));
673 error = errno;
1f6e0fbd 674 goto error_unref_notifier;
6c88d577
JP
675 }
676
677 /* Make non-blocking. */
149f577a 678 error = set_nonblocking(state->fd);
a740f0de 679 if (error) {
1f6e0fbd 680 goto error_unref_notifier;
a740f0de
JG
681 }
682
de5cdb90 683 netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
149f577a 684 *netdev_devp = &netdev_dev->netdev_dev;
a740f0de
JG
685 return 0;
686
1f6e0fbd
BP
687error_unref_notifier:
688 cache_notifier_unref();
a740f0de 689error:
149f577a 690 free(netdev_dev);
a740f0de
JG
691 return error;
692}
693
a740f0de 694static void
149f577a 695destroy_tap(struct netdev_dev_linux *netdev_dev)
a740f0de 696{
149f577a
JG
697 struct tap_state *state = &netdev_dev->state.tap;
698
699 if (state->fd >= 0) {
700 close(state->fd);
a740f0de
JG
701 }
702}
703
149f577a 704/* Destroys the netdev device 'netdev_dev_'. */
6c88d577 705static void
149f577a 706netdev_linux_destroy(struct netdev_dev *netdev_dev_)
6c88d577 707{
149f577a 708 struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
d2bb2799 709 const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
6c88d577 710
c1c9c9c4
BP
711 if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
712 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
713 }
714
1f6e0fbd 715 if (class == &netdev_tap_class) {
149f577a 716 destroy_tap(netdev_dev);
6c88d577 717 }
658797c8 718 free(netdev_dev);
1f6e0fbd
BP
719
720 cache_notifier_unref();
6c88d577
JP
721}
722
8b61709d 723static int
7b6b0ef4 724netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
8b61709d
BP
725{
726 struct netdev_linux *netdev;
727 enum netdev_flags flags;
728 int error;
729
730 /* Allocate network device. */
ec6fde61 731 netdev = xzalloc(sizeof *netdev);
49a6a163 732 netdev->fd = -1;
5b7448ed 733 netdev_init(&netdev->netdev, netdev_dev_);
8b61709d 734
c3827f61
BP
735 /* Verify that the device really exists, by attempting to read its flags.
736 * (The flags might be cached, in which case this won't actually do an
737 * ioctl.)
738 *
739 * Don't do this for "internal" netdevs, though, because those have to be
740 * created as netdev objects before they exist in the kernel, because
741 * creating them in the kernel happens by passing a netdev object to
742 * dpif_port_add(). */
743 if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
744 error = netdev_get_flags(&netdev->netdev, &flags);
745 if (error == ENODEV) {
746 goto error;
747 }
8b61709d
BP
748 }
749
8b61709d
BP
750 *netdevp = &netdev->netdev;
751 return 0;
752
753error:
149f577a 754 netdev_uninit(&netdev->netdev, true);
8b61709d
BP
755 return error;
756}
757
758/* Closes and destroys 'netdev'. */
759static void
760netdev_linux_close(struct netdev *netdev_)
761{
762 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
763
49a6a163 764 if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
5b7448ed 765 close(netdev->fd);
8b61709d
BP
766 }
767 free(netdev);
768}
e9e28be3 769
7b6b0ef4
BP
770static int
771netdev_linux_listen(struct netdev *netdev_)
772{
773 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
33d82a56
JP
774 struct netdev_dev_linux *netdev_dev =
775 netdev_dev_linux_cast(netdev_get_dev(netdev_));
7b6b0ef4
BP
776 struct sockaddr_ll sll;
777 int ifindex;
778 int error;
779 int fd;
780
781 if (netdev->fd >= 0) {
782 return 0;
783 }
784
33d82a56
JP
785 if (!strcmp(netdev_get_type(netdev_), "tap")
786 && !netdev_dev->state.tap.opened) {
787 netdev->fd = netdev_dev->state.tap.fd;
788 netdev_dev->state.tap.opened = true;
789 return 0;
790 }
791
7b6b0ef4
BP
792 /* Create file descriptor. */
793 fd = socket(PF_PACKET, SOCK_RAW, 0);
794 if (fd < 0) {
795 error = errno;
796 VLOG_ERR("failed to create raw socket (%s)", strerror(error));
797 goto error;
798 }
799
800 /* Set non-blocking mode. */
801 error = set_nonblocking(fd);
802 if (error) {
803 goto error;
804 }
805
806 /* Get ethernet device index. */
807 error = get_ifindex(&netdev->netdev, &ifindex);
808 if (error) {
809 goto error;
810 }
811
812 /* Bind to specific ethernet device. */
813 memset(&sll, 0, sizeof sll);
814 sll.sll_family = AF_PACKET;
815 sll.sll_ifindex = ifindex;
816 sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
817 if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
818 error = errno;
819 VLOG_ERR("%s: failed to bind raw socket (%s)",
820 netdev_get_name(netdev_), strerror(error));
821 goto error;
822 }
823
824 netdev->fd = fd;
825 return 0;
826
827error:
828 if (fd >= 0) {
829 close(fd);
830 }
831 return error;
832}
833
8b61709d
BP
834static int
835netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
836{
837 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
838
5b7448ed 839 if (netdev->fd < 0) {
7b6b0ef4 840 /* Device is not listening. */
c0e5f6ca 841 return -EAGAIN;
8b61709d
BP
842 }
843
844 for (;;) {
8e8cddf7
BP
845 ssize_t retval;
846
847 retval = (netdev_->netdev_dev->netdev_class == &netdev_tap_class
848 ? read(netdev->fd, data, size)
849 : recv(netdev->fd, data, size, MSG_TRUNC));
0e15264f
BP
850 if (retval >= 0) {
851 return retval <= size ? retval : -EMSGSIZE;
8b61709d
BP
852 } else if (errno != EINTR) {
853 if (errno != EAGAIN) {
854 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
855 strerror(errno), netdev_get_name(netdev_));
856 }
c0e5f6ca 857 return -errno;
8b61709d
BP
858 }
859 }
860}
861
862/* Registers with the poll loop to wake up from the next call to poll_block()
863 * when a packet is ready to be received with netdev_recv() on 'netdev'. */
864static void
865netdev_linux_recv_wait(struct netdev *netdev_)
866{
867 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed
JG
868 if (netdev->fd >= 0) {
869 poll_fd_wait(netdev->fd, POLLIN);
8b61709d
BP
870 }
871}
872
873/* Discards all packets waiting to be received from 'netdev'. */
874static int
875netdev_linux_drain(struct netdev *netdev_)
876{
877 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 878 if (netdev->fd < 0) {
8b61709d 879 return 0;
5b7448ed 880 } else if (!strcmp(netdev_get_type(netdev_), "tap")) {
8b61709d 881 struct ifreq ifr;
149f577a 882 int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
8b61709d
BP
883 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
884 if (error) {
885 return error;
886 }
5b7448ed 887 drain_fd(netdev->fd, ifr.ifr_qlen);
8b61709d
BP
888 return 0;
889 } else {
5b7448ed 890 return drain_rcvbuf(netdev->fd);
8b61709d
BP
891 }
892}
893
894/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
895 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
896 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
897 * the packet is too big or too small to transmit on the device.
898 *
899 * The caller retains ownership of 'buffer' in all cases.
900 *
901 * The kernel maintains a packet transmission queue, so the caller is not
902 * expected to do additional queuing of packets. */
903static int
904netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
905{
f23347ea
BP
906 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
907 for (;;) {
908 ssize_t retval;
8b61709d 909
f23347ea
BP
910 if (netdev->fd < 0) {
911 /* Use our AF_PACKET socket to send to this device. */
912 struct sockaddr_ll sll;
913 struct msghdr msg;
914 struct iovec iov;
915 int ifindex;
916 int error;
488d734d
BP
917 int sock;
918
919 sock = af_packet_sock();
920 if (sock < 0) {
c4c7a3d7 921 return -sock;
488d734d 922 }
f23347ea
BP
923
924 error = get_ifindex(netdev_, &ifindex);
925 if (error) {
926 return error;
927 }
8b61709d 928
f23347ea
BP
929 /* We don't bother setting most fields in sockaddr_ll because the
930 * kernel ignores them for SOCK_RAW. */
931 memset(&sll, 0, sizeof sll);
932 sll.sll_family = AF_PACKET;
933 sll.sll_ifindex = ifindex;
76c308b5 934
ebc56baa 935 iov.iov_base = CONST_CAST(void *, data);
f23347ea 936 iov.iov_len = size;
76c308b5 937
f23347ea
BP
938 msg.msg_name = &sll;
939 msg.msg_namelen = sizeof sll;
940 msg.msg_iov = &iov;
941 msg.msg_iovlen = 1;
942 msg.msg_control = NULL;
943 msg.msg_controllen = 0;
944 msg.msg_flags = 0;
945
488d734d 946 retval = sendmsg(sock, &msg, 0);
f23347ea
BP
947 } else {
948 /* Use the netdev's own fd to send to this device. This is
949 * essential for tap devices, because packets sent to a tap device
950 * with an AF_PACKET socket will loop back to be *received* again
951 * on the tap device. */
952 retval = write(netdev->fd, data, size);
953 }
76c308b5 954
8b61709d
BP
955 if (retval < 0) {
956 /* The Linux AF_PACKET implementation never blocks waiting for room
957 * for packets, instead returning ENOBUFS. Translate this into
958 * EAGAIN for the caller. */
959 if (errno == ENOBUFS) {
960 return EAGAIN;
961 } else if (errno == EINTR) {
962 continue;
963 } else if (errno != EAGAIN) {
964 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
965 netdev_get_name(netdev_), strerror(errno));
966 }
967 return errno;
968 } else if (retval != size) {
969 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
970 "%zu) on %s", retval, size, netdev_get_name(netdev_));
971 return EMSGSIZE;
972 } else {
973 return 0;
974 }
975 }
976}
977
978/* Registers with the poll loop to wake up from the next call to poll_block()
979 * when the packet transmission queue has sufficient room to transmit a packet
980 * with netdev_send().
981 *
982 * The kernel maintains a packet transmission queue, so the client is not
983 * expected to do additional queuing of packets. Thus, this function is
984 * unlikely to ever be used. It is included for completeness. */
985static void
986netdev_linux_send_wait(struct netdev *netdev_)
987{
988 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5b7448ed 989 if (netdev->fd < 0) {
8b61709d 990 /* Nothing to do. */
5b7448ed
JG
991 } else if (strcmp(netdev_get_type(netdev_), "tap")) {
992 poll_fd_wait(netdev->fd, POLLOUT);
8b61709d
BP
993 } else {
994 /* TAP device always accepts packets.*/
995 poll_immediate_wake();
996 }
997}
998
999/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1000 * otherwise a positive errno value. */
1001static int
1002netdev_linux_set_etheraddr(struct netdev *netdev_,
1003 const uint8_t mac[ETH_ADDR_LEN])
1004{
149f577a
JG
1005 struct netdev_dev_linux *netdev_dev =
1006 netdev_dev_linux_cast(netdev_get_dev(netdev_));
4b609110 1007 struct netdev_saved_flags *sf = NULL;
eb395f2e
BP
1008 int error;
1009
44445cac
PS
1010 if (netdev_dev->cache_valid & VALID_ETHERADDR) {
1011 if (netdev_dev->ether_addr_error) {
1012 return netdev_dev->ether_addr_error;
1013 }
1014 if (eth_addr_equals(netdev_dev->etheraddr, mac)) {
1015 return 0;
1016 }
1017 netdev_dev->cache_valid &= ~VALID_ETHERADDR;
1018 }
1019
7eb1bd81
JP
1020 /* Tap devices must be brought down before setting the address. */
1021 if (!strcmp(netdev_get_type(netdev_), "tap")) {
1022 enum netdev_flags flags;
1023
1024 if (!netdev_get_flags(netdev_, &flags) && (flags & NETDEV_UP)) {
4b609110 1025 netdev_turn_flags_off(netdev_, NETDEV_UP, &sf);
7eb1bd81
JP
1026 }
1027 }
44445cac
PS
1028 error = set_etheraddr(netdev_get_name(netdev_), mac);
1029 if (!error || error == ENODEV) {
1030 netdev_dev->ether_addr_error = error;
1031 netdev_dev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1032 if (!error) {
149f577a 1033 memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
eb395f2e 1034 }
8b61709d 1035 }
44445cac 1036
4b609110 1037 netdev_restore_flags(sf);
7eb1bd81 1038
8b61709d
BP
1039 return error;
1040}
1041
44445cac 1042/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d
BP
1043static int
1044netdev_linux_get_etheraddr(const struct netdev *netdev_,
1045 uint8_t mac[ETH_ADDR_LEN])
1046{
149f577a
JG
1047 struct netdev_dev_linux *netdev_dev =
1048 netdev_dev_linux_cast(netdev_get_dev(netdev_));
44445cac 1049
149f577a 1050 if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
8b61709d 1051 int error = get_etheraddr(netdev_get_name(netdev_),
149f577a 1052 netdev_dev->etheraddr);
44445cac
PS
1053
1054 netdev_dev->ether_addr_error = error;
149f577a 1055 netdev_dev->cache_valid |= VALID_ETHERADDR;
8b61709d 1056 }
44445cac
PS
1057
1058 if (!netdev_dev->ether_addr_error) {
1059 memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
1060 }
1061
1062 return netdev_dev->ether_addr_error;
8b61709d
BP
1063}
1064
1065/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1066 * in bytes, not including the hardware header; thus, this is typically 1500
1067 * bytes for Ethernet devices. */
1068static int
1069netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1070{
149f577a
JG
1071 struct netdev_dev_linux *netdev_dev =
1072 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1073 if (!(netdev_dev->cache_valid & VALID_MTU)) {
8b61709d
BP
1074 struct ifreq ifr;
1075 int error;
1076
149f577a
JG
1077 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1078 SIOCGIFMTU, "SIOCGIFMTU");
90a6637d
PS
1079
1080 netdev_dev->netdev_mtu_error = error;
149f577a
JG
1081 netdev_dev->mtu = ifr.ifr_mtu;
1082 netdev_dev->cache_valid |= VALID_MTU;
8b61709d 1083 }
90a6637d
PS
1084
1085 if (!netdev_dev->netdev_mtu_error) {
1086 *mtup = netdev_dev->mtu;
1087 }
1088 return netdev_dev->netdev_mtu_error;
8b61709d
BP
1089}
1090
9b020780
PS
1091/* Sets the maximum size of transmitted (MTU) for given device using linux
1092 * networking ioctl interface.
1093 */
1094static int
1095netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1096{
1097 struct netdev_dev_linux *netdev_dev =
1098 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1099 struct ifreq ifr;
1100 int error;
1101
90a6637d
PS
1102 if (netdev_dev->cache_valid & VALID_MTU) {
1103 if (netdev_dev->netdev_mtu_error) {
1104 return netdev_dev->netdev_mtu_error;
1105 }
1106 if (netdev_dev->mtu == mtu) {
1107 return 0;
1108 }
1109 netdev_dev->cache_valid &= ~VALID_MTU;
153e5481 1110 }
9b020780
PS
1111 ifr.ifr_mtu = mtu;
1112 error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1113 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d
PS
1114 if (!error || error == ENODEV) {
1115 netdev_dev->netdev_mtu_error = error;
1116 netdev_dev->mtu = ifr.ifr_mtu;
1117 netdev_dev->cache_valid |= VALID_MTU;
9b020780 1118 }
90a6637d 1119 return error;
9b020780
PS
1120}
1121
9ab3d9a3
BP
1122/* Returns the ifindex of 'netdev', if successful, as a positive number.
1123 * On failure, returns a negative errno value. */
1124static int
1125netdev_linux_get_ifindex(const struct netdev *netdev)
1126{
1127 int ifindex, error;
1128
1129 error = get_ifindex(netdev, &ifindex);
1130 return error ? -error : ifindex;
1131}
1132
8b61709d
BP
1133static int
1134netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1135{
149f577a
JG
1136 struct netdev_dev_linux *netdev_dev =
1137 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1138
1670c579
EJ
1139 if (netdev_dev->miimon_interval > 0) {
1140 *carrier = netdev_dev->miimon;
3a183124 1141 } else {
c37d4da4 1142 *carrier = (netdev_dev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1143 }
8b61709d 1144
3a183124 1145 return 0;
8b61709d
BP
1146}
1147
65c3058c
EJ
1148static long long int
1149netdev_linux_get_carrier_resets(const struct netdev *netdev)
1150{
1151 return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1152}
1153
63331829 1154static int
1670c579
EJ
1155netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1156 struct mii_ioctl_data *data)
63331829 1157{
63331829 1158 struct ifreq ifr;
782e6111 1159 int error;
63331829 1160
63331829 1161 memset(&ifr, 0, sizeof ifr);
782e6111 1162 memcpy(&ifr.ifr_data, data, sizeof *data);
1670c579 1163 error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1164 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1165
782e6111
EJ
1166 return error;
1167}
1168
1169static int
1670c579 1170netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1171{
782e6111
EJ
1172 struct mii_ioctl_data data;
1173 int error;
63331829 1174
782e6111
EJ
1175 *miimon = false;
1176
1177 memset(&data, 0, sizeof data);
1670c579 1178 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1179 if (!error) {
1180 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1181 data.reg_num = MII_BMSR;
1670c579 1182 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1183 &data);
63331829
EJ
1184
1185 if (!error) {
782e6111 1186 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829
EJ
1187 } else {
1188 VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1189 }
1190 } else {
1191 struct ethtool_cmd ecmd;
63331829
EJ
1192
1193 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1194 name);
1195
ab985a77 1196 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1197 memset(&ecmd, 0, sizeof ecmd);
1198 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1199 "ETHTOOL_GLINK");
1200 if (!error) {
782e6111
EJ
1201 struct ethtool_value eval;
1202
1203 memcpy(&eval, &ecmd, sizeof eval);
1204 *miimon = !!eval.data;
63331829
EJ
1205 } else {
1206 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1207 }
1208 }
1209
1210 return error;
1211}
1212
1670c579
EJ
1213static int
1214netdev_linux_set_miimon_interval(struct netdev *netdev_,
1215 long long int interval)
1216{
1217 struct netdev_dev_linux *netdev_dev;
1218
1219 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1220
1221 interval = interval > 0 ? MAX(interval, 100) : 0;
1222 if (netdev_dev->miimon_interval != interval) {
1223 netdev_dev->miimon_interval = interval;
1224 timer_set_expired(&netdev_dev->miimon_timer);
1225 }
1226
1227 return 0;
1228}
1229
1230static void
1231netdev_linux_miimon_run(void)
1232{
1233 struct shash device_shash;
1234 struct shash_node *node;
1235
1236 shash_init(&device_shash);
1237 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1238 SHASH_FOR_EACH (node, &device_shash) {
1239 struct netdev_dev_linux *dev = node->data;
1240 bool miimon;
1241
1242 if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1243 continue;
1244 }
1245
1246 netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1247 if (miimon != dev->miimon) {
1670c579 1248 dev->miimon = miimon;
4f925bd3 1249 netdev_dev_linux_changed(dev, dev->ifi_flags, 0);
1670c579
EJ
1250 }
1251
1252 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1253 }
1254
1255 shash_destroy(&device_shash);
1256}
1257
1258static void
1259netdev_linux_miimon_wait(void)
1260{
1261 struct shash device_shash;
1262 struct shash_node *node;
1263
1264 shash_init(&device_shash);
1265 netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1266 SHASH_FOR_EACH (node, &device_shash) {
1267 struct netdev_dev_linux *dev = node->data;
1268
1269 if (dev->miimon_interval > 0) {
1270 timer_wait(&dev->miimon_timer);
1271 }
1272 }
1273 shash_destroy(&device_shash);
1274}
1275
8b61709d
BP
1276/* Check whether we can we use RTM_GETLINK to get network device statistics.
1277 * In pre-2.6.19 kernels, this was only available if wireless extensions were
1278 * enabled. */
1279static bool
1280check_for_working_netlink_stats(void)
1281{
1282 /* Decide on the netdev_get_stats() implementation to use. Netlink is
1283 * preferable, so if that works, we'll use it. */
1284 int ifindex = do_get_ifindex("lo");
1285 if (ifindex < 0) {
1286 VLOG_WARN("failed to get ifindex for lo, "
1287 "obtaining netdev stats from proc");
1288 return false;
1289 } else {
1290 struct netdev_stats stats;
1291 int error = get_stats_via_netlink(ifindex, &stats);
1292 if (!error) {
1293 VLOG_DBG("obtaining netdev stats via rtnetlink");
1294 return true;
1295 } else {
1296 VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1297 "via proc (you are probably running a pre-2.6.19 "
1298 "kernel)", strerror(error));
1299 return false;
1300 }
1301 }
1302}
1303
92df599c
JG
1304static void
1305swap_uint64(uint64_t *a, uint64_t *b)
1306{
1de0e8ae
BP
1307 uint64_t tmp = *a;
1308 *a = *b;
1309 *b = tmp;
92df599c
JG
1310}
1311
c060c4cf
EJ
1312/* Copies 'src' into 'dst', performing format conversion in the process.
1313 *
1314 * 'src' is allowed to be misaligned. */
1315static void
1316netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1317 const struct ovs_vport_stats *src)
1318{
1319 dst->rx_packets = get_unaligned_u64(&src->rx_packets);
1320 dst->tx_packets = get_unaligned_u64(&src->tx_packets);
1321 dst->rx_bytes = get_unaligned_u64(&src->rx_bytes);
1322 dst->tx_bytes = get_unaligned_u64(&src->tx_bytes);
1323 dst->rx_errors = get_unaligned_u64(&src->rx_errors);
1324 dst->tx_errors = get_unaligned_u64(&src->tx_errors);
1325 dst->rx_dropped = get_unaligned_u64(&src->rx_dropped);
1326 dst->tx_dropped = get_unaligned_u64(&src->tx_dropped);
1327 dst->multicast = 0;
1328 dst->collisions = 0;
1329 dst->rx_length_errors = 0;
1330 dst->rx_over_errors = 0;
1331 dst->rx_crc_errors = 0;
1332 dst->rx_frame_errors = 0;
1333 dst->rx_fifo_errors = 0;
1334 dst->rx_missed_errors = 0;
1335 dst->tx_aborted_errors = 0;
1336 dst->tx_carrier_errors = 0;
1337 dst->tx_fifo_errors = 0;
1338 dst->tx_heartbeat_errors = 0;
1339 dst->tx_window_errors = 0;
1340}
1341
1342static int
1343get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1344{
1345 struct dpif_linux_vport reply;
1346 struct ofpbuf *buf;
1347 int error;
1348
1349 error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
1350 if (error) {
1351 return error;
1352 } else if (!reply.stats) {
1353 ofpbuf_delete(buf);
1354 return EOPNOTSUPP;
1355 }
1356
1357 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1358
1359 ofpbuf_delete(buf);
1360
1361 return 0;
1362}
1363
f613a0d7
PS
1364static void
1365get_stats_via_vport(const struct netdev *netdev_,
1366 struct netdev_stats *stats)
8b61709d 1367{
149f577a
JG
1368 struct netdev_dev_linux *netdev_dev =
1369 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d 1370
bba1e6f3
PS
1371 if (!netdev_dev->vport_stats_error ||
1372 !(netdev_dev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1373 int error;
7fbef77a 1374
c060c4cf 1375 error = get_stats_via_vport__(netdev_, stats);
bcb1f5a1 1376 if (error && error != ENOENT) {
a57a8488
BP
1377 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
1378 "(%s)", netdev_get_name(netdev_), strerror(error));
f613a0d7 1379 }
bba1e6f3
PS
1380 netdev_dev->vport_stats_error = error;
1381 netdev_dev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1382 }
f613a0d7 1383}
8b61709d 1384
f613a0d7
PS
1385static int
1386netdev_linux_sys_get_stats(const struct netdev *netdev_,
1387 struct netdev_stats *stats)
1388{
1389 static int use_netlink_stats = -1;
1390 int error;
1391
1392 if (use_netlink_stats < 0) {
1393 use_netlink_stats = check_for_working_netlink_stats();
1394 }
1395
1396 if (use_netlink_stats) {
1397 int ifindex;
1398
1399 error = get_ifindex(netdev_, &ifindex);
1400 if (!error) {
1401 error = get_stats_via_netlink(ifindex, stats);
7fbef77a 1402 }
f613a0d7
PS
1403 } else {
1404 error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1405 }
7fbef77a 1406
f613a0d7
PS
1407 if (error) {
1408 VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1409 netdev_get_name(netdev_), error);
1410 }
1411 return error;
1412
1413}
1414
1415/* Retrieves current device stats for 'netdev-linux'. */
1416static int
1417netdev_linux_get_stats(const struct netdev *netdev_,
1418 struct netdev_stats *stats)
1419{
1420 struct netdev_dev_linux *netdev_dev =
1421 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1422 struct netdev_stats dev_stats;
1423 int error;
1424
1425 get_stats_via_vport(netdev_, stats);
1426
1427 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1428
1429 if (error) {
bba1e6f3 1430 if (netdev_dev->vport_stats_error) {
f613a0d7 1431 return error;
7fbef77a 1432 } else {
f613a0d7
PS
1433 return 0;
1434 }
1435 }
1436
bba1e6f3 1437 if (netdev_dev->vport_stats_error) {
f613a0d7
PS
1438 /* stats not available from OVS then use ioctl stats. */
1439 *stats = dev_stats;
1440 } else {
1441 stats->rx_errors += dev_stats.rx_errors;
1442 stats->tx_errors += dev_stats.tx_errors;
1443 stats->rx_dropped += dev_stats.rx_dropped;
1444 stats->tx_dropped += dev_stats.tx_dropped;
1445 stats->multicast += dev_stats.multicast;
1446 stats->collisions += dev_stats.collisions;
1447 stats->rx_length_errors += dev_stats.rx_length_errors;
1448 stats->rx_over_errors += dev_stats.rx_over_errors;
1449 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1450 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1451 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1452 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1453 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1454 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1455 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1456 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1457 stats->tx_window_errors += dev_stats.tx_window_errors;
1458 }
1459 return 0;
1460}
1461
1462/* Retrieves current device stats for 'netdev-tap' netdev or
1463 * netdev-internal. */
1464static int
bba1e6f3 1465netdev_tap_get_stats(const struct netdev *netdev_,
f613a0d7
PS
1466 struct netdev_stats *stats)
1467{
1468 struct netdev_dev_linux *netdev_dev =
1469 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1470 struct netdev_stats dev_stats;
1471 int error;
1472
1473 get_stats_via_vport(netdev_, stats);
1474
1475 error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1476 if (error) {
bba1e6f3 1477 if (netdev_dev->vport_stats_error) {
f613a0d7
PS
1478 return error;
1479 } else {
1480 return 0;
8b61709d 1481 }
8b61709d 1482 }
fe6b0e03
JG
1483
1484 /* If this port is an internal port then the transmit and receive stats
1485 * will appear to be swapped relative to the other ports since we are the
1486 * one sending the data, not a remote computer. For consistency, we swap
7fbef77a
JG
1487 * them back here. This does not apply if we are getting stats from the
1488 * vport layer because it always tracks stats from the perspective of the
1489 * switch. */
bba1e6f3 1490 if (netdev_dev->vport_stats_error) {
f613a0d7 1491 *stats = dev_stats;
92df599c
JG
1492 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1493 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1494 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1495 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1496 stats->rx_length_errors = 0;
1497 stats->rx_over_errors = 0;
1498 stats->rx_crc_errors = 0;
1499 stats->rx_frame_errors = 0;
1500 stats->rx_fifo_errors = 0;
1501 stats->rx_missed_errors = 0;
1502 stats->tx_aborted_errors = 0;
1503 stats->tx_carrier_errors = 0;
1504 stats->tx_fifo_errors = 0;
1505 stats->tx_heartbeat_errors = 0;
1506 stats->tx_window_errors = 0;
f613a0d7
PS
1507 } else {
1508 stats->rx_dropped += dev_stats.tx_dropped;
1509 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1510
f613a0d7
PS
1511 stats->rx_errors += dev_stats.tx_errors;
1512 stats->tx_errors += dev_stats.rx_errors;
1513
1514 stats->multicast += dev_stats.multicast;
1515 stats->collisions += dev_stats.collisions;
1516 }
1517 return 0;
8b61709d
BP
1518}
1519
bba1e6f3
PS
1520static int
1521netdev_internal_get_stats(const struct netdev *netdev_,
1522 struct netdev_stats *stats)
1523{
1524 struct netdev_dev_linux *netdev_dev =
1525 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1526
1527 get_stats_via_vport(netdev_, stats);
1528 return netdev_dev->vport_stats_error;
1529}
1530
2f31a822
EJ
1531static int
1532netdev_internal_set_stats(struct netdev *netdev,
1533 const struct netdev_stats *stats)
1534{
1535 struct ovs_vport_stats vport_stats;
1536 struct dpif_linux_vport vport;
1537 int err;
1538
1539 vport_stats.rx_packets = stats->rx_packets;
1540 vport_stats.tx_packets = stats->tx_packets;
1541 vport_stats.rx_bytes = stats->rx_bytes;
1542 vport_stats.tx_bytes = stats->tx_bytes;
1543 vport_stats.rx_errors = stats->rx_errors;
1544 vport_stats.tx_errors = stats->tx_errors;
1545 vport_stats.rx_dropped = stats->rx_dropped;
1546 vport_stats.tx_dropped = stats->tx_dropped;
1547
1548 dpif_linux_vport_init(&vport);
1549 vport.cmd = OVS_VPORT_CMD_SET;
1550 vport.name = netdev_get_name(netdev);
1551 vport.stats = &vport_stats;
1552
1553 err = dpif_linux_vport_transact(&vport, NULL, NULL);
1554
1555 /* If the vport layer doesn't know about the device, that doesn't mean it
1556 * doesn't exist (after all were able to open it when netdev_open() was
1557 * called), it just means that it isn't attached and we'll be getting
1558 * stats a different way. */
1559 if (err == ENODEV) {
1560 err = EOPNOTSUPP;
1561 }
1562
1563 return err;
1564}
1565
51f87458
PS
1566static void
1567netdev_linux_read_features(struct netdev_dev_linux *netdev_dev)
8b61709d
BP
1568{
1569 struct ethtool_cmd ecmd;
6c038611 1570 uint32_t speed;
8b61709d
BP
1571 int error;
1572
51f87458
PS
1573 if (netdev_dev->cache_valid & VALID_FEATURES) {
1574 return;
1575 }
1576
ab985a77 1577 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1578 memset(&ecmd, 0, sizeof ecmd);
51f87458 1579 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name, &ecmd,
8b61709d
BP
1580 ETHTOOL_GSET, "ETHTOOL_GSET");
1581 if (error) {
51f87458 1582 goto out;
8b61709d
BP
1583 }
1584
1585 /* Supported features. */
51f87458 1586 netdev_dev->supported = 0;
8b61709d 1587 if (ecmd.supported & SUPPORTED_10baseT_Half) {
51f87458 1588 netdev_dev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1589 }
1590 if (ecmd.supported & SUPPORTED_10baseT_Full) {
51f87458 1591 netdev_dev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1592 }
1593 if (ecmd.supported & SUPPORTED_100baseT_Half) {
51f87458 1594 netdev_dev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1595 }
1596 if (ecmd.supported & SUPPORTED_100baseT_Full) {
51f87458 1597 netdev_dev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1598 }
1599 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
51f87458 1600 netdev_dev->supported |= NETDEV_F_1GB_HD;
8b61709d
BP
1601 }
1602 if (ecmd.supported & SUPPORTED_1000baseT_Full) {
51f87458 1603 netdev_dev->supported |= NETDEV_F_1GB_FD;
8b61709d
BP
1604 }
1605 if (ecmd.supported & SUPPORTED_10000baseT_Full) {
51f87458 1606 netdev_dev->supported |= NETDEV_F_10GB_FD;
8b61709d
BP
1607 }
1608 if (ecmd.supported & SUPPORTED_TP) {
51f87458 1609 netdev_dev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1610 }
1611 if (ecmd.supported & SUPPORTED_FIBRE) {
51f87458 1612 netdev_dev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1613 }
1614 if (ecmd.supported & SUPPORTED_Autoneg) {
51f87458 1615 netdev_dev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1616 }
1617 if (ecmd.supported & SUPPORTED_Pause) {
51f87458 1618 netdev_dev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1619 }
1620 if (ecmd.supported & SUPPORTED_Asym_Pause) {
51f87458 1621 netdev_dev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1622 }
1623
1624 /* Advertised features. */
51f87458 1625 netdev_dev->advertised = 0;
8b61709d 1626 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
51f87458 1627 netdev_dev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1628 }
1629 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
51f87458 1630 netdev_dev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1631 }
1632 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
51f87458 1633 netdev_dev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1634 }
1635 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
51f87458 1636 netdev_dev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1637 }
1638 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
51f87458 1639 netdev_dev->advertised |= NETDEV_F_1GB_HD;
8b61709d
BP
1640 }
1641 if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
51f87458 1642 netdev_dev->advertised |= NETDEV_F_1GB_FD;
8b61709d
BP
1643 }
1644 if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
51f87458 1645 netdev_dev->advertised |= NETDEV_F_10GB_FD;
8b61709d
BP
1646 }
1647 if (ecmd.advertising & ADVERTISED_TP) {
51f87458 1648 netdev_dev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1649 }
1650 if (ecmd.advertising & ADVERTISED_FIBRE) {
51f87458 1651 netdev_dev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1652 }
1653 if (ecmd.advertising & ADVERTISED_Autoneg) {
51f87458 1654 netdev_dev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1655 }
1656 if (ecmd.advertising & ADVERTISED_Pause) {
51f87458 1657 netdev_dev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1658 }
1659 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
51f87458 1660 netdev_dev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1661 }
1662
1663 /* Current settings. */
2a529ead 1664 speed = ecmd.speed;
6c038611 1665 if (speed == SPEED_10) {
51f87458 1666 netdev_dev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1667 } else if (speed == SPEED_100) {
51f87458 1668 netdev_dev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1669 } else if (speed == SPEED_1000) {
51f87458 1670 netdev_dev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1671 } else if (speed == SPEED_10000) {
51f87458 1672 netdev_dev->current = NETDEV_F_10GB_FD;
6c038611 1673 } else if (speed == 40000) {
51f87458 1674 netdev_dev->current = NETDEV_F_40GB_FD;
6c038611 1675 } else if (speed == 100000) {
51f87458 1676 netdev_dev->current = NETDEV_F_100GB_FD;
6c038611 1677 } else if (speed == 1000000) {
51f87458 1678 netdev_dev->current = NETDEV_F_1TB_FD;
8b61709d 1679 } else {
51f87458 1680 netdev_dev->current = 0;
8b61709d
BP
1681 }
1682
1683 if (ecmd.port == PORT_TP) {
51f87458 1684 netdev_dev->current |= NETDEV_F_COPPER;
8b61709d 1685 } else if (ecmd.port == PORT_FIBRE) {
51f87458 1686 netdev_dev->current |= NETDEV_F_FIBER;
8b61709d
BP
1687 }
1688
1689 if (ecmd.autoneg) {
51f87458 1690 netdev_dev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1691 }
1692
1693 /* Peer advertisements. */
51f87458 1694 netdev_dev->peer = 0; /* XXX */
8b61709d 1695
51f87458
PS
1696out:
1697 netdev_dev->cache_valid |= VALID_FEATURES;
1698 netdev_dev->get_features_error = error;
1699}
1700
1701/* Stores the features supported by 'netdev' into each of '*current',
1702 * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1703 * bitmap of NETDEV_* bits. Returns 0 if successful, otherwise a positive
1704 * errno value. */
1705static int
1706netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1707 enum netdev_features *current,
1708 enum netdev_features *advertised,
1709 enum netdev_features *supported,
1710 enum netdev_features *peer)
51f87458
PS
1711{
1712 struct netdev_dev_linux *netdev_dev =
1713 netdev_dev_linux_cast(netdev_get_dev(netdev_));
1714
1715 netdev_linux_read_features(netdev_dev);
1716
1717 if (!netdev_dev->get_features_error) {
1718 *current = netdev_dev->current;
1719 *advertised = netdev_dev->advertised;
1720 *supported = netdev_dev->supported;
1721 *peer = netdev_dev->peer;
1722 }
1723 return netdev_dev->get_features_error;
8b61709d
BP
1724}
1725
1726/* Set the features advertised by 'netdev' to 'advertise'. */
1727static int
6c038611
BP
1728netdev_linux_set_advertisements(struct netdev *netdev,
1729 enum netdev_features advertise)
8b61709d
BP
1730{
1731 struct ethtool_cmd ecmd;
1732 int error;
1733
ab985a77 1734 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1735 memset(&ecmd, 0, sizeof ecmd);
0b0544d7 1736 error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1737 ETHTOOL_GSET, "ETHTOOL_GSET");
1738 if (error) {
1739 return error;
1740 }
1741
1742 ecmd.advertising = 0;
6c038611 1743 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
1744 ecmd.advertising |= ADVERTISED_10baseT_Half;
1745 }
6c038611 1746 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
1747 ecmd.advertising |= ADVERTISED_10baseT_Full;
1748 }
6c038611 1749 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
1750 ecmd.advertising |= ADVERTISED_100baseT_Half;
1751 }
6c038611 1752 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
1753 ecmd.advertising |= ADVERTISED_100baseT_Full;
1754 }
6c038611 1755 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
1756 ecmd.advertising |= ADVERTISED_1000baseT_Half;
1757 }
6c038611 1758 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
1759 ecmd.advertising |= ADVERTISED_1000baseT_Full;
1760 }
6c038611 1761 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
1762 ecmd.advertising |= ADVERTISED_10000baseT_Full;
1763 }
6c038611 1764 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
1765 ecmd.advertising |= ADVERTISED_TP;
1766 }
6c038611 1767 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
1768 ecmd.advertising |= ADVERTISED_FIBRE;
1769 }
6c038611 1770 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
1771 ecmd.advertising |= ADVERTISED_Autoneg;
1772 }
6c038611 1773 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
1774 ecmd.advertising |= ADVERTISED_Pause;
1775 }
6c038611 1776 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
1777 ecmd.advertising |= ADVERTISED_Asym_Pause;
1778 }
ab985a77 1779 COVERAGE_INC(netdev_set_ethtool);
0b0544d7 1780 return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
8b61709d
BP
1781 ETHTOOL_SSET, "ETHTOOL_SSET");
1782}
1783
f8500004
JP
1784/* Attempts to set input rate limiting (policing) policy. Returns 0 if
1785 * successful, otherwise a positive errno value. */
8b61709d
BP
1786static int
1787netdev_linux_set_policing(struct netdev *netdev,
1788 uint32_t kbits_rate, uint32_t kbits_burst)
1789{
80a86fbe
BP
1790 struct netdev_dev_linux *netdev_dev =
1791 netdev_dev_linux_cast(netdev_get_dev(netdev));
8b61709d 1792 const char *netdev_name = netdev_get_name(netdev);
f8500004 1793 int error;
8b61709d 1794
8e460221 1795
80a86fbe
BP
1796 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1797 : !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1798 : kbits_burst); /* Stick with user-specified value. */
1799
c9f71668
PS
1800 if (netdev_dev->cache_valid & VALID_POLICING) {
1801 if (netdev_dev->netdev_policing_error) {
1802 return netdev_dev->netdev_policing_error;
1803 }
1804
1805 if (netdev_dev->kbits_rate == kbits_rate &&
1806 netdev_dev->kbits_burst == kbits_burst) {
1807 /* Assume that settings haven't changed since we last set them. */
1808 return 0;
1809 }
1810 netdev_dev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
1811 }
1812
ac8c3412 1813 COVERAGE_INC(netdev_set_policing);
f8500004
JP
1814 /* Remove any existing ingress qdisc. */
1815 error = tc_add_del_ingress_qdisc(netdev, false);
1816 if (error) {
1817 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1818 netdev_name, strerror(error));
c9f71668 1819 goto out;
f8500004
JP
1820 }
1821
8b61709d 1822 if (kbits_rate) {
f8500004
JP
1823 error = tc_add_del_ingress_qdisc(netdev, true);
1824 if (error) {
1825 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
1826 netdev_name, strerror(error));
c9f71668 1827 goto out;
8b61709d
BP
1828 }
1829
f8500004
JP
1830 error = tc_add_policer(netdev, kbits_rate, kbits_burst);
1831 if (error){
1832 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
1833 netdev_name, strerror(error));
c9f71668 1834 goto out;
8b61709d 1835 }
8b61709d
BP
1836 }
1837
f8500004
JP
1838 netdev_dev->kbits_rate = kbits_rate;
1839 netdev_dev->kbits_burst = kbits_burst;
f8500004 1840
c9f71668
PS
1841out:
1842 if (!error || error == ENODEV) {
1843 netdev_dev->netdev_policing_error = error;
1844 netdev_dev->cache_valid |= VALID_POLICING;
1845 }
1846 return error;
8b61709d
BP
1847}
1848
c1c9c9c4
BP
1849static int
1850netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 1851 struct sset *types)
c1c9c9c4 1852{
559eb230 1853 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1854
1855 for (opsp = tcs; *opsp != NULL; opsp++) {
1856 const struct tc_ops *ops = *opsp;
1857 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 1858 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
1859 }
1860 }
1861 return 0;
1862}
1863
1864static const struct tc_ops *
1865tc_lookup_ovs_name(const char *name)
1866{
559eb230 1867 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1868
1869 for (opsp = tcs; *opsp != NULL; opsp++) {
1870 const struct tc_ops *ops = *opsp;
1871 if (!strcmp(name, ops->ovs_name)) {
1872 return ops;
1873 }
1874 }
1875 return NULL;
1876}
1877
1878static const struct tc_ops *
1879tc_lookup_linux_name(const char *name)
1880{
559eb230 1881 const struct tc_ops *const *opsp;
c1c9c9c4
BP
1882
1883 for (opsp = tcs; *opsp != NULL; opsp++) {
1884 const struct tc_ops *ops = *opsp;
1885 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1886 return ops;
1887 }
1888 }
1889 return NULL;
1890}
1891
93b13be8
BP
1892static struct tc_queue *
1893tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1894 size_t hash)
1895{
1896 struct netdev_dev_linux *netdev_dev =
1897 netdev_dev_linux_cast(netdev_get_dev(netdev));
1898 struct tc_queue *queue;
1899
4e8e4213 1900 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
93b13be8
BP
1901 if (queue->queue_id == queue_id) {
1902 return queue;
1903 }
1904 }
1905 return NULL;
1906}
1907
1908static struct tc_queue *
1909tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1910{
1911 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1912}
1913
c1c9c9c4
BP
1914static int
1915netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1916 const char *type,
1917 struct netdev_qos_capabilities *caps)
1918{
1919 const struct tc_ops *ops = tc_lookup_ovs_name(type);
1920 if (!ops) {
1921 return EOPNOTSUPP;
1922 }
1923 caps->n_queues = ops->n_queues;
1924 return 0;
1925}
1926
1927static int
1928netdev_linux_get_qos(const struct netdev *netdev,
79f1cbe9 1929 const char **typep, struct smap *details)
c1c9c9c4
BP
1930{
1931 struct netdev_dev_linux *netdev_dev =
1932 netdev_dev_linux_cast(netdev_get_dev(netdev));
1933 int error;
1934
1935 error = tc_query_qdisc(netdev);
1936 if (error) {
1937 return error;
1938 }
1939
1940 *typep = netdev_dev->tc->ops->ovs_name;
1941 return (netdev_dev->tc->ops->qdisc_get
1942 ? netdev_dev->tc->ops->qdisc_get(netdev, details)
1943 : 0);
1944}
1945
1946static int
1947netdev_linux_set_qos(struct netdev *netdev,
79f1cbe9 1948 const char *type, const struct smap *details)
c1c9c9c4
BP
1949{
1950 struct netdev_dev_linux *netdev_dev =
1951 netdev_dev_linux_cast(netdev_get_dev(netdev));
1952 const struct tc_ops *new_ops;
1953 int error;
1954
1955 new_ops = tc_lookup_ovs_name(type);
1956 if (!new_ops || !new_ops->tc_install) {
1957 return EOPNOTSUPP;
1958 }
1959
1960 error = tc_query_qdisc(netdev);
1961 if (error) {
1962 return error;
1963 }
1964
1965 if (new_ops == netdev_dev->tc->ops) {
1966 return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1967 } else {
1968 /* Delete existing qdisc. */
1969 error = tc_del_qdisc(netdev);
1970 if (error) {
1971 return error;
1972 }
cb22974d 1973 ovs_assert(netdev_dev->tc == NULL);
c1c9c9c4
BP
1974
1975 /* Install new qdisc. */
1976 error = new_ops->tc_install(netdev, details);
cb22974d 1977 ovs_assert((error == 0) == (netdev_dev->tc != NULL));
c1c9c9c4
BP
1978
1979 return error;
1980 }
1981}
1982
1983static int
1984netdev_linux_get_queue(const struct netdev *netdev,
79f1cbe9 1985 unsigned int queue_id, struct smap *details)
c1c9c9c4
BP
1986{
1987 struct netdev_dev_linux *netdev_dev =
1988 netdev_dev_linux_cast(netdev_get_dev(netdev));
1989 int error;
1990
1991 error = tc_query_qdisc(netdev);
1992 if (error) {
1993 return error;
93b13be8
BP
1994 } else {
1995 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1996 return (queue
1997 ? netdev_dev->tc->ops->class_get(netdev, queue, details)
1998 : ENOENT);
c1c9c9c4 1999 }
c1c9c9c4
BP
2000}
2001
2002static int
2003netdev_linux_set_queue(struct netdev *netdev,
79f1cbe9 2004 unsigned int queue_id, const struct smap *details)
c1c9c9c4
BP
2005{
2006 struct netdev_dev_linux *netdev_dev =
2007 netdev_dev_linux_cast(netdev_get_dev(netdev));
2008 int error;
2009
2010 error = tc_query_qdisc(netdev);
2011 if (error) {
2012 return error;
2013 } else if (queue_id >= netdev_dev->tc->ops->n_queues
2014 || !netdev_dev->tc->ops->class_set) {
2015 return EINVAL;
2016 }
2017
2018 return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
2019}
2020
2021static int
2022netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
2023{
2024 struct netdev_dev_linux *netdev_dev =
2025 netdev_dev_linux_cast(netdev_get_dev(netdev));
2026 int error;
2027
2028 error = tc_query_qdisc(netdev);
2029 if (error) {
2030 return error;
2031 } else if (!netdev_dev->tc->ops->class_delete) {
2032 return EINVAL;
93b13be8
BP
2033 } else {
2034 struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2035 return (queue
2036 ? netdev_dev->tc->ops->class_delete(netdev, queue)
2037 : ENOENT);
c1c9c9c4 2038 }
c1c9c9c4
BP
2039}
2040
2041static int
2042netdev_linux_get_queue_stats(const struct netdev *netdev,
2043 unsigned int queue_id,
2044 struct netdev_queue_stats *stats)
2045{
2046 struct netdev_dev_linux *netdev_dev =
2047 netdev_dev_linux_cast(netdev_get_dev(netdev));
2048 int error;
2049
2050 error = tc_query_qdisc(netdev);
2051 if (error) {
2052 return error;
c1c9c9c4
BP
2053 } else if (!netdev_dev->tc->ops->class_get_stats) {
2054 return EOPNOTSUPP;
93b13be8
BP
2055 } else {
2056 const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
2057 return (queue
2058 ? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
2059 : ENOENT);
c1c9c9c4 2060 }
c1c9c9c4
BP
2061}
2062
23a98ffe 2063static bool
c1c9c9c4
BP
2064start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
2065{
2066 struct ofpbuf request;
2067 struct tcmsg *tcmsg;
2068
2069 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2070 if (!tcmsg) {
2071 return false;
2072 }
3c4de644 2073 tcmsg->tcm_parent = 0;
c1c9c9c4
BP
2074 nl_dump_start(dump, rtnl_sock, &request);
2075 ofpbuf_uninit(&request);
23a98ffe 2076 return true;
c1c9c9c4
BP
2077}
2078
2079static int
2080netdev_linux_dump_queues(const struct netdev *netdev,
2081 netdev_dump_queues_cb *cb, void *aux)
2082{
2083 struct netdev_dev_linux *netdev_dev =
2084 netdev_dev_linux_cast(netdev_get_dev(netdev));
f486e840 2085 struct tc_queue *queue, *next_queue;
79f1cbe9 2086 struct smap details;
c1c9c9c4 2087 int last_error;
c1c9c9c4
BP
2088 int error;
2089
2090 error = tc_query_qdisc(netdev);
2091 if (error) {
2092 return error;
2093 } else if (!netdev_dev->tc->ops->class_get) {
2094 return EOPNOTSUPP;
2095 }
2096
2097 last_error = 0;
79f1cbe9 2098 smap_init(&details);
f486e840
BP
2099 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
2100 &netdev_dev->tc->queues) {
79f1cbe9 2101 smap_clear(&details);
c1c9c9c4 2102
93b13be8 2103 error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
c1c9c9c4 2104 if (!error) {
93b13be8 2105 (*cb)(queue->queue_id, &details, aux);
c1c9c9c4
BP
2106 } else {
2107 last_error = error;
2108 }
2109 }
79f1cbe9 2110 smap_destroy(&details);
c1c9c9c4
BP
2111
2112 return last_error;
2113}
2114
2115static int
2116netdev_linux_dump_queue_stats(const struct netdev *netdev,
2117 netdev_dump_queue_stats_cb *cb, void *aux)
2118{
2119 struct netdev_dev_linux *netdev_dev =
2120 netdev_dev_linux_cast(netdev_get_dev(netdev));
2121 struct nl_dump dump;
2122 struct ofpbuf msg;
2123 int last_error;
2124 int error;
2125
2126 error = tc_query_qdisc(netdev);
2127 if (error) {
2128 return error;
2129 } else if (!netdev_dev->tc->ops->class_dump_stats) {
2130 return EOPNOTSUPP;
2131 }
2132
2133 last_error = 0;
23a98ffe
BP
2134 if (!start_queue_dump(netdev, &dump)) {
2135 return ENODEV;
2136 }
c1c9c9c4
BP
2137 while (nl_dump_next(&dump, &msg)) {
2138 error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
2139 if (error) {
2140 last_error = error;
2141 }
2142 }
2143
2144 error = nl_dump_done(&dump);
2145 return error ? error : last_error;
2146}
2147
8b61709d 2148static int
f1acd62b
BP
2149netdev_linux_get_in4(const struct netdev *netdev_,
2150 struct in_addr *address, struct in_addr *netmask)
8b61709d 2151{
149f577a
JG
2152 struct netdev_dev_linux *netdev_dev =
2153 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2154
2155 if (!(netdev_dev->cache_valid & VALID_IN4)) {
8b61709d
BP
2156 int error;
2157
149f577a 2158 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
8b61709d
BP
2159 SIOCGIFADDR, "SIOCGIFADDR");
2160 if (error) {
2161 return error;
2162 }
2163
149f577a 2164 error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
f1acd62b
BP
2165 SIOCGIFNETMASK, "SIOCGIFNETMASK");
2166 if (error) {
2167 return error;
2168 }
2169
149f577a 2170 netdev_dev->cache_valid |= VALID_IN4;
8b61709d 2171 }
149f577a
JG
2172 *address = netdev_dev->address;
2173 *netmask = netdev_dev->netmask;
f1acd62b 2174 return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
8b61709d
BP
2175}
2176
8b61709d 2177static int
f1acd62b
BP
2178netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2179 struct in_addr netmask)
8b61709d 2180{
149f577a
JG
2181 struct netdev_dev_linux *netdev_dev =
2182 netdev_dev_linux_cast(netdev_get_dev(netdev_));
8b61709d
BP
2183 int error;
2184
f1acd62b 2185 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2186 if (!error) {
149f577a
JG
2187 netdev_dev->cache_valid |= VALID_IN4;
2188 netdev_dev->address = address;
2189 netdev_dev->netmask = netmask;
f1acd62b 2190 if (address.s_addr != INADDR_ANY) {
8b61709d 2191 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2192 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2193 }
2194 }
2195 return error;
2196}
2197
2198static bool
2199parse_if_inet6_line(const char *line,
2200 struct in6_addr *in6, char ifname[16 + 1])
2201{
2202 uint8_t *s6 = in6->s6_addr;
2203#define X8 "%2"SCNx8
2204 return sscanf(line,
2205 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2206 "%*x %*x %*x %*x %16s\n",
2207 &s6[0], &s6[1], &s6[2], &s6[3],
2208 &s6[4], &s6[5], &s6[6], &s6[7],
2209 &s6[8], &s6[9], &s6[10], &s6[11],
2210 &s6[12], &s6[13], &s6[14], &s6[15],
2211 ifname) == 17;
2212}
2213
2214/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2215 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2216static int
2217netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2218{
149f577a
JG
2219 struct netdev_dev_linux *netdev_dev =
2220 netdev_dev_linux_cast(netdev_get_dev(netdev_));
2221 if (!(netdev_dev->cache_valid & VALID_IN6)) {
8b61709d
BP
2222 FILE *file;
2223 char line[128];
2224
149f577a 2225 netdev_dev->in6 = in6addr_any;
8b61709d
BP
2226
2227 file = fopen("/proc/net/if_inet6", "r");
2228 if (file != NULL) {
2229 const char *name = netdev_get_name(netdev_);
2230 while (fgets(line, sizeof line, file)) {
2a022368 2231 struct in6_addr in6_tmp;
8b61709d 2232 char ifname[16 + 1];
2a022368 2233 if (parse_if_inet6_line(line, &in6_tmp, ifname)
8b61709d
BP
2234 && !strcmp(name, ifname))
2235 {
2a022368 2236 netdev_dev->in6 = in6_tmp;
8b61709d
BP
2237 break;
2238 }
2239 }
2240 fclose(file);
2241 }
149f577a 2242 netdev_dev->cache_valid |= VALID_IN6;
8b61709d 2243 }
149f577a 2244 *in6 = netdev_dev->in6;
8b61709d
BP
2245 return 0;
2246}
2247
2248static void
2249make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2250{
2251 struct sockaddr_in sin;
2252 memset(&sin, 0, sizeof sin);
2253 sin.sin_family = AF_INET;
2254 sin.sin_addr = addr;
2255 sin.sin_port = 0;
2256
2257 memset(sa, 0, sizeof *sa);
2258 memcpy(sa, &sin, sizeof sin);
2259}
2260
2261static int
2262do_set_addr(struct netdev *netdev,
2263 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2264{
2265 struct ifreq ifr;
71d7c22f 2266 ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
8b61709d 2267 make_in4_sockaddr(&ifr.ifr_addr, addr);
149f577a
JG
2268
2269 return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2270 ioctl_name);
8b61709d
BP
2271}
2272
2273/* Adds 'router' as a default IP gateway. */
2274static int
67a4917b 2275netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2276{
2277 struct in_addr any = { INADDR_ANY };
2278 struct rtentry rt;
2279 int error;
2280
2281 memset(&rt, 0, sizeof rt);
2282 make_in4_sockaddr(&rt.rt_dst, any);
2283 make_in4_sockaddr(&rt.rt_gateway, router);
2284 make_in4_sockaddr(&rt.rt_genmask, any);
2285 rt.rt_flags = RTF_UP | RTF_GATEWAY;
8b61709d
BP
2286 error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2287 if (error) {
2288 VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2289 }
2290 return error;
2291}
2292
f1acd62b
BP
2293static int
2294netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2295 char **netdev_name)
2296{
2297 static const char fn[] = "/proc/net/route";
2298 FILE *stream;
2299 char line[256];
2300 int ln;
2301
2302 *netdev_name = NULL;
2303 stream = fopen(fn, "r");
2304 if (stream == NULL) {
2305 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2306 return errno;
2307 }
2308
2309 ln = 0;
2310 while (fgets(line, sizeof line, stream)) {
2311 if (++ln >= 2) {
2312 char iface[17];
dbba996b 2313 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2314 int refcnt, metric, mtu;
2315 unsigned int flags, use, window, irtt;
2316
2317 if (sscanf(line,
2318 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2319 " %d %u %u\n",
2320 iface, &dest, &gateway, &flags, &refcnt,
2321 &use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2322
d295e8e9 2323 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2324 fn, ln, line);
2325 continue;
2326 }
2327 if (!(flags & RTF_UP)) {
2328 /* Skip routes that aren't up. */
2329 continue;
2330 }
2331
2332 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2333 * network byte order, so we don't need need any endian
f1acd62b
BP
2334 * conversions here. */
2335 if ((dest & mask) == (host->s_addr & mask)) {
2336 if (!gateway) {
2337 /* The host is directly reachable. */
2338 next_hop->s_addr = 0;
2339 } else {
2340 /* To reach the host, we must go through a gateway. */
2341 next_hop->s_addr = gateway;
2342 }
2343 *netdev_name = xstrdup(iface);
2344 fclose(stream);
2345 return 0;
2346 }
2347 }
2348 }
2349
2350 fclose(stream);
2351 return ENXIO;
2352}
2353
e210037e 2354static int
275707c3 2355netdev_linux_get_status(const struct netdev *netdev, struct smap *smap)
e210037e 2356{
275707c3
EJ
2357 struct netdev_dev_linux *netdev_dev;
2358 int error = 0;
2359
2360 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2361 if (!(netdev_dev->cache_valid & VALID_DRVINFO)) {
2362 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev_dev->drvinfo;
2363
2364 COVERAGE_INC(netdev_get_ethtool);
2365 memset(&netdev_dev->drvinfo, 0, sizeof netdev_dev->drvinfo);
2366 error = netdev_linux_do_ethtool(netdev_dev->netdev_dev.name,
2367 cmd,
2368 ETHTOOL_GDRVINFO,
2369 "ETHTOOL_GDRVINFO");
2370 if (!error) {
2371 netdev_dev->cache_valid |= VALID_DRVINFO;
2372 }
2373 }
e210037e 2374
e210037e 2375 if (!error) {
79f1cbe9
EJ
2376 smap_add(smap, "driver_name", netdev_dev->drvinfo.driver);
2377 smap_add(smap, "driver_version", netdev_dev->drvinfo.version);
2378 smap_add(smap, "firmware_version", netdev_dev->drvinfo.fw_version);
e210037e 2379 }
e210037e
AE
2380 return error;
2381}
2382
4f925bd3 2383static int
275707c3
EJ
2384netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2385 struct smap *smap)
4f925bd3 2386{
79f1cbe9 2387 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2388 return 0;
2389}
2390
8b61709d
BP
2391/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2392 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2393 * returns 0. Otherwise, it returns a positive errno value; in particular,
2394 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2395static int
2396netdev_linux_arp_lookup(const struct netdev *netdev,
dbba996b 2397 ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
8b61709d
BP
2398{
2399 struct arpreq r;
c100e025 2400 struct sockaddr_in sin;
8b61709d
BP
2401 int retval;
2402
2403 memset(&r, 0, sizeof r);
f2cc621b 2404 memset(&sin, 0, sizeof sin);
c100e025
BP
2405 sin.sin_family = AF_INET;
2406 sin.sin_addr.s_addr = ip;
2407 sin.sin_port = 0;
2408 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2409 r.arp_ha.sa_family = ARPHRD_ETHER;
2410 r.arp_flags = 0;
71d7c22f 2411 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d
BP
2412 COVERAGE_INC(netdev_arp_lookup);
2413 retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2414 if (!retval) {
2415 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2416 } else if (retval != ENXIO) {
2417 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
ed36537e 2418 netdev_get_name(netdev), IP_ARGS(ip), strerror(retval));
8b61709d
BP
2419 }
2420 return retval;
2421}
2422
2423static int
2424nd_to_iff_flags(enum netdev_flags nd)
2425{
2426 int iff = 0;
2427 if (nd & NETDEV_UP) {
2428 iff |= IFF_UP;
2429 }
2430 if (nd & NETDEV_PROMISC) {
2431 iff |= IFF_PROMISC;
2432 }
2433 return iff;
2434}
2435
2436static int
2437iff_to_nd_flags(int iff)
2438{
2439 enum netdev_flags nd = 0;
2440 if (iff & IFF_UP) {
2441 nd |= NETDEV_UP;
2442 }
2443 if (iff & IFF_PROMISC) {
2444 nd |= NETDEV_PROMISC;
2445 }
2446 return nd;
2447}
2448
2449static int
4b609110 2450netdev_linux_update_flags(struct netdev_dev *dev_, enum netdev_flags off,
8b61709d
BP
2451 enum netdev_flags on, enum netdev_flags *old_flagsp)
2452{
c37d4da4 2453 struct netdev_dev_linux *netdev_dev;
8b61709d 2454 int old_flags, new_flags;
c37d4da4
EJ
2455 int error = 0;
2456
4b609110 2457 netdev_dev = netdev_dev_linux_cast(dev_);
c37d4da4
EJ
2458 old_flags = netdev_dev->ifi_flags;
2459 *old_flagsp = iff_to_nd_flags(old_flags);
2460 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2461 if (new_flags != old_flags) {
4b609110 2462 error = set_flags(netdev_dev_get_name(dev_), new_flags);
c37d4da4 2463 get_flags(&netdev_dev->netdev_dev, &netdev_dev->ifi_flags);
8b61709d
BP
2464 }
2465 return error;
2466}
2467
ac4d3bcb
EJ
2468static unsigned int
2469netdev_linux_change_seq(const struct netdev *netdev)
2470{
2471 return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2472}
2473
4f925bd3 2474#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS, \
51f87458 2475 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2476{ \
2477 NAME, \
2478 \
2479 netdev_linux_init, \
2480 netdev_linux_run, \
2481 netdev_linux_wait, \
2482 \
2483 CREATE, \
2484 netdev_linux_destroy, \
de5cdb90 2485 NULL, /* get_config */ \
6d9e6eb4 2486 NULL, /* set_config */ \
f431bf7d 2487 NULL, /* get_tunnel_config */ \
c3827f61
BP
2488 \
2489 netdev_linux_open, \
2490 netdev_linux_close, \
2491 \
7b6b0ef4 2492 netdev_linux_listen, \
c3827f61
BP
2493 netdev_linux_recv, \
2494 netdev_linux_recv_wait, \
2495 netdev_linux_drain, \
2496 \
2497 netdev_linux_send, \
2498 netdev_linux_send_wait, \
2499 \
2500 netdev_linux_set_etheraddr, \
2501 netdev_linux_get_etheraddr, \
2502 netdev_linux_get_mtu, \
9b020780 2503 netdev_linux_set_mtu, \
c3827f61
BP
2504 netdev_linux_get_ifindex, \
2505 netdev_linux_get_carrier, \
65c3058c 2506 netdev_linux_get_carrier_resets, \
1670c579 2507 netdev_linux_set_miimon_interval, \
f613a0d7 2508 GET_STATS, \
c3827f61
BP
2509 SET_STATS, \
2510 \
51f87458 2511 GET_FEATURES, \
c3827f61 2512 netdev_linux_set_advertisements, \
c3827f61
BP
2513 \
2514 netdev_linux_set_policing, \
2515 netdev_linux_get_qos_types, \
2516 netdev_linux_get_qos_capabilities, \
2517 netdev_linux_get_qos, \
2518 netdev_linux_set_qos, \
2519 netdev_linux_get_queue, \
2520 netdev_linux_set_queue, \
2521 netdev_linux_delete_queue, \
2522 netdev_linux_get_queue_stats, \
2523 netdev_linux_dump_queues, \
2524 netdev_linux_dump_queue_stats, \
2525 \
2526 netdev_linux_get_in4, \
2527 netdev_linux_set_in4, \
2528 netdev_linux_get_in6, \
2529 netdev_linux_add_router, \
2530 netdev_linux_get_next_hop, \
4f925bd3 2531 GET_STATUS, \
c3827f61
BP
2532 netdev_linux_arp_lookup, \
2533 \
2534 netdev_linux_update_flags, \
2535 \
ac4d3bcb 2536 netdev_linux_change_seq \
c3827f61
BP
2537}
2538
2539const struct netdev_class netdev_linux_class =
2540 NETDEV_LINUX_CLASS(
2541 "system",
2542 netdev_linux_create,
f613a0d7 2543 netdev_linux_get_stats,
4f925bd3 2544 NULL, /* set_stats */
51f87458 2545 netdev_linux_get_features,
275707c3 2546 netdev_linux_get_status);
c3827f61
BP
2547
2548const struct netdev_class netdev_tap_class =
2549 NETDEV_LINUX_CLASS(
2550 "tap",
2551 netdev_linux_create_tap,
bba1e6f3 2552 netdev_tap_get_stats,
4f925bd3 2553 NULL, /* set_stats */
51f87458 2554 netdev_linux_get_features,
275707c3 2555 netdev_linux_get_status);
c3827f61
BP
2556
2557const struct netdev_class netdev_internal_class =
2558 NETDEV_LINUX_CLASS(
2559 "internal",
2560 netdev_linux_create,
bba1e6f3 2561 netdev_internal_get_stats,
2f31a822 2562 netdev_internal_set_stats,
51f87458 2563 NULL, /* get_features */
275707c3 2564 netdev_internal_get_status);
8b61709d 2565\f
c1c9c9c4 2566/* HTB traffic control class. */
559843ed 2567
c1c9c9c4 2568#define HTB_N_QUEUES 0xf000
8b61709d 2569
c1c9c9c4
BP
2570struct htb {
2571 struct tc tc;
2572 unsigned int max_rate; /* In bytes/s. */
2573};
8b61709d 2574
c1c9c9c4 2575struct htb_class {
93b13be8 2576 struct tc_queue tc_queue;
c1c9c9c4
BP
2577 unsigned int min_rate; /* In bytes/s. */
2578 unsigned int max_rate; /* In bytes/s. */
2579 unsigned int burst; /* In bytes. */
2580 unsigned int priority; /* Lower values are higher priorities. */
2581};
8b61709d 2582
c1c9c9c4
BP
2583static struct htb *
2584htb_get__(const struct netdev *netdev)
2585{
2586 struct netdev_dev_linux *netdev_dev =
2587 netdev_dev_linux_cast(netdev_get_dev(netdev));
2588 return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2589}
2590
24045e35 2591static void
c1c9c9c4
BP
2592htb_install__(struct netdev *netdev, uint64_t max_rate)
2593{
2594 struct netdev_dev_linux *netdev_dev =
2595 netdev_dev_linux_cast(netdev_get_dev(netdev));
2596 struct htb *htb;
2597
2598 htb = xmalloc(sizeof *htb);
2599 tc_init(&htb->tc, &tc_ops_htb);
2600 htb->max_rate = max_rate;
2601
2602 netdev_dev->tc = &htb->tc;
c1c9c9c4
BP
2603}
2604
2605/* Create an HTB qdisc.
2606 *
a339aa81 2607 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
2608static int
2609htb_setup_qdisc__(struct netdev *netdev)
2610{
2611 size_t opt_offset;
2612 struct tc_htb_glob opt;
2613 struct ofpbuf request;
2614 struct tcmsg *tcmsg;
2615
2616 tc_del_qdisc(netdev);
2617
2618 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2619 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
2620 if (!tcmsg) {
2621 return ENODEV;
2622 }
c1c9c9c4
BP
2623 tcmsg->tcm_handle = tc_make_handle(1, 0);
2624 tcmsg->tcm_parent = TC_H_ROOT;
2625
2626 nl_msg_put_string(&request, TCA_KIND, "htb");
2627
2628 memset(&opt, 0, sizeof opt);
2629 opt.rate2quantum = 10;
2630 opt.version = 3;
4ecf12d5 2631 opt.defcls = 1;
c1c9c9c4
BP
2632
2633 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2634 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2635 nl_msg_end_nested(&request, opt_offset);
2636
2637 return tc_transact(&request, NULL);
2638}
2639
2640/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2641 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2642static int
2643htb_setup_class__(struct netdev *netdev, unsigned int handle,
2644 unsigned int parent, struct htb_class *class)
2645{
2646 size_t opt_offset;
2647 struct tc_htb_opt opt;
2648 struct ofpbuf request;
2649 struct tcmsg *tcmsg;
2650 int error;
2651 int mtu;
2652
9b020780
PS
2653 error = netdev_get_mtu(netdev, &mtu);
2654 if (error) {
f915f1a8
BP
2655 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2656 netdev_get_name(netdev));
9b020780 2657 return error;
f915f1a8 2658 }
c1c9c9c4
BP
2659
2660 memset(&opt, 0, sizeof opt);
2661 tc_fill_rate(&opt.rate, class->min_rate, mtu);
2662 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2663 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2664 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2665 opt.prio = class->priority;
2666
2667 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
2668 if (!tcmsg) {
2669 return ENODEV;
2670 }
c1c9c9c4
BP
2671 tcmsg->tcm_handle = handle;
2672 tcmsg->tcm_parent = parent;
2673
2674 nl_msg_put_string(&request, TCA_KIND, "htb");
2675 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2676 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2677 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2678 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2679 nl_msg_end_nested(&request, opt_offset);
2680
2681 error = tc_transact(&request, NULL);
2682 if (error) {
2683 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2684 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2685 netdev_get_name(netdev),
2686 tc_get_major(handle), tc_get_minor(handle),
2687 tc_get_major(parent), tc_get_minor(parent),
2688 class->min_rate, class->max_rate,
2689 class->burst, class->priority, strerror(error));
2690 }
2691 return error;
2692}
2693
2694/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2695 * description of them into 'details'. The description complies with the
2696 * specification given in the vswitch database documentation for linux-htb
2697 * queue details. */
2698static int
2699htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2700{
2701 static const struct nl_policy tca_htb_policy[] = {
2702 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2703 .min_len = sizeof(struct tc_htb_opt) },
2704 };
2705
2706 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2707 const struct tc_htb_opt *htb;
2708
2709 if (!nl_parse_nested(nl_options, tca_htb_policy,
2710 attrs, ARRAY_SIZE(tca_htb_policy))) {
2711 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2712 return EPROTO;
2713 }
2714
2715 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2716 class->min_rate = htb->rate.rate;
2717 class->max_rate = htb->ceil.rate;
2718 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2719 class->priority = htb->prio;
2720 return 0;
2721}
2722
2723static int
2724htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2725 struct htb_class *options,
2726 struct netdev_queue_stats *stats)
2727{
2728 struct nlattr *nl_options;
2729 unsigned int handle;
2730 int error;
2731
2732 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2733 if (!error && queue_id) {
17ee3c1f
BP
2734 unsigned int major = tc_get_major(handle);
2735 unsigned int minor = tc_get_minor(handle);
2736 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2737 *queue_id = minor - 1;
c1c9c9c4
BP
2738 } else {
2739 error = EPROTO;
2740 }
2741 }
2742 if (!error && options) {
2743 error = htb_parse_tca_options__(nl_options, options);
2744 }
2745 return error;
2746}
2747
2748static void
2749htb_parse_qdisc_details__(struct netdev *netdev,
79f1cbe9 2750 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2751{
2752 const char *max_rate_s;
2753
79f1cbe9 2754 max_rate_s = smap_get(details, "max-rate");
c1c9c9c4
BP
2755 hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2756 if (!hc->max_rate) {
a00ca915 2757 enum netdev_features current;
c1c9c9c4
BP
2758
2759 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 2760 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
2761 }
2762 hc->min_rate = hc->max_rate;
2763 hc->burst = 0;
2764 hc->priority = 0;
2765}
2766
2767static int
2768htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 2769 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
2770{
2771 const struct htb *htb = htb_get__(netdev);
79f1cbe9
EJ
2772 const char *min_rate_s = smap_get(details, "min-rate");
2773 const char *max_rate_s = smap_get(details, "max-rate");
2774 const char *burst_s = smap_get(details, "burst");
2775 const char *priority_s = smap_get(details, "priority");
9b020780 2776 int mtu, error;
c1c9c9c4 2777
9b020780
PS
2778 error = netdev_get_mtu(netdev, &mtu);
2779 if (error) {
f915f1a8
BP
2780 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2781 netdev_get_name(netdev));
9b020780 2782 return error;
f915f1a8
BP
2783 }
2784
4f104611
EJ
2785 /* HTB requires at least an mtu sized min-rate to send any traffic even
2786 * on uncongested links. */
c45ab5e9 2787 hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
4f104611 2788 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
2789 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2790
2791 /* max-rate */
2792 hc->max_rate = (max_rate_s
2793 ? strtoull(max_rate_s, NULL, 10) / 8
2794 : htb->max_rate);
2795 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2796 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2797
2798 /* burst
2799 *
2800 * According to hints in the documentation that I've read, it is important
2801 * that 'burst' be at least as big as the largest frame that might be
2802 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2803 * but having it a bit too small is a problem. Since netdev_get_mtu()
2804 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
2805 * the MTU. We actually add 64, instead of 14, as a guard against
2806 * additional headers get tacked on somewhere that we're not aware of. */
c1c9c9c4
BP
2807 hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2808 hc->burst = MAX(hc->burst, mtu + 64);
2809
2810 /* priority */
2811 hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2812
2813 return 0;
2814}
2815
2816static int
2817htb_query_class__(const struct netdev *netdev, unsigned int handle,
2818 unsigned int parent, struct htb_class *options,
2819 struct netdev_queue_stats *stats)
2820{
2821 struct ofpbuf *reply;
2822 int error;
2823
2824 error = tc_query_class(netdev, handle, parent, &reply);
2825 if (!error) {
2826 error = htb_parse_tcmsg__(reply, NULL, options, stats);
2827 ofpbuf_delete(reply);
2828 }
2829 return error;
2830}
2831
2832static int
79f1cbe9 2833htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2834{
2835 int error;
2836
2837 error = htb_setup_qdisc__(netdev);
2838 if (!error) {
2839 struct htb_class hc;
2840
2841 htb_parse_qdisc_details__(netdev, details, &hc);
2842 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2843 tc_make_handle(1, 0), &hc);
2844 if (!error) {
2845 htb_install__(netdev, hc.max_rate);
2846 }
2847 }
2848 return error;
2849}
2850
93b13be8
BP
2851static struct htb_class *
2852htb_class_cast__(const struct tc_queue *queue)
2853{
2854 return CONTAINER_OF(queue, struct htb_class, tc_queue);
2855}
2856
c1c9c9c4
BP
2857static void
2858htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2859 const struct htb_class *hc)
2860{
2861 struct htb *htb = htb_get__(netdev);
93b13be8
BP
2862 size_t hash = hash_int(queue_id, 0);
2863 struct tc_queue *queue;
c1c9c9c4
BP
2864 struct htb_class *hcp;
2865
93b13be8
BP
2866 queue = tc_find_queue__(netdev, queue_id, hash);
2867 if (queue) {
2868 hcp = htb_class_cast__(queue);
2869 } else {
c1c9c9c4 2870 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
2871 queue = &hcp->tc_queue;
2872 queue->queue_id = queue_id;
2873 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 2874 }
93b13be8
BP
2875
2876 hcp->min_rate = hc->min_rate;
2877 hcp->max_rate = hc->max_rate;
2878 hcp->burst = hc->burst;
2879 hcp->priority = hc->priority;
c1c9c9c4
BP
2880}
2881
2882static int
2883htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2884{
c1c9c9c4
BP
2885 struct ofpbuf msg;
2886 struct nl_dump dump;
2887 struct htb_class hc;
c1c9c9c4
BP
2888
2889 /* Get qdisc options. */
2890 hc.max_rate = 0;
2891 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 2892 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
2893
2894 /* Get queues. */
23a98ffe
BP
2895 if (!start_queue_dump(netdev, &dump)) {
2896 return ENODEV;
2897 }
c1c9c9c4
BP
2898 while (nl_dump_next(&dump, &msg)) {
2899 unsigned int queue_id;
2900
2901 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2902 htb_update_queue__(netdev, queue_id, &hc);
2903 }
2904 }
2905 nl_dump_done(&dump);
2906
2907 return 0;
2908}
2909
2910static void
2911htb_tc_destroy(struct tc *tc)
2912{
2913 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
93b13be8 2914 struct htb_class *hc, *next;
c1c9c9c4 2915
4e8e4213 2916 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
93b13be8 2917 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4
BP
2918 free(hc);
2919 }
2920 tc_destroy(tc);
2921 free(htb);
2922}
2923
2924static int
79f1cbe9 2925htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
2926{
2927 const struct htb *htb = htb_get__(netdev);
79f1cbe9 2928 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
2929 return 0;
2930}
2931
2932static int
79f1cbe9 2933htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
2934{
2935 struct htb_class hc;
2936 int error;
2937
2938 htb_parse_qdisc_details__(netdev, details, &hc);
2939 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2940 tc_make_handle(1, 0), &hc);
2941 if (!error) {
2942 htb_get__(netdev)->max_rate = hc.max_rate;
2943 }
2944 return error;
2945}
2946
2947static int
93b13be8 2948htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 2949 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 2950{
93b13be8 2951 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2952
79f1cbe9 2953 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 2954 if (hc->min_rate != hc->max_rate) {
79f1cbe9 2955 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 2956 }
79f1cbe9 2957 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 2958 if (hc->priority) {
79f1cbe9 2959 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
2960 }
2961 return 0;
2962}
2963
2964static int
2965htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 2966 const struct smap *details)
c1c9c9c4
BP
2967{
2968 struct htb_class hc;
2969 int error;
2970
2971 error = htb_parse_class_details__(netdev, details, &hc);
2972 if (error) {
2973 return error;
2974 }
2975
17ee3c1f 2976 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
2977 tc_make_handle(1, 0xfffe), &hc);
2978 if (error) {
2979 return error;
2980 }
2981
2982 htb_update_queue__(netdev, queue_id, &hc);
2983 return 0;
2984}
2985
2986static int
93b13be8 2987htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 2988{
93b13be8 2989 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 2990 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
2991 int error;
2992
93b13be8 2993 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 2994 if (!error) {
93b13be8 2995 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 2996 free(hc);
c1c9c9c4
BP
2997 }
2998 return error;
2999}
3000
3001static int
93b13be8 3002htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3003 struct netdev_queue_stats *stats)
3004{
93b13be8 3005 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3006 tc_make_handle(1, 0xfffe), NULL, stats);
3007}
3008
3009static int
3010htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3011 const struct ofpbuf *nlmsg,
3012 netdev_dump_queue_stats_cb *cb, void *aux)
3013{
3014 struct netdev_queue_stats stats;
17ee3c1f 3015 unsigned int handle, major, minor;
c1c9c9c4
BP
3016 int error;
3017
3018 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3019 if (error) {
3020 return error;
3021 }
3022
17ee3c1f
BP
3023 major = tc_get_major(handle);
3024 minor = tc_get_minor(handle);
3025 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3026 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3027 }
3028 return 0;
3029}
3030
3031static const struct tc_ops tc_ops_htb = {
3032 "htb", /* linux_name */
3033 "linux-htb", /* ovs_name */
3034 HTB_N_QUEUES, /* n_queues */
3035 htb_tc_install,
3036 htb_tc_load,
3037 htb_tc_destroy,
3038 htb_qdisc_get,
3039 htb_qdisc_set,
3040 htb_class_get,
3041 htb_class_set,
3042 htb_class_delete,
3043 htb_class_get_stats,
3044 htb_class_dump_stats
3045};
3046\f
a339aa81
EJ
3047/* "linux-hfsc" traffic control class. */
3048
3049#define HFSC_N_QUEUES 0xf000
3050
3051struct hfsc {
3052 struct tc tc;
3053 uint32_t max_rate;
3054};
3055
3056struct hfsc_class {
3057 struct tc_queue tc_queue;
3058 uint32_t min_rate;
3059 uint32_t max_rate;
3060};
3061
3062static struct hfsc *
3063hfsc_get__(const struct netdev *netdev)
3064{
3065 struct netdev_dev_linux *netdev_dev;
3066 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3067 return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
3068}
3069
3070static struct hfsc_class *
3071hfsc_class_cast__(const struct tc_queue *queue)
3072{
3073 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3074}
3075
24045e35 3076static void
a339aa81
EJ
3077hfsc_install__(struct netdev *netdev, uint32_t max_rate)
3078{
3079 struct netdev_dev_linux * netdev_dev;
3080 struct hfsc *hfsc;
3081
3082 netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
3083 hfsc = xmalloc(sizeof *hfsc);
3084 tc_init(&hfsc->tc, &tc_ops_hfsc);
3085 hfsc->max_rate = max_rate;
3086 netdev_dev->tc = &hfsc->tc;
a339aa81
EJ
3087}
3088
3089static void
3090hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
3091 const struct hfsc_class *hc)
3092{
3093 size_t hash;
3094 struct hfsc *hfsc;
3095 struct hfsc_class *hcp;
3096 struct tc_queue *queue;
3097
3098 hfsc = hfsc_get__(netdev);
3099 hash = hash_int(queue_id, 0);
3100
3101 queue = tc_find_queue__(netdev, queue_id, hash);
3102 if (queue) {
3103 hcp = hfsc_class_cast__(queue);
3104 } else {
3105 hcp = xmalloc(sizeof *hcp);
3106 queue = &hcp->tc_queue;
3107 queue->queue_id = queue_id;
3108 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
3109 }
3110
3111 hcp->min_rate = hc->min_rate;
3112 hcp->max_rate = hc->max_rate;
3113}
3114
3115static int
3116hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
3117{
3118 const struct tc_service_curve *rsc, *fsc, *usc;
3119 static const struct nl_policy tca_hfsc_policy[] = {
3120 [TCA_HFSC_RSC] = {
3121 .type = NL_A_UNSPEC,
3122 .optional = false,
3123 .min_len = sizeof(struct tc_service_curve),
3124 },
3125 [TCA_HFSC_FSC] = {
3126 .type = NL_A_UNSPEC,
3127 .optional = false,
3128 .min_len = sizeof(struct tc_service_curve),
3129 },
3130 [TCA_HFSC_USC] = {
3131 .type = NL_A_UNSPEC,
3132 .optional = false,
3133 .min_len = sizeof(struct tc_service_curve),
3134 },
3135 };
3136 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
3137
3138 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
3139 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
3140 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
3141 return EPROTO;
3142 }
3143
3144 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
3145 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
3146 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
3147
3148 if (rsc->m1 != 0 || rsc->d != 0 ||
3149 fsc->m1 != 0 || fsc->d != 0 ||
3150 usc->m1 != 0 || usc->d != 0) {
3151 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3152 "Non-linear service curves are not supported.");
3153 return EPROTO;
3154 }
3155
3156 if (rsc->m2 != fsc->m2) {
3157 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3158 "Real-time service curves are not supported ");
3159 return EPROTO;
3160 }
3161
3162 if (rsc->m2 > usc->m2) {
3163 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
3164 "Min-rate service curve is greater than "
3165 "the max-rate service curve.");
3166 return EPROTO;
3167 }
3168
3169 class->min_rate = fsc->m2;
3170 class->max_rate = usc->m2;
3171 return 0;
3172}
3173
3174static int
3175hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3176 struct hfsc_class *options,
3177 struct netdev_queue_stats *stats)
3178{
3179 int error;
3180 unsigned int handle;
3181 struct nlattr *nl_options;
3182
3183 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3184 if (error) {
3185 return error;
3186 }
3187
3188 if (queue_id) {
3189 unsigned int major, minor;
3190
3191 major = tc_get_major(handle);
3192 minor = tc_get_minor(handle);
3193 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3194 *queue_id = minor - 1;
3195 } else {
3196 return EPROTO;
3197 }
3198 }
3199
3200 if (options) {
3201 error = hfsc_parse_tca_options__(nl_options, options);
3202 }
3203
3204 return error;
3205}
3206
3207static int
3208hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
3209 unsigned int parent, struct hfsc_class *options,
3210 struct netdev_queue_stats *stats)
3211{
3212 int error;
3213 struct ofpbuf *reply;
3214
3215 error = tc_query_class(netdev, handle, parent, &reply);
3216 if (error) {
3217 return error;
3218 }
3219
3220 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
3221 ofpbuf_delete(reply);
3222 return error;
3223}
3224
3225static void
79f1cbe9 3226hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
a339aa81
EJ
3227 struct hfsc_class *class)
3228{
3229 uint32_t max_rate;
3230 const char *max_rate_s;
3231
79f1cbe9 3232 max_rate_s = smap_get(details, "max-rate");
a339aa81
EJ
3233 max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
3234
3235 if (!max_rate) {
a00ca915 3236 enum netdev_features current;
a339aa81
EJ
3237
3238 netdev_get_features(netdev, &current, NULL, NULL, NULL);
d02a5f8e 3239 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
3240 }
3241
3242 class->min_rate = max_rate;
3243 class->max_rate = max_rate;
3244}
3245
3246static int
3247hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 3248 const struct smap *details,
a339aa81
EJ
3249 struct hfsc_class * class)
3250{
3251 const struct hfsc *hfsc;
3252 uint32_t min_rate, max_rate;
3253 const char *min_rate_s, *max_rate_s;
3254
3255 hfsc = hfsc_get__(netdev);
79f1cbe9
EJ
3256 min_rate_s = smap_get(details, "min-rate");
3257 max_rate_s = smap_get(details, "max-rate");
a339aa81 3258
c45ab5e9 3259 min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
79398bad 3260 min_rate = MAX(min_rate, 1);
a339aa81
EJ
3261 min_rate = MIN(min_rate, hfsc->max_rate);
3262
3263 max_rate = (max_rate_s
3264 ? strtoull(max_rate_s, NULL, 10) / 8
3265 : hfsc->max_rate);
3266 max_rate = MAX(max_rate, min_rate);
3267 max_rate = MIN(max_rate, hfsc->max_rate);
3268
3269 class->min_rate = min_rate;
3270 class->max_rate = max_rate;
3271
3272 return 0;
3273}
3274
3275/* Create an HFSC qdisc.
3276 *
3277 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3278static int
3279hfsc_setup_qdisc__(struct netdev * netdev)
3280{
3281 struct tcmsg *tcmsg;
3282 struct ofpbuf request;
3283 struct tc_hfsc_qopt opt;
3284
3285 tc_del_qdisc(netdev);
3286
3287 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3288 NLM_F_EXCL | NLM_F_CREATE, &request);
3289
3290 if (!tcmsg) {
3291 return ENODEV;
3292 }
3293
3294 tcmsg->tcm_handle = tc_make_handle(1, 0);
3295 tcmsg->tcm_parent = TC_H_ROOT;
3296
3297 memset(&opt, 0, sizeof opt);
3298 opt.defcls = 1;
3299
3300 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3301 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3302
3303 return tc_transact(&request, NULL);
3304}
3305
3306/* Create an HFSC class.
3307 *
3308 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3309 * sc rate <min_rate> ul rate <max_rate>" */
3310static int
3311hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3312 unsigned int parent, struct hfsc_class *class)
3313{
3314 int error;
3315 size_t opt_offset;
3316 struct tcmsg *tcmsg;
3317 struct ofpbuf request;
3318 struct tc_service_curve min, max;
3319
3320 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3321
3322 if (!tcmsg) {
3323 return ENODEV;
3324 }
3325
3326 tcmsg->tcm_handle = handle;
3327 tcmsg->tcm_parent = parent;
3328
3329 min.m1 = 0;
3330 min.d = 0;
3331 min.m2 = class->min_rate;
3332
3333 max.m1 = 0;
3334 max.d = 0;
3335 max.m2 = class->max_rate;
3336
3337 nl_msg_put_string(&request, TCA_KIND, "hfsc");
3338 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3339 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3340 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3341 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3342 nl_msg_end_nested(&request, opt_offset);
3343
3344 error = tc_transact(&request, NULL);
3345 if (error) {
3346 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3347 "min-rate %ubps, max-rate %ubps (%s)",
3348 netdev_get_name(netdev),
3349 tc_get_major(handle), tc_get_minor(handle),
3350 tc_get_major(parent), tc_get_minor(parent),
3351 class->min_rate, class->max_rate, strerror(error));
3352 }
3353
3354 return error;
3355}
3356
3357static int
79f1cbe9 3358hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3359{
3360 int error;
3361 struct hfsc_class class;
3362
3363 error = hfsc_setup_qdisc__(netdev);
3364
3365 if (error) {
3366 return error;
3367 }
3368
3369 hfsc_parse_qdisc_details__(netdev, details, &class);
3370 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3371 tc_make_handle(1, 0), &class);
3372
3373 if (error) {
3374 return error;
3375 }
3376
3377 hfsc_install__(netdev, class.max_rate);
3378 return 0;
3379}
3380
3381static int
3382hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3383{
3384 struct ofpbuf msg;
a339aa81
EJ
3385 struct nl_dump dump;
3386 struct hfsc_class hc;
3387
3388 hc.max_rate = 0;
3389 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3390 hfsc_install__(netdev, hc.max_rate);
a339aa81
EJ
3391
3392 if (!start_queue_dump(netdev, &dump)) {
3393 return ENODEV;
3394 }
3395
3396 while (nl_dump_next(&dump, &msg)) {
3397 unsigned int queue_id;
3398
3399 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3400 hfsc_update_queue__(netdev, queue_id, &hc);
3401 }
3402 }
3403
3404 nl_dump_done(&dump);
3405 return 0;
3406}
3407
3408static void
3409hfsc_tc_destroy(struct tc *tc)
3410{
3411 struct hfsc *hfsc;
3412 struct hfsc_class *hc, *next;
3413
3414 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3415
3416 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3417 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3418 free(hc);
3419 }
3420
3421 tc_destroy(tc);
3422 free(hfsc);
3423}
3424
3425static int
79f1cbe9 3426hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
3427{
3428 const struct hfsc *hfsc;
3429 hfsc = hfsc_get__(netdev);
79f1cbe9 3430 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
3431 return 0;
3432}
3433
3434static int
79f1cbe9 3435hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
3436{
3437 int error;
3438 struct hfsc_class class;
3439
3440 hfsc_parse_qdisc_details__(netdev, details, &class);
3441 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3442 tc_make_handle(1, 0), &class);
3443
3444 if (!error) {
3445 hfsc_get__(netdev)->max_rate = class.max_rate;
3446 }
3447
3448 return error;
3449}
3450
3451static int
3452hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3453 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
3454{
3455 const struct hfsc_class *hc;
3456
3457 hc = hfsc_class_cast__(queue);
79f1cbe9 3458 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 3459 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3460 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
3461 }
3462 return 0;
3463}
3464
3465static int
3466hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3467 const struct smap *details)
a339aa81
EJ
3468{
3469 int error;
3470 struct hfsc_class class;
3471
3472 error = hfsc_parse_class_details__(netdev, details, &class);
3473 if (error) {
3474 return error;
3475 }
3476
3477 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3478 tc_make_handle(1, 0xfffe), &class);
3479 if (error) {
3480 return error;
3481 }
3482
3483 hfsc_update_queue__(netdev, queue_id, &class);
3484 return 0;
3485}
3486
3487static int
3488hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3489{
3490 int error;
3491 struct hfsc *hfsc;
3492 struct hfsc_class *hc;
3493
3494 hc = hfsc_class_cast__(queue);
3495 hfsc = hfsc_get__(netdev);
3496
3497 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3498 if (!error) {
3499 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3500 free(hc);
3501 }
3502 return error;
3503}
3504
3505static int
3506hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3507 struct netdev_queue_stats *stats)
3508{
3509 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3510 tc_make_handle(1, 0xfffe), NULL, stats);
3511}
3512
3513static int
3514hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3515 const struct ofpbuf *nlmsg,
3516 netdev_dump_queue_stats_cb *cb, void *aux)
3517{
3518 struct netdev_queue_stats stats;
3519 unsigned int handle, major, minor;
3520 int error;
3521
3522 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3523 if (error) {
3524 return error;
3525 }
3526
3527 major = tc_get_major(handle);
3528 minor = tc_get_minor(handle);
3529 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3530 (*cb)(minor - 1, &stats, aux);
3531 }
3532 return 0;
3533}
3534
3535static const struct tc_ops tc_ops_hfsc = {
3536 "hfsc", /* linux_name */
3537 "linux-hfsc", /* ovs_name */
3538 HFSC_N_QUEUES, /* n_queues */
3539 hfsc_tc_install, /* tc_install */
3540 hfsc_tc_load, /* tc_load */
3541 hfsc_tc_destroy, /* tc_destroy */
3542 hfsc_qdisc_get, /* qdisc_get */
3543 hfsc_qdisc_set, /* qdisc_set */
3544 hfsc_class_get, /* class_get */
3545 hfsc_class_set, /* class_set */
3546 hfsc_class_delete, /* class_delete */
3547 hfsc_class_get_stats, /* class_get_stats */
3548 hfsc_class_dump_stats /* class_dump_stats */
3549};
3550\f
c1c9c9c4
BP
3551/* "linux-default" traffic control class.
3552 *
3553 * This class represents the default, unnamed Linux qdisc. It corresponds to
3554 * the "" (empty string) QoS type in the OVS database. */
3555
3556static void
3557default_install__(struct netdev *netdev)
3558{
3559 struct netdev_dev_linux *netdev_dev =
3560 netdev_dev_linux_cast(netdev_get_dev(netdev));
559eb230 3561 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 3562
559eb230
BP
3563 /* Nothing but a tc class implementation is allowed to write to a tc. This
3564 * class never does that, so we can legitimately use a const tc object. */
3565 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3566}
3567
3568static int
3569default_tc_install(struct netdev *netdev,
79f1cbe9 3570 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
3571{
3572 default_install__(netdev);
3573 return 0;
3574}
3575
3576static int
3577default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3578{
3579 default_install__(netdev);
3580 return 0;
3581}
3582
3583static const struct tc_ops tc_ops_default = {
3584 NULL, /* linux_name */
3585 "", /* ovs_name */
3586 0, /* n_queues */
3587 default_tc_install,
3588 default_tc_load,
3589 NULL, /* tc_destroy */
3590 NULL, /* qdisc_get */
3591 NULL, /* qdisc_set */
3592 NULL, /* class_get */
3593 NULL, /* class_set */
3594 NULL, /* class_delete */
3595 NULL, /* class_get_stats */
3596 NULL /* class_dump_stats */
3597};
3598\f
3599/* "linux-other" traffic control class.
3600 *
3601 * */
3602
3603static int
3604other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3605{
3606 struct netdev_dev_linux *netdev_dev =
3607 netdev_dev_linux_cast(netdev_get_dev(netdev));
559eb230 3608 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 3609
559eb230
BP
3610 /* Nothing but a tc class implementation is allowed to write to a tc. This
3611 * class never does that, so we can legitimately use a const tc object. */
3612 netdev_dev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
3613 return 0;
3614}
3615
3616static const struct tc_ops tc_ops_other = {
3617 NULL, /* linux_name */
3618 "linux-other", /* ovs_name */
3619 0, /* n_queues */
3620 NULL, /* tc_install */
3621 other_tc_load,
3622 NULL, /* tc_destroy */
3623 NULL, /* qdisc_get */
3624 NULL, /* qdisc_set */
3625 NULL, /* class_get */
3626 NULL, /* class_set */
3627 NULL, /* class_delete */
3628 NULL, /* class_get_stats */
3629 NULL /* class_dump_stats */
3630};
3631\f
3632/* Traffic control. */
3633
3634/* Number of kernel "tc" ticks per second. */
3635static double ticks_per_s;
3636
3637/* Number of kernel "jiffies" per second. This is used for the purpose of
3638 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3639 * one jiffy's worth of data.
3640 *
3641 * There are two possibilities here:
3642 *
3643 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3644 * approximate range of 100 to 1024. That means that we really need to
3645 * make sure that the qdisc can buffer that much data.
3646 *
3647 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3648 * has finely granular timers and there's no need to fudge additional room
3649 * for buffers. (There's no extra effort needed to implement that: the
3650 * large 'buffer_hz' is used as a divisor, so practically any number will
3651 * come out as 0 in the division. Small integer results in the case of
3652 * really high dividends won't have any real effect anyhow.)
3653 */
3654static unsigned int buffer_hz;
3655
3656/* Returns tc handle 'major':'minor'. */
3657static unsigned int
3658tc_make_handle(unsigned int major, unsigned int minor)
3659{
3660 return TC_H_MAKE(major << 16, minor);
3661}
3662
3663/* Returns the major number from 'handle'. */
3664static unsigned int
3665tc_get_major(unsigned int handle)
3666{
3667 return TC_H_MAJ(handle) >> 16;
3668}
3669
3670/* Returns the minor number from 'handle'. */
3671static unsigned int
3672tc_get_minor(unsigned int handle)
3673{
3674 return TC_H_MIN(handle);
3675}
3676
3677static struct tcmsg *
3678tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3679 struct ofpbuf *request)
3680{
3681 struct tcmsg *tcmsg;
3682 int ifindex;
3683 int error;
3684
3685 error = get_ifindex(netdev, &ifindex);
3686 if (error) {
3687 return NULL;
3688 }
3689
3690 ofpbuf_init(request, 512);
3691 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3692 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3693 tcmsg->tcm_family = AF_UNSPEC;
3694 tcmsg->tcm_ifindex = ifindex;
3695 /* Caller should fill in tcmsg->tcm_handle. */
3696 /* Caller should fill in tcmsg->tcm_parent. */
3697
3698 return tcmsg;
3699}
3700
3701static int
3702tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3703{
3704 int error = nl_sock_transact(rtnl_sock, request, replyp);
3705 ofpbuf_uninit(request);
3706 return error;
3707}
3708
f8500004
JP
3709/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3710 * policing configuration.
3711 *
3712 * This function is equivalent to running the following when 'add' is true:
3713 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3714 *
3715 * This function is equivalent to running the following when 'add' is false:
3716 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3717 *
3718 * The configuration and stats may be seen with the following command:
3719 * /sbin/tc -s qdisc show dev <devname>
3720 *
3721 * Returns 0 if successful, otherwise a positive errno value.
3722 */
3723static int
3724tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
3725{
3726 struct ofpbuf request;
3727 struct tcmsg *tcmsg;
3728 int error;
3729 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
3730 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
3731
3732 tcmsg = tc_make_request(netdev, type, flags, &request);
3733 if (!tcmsg) {
3734 return ENODEV;
3735 }
3736 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
3737 tcmsg->tcm_parent = TC_H_INGRESS;
3738 nl_msg_put_string(&request, TCA_KIND, "ingress");
3739 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
3740
3741 error = tc_transact(&request, NULL);
3742 if (error) {
3743 /* If we're deleting the qdisc, don't worry about some of the
3744 * error conditions. */
3745 if (!add && (error == ENOENT || error == EINVAL)) {
3746 return 0;
3747 }
3748 return error;
3749 }
3750
3751 return 0;
3752}
3753
3754/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
3755 * of 'kbits_burst'.
3756 *
3757 * This function is equivalent to running:
3758 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
3759 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
3760 * mtu 65535 drop
3761 *
3762 * The configuration and stats may be seen with the following command:
3763 * /sbin/tc -s filter show <devname> eth0 parent ffff:
3764 *
3765 * Returns 0 if successful, otherwise a positive errno value.
3766 */
3767static int
3768tc_add_policer(struct netdev *netdev, int kbits_rate, int kbits_burst)
3769{
3770 struct tc_police tc_police;
3771 struct ofpbuf request;
3772 struct tcmsg *tcmsg;
3773 size_t basic_offset;
3774 size_t police_offset;
3775 int error;
3776 int mtu = 65535;
3777
3778 memset(&tc_police, 0, sizeof tc_police);
3779 tc_police.action = TC_POLICE_SHOT;
3780 tc_police.mtu = mtu;
e5c08015 3781 tc_fill_rate(&tc_police.rate, (kbits_rate * 1000)/8, mtu);
f8500004
JP
3782 tc_police.burst = tc_bytes_to_ticks(tc_police.rate.rate,
3783 kbits_burst * 1024);
3784
3785 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
3786 NLM_F_EXCL | NLM_F_CREATE, &request);
3787 if (!tcmsg) {
3788 return ENODEV;
3789 }
3790 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
3791 tcmsg->tcm_info = tc_make_handle(49,
3792 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
3793
3794 nl_msg_put_string(&request, TCA_KIND, "basic");
3795 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3796 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
3797 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
3798 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
3799 nl_msg_end_nested(&request, police_offset);
3800 nl_msg_end_nested(&request, basic_offset);
3801
3802 error = tc_transact(&request, NULL);
3803 if (error) {
3804 return error;
3805 }
3806
3807 return 0;
3808}
3809
c1c9c9c4
BP
3810static void
3811read_psched(void)
3812{
3813 /* The values in psched are not individually very meaningful, but they are
3814 * important. The tables below show some values seen in the wild.
3815 *
3816 * Some notes:
3817 *
3818 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3819 * (Before that, there are hints that it was 1000000000.)
3820 *
3821 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
3822 * above.
3823 *
3824 * /proc/net/psched
3825 * -----------------------------------
3826 * [1] 000c8000 000f4240 000f4240 00000064
3827 * [2] 000003e8 00000400 000f4240 3b9aca00
3828 * [3] 000003e8 00000400 000f4240 3b9aca00
3829 * [4] 000003e8 00000400 000f4240 00000064
3830 * [5] 000003e8 00000040 000f4240 3b9aca00
3831 * [6] 000003e8 00000040 000f4240 000000f9
3832 *
3833 * a b c d ticks_per_s buffer_hz
3834 * ------- --------- ---------- ------------- ----------- -------------
3835 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
3836 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3837 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3838 * [4] 1,000 1,024 1,000,000 100 976,562 100
3839 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3840 * [6] 1,000 64 1,000,000 249 15,625,000 249
3841 *
3842 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3843 * [2] 2.6.26-1-686-bigmem from Debian lenny
3844 * [3] 2.6.26-2-sparc64 from Debian lenny
3845 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3846 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3847 * [6] 2.6.34 from kernel.org on KVM
3848 */
3849 static const char fn[] = "/proc/net/psched";
3850 unsigned int a, b, c, d;
3851 FILE *stream;
3852
3853 ticks_per_s = 1.0;
3854 buffer_hz = 100;
3855
3856 stream = fopen(fn, "r");
3857 if (!stream) {
3858 VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3859 return;
3860 }
3861
3862 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3863 VLOG_WARN("%s: read failed", fn);
3864 fclose(stream);
3865 return;
3866 }
3867 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3868 fclose(stream);
3869
3870 if (!a || !c) {
3871 VLOG_WARN("%s: invalid scheduler parameters", fn);
3872 return;
3873 }
3874
3875 ticks_per_s = (double) a * c / b;
3876 if (c == 1000000) {
3877 buffer_hz = d;
3878 } else {
3879 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3880 fn, a, b, c, d);
3881 }
3882 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3883}
3884
3885/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3886 * rate of 'rate' bytes per second. */
3887static unsigned int
3888tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3889{
3890 if (!buffer_hz) {
3891 read_psched();
3892 }
3893 return (rate * ticks) / ticks_per_s;
3894}
3895
3896/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3897 * rate of 'rate' bytes per second. */
3898static unsigned int
3899tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3900{
3901 if (!buffer_hz) {
3902 read_psched();
3903 }
015c93a4 3904 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
3905}
3906
3907/* Returns the number of bytes that need to be reserved for qdisc buffering at
3908 * a transmission rate of 'rate' bytes per second. */
3909static unsigned int
3910tc_buffer_per_jiffy(unsigned int rate)
3911{
3912 if (!buffer_hz) {
3913 read_psched();
3914 }
3915 return rate / buffer_hz;
3916}
3917
3918/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3919 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3920 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3921 * stores NULL into it if it is absent.
3922 *
3923 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3924 * 'msg'.
3925 *
3926 * Returns 0 if successful, otherwise a positive errno value. */
3927static int
3928tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3929 struct nlattr **options)
3930{
3931 static const struct nl_policy tca_policy[] = {
3932 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3933 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3934 };
3935 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3936
3937 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3938 tca_policy, ta, ARRAY_SIZE(ta))) {
3939 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3940 goto error;
3941 }
3942
3943 if (kind) {
3944 *kind = nl_attr_get_string(ta[TCA_KIND]);
3945 }
3946
3947 if (options) {
3948 *options = ta[TCA_OPTIONS];
3949 }
3950
3951 return 0;
3952
3953error:
3954 if (kind) {
3955 *kind = NULL;
3956 }
3957 if (options) {
3958 *options = NULL;
3959 }
3960 return EPROTO;
3961}
3962
3963/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3964 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3965 * into '*options', and its queue statistics into '*stats'. Any of the output
3966 * arguments may be null.
3967 *
3968 * Returns 0 if successful, otherwise a positive errno value. */
3969static int
3970tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3971 struct nlattr **options, struct netdev_queue_stats *stats)
3972{
3973 static const struct nl_policy tca_policy[] = {
3974 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3975 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3976 };
3977 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3978
3979 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3980 tca_policy, ta, ARRAY_SIZE(ta))) {
3981 VLOG_WARN_RL(&rl, "failed to parse class message");
3982 goto error;
3983 }
3984
3985 if (handlep) {
3986 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3987 *handlep = tc->tcm_handle;
3988 }
3989
3990 if (options) {
3991 *options = ta[TCA_OPTIONS];
3992 }
3993
3994 if (stats) {
3995 const struct gnet_stats_queue *gsq;
3996 struct gnet_stats_basic gsb;
3997
3998 static const struct nl_policy stats_policy[] = {
3999 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4000 .min_len = sizeof gsb },
4001 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4002 .min_len = sizeof *gsq },
4003 };
4004 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4005
4006 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4007 sa, ARRAY_SIZE(sa))) {
4008 VLOG_WARN_RL(&rl, "failed to parse class stats");
4009 goto error;
4010 }
4011
4012 /* Alignment issues screw up the length of struct gnet_stats_basic on
4013 * some arch/bitsize combinations. Newer versions of Linux have a
4014 * struct gnet_stats_basic_packed, but we can't depend on that. The
4015 * easiest thing to do is just to make a copy. */
4016 memset(&gsb, 0, sizeof gsb);
4017 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4018 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4019 stats->tx_bytes = gsb.bytes;
4020 stats->tx_packets = gsb.packets;
4021
4022 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4023 stats->tx_errors = gsq->drops;
4024 }
4025
4026 return 0;
4027
4028error:
4029 if (options) {
4030 *options = NULL;
4031 }
4032 if (stats) {
4033 memset(stats, 0, sizeof *stats);
4034 }
4035 return EPROTO;
4036}
4037
4038/* Queries the kernel for class with identifier 'handle' and parent 'parent'
4039 * on 'netdev'. */
4040static int
4041tc_query_class(const struct netdev *netdev,
4042 unsigned int handle, unsigned int parent,
4043 struct ofpbuf **replyp)
4044{
4045 struct ofpbuf request;
4046 struct tcmsg *tcmsg;
4047 int error;
4048
4049 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
4050 if (!tcmsg) {
4051 return ENODEV;
4052 }
c1c9c9c4
BP
4053 tcmsg->tcm_handle = handle;
4054 tcmsg->tcm_parent = parent;
4055
4056 error = tc_transact(&request, replyp);
4057 if (error) {
4058 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
4059 netdev_get_name(netdev),
4060 tc_get_major(handle), tc_get_minor(handle),
4061 tc_get_major(parent), tc_get_minor(parent),
4062 strerror(error));
4063 }
4064 return error;
4065}
4066
4067/* Equivalent to "tc class del dev <name> handle <handle>". */
4068static int
4069tc_delete_class(const struct netdev *netdev, unsigned int handle)
4070{
4071 struct ofpbuf request;
4072 struct tcmsg *tcmsg;
4073 int error;
4074
4075 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
4076 if (!tcmsg) {
4077 return ENODEV;
4078 }
c1c9c9c4
BP
4079 tcmsg->tcm_handle = handle;
4080 tcmsg->tcm_parent = 0;
4081
4082 error = tc_transact(&request, NULL);
4083 if (error) {
4084 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
4085 netdev_get_name(netdev),
4086 tc_get_major(handle), tc_get_minor(handle),
4087 strerror(error));
4088 }
4089 return error;
4090}
4091
4092/* Equivalent to "tc qdisc del dev <name> root". */
4093static int
4094tc_del_qdisc(struct netdev *netdev)
4095{
4096 struct netdev_dev_linux *netdev_dev =
4097 netdev_dev_linux_cast(netdev_get_dev(netdev));
4098 struct ofpbuf request;
4099 struct tcmsg *tcmsg;
4100 int error;
4101
4102 tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
23a98ffe
BP
4103 if (!tcmsg) {
4104 return ENODEV;
4105 }
c1c9c9c4
BP
4106 tcmsg->tcm_handle = tc_make_handle(1, 0);
4107 tcmsg->tcm_parent = TC_H_ROOT;
4108
4109 error = tc_transact(&request, NULL);
4110 if (error == EINVAL) {
4111 /* EINVAL probably means that the default qdisc was in use, in which
4112 * case we've accomplished our purpose. */
4113 error = 0;
4114 }
4115 if (!error && netdev_dev->tc) {
4116 if (netdev_dev->tc->ops->tc_destroy) {
4117 netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
4118 }
4119 netdev_dev->tc = NULL;
4120 }
4121 return error;
4122}
4123
4124/* If 'netdev''s qdisc type and parameters are not yet known, queries the
4125 * kernel to determine what they are. Returns 0 if successful, otherwise a
4126 * positive errno value. */
4127static int
4128tc_query_qdisc(const struct netdev *netdev)
4129{
4130 struct netdev_dev_linux *netdev_dev =
4131 netdev_dev_linux_cast(netdev_get_dev(netdev));
4132 struct ofpbuf request, *qdisc;
4133 const struct tc_ops *ops;
4134 struct tcmsg *tcmsg;
4135 int load_error;
4136 int error;
4137
4138 if (netdev_dev->tc) {
4139 return 0;
4140 }
4141
4142 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4143 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4144 * 2.6.35 without that fix backported to it.
4145 *
4146 * To avoid the OOPS, we must not make a request that would attempt to dump
4147 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4148 * few others. There are a few ways that I can see to do this, but most of
4149 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4150 * technique chosen here is to assume that any non-default qdisc that we
4151 * create will have a class with handle 1:0. The built-in qdiscs only have
4152 * a class with handle 0:0.
4153 *
4154 * We could check for Linux 2.6.35+ and use a more straightforward method
4155 * there. */
4156 tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
4157 if (!tcmsg) {
4158 return ENODEV;
4159 }
c1c9c9c4
BP
4160 tcmsg->tcm_handle = tc_make_handle(1, 0);
4161 tcmsg->tcm_parent = 0;
4162
4163 /* Figure out what tc class to instantiate. */
4164 error = tc_transact(&request, &qdisc);
4165 if (!error) {
4166 const char *kind;
4167
4168 error = tc_parse_qdisc(qdisc, &kind, NULL);
4169 if (error) {
4170 ops = &tc_ops_other;
4171 } else {
4172 ops = tc_lookup_linux_name(kind);
4173 if (!ops) {
4174 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
4175 VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
4176
4177 ops = &tc_ops_other;
4178 }
4179 }
4180 } else if (error == ENOENT) {
4181 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4182 * other entity that doesn't have a handle 1:0. We will assume
4183 * that it's the system default qdisc. */
4184 ops = &tc_ops_default;
4185 error = 0;
4186 } else {
4187 /* Who knows? Maybe the device got deleted. */
4188 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
4189 netdev_get_name(netdev), strerror(error));
4190 ops = &tc_ops_other;
4191 }
4192
4193 /* Instantiate it. */
ebc56baa 4194 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev), qdisc);
cb22974d 4195 ovs_assert((load_error == 0) == (netdev_dev->tc != NULL));
c1c9c9c4
BP
4196 ofpbuf_delete(qdisc);
4197
4198 return error ? error : load_error;
4199}
4200
4201/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4202 approximate the time to transmit packets of various lengths. For an MTU of
4203 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4204 represents two possible packet lengths; for a MTU of 513 through 1024, four
4205 possible lengths; and so on.
4206
4207 Returns, for the specified 'mtu', the number of bits that packet lengths
4208 need to be shifted right to fit within such a 256-entry table. */
4209static int
4210tc_calc_cell_log(unsigned int mtu)
4211{
4212 int cell_log;
4213
4214 if (!mtu) {
4215 mtu = ETH_PAYLOAD_MAX;
4216 }
4217 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
4218
4219 for (cell_log = 0; mtu >= 256; cell_log++) {
4220 mtu >>= 1;
4221 }
4222
4223 return cell_log;
4224}
4225
4226/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4227 * of 'mtu'. */
4228static void
4229tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
4230{
4231 memset(rate, 0, sizeof *rate);
4232 rate->cell_log = tc_calc_cell_log(mtu);
4233 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4234 /* rate->cell_align = 0; */ /* distro headers. */
4235 rate->mpu = ETH_TOTAL_MIN;
4236 rate->rate = Bps;
4237}
4238
4239/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4240 * attribute of the specified "type".
4241 *
4242 * See tc_calc_cell_log() above for a description of "rtab"s. */
4243static void
4244tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
4245{
4246 uint32_t *rtab;
4247 unsigned int i;
4248
4249 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
4250 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
4251 unsigned packet_size = (i + 1) << rate->cell_log;
4252 if (packet_size < rate->mpu) {
4253 packet_size = rate->mpu;
4254 }
4255 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
4256 }
4257}
4258
4259/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4260 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4261 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 4262 * 0 is fine.) */
c1c9c9c4
BP
4263static int
4264tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
4265{
4266 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
4267 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
4268}
d3980822 4269\f
aaf2fb1a
BP
4270/* Linux-only functions declared in netdev-linux.h */
4271
025e874a
BP
4272/* Returns a fd for an AF_INET socket or a negative errno value. */
4273int
4274netdev_linux_get_af_inet_sock(void)
4275{
4276 int error = netdev_linux_init();
4277 return error ? -error : af_inet_sock;
4278}
4279
aaf2fb1a
BP
4280/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4281 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4282int
4283netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
4284 const char *flag_name, bool enable)
4285{
4286 const char *netdev_name = netdev_get_name(netdev);
4287 struct ethtool_value evalue;
4288 uint32_t new_flags;
4289 int error;
4290
ab985a77 4291 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4292 memset(&evalue, 0, sizeof evalue);
4293 error = netdev_linux_do_ethtool(netdev_name,
4294 (struct ethtool_cmd *)&evalue,
4295 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4296 if (error) {
4297 return error;
4298 }
4299
ab985a77 4300 COVERAGE_INC(netdev_set_ethtool);
aaf2fb1a
BP
4301 evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
4302 error = netdev_linux_do_ethtool(netdev_name,
4303 (struct ethtool_cmd *)&evalue,
4304 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
4305 if (error) {
4306 return error;
4307 }
4308
ab985a77 4309 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
4310 memset(&evalue, 0, sizeof evalue);
4311 error = netdev_linux_do_ethtool(netdev_name,
4312 (struct ethtool_cmd *)&evalue,
4313 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
4314 if (error) {
4315 return error;
4316 }
4317
4318 if (new_flags != evalue.data) {
4319 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
4320 "device %s failed", enable ? "enable" : "disable",
4321 flag_name, netdev_name);
4322 return EOPNOTSUPP;
4323 }
4324
4325 return 0;
4326}
4327\f
4328/* Utility functions. */
4329
d3980822 4330/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 4331static void
d3980822
BP
4332netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
4333 const struct rtnl_link_stats *src)
4334{
f613a0d7
PS
4335 dst->rx_packets = src->rx_packets;
4336 dst->tx_packets = src->tx_packets;
4337 dst->rx_bytes = src->rx_bytes;
4338 dst->tx_bytes = src->tx_bytes;
4339 dst->rx_errors = src->rx_errors;
4340 dst->tx_errors = src->tx_errors;
4341 dst->rx_dropped = src->rx_dropped;
4342 dst->tx_dropped = src->tx_dropped;
4343 dst->multicast = src->multicast;
4344 dst->collisions = src->collisions;
4345 dst->rx_length_errors = src->rx_length_errors;
4346 dst->rx_over_errors = src->rx_over_errors;
4347 dst->rx_crc_errors = src->rx_crc_errors;
4348 dst->rx_frame_errors = src->rx_frame_errors;
4349 dst->rx_fifo_errors = src->rx_fifo_errors;
4350 dst->rx_missed_errors = src->rx_missed_errors;
4351 dst->tx_aborted_errors = src->tx_aborted_errors;
4352 dst->tx_carrier_errors = src->tx_carrier_errors;
4353 dst->tx_fifo_errors = src->tx_fifo_errors;
4354 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4355 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
4356}
4357
c1c9c9c4
BP
4358static int
4359get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4360{
4361 /* Policy for RTNLGRP_LINK messages.
4362 *
4363 * There are *many* more fields in these messages, but currently we only
4364 * care about these fields. */
4365 static const struct nl_policy rtnlgrp_link_policy[] = {
4366 [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4367 [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4368 .min_len = sizeof(struct rtnl_link_stats) },
4369 };
4370
4371 struct ofpbuf request;
4372 struct ofpbuf *reply;
4373 struct ifinfomsg *ifi;
c1c9c9c4
BP
4374 struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4375 int error;
4376
4377 ofpbuf_init(&request, 0);
4378 nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4379 ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4380 ifi->ifi_family = PF_UNSPEC;
4381 ifi->ifi_index = ifindex;
4382 error = nl_sock_transact(rtnl_sock, &request, &reply);
4383 ofpbuf_uninit(&request);
4384 if (error) {
4385 return error;
4386 }
4387
4388 if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4389 rtnlgrp_link_policy,
4390 attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4391 ofpbuf_delete(reply);
4392 return EPROTO;
4393 }
4394
4395 if (!attrs[IFLA_STATS]) {
4396 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4397 ofpbuf_delete(reply);
4398 return EPROTO;
4399 }
8b61709d 4400
d3980822 4401 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
8b61709d 4402
576e26d7
BP
4403 ofpbuf_delete(reply);
4404
8b61709d
BP
4405 return 0;
4406}
4407
4408static int
4409get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4410{
4411 static const char fn[] = "/proc/net/dev";
4412 char line[1024];
4413 FILE *stream;
4414 int ln;
4415
4416 stream = fopen(fn, "r");
4417 if (!stream) {
4418 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4419 return errno;
4420 }
4421
4422 ln = 0;
4423 while (fgets(line, sizeof line, stream)) {
4424 if (++ln >= 3) {
4425 char devname[16];
4426#define X64 "%"SCNu64
4427 if (sscanf(line,
4428 " %15[^:]:"
4429 X64 X64 X64 X64 X64 X64 X64 "%*u"
4430 X64 X64 X64 X64 X64 X64 X64 "%*u",
4431 devname,
4432 &stats->rx_bytes,
4433 &stats->rx_packets,
4434 &stats->rx_errors,
4435 &stats->rx_dropped,
4436 &stats->rx_fifo_errors,
4437 &stats->rx_frame_errors,
4438 &stats->multicast,
4439 &stats->tx_bytes,
4440 &stats->tx_packets,
4441 &stats->tx_errors,
4442 &stats->tx_dropped,
4443 &stats->tx_fifo_errors,
4444 &stats->collisions,
4445 &stats->tx_carrier_errors) != 15) {
4446 VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4447 } else if (!strcmp(devname, netdev_name)) {
4448 stats->rx_length_errors = UINT64_MAX;
4449 stats->rx_over_errors = UINT64_MAX;
4450 stats->rx_crc_errors = UINT64_MAX;
4451 stats->rx_missed_errors = UINT64_MAX;
4452 stats->tx_aborted_errors = UINT64_MAX;
4453 stats->tx_heartbeat_errors = UINT64_MAX;
4454 stats->tx_window_errors = UINT64_MAX;
4455 fclose(stream);
4456 return 0;
4457 }
4458 }
4459 }
4460 VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4461 fclose(stream);
4462 return ENODEV;
4463}
c1c9c9c4 4464
3a183124 4465static int
059e5f4f 4466get_flags(const struct netdev_dev *dev, unsigned int *flags)
8b61709d
BP
4467{
4468 struct ifreq ifr;
4469 int error;
4470
755be9ea
EJ
4471 *flags = 0;
4472 error = netdev_linux_do_ioctl(dev->name, &ifr, SIOCGIFFLAGS,
149f577a 4473 "SIOCGIFFLAGS");
755be9ea
EJ
4474 if (!error) {
4475 *flags = ifr.ifr_flags;
4476 }
8b61709d
BP
4477 return error;
4478}
4479
4480static int
4b609110 4481set_flags(const char *name, unsigned int flags)
8b61709d
BP
4482{
4483 struct ifreq ifr;
4484
4485 ifr.ifr_flags = flags;
4b609110 4486 return netdev_linux_do_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
4487}
4488
4489static int
4490do_get_ifindex(const char *netdev_name)
4491{
4492 struct ifreq ifr;
4493
71d7c22f 4494 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4495 COVERAGE_INC(netdev_get_ifindex);
4496 if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4497 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4498 netdev_name, strerror(errno));
4499 return -errno;
4500 }
4501 return ifr.ifr_ifindex;
4502}
4503
4504static int
4505get_ifindex(const struct netdev *netdev_, int *ifindexp)
4506{
149f577a
JG
4507 struct netdev_dev_linux *netdev_dev =
4508 netdev_dev_linux_cast(netdev_get_dev(netdev_));
c7b1b0a5 4509
149f577a 4510 if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
8b61709d 4511 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 4512
8b61709d 4513 if (ifindex < 0) {
c7b1b0a5
PS
4514 netdev_dev->get_ifindex_error = -ifindex;
4515 netdev_dev->ifindex = 0;
4516 } else {
4517 netdev_dev->get_ifindex_error = 0;
4518 netdev_dev->ifindex = ifindex;
8b61709d 4519 }
149f577a 4520 netdev_dev->cache_valid |= VALID_IFINDEX;
8b61709d 4521 }
c7b1b0a5 4522
149f577a 4523 *ifindexp = netdev_dev->ifindex;
c7b1b0a5 4524 return netdev_dev->get_ifindex_error;
8b61709d
BP
4525}
4526
4527static int
4528get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4529{
4530 struct ifreq ifr;
4531 int hwaddr_family;
4532
4533 memset(&ifr, 0, sizeof ifr);
71d7c22f 4534 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d
BP
4535 COVERAGE_INC(netdev_get_hwaddr);
4536 if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
78857dfb
BP
4537 /* ENODEV probably means that a vif disappeared asynchronously and
4538 * hasn't been removed from the database yet, so reduce the log level
4539 * to INFO for that case. */
4540 VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4541 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4542 netdev_name, strerror(errno));
8b61709d
BP
4543 return errno;
4544 }
4545 hwaddr_family = ifr.ifr_hwaddr.sa_family;
4546 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4547 VLOG_WARN("%s device has unknown hardware address family %d",
4548 netdev_name, hwaddr_family);
4549 }
4550 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4551 return 0;
4552}
4553
4554static int
44445cac 4555set_etheraddr(const char *netdev_name,
8b61709d
BP
4556 const uint8_t mac[ETH_ADDR_LEN])
4557{
4558 struct ifreq ifr;
4559
4560 memset(&ifr, 0, sizeof ifr);
71d7c22f 4561 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 4562 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
8b61709d
BP
4563 memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4564 COVERAGE_INC(netdev_set_hwaddr);
4565 if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4566 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4567 netdev_name, strerror(errno));
4568 return errno;
4569 }
4570 return 0;
4571}
4572
4573static int
0b0544d7 4574netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
4575 int cmd, const char *cmd_name)
4576{
4577 struct ifreq ifr;
4578
4579 memset(&ifr, 0, sizeof ifr);
71d7c22f 4580 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
4581 ifr.ifr_data = (caddr_t) ecmd;
4582
4583 ecmd->cmd = cmd;
8b61709d
BP
4584 if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4585 return 0;
4586 } else {
4587 if (errno != EOPNOTSUPP) {
4588 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
0b0544d7 4589 "failed: %s", cmd_name, name, strerror(errno));
8b61709d
BP
4590 } else {
4591 /* The device doesn't support this operation. That's pretty
4592 * common, so there's no point in logging anything. */
4593 }
4594 return errno;
4595 }
4596}
4597
4598static int
149f577a
JG
4599netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4600 const char *cmd_name)
8b61709d 4601{
71d7c22f 4602 ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
8b61709d 4603 if (ioctl(af_inet_sock, cmd, ifr) == -1) {
149f577a
JG
4604 VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4605 strerror(errno));
8b61709d
BP
4606 return errno;
4607 }
4608 return 0;
4609}
f1acd62b
BP
4610
4611static int
4612netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4613 int cmd, const char *cmd_name)
4614{
4615 struct ifreq ifr;
4616 int error;
4617
4618 ifr.ifr_addr.sa_family = AF_INET;
149f577a 4619 error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
f1acd62b
BP
4620 if (!error) {
4621 const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4622 *ip = sin->sin_addr;
4623 }
4624 return error;
4625}
488d734d
BP
4626
4627/* Returns an AF_PACKET raw socket or a negative errno value. */
4628static int
4629af_packet_sock(void)
4630{
4631 static int sock = INT_MIN;
4632
4633 if (sock == INT_MIN) {
4634 sock = socket(AF_PACKET, SOCK_RAW, 0);
4635 if (sock >= 0) {
8450059e
BP
4636 int error = set_nonblocking(sock);
4637 if (error) {
4638 close(sock);
4639 sock = -error;
4640 }
488d734d
BP
4641 } else {
4642 sock = -errno;
4643 VLOG_ERR("failed to create packet socket: %s", strerror(errno));
4644 }
4645 }
4646
4647 return sock;
4648}