]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
dpctl: add examples to the manpage.
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
13c1637f 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
55bc98d6 23#include <arpa/inet.h>
8b61709d 24#include <inttypes.h>
32383c3b 25#include <linux/filter.h>
c1c9c9c4 26#include <linux/gen_stats.h>
bb7d0e22 27#include <linux/if_ether.h>
8b61709d
BP
28#include <linux/if_tun.h>
29#include <linux/types.h>
30#include <linux/ethtool.h>
63331829 31#include <linux/mii.h>
f8500004 32#include <linux/pkt_cls.h>
6f42c8ea 33#include <linux/pkt_sched.h>
e9e28be3 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/types.h>
37#include <sys/ioctl.h>
38#include <sys/socket.h>
ac3e3aaa 39#include <sys/utsname.h>
55bc98d6 40#include <netpacket/packet.h>
8b61709d
BP
41#include <net/if.h>
42#include <net/if_arp.h>
55bc98d6 43#include <net/if_packet.h>
8b61709d
BP
44#include <net/route.h>
45#include <netinet/in.h>
e9e28be3 46#include <poll.h>
8b61709d
BP
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
e9e28be3
BP
50
51#include "coverage.h"
e14deea0 52#include "dp-packet.h"
93451a0a 53#include "dpif-netlink.h"
df1e5a3b 54#include "dpif-netdev.h"
3e8a2ad1 55#include "openvswitch/dynamic-string.h"
8b61709d 56#include "fatal-signal.h"
93b13be8 57#include "hash.h"
ee89ea7b 58#include "openvswitch/hmap.h"
8b61709d 59#include "netdev-provider.h"
7fbef77a 60#include "netdev-vport.h"
45c8d3a1 61#include "netlink-notifier.h"
2fe27d5a 62#include "netlink-socket.h"
c060c4cf 63#include "netlink.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d
BP
67#include "packets.h"
68#include "poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
ee89ea7b 70#include "openvswitch/shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
1670c579 73#include "timer.h"
c060c4cf 74#include "unaligned.h"
e6211adc 75#include "openvswitch/vlog.h"
ee89ea7b 76#include "util.h"
5136ce49 77
d98e6007 78VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 79
d76f09ea
BP
80COVERAGE_DEFINE(netdev_set_policing);
81COVERAGE_DEFINE(netdev_arp_lookup);
82COVERAGE_DEFINE(netdev_get_ifindex);
83COVERAGE_DEFINE(netdev_get_hwaddr);
84COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
85COVERAGE_DEFINE(netdev_get_ethtool);
86COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 87
8b61709d
BP
88\f
89/* These were introduced in Linux 2.6.14, so they might be missing if we have
90 * old headers. */
91#ifndef ADVERTISED_Pause
92#define ADVERTISED_Pause (1 << 13)
93#endif
94#ifndef ADVERTISED_Asym_Pause
95#define ADVERTISED_Asym_Pause (1 << 14)
96#endif
97
e47bd51a
JP
98/* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100#ifndef ETHTOOL_GFLAGS
101#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102#endif
103#ifndef ETHTOOL_SFLAGS
104#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105#endif
106
c1c9c9c4
BP
107/* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 * headers. */
109#ifndef TC_RTAB_SIZE
110#define TC_RTAB_SIZE 1024
111#endif
112
b73c8518
SH
113/* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
118 *
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
121 */
55bc98d6
BP
122#ifndef PACKET_AUXDATA
123#define PACKET_AUXDATA 8
124#endif
b73c8518
SH
125#ifndef TP_STATUS_VLAN_VALID
126#define TP_STATUS_VLAN_VALID (1 << 4)
127#endif
128#ifndef TP_STATUS_VLAN_TPID_VALID
129#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130#endif
131#undef tpacket_auxdata
132#define tpacket_auxdata rpl_tpacket_auxdata
133struct tpacket_auxdata {
134 uint32_t tp_status;
135 uint32_t tp_len;
136 uint32_t tp_snaplen;
137 uint16_t tp_mac;
138 uint16_t tp_net;
139 uint16_t tp_vlan_tci;
140 uint16_t tp_vlan_tpid;
141};
142
0c615356
SH
143/* Linux 2.6.27 introduced ethtool_cmd_speed
144 *
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
147 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
148 * unconditionally replace ethtool_cmd_speed. */
149#define ethtool_cmd_speed rpl_ethtool_cmd_speed
150static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
151{
152 return ep->speed | (ep->speed_hi << 16);
153}
154
67bed84c
SH
155/* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157#ifndef SUPPORTED_1000baseKX_Full
158#define SUPPORTED_1000baseKX_Full (1 << 17)
159#define SUPPORTED_10000baseKX4_Full (1 << 18)
160#define SUPPORTED_10000baseKR_Full (1 << 19)
161#define SUPPORTED_10000baseR_FEC (1 << 20)
162#define ADVERTISED_1000baseKX_Full (1 << 17)
163#define ADVERTISED_10000baseKX4_Full (1 << 18)
164#define ADVERTISED_10000baseKR_Full (1 << 19)
165#define ADVERTISED_10000baseR_FEC (1 << 20)
166#endif
167
168/* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170#ifndef SUPPORTED_40000baseKR4_Full
171#define SUPPORTED_40000baseKR4_Full (1 << 23)
172#define SUPPORTED_40000baseCR4_Full (1 << 24)
173#define SUPPORTED_40000baseSR4_Full (1 << 25)
174#define SUPPORTED_40000baseLR4_Full (1 << 26)
175#define ADVERTISED_40000baseKR4_Full (1 << 23)
176#define ADVERTISED_40000baseCR4_Full (1 << 24)
177#define ADVERTISED_40000baseSR4_Full (1 << 25)
178#define ADVERTISED_40000baseLR4_Full (1 << 26)
179#endif
180
fa373af4
BP
181/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 *
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
185 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
186 * if_link.h is not self-contained on those kernels. It is easiest to
187 * unconditionally define a replacement. */
188#ifndef IFLA_STATS64
337c9b99 189#define IFLA_STATS64 23
fa373af4
BP
190#endif
191#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
192struct rtnl_link_stats64 {
193 uint64_t rx_packets;
194 uint64_t tx_packets;
195 uint64_t rx_bytes;
196 uint64_t tx_bytes;
197 uint64_t rx_errors;
198 uint64_t tx_errors;
199 uint64_t rx_dropped;
200 uint64_t tx_dropped;
201 uint64_t multicast;
202 uint64_t collisions;
203
204 uint64_t rx_length_errors;
205 uint64_t rx_over_errors;
206 uint64_t rx_crc_errors;
207 uint64_t rx_frame_errors;
208 uint64_t rx_fifo_errors;
209 uint64_t rx_missed_errors;
210
211 uint64_t tx_aborted_errors;
212 uint64_t tx_carrier_errors;
213 uint64_t tx_fifo_errors;
214 uint64_t tx_heartbeat_errors;
215 uint64_t tx_window_errors;
216
217 uint64_t rx_compressed;
218 uint64_t tx_compressed;
219};
337c9b99 220
8b61709d 221enum {
7fbef77a
JG
222 VALID_IFINDEX = 1 << 0,
223 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
224 VALID_IN = 1 << 2,
225 VALID_MTU = 1 << 3,
226 VALID_POLICING = 1 << 4,
227 VALID_VPORT_STAT_ERROR = 1 << 5,
228 VALID_DRVINFO = 1 << 6,
229 VALID_FEATURES = 1 << 7,
8b61709d 230};
c1c9c9c4
BP
231\f
232/* Traffic control. */
233
234/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
235 * network device.
236 *
237 * Each TC implementation subclasses this with whatever additional data it
238 * needs. */
c1c9c9c4
BP
239struct tc {
240 const struct tc_ops *ops;
93b13be8
BP
241 struct hmap queues; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
244};
c1c9c9c4 245
559eb230
BP
246#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247
93b13be8
BP
248/* One traffic control queue.
249 *
250 * Each TC implementation subclasses this with whatever additional data it
251 * needs. */
252struct tc_queue {
253 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 255 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
256};
257
258/* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
260 *
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
264struct tc_ops {
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name;
269
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name;
272
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues;
276
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
282 *
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
286 *
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
289 *
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
79f1cbe9 292 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
293
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
297 *
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
303 * 'netdev'.
304 *
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
308
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
311 * tc_destroy(tc).
312 *
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
316 *
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy)(struct tc *tc);
319
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 *
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
325 *
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
329 *
330 * This function may be null if 'tc' is not configurable.
331 */
79f1cbe9 332 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
333
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
336 *
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
340 *
341 * This function may be null if 'tc' is not configurable.
342 */
79f1cbe9 343 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 344
93b13be8
BP
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
347 *
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 *
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
355 *
356 * This function may be null if 'tc' does not have queues ('n_queues' is
357 * 0). */
93b13be8 358 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 359 struct smap *details);
c1c9c9c4
BP
360
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
364 * 'n_queues'.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 373 const struct smap *details);
c1c9c9c4 374
93b13be8
BP
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
377 *
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
93b13be8 380 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 381
93b13be8
BP
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
384 *
385 * On success, initializes '*stats'.
386 *
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
93b13be8
BP
389 int (*class_get_stats)(const struct netdev *netdev,
390 const struct tc_queue *queue,
c1c9c9c4
BP
391 struct netdev_queue_stats *stats);
392
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 *
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats)(const struct netdev *netdev,
399 const struct ofpbuf *nlmsg,
400 netdev_dump_queue_stats_cb *cb, void *aux);
401};
402
403static void
404tc_init(struct tc *tc, const struct tc_ops *ops)
405{
406 tc->ops = ops;
93b13be8 407 hmap_init(&tc->queues);
c1c9c9c4
BP
408}
409
410static void
411tc_destroy(struct tc *tc)
412{
93b13be8 413 hmap_destroy(&tc->queues);
c1c9c9c4
BP
414}
415
416static const struct tc_ops tc_ops_htb;
a339aa81 417static const struct tc_ops tc_ops_hfsc;
677d9158
JV
418static const struct tc_ops tc_ops_codel;
419static const struct tc_ops tc_ops_fqcodel;
420static const struct tc_ops tc_ops_sfq;
c1c9c9c4 421static const struct tc_ops tc_ops_default;
6cf888b8 422static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
423static const struct tc_ops tc_ops_other;
424
559eb230 425static const struct tc_ops *const tcs[] = {
c1c9c9c4 426 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 427 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
428 &tc_ops_codel, /* Controlled delay */
429 &tc_ops_fqcodel, /* Fair queue controlled delay */
430 &tc_ops_sfq, /* Stochastic fair queueing */
6cf888b8 431 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
432 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other, /* Some other qdisc. */
434 NULL
435};
149f577a 436
c1c9c9c4
BP
437static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
438static unsigned int tc_get_major(unsigned int handle);
439static unsigned int tc_get_minor(unsigned int handle);
440
441static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
445static struct tcmsg *tc_make_request(const struct netdev *, int type,
446 unsigned int flags, struct ofpbuf *);
447static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
f8500004 448static int tc_add_del_ingress_qdisc(struct netdev *netdev, bool add);
c7952afb
BP
449static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
451
452static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462static int tc_del_qdisc(struct netdev *netdev);
463static int tc_query_qdisc(const struct netdev *netdev);
464
465static int tc_calc_cell_log(unsigned int mtu);
466static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470\f
b5d57fc8
BP
471struct netdev_linux {
472 struct netdev up;
149f577a 473
86383816
BP
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
149f577a 477 unsigned int cache_valid;
8b61709d 478
1670c579
EJ
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
8722022c
BP
483 /* The following are figured out "on demand" only. They are only valid
484 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 485 int ifindex;
74ff3298 486 struct eth_addr etheraddr;
8b61709d 487 int mtu;
059e5f4f 488 unsigned int ifi_flags;
65c3058c 489 long long int carrier_resets;
80a86fbe
BP
490 uint32_t kbits_rate; /* Policing data. */
491 uint32_t kbits_burst;
bba1e6f3
PS
492 int vport_stats_error; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
90a6637d 494 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 495 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 496 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 497 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 498 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 499
a00ca915
EJ
500 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 503
4f925bd3 504 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 505 struct tc *tc;
149f577a 506
d0d08f8a
BP
507 /* For devices of class netdev_tap_class only. */
508 int tap_fd;
8b61709d
BP
509};
510
f7791740
PS
511struct netdev_rxq_linux {
512 struct netdev_rxq up;
796223f5 513 bool is_tap;
5b7448ed 514 int fd;
149f577a 515};
8b61709d 516
8b61709d
BP
517/* This is set pretty low because we probably won't learn anything from the
518 * additional log messages. */
519static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
520
19c8e9c1
JS
521/* Polling miimon status for all ports causes performance degradation when
522 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
523 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 *
525 * Readers do not depend on this variable synchronizing with the related
526 * changes in the device miimon status, so we can use atomic_count. */
527static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 528
1c33f0c3 529static void netdev_linux_run(const struct netdev_class *);
6f643e49 530
0b0544d7 531static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 532 int cmd, const char *cmd_name);
b5d57fc8 533static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 534static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
535static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
536 enum netdev_flags on, enum netdev_flags *old_flagsp)
537 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
538static int do_get_ifindex(const char *netdev_name);
539static int get_ifindex(const struct netdev *, int *ifindexp);
540static int do_set_addr(struct netdev *netdev,
541 int ioctl_nr, const char *ioctl_name,
542 struct in_addr addr);
74ff3298
JR
543static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
544static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 545static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 546static int af_packet_sock(void);
19c8e9c1 547static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
548static void netdev_linux_miimon_run(void);
549static void netdev_linux_miimon_wait(void);
df1e5a3b 550static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 551
15b3596a
JG
552static bool
553is_netdev_linux_class(const struct netdev_class *netdev_class)
554{
259e0b1a 555 return netdev_class->run == netdev_linux_run;
15b3596a
JG
556}
557
796223f5
BP
558static bool
559is_tap_netdev(const struct netdev *netdev)
560{
b5d57fc8 561 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
562}
563
8b61709d
BP
564static struct netdev_linux *
565netdev_linux_cast(const struct netdev *netdev)
566{
b5d57fc8 567 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 568
180c6d0b 569 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 570}
796223f5 571
f7791740
PS
572static struct netdev_rxq_linux *
573netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 574{
9dc63482 575 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 576 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 577}
ff4ed3c9 578\f
cee87338 579static void netdev_linux_update(struct netdev_linux *netdev,
7e9dcc0f 580 const struct rtnetlink_change *)
86383816 581 OVS_REQUIRES(netdev->mutex);
cee87338 582static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
583 unsigned int ifi_flags, unsigned int mask)
584 OVS_REQUIRES(netdev->mutex);
cee87338 585
d6384a3a
AW
586/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
587 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
588 * if no such socket could be created. */
589static struct nl_sock *
590netdev_linux_notify_sock(void)
591{
592 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
593 static struct nl_sock *sock;
989d7135
PS
594 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
595 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
596
597 if (ovsthread_once_start(&once)) {
598 int error;
599
600 error = nl_sock_create(NETLINK_ROUTE, &sock);
601 if (!error) {
d6384a3a
AW
602 size_t i;
603
604 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
605 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
606 if (error) {
607 nl_sock_destroy(sock);
608 sock = NULL;
609 break;
610 }
cee87338
BP
611 }
612 }
613 ovsthread_once_done(&once);
614 }
615
616 return sock;
617}
618
19c8e9c1
JS
619static bool
620netdev_linux_miimon_enabled(void)
621{
812c272c 622 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
623}
624
8b61709d 625static void
1c33f0c3 626netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 627{
cee87338
BP
628 struct nl_sock *sock;
629 int error;
630
19c8e9c1
JS
631 if (netdev_linux_miimon_enabled()) {
632 netdev_linux_miimon_run();
633 }
cee87338
BP
634
635 sock = netdev_linux_notify_sock();
636 if (!sock) {
637 return;
638 }
639
640 do {
641 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
642 uint64_t buf_stub[4096 / 8];
643 struct ofpbuf buf;
644
645 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
646 error = nl_sock_recv(sock, &buf, false);
647 if (!error) {
7e9dcc0f 648 struct rtnetlink_change change;
cee87338 649
7e9dcc0f 650 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
651 struct netdev *netdev_ = NULL;
652 char dev_name[IFNAMSIZ];
653
654 if (!change.ifname) {
655 change.ifname = if_indextoname(change.if_index, dev_name);
656 }
657
658 if (change.ifname) {
659 netdev_ = netdev_from_name(change.ifname);
660 }
cee87338
BP
661 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
662 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
663
664 ovs_mutex_lock(&netdev->mutex);
cee87338 665 netdev_linux_update(netdev, &change);
86383816 666 ovs_mutex_unlock(&netdev->mutex);
cee87338 667 }
38e0065b 668 netdev_close(netdev_);
cee87338
BP
669 }
670 } else if (error == ENOBUFS) {
671 struct shash device_shash;
672 struct shash_node *node;
673
674 nl_sock_drain(sock);
675
676 shash_init(&device_shash);
677 netdev_get_devices(&netdev_linux_class, &device_shash);
678 SHASH_FOR_EACH (node, &device_shash) {
679 struct netdev *netdev_ = node->data;
680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
681 unsigned int flags;
682
86383816 683 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
684 get_flags(netdev_, &flags);
685 netdev_linux_changed(netdev, flags, 0);
86383816
BP
686 ovs_mutex_unlock(&netdev->mutex);
687
cee87338
BP
688 netdev_close(netdev_);
689 }
690 shash_destroy(&device_shash);
691 } else if (error != EAGAIN) {
692 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
693 ovs_strerror(error));
694 }
695 ofpbuf_uninit(&buf);
696 } while (!error);
8b61709d
BP
697}
698
699static void
1c33f0c3 700netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 701{
cee87338
BP
702 struct nl_sock *sock;
703
19c8e9c1
JS
704 if (netdev_linux_miimon_enabled()) {
705 netdev_linux_miimon_wait();
706 }
cee87338
BP
707 sock = netdev_linux_notify_sock();
708 if (sock) {
709 nl_sock_wait(sock, POLLIN);
710 }
8b61709d
BP
711}
712
ac4d3bcb 713static void
b5d57fc8
BP
714netdev_linux_changed(struct netdev_linux *dev,
715 unsigned int ifi_flags, unsigned int mask)
86383816 716 OVS_REQUIRES(dev->mutex)
ac4d3bcb 717{
3e912ffc 718 netdev_change_seq_changed(&dev->up);
8aa77183
BP
719
720 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
721 dev->carrier_resets++;
722 }
723 dev->ifi_flags = ifi_flags;
724
4f925bd3 725 dev->cache_valid &= mask;
6b6e1329 726 if (!(mask & VALID_IN)) {
a8704b50
PS
727 netdev_get_addrs_list_flush();
728 }
4f925bd3
PS
729}
730
731static void
b5d57fc8 732netdev_linux_update(struct netdev_linux *dev,
7e9dcc0f 733 const struct rtnetlink_change *change)
86383816 734 OVS_REQUIRES(dev->mutex)
4f925bd3 735{
d6384a3a
AW
736 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)){
737 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 738 /* Keep drv-info, and ip addresses. */
d6384a3a 739 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 740 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
741
742 /* Update netdev from rtnl-change msg. */
743 if (change->mtu) {
744 dev->mtu = change->mtu;
745 dev->cache_valid |= VALID_MTU;
746 dev->netdev_mtu_error = 0;
747 }
90a6637d 748
74ff3298
JR
749 if (!eth_addr_is_zero(change->mac)) {
750 dev->etheraddr = change->mac;
d6384a3a
AW
751 dev->cache_valid |= VALID_ETHERADDR;
752 dev->ether_addr_error = 0;
753 }
44445cac 754
d6384a3a
AW
755 dev->ifindex = change->if_index;
756 dev->cache_valid |= VALID_IFINDEX;
757 dev->get_ifindex_error = 0;
758 } else {
759 netdev_linux_changed(dev, change->ifi_flags, 0);
760 }
761 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
762 /* Invalidates in4, in6. */
6b6e1329 763 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 764 } else {
d6384a3a 765 OVS_NOT_REACHED();
4f925bd3 766 }
ac4d3bcb
EJ
767}
768
9dc63482
BP
769static struct netdev *
770netdev_linux_alloc(void)
771{
772 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
773 return &netdev->up;
774}
775
cee87338 776static void
9dc63482
BP
777netdev_linux_common_construct(struct netdev_linux *netdev)
778{
834d6caf 779 ovs_mutex_init(&netdev->mutex);
9dc63482
BP
780}
781
1f6e0fbd
BP
782/* Creates system and internal devices. */
783static int
9dc63482 784netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 785{
9dc63482 786 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1f6e0fbd
BP
787 int error;
788
cee87338 789 netdev_linux_common_construct(netdev);
1f6e0fbd 790
b5d57fc8
BP
791 error = get_flags(&netdev->up, &netdev->ifi_flags);
792 if (error == ENODEV) {
9dc63482 793 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 794 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
795 return ENODEV;
796 } else {
797 /* "Internal" netdevs have to be created as netdev objects before
798 * they exist in the kernel, because creating them in the kernel
799 * happens by passing a netdev object to dpif_port_add().
800 * Therefore, ignore the error. */
801 }
802 }
46415c90 803
a740f0de
JG
804 return 0;
805}
806
5b7448ed
JG
807/* For most types of netdevs we open the device for each call of
808 * netdev_open(). However, this is not the case with tap devices,
809 * since it is only possible to open the device once. In this
810 * situation we share a single file descriptor, and consequently
811 * buffers, across all readers. Therefore once data is read it will
812 * be unavailable to other reads for tap devices. */
a740f0de 813static int
9dc63482 814netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 815{
9dc63482 816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 817 static const char tap_dev[] = "/dev/net/tun";
9dc63482 818 const char *name = netdev_->name;
a740f0de
JG
819 struct ifreq ifr;
820 int error;
821
cee87338 822 netdev_linux_common_construct(netdev);
1f6e0fbd 823
6c88d577 824 /* Open tap device. */
d0d08f8a
BP
825 netdev->tap_fd = open(tap_dev, O_RDWR);
826 if (netdev->tap_fd < 0) {
6c88d577 827 error = errno;
10a89ef0 828 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 829 return error;
6c88d577
JP
830 }
831
832 /* Create tap device. */
833 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 834 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 835 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 836 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 837 ovs_strerror(errno));
6c88d577 838 error = errno;
f61d8d29 839 goto error_close;
6c88d577
JP
840 }
841
842 /* Make non-blocking. */
d0d08f8a 843 error = set_nonblocking(netdev->tap_fd);
a740f0de 844 if (error) {
f61d8d29 845 goto error_close;
a740f0de
JG
846 }
847
848 return 0;
849
f61d8d29 850error_close:
d0d08f8a 851 close(netdev->tap_fd);
a740f0de
JG
852 return error;
853}
854
6c88d577 855static void
9dc63482 856netdev_linux_destruct(struct netdev *netdev_)
6c88d577 857{
b5d57fc8 858 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 859
b5d57fc8
BP
860 if (netdev->tc && netdev->tc->ops->tc_destroy) {
861 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
862 }
863
d0d08f8a
BP
864 if (netdev_get_class(netdev_) == &netdev_tap_class
865 && netdev->tap_fd >= 0)
866 {
867 close(netdev->tap_fd);
6c88d577 868 }
86383816 869
19c8e9c1 870 if (netdev->miimon_interval > 0) {
812c272c 871 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
872 }
873
86383816 874 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
875}
876
9dc63482
BP
877static void
878netdev_linux_dealloc(struct netdev *netdev_)
879{
880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
881 free(netdev);
882}
883
f7791740
PS
884static struct netdev_rxq *
885netdev_linux_rxq_alloc(void)
9dc63482 886{
f7791740 887 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
888 return &rx->up;
889}
890
7b6b0ef4 891static int
f7791740 892netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 893{
f7791740 894 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 895 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 897 int error;
7b6b0ef4 898
86383816 899 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
900 rx->is_tap = is_tap_netdev(netdev_);
901 if (rx->is_tap) {
902 rx->fd = netdev->tap_fd;
796223f5
BP
903 } else {
904 struct sockaddr_ll sll;
b73c8518 905 int ifindex, val;
32383c3b 906 /* Result of tcpdump -dd inbound */
259e0b1a 907 static const struct sock_filter filt[] = {
32383c3b
MM
908 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
909 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
910 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
911 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
912 };
259e0b1a
BP
913 static const struct sock_fprog fprog = {
914 ARRAY_SIZE(filt), (struct sock_filter *) filt
915 };
7b6b0ef4 916
796223f5 917 /* Create file descriptor. */
9dc63482
BP
918 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
919 if (rx->fd < 0) {
796223f5 920 error = errno;
10a89ef0 921 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
922 goto error;
923 }
33d82a56 924
b73c8518
SH
925 val = 1;
926 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
927 error = errno;
928 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
929 netdev_get_name(netdev_), ovs_strerror(error));
930 goto error;
931 }
932
796223f5 933 /* Set non-blocking mode. */
9dc63482 934 error = set_nonblocking(rx->fd);
796223f5
BP
935 if (error) {
936 goto error;
937 }
7b6b0ef4 938
796223f5 939 /* Get ethernet device index. */
180c6d0b 940 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
941 if (error) {
942 goto error;
943 }
7b6b0ef4 944
796223f5
BP
945 /* Bind to specific ethernet device. */
946 memset(&sll, 0, sizeof sll);
947 sll.sll_family = AF_PACKET;
948 sll.sll_ifindex = ifindex;
b73c8518 949 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 950 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
951 error = errno;
952 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 953 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
954 goto error;
955 }
32383c3b
MM
956
957 /* Filter for only inbound packets. */
9dc63482 958 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
959 sizeof fprog);
960 if (error) {
961 error = errno;
259e0b1a 962 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 963 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
964 goto error;
965 }
7b6b0ef4 966 }
86383816 967 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 968
7b6b0ef4
BP
969 return 0;
970
971error:
9dc63482
BP
972 if (rx->fd >= 0) {
973 close(rx->fd);
7b6b0ef4 974 }
86383816 975 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
976 return error;
977}
978
796223f5 979static void
f7791740 980netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 981{
f7791740 982 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 983
796223f5
BP
984 if (!rx->is_tap) {
985 close(rx->fd);
8b61709d 986 }
9dc63482
BP
987}
988
989static void
f7791740 990netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 991{
f7791740 992 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 993
796223f5
BP
994 free(rx);
995}
8b61709d 996
b73c8518 997static ovs_be16
1ebdc7eb 998auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
999{
1000 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1001 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1002 } else if (double_tagged) {
1003 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1004 } else {
1ebdc7eb 1005 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1006 }
1007}
1008
1009static bool
1010auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1011{
1012 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1013}
1014
796223f5 1015static int
cf62fa4c 1016netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1017{
b73c8518 1018 size_t size;
796223f5 1019 ssize_t retval;
b73c8518
SH
1020 struct iovec iov;
1021 struct cmsghdr *cmsg;
1022 union {
1023 struct cmsghdr cmsg;
1024 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1025 } cmsg_buffer;
1026 struct msghdr msgh;
1027
1028 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1029 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1030 size = dp_packet_tailroom(buffer);
b73c8518 1031
cf62fa4c 1032 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1033 iov.iov_len = size;
1034 msgh.msg_name = NULL;
1035 msgh.msg_namelen = 0;
1036 msgh.msg_iov = &iov;
1037 msgh.msg_iovlen = 1;
1038 msgh.msg_control = &cmsg_buffer;
1039 msgh.msg_controllen = sizeof cmsg_buffer;
1040 msgh.msg_flags = 0;
8e8cddf7 1041
796223f5 1042 do {
b73c8518 1043 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1044 } while (retval < 0 && errno == EINTR);
1045
bfd3367b 1046 if (retval < 0) {
b73c8518
SH
1047 return errno;
1048 } else if (retval > size) {
1049 return EMSGSIZE;
1050 }
1051
cf62fa4c 1052 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1053
1054 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1055 const struct tpacket_auxdata *aux;
1056
1057 if (cmsg->cmsg_level != SOL_PACKET
1058 || cmsg->cmsg_type != PACKET_AUXDATA
1059 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1060 continue;
8b61709d 1061 }
b73c8518
SH
1062
1063 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1064 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1065 struct eth_header *eth;
1066 bool double_tagged;
1067
b73c8518
SH
1068 if (retval < ETH_HEADER_LEN) {
1069 return EINVAL;
1070 }
1071
1ebdc7eb
EG
1072 eth = dp_packet_data(buffer);
1073 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1074
1075 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1076 htons(aux->tp_vlan_tci));
1077 break;
1078 }
1079 }
1080
1081 return 0;
1082}
1083
1084static int
cf62fa4c 1085netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1086{
1087 ssize_t retval;
cf62fa4c 1088 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1089
1090 do {
cf62fa4c 1091 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1092 } while (retval < 0 && errno == EINTR);
1093
1094 if (retval < 0) {
bfd3367b 1095 return errno;
8b61709d 1096 }
b73c8518 1097
cf62fa4c 1098 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1099 return 0;
1100}
1101
1102static int
64839cf4 1103netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
b73c8518 1104{
f7791740 1105 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1106 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1107 struct dp_packet *buffer;
df1e5a3b
PS
1108 ssize_t retval;
1109 int mtu;
1110
1111 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1112 mtu = ETH_PAYLOAD_MAX;
1113 }
1114
cf62fa4c 1115 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1116 DP_NETDEV_HEADROOM);
b73c8518 1117 retval = (rx->is_tap
f7791740
PS
1118 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1119 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1120
1121 if (retval) {
1122 if (retval != EAGAIN && retval != EMSGSIZE) {
1123 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1124 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1125 }
cf62fa4c 1126 dp_packet_delete(buffer);
df1e5a3b 1127 } else {
72c84bc2 1128 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1129 }
1130
1131 return retval;
8b61709d
BP
1132}
1133
8b61709d 1134static void
f7791740 1135netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1136{
f7791740 1137 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1138 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1139}
1140
8b61709d 1141static int
f7791740 1142netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1143{
f7791740 1144 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1145 if (rx->is_tap) {
8b61709d 1146 struct ifreq ifr;
f7791740 1147 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1148 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1149 if (error) {
1150 return error;
1151 }
796223f5 1152 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1153 return 0;
1154 } else {
796223f5 1155 return drain_rcvbuf(rx->fd);
8b61709d
BP
1156 }
1157}
1158
1159/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1160 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1161 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1162 * the packet is too big or too small to transmit on the device.
1163 *
1164 * The caller retains ownership of 'buffer' in all cases.
1165 *
1166 * The kernel maintains a packet transmission queue, so the caller is not
1167 * expected to do additional queuing of packets. */
1168static int
f00fa8cb 1169netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
324c8374
IM
1170 struct dp_packet_batch *batch, bool may_steal,
1171 bool concurrent_txq OVS_UNUSED)
8b61709d 1172{
f4fd623c
DDP
1173 int i;
1174 int error = 0;
40d26f04 1175
f4fd623c 1176 /* 'i' is incremented only if there's no error */
64839cf4
WT
1177 for (i = 0; i < batch->count;) {
1178 const void *data = dp_packet_data(batch->packets[i]);
1179 size_t size = dp_packet_size(batch->packets[i]);
f23347ea 1180 ssize_t retval;
8b61709d 1181
aaca4fe0 1182 /* Truncate the packet if it is configured. */
64839cf4 1183 size -= dp_packet_get_cutlen(batch->packets[i]);
aaca4fe0 1184
796223f5 1185 if (!is_tap_netdev(netdev_)) {
f23347ea
BP
1186 /* Use our AF_PACKET socket to send to this device. */
1187 struct sockaddr_ll sll;
1188 struct msghdr msg;
1189 struct iovec iov;
1190 int ifindex;
488d734d
BP
1191 int sock;
1192
1193 sock = af_packet_sock();
1194 if (sock < 0) {
c4c7a3d7 1195 return -sock;
488d734d 1196 }
f23347ea 1197
86383816
BP
1198 ifindex = netdev_get_ifindex(netdev_);
1199 if (ifindex < 0) {
1200 return -ifindex;
f23347ea 1201 }
8b61709d 1202
f23347ea
BP
1203 /* We don't bother setting most fields in sockaddr_ll because the
1204 * kernel ignores them for SOCK_RAW. */
1205 memset(&sll, 0, sizeof sll);
1206 sll.sll_family = AF_PACKET;
1207 sll.sll_ifindex = ifindex;
76c308b5 1208
ebc56baa 1209 iov.iov_base = CONST_CAST(void *, data);
f23347ea 1210 iov.iov_len = size;
76c308b5 1211
f23347ea
BP
1212 msg.msg_name = &sll;
1213 msg.msg_namelen = sizeof sll;
1214 msg.msg_iov = &iov;
1215 msg.msg_iovlen = 1;
1216 msg.msg_control = NULL;
1217 msg.msg_controllen = 0;
1218 msg.msg_flags = 0;
1219
488d734d 1220 retval = sendmsg(sock, &msg, 0);
f23347ea 1221 } else {
796223f5
BP
1222 /* Use the tap fd to send to this device. This is essential for
1223 * tap devices, because packets sent to a tap device with an
1224 * AF_PACKET socket will loop back to be *received* again on the
32383c3b
MM
1225 * tap device. This doesn't occur on other interface types
1226 * because we attach a socket filter to the rx socket. */
b5d57fc8 1227 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
796223f5 1228
d0d08f8a 1229 retval = write(netdev->tap_fd, data, size);
f23347ea 1230 }
76c308b5 1231
8b61709d 1232 if (retval < 0) {
29736cc0
DDP
1233 if (errno == EINTR) {
1234 /* The send was interrupted by a signal. Retry the packet by
1235 * continuing without incrementing 'i'.*/
8b61709d 1236 continue;
29736cc0
DDP
1237 } else if (errno == EIO && is_tap_netdev(netdev_)) {
1238 /* The Linux tap driver returns EIO if the device is not up.
1239 * From the OVS side this is not an error, so ignore it. */
1240 } else {
1241 /* The Linux AF_PACKET implementation never blocks waiting for
1242 * room for packets, instead returning ENOBUFS. Translate this
1243 * into EAGAIN for the caller. */
1244 error = errno == ENOBUFS ? EAGAIN : errno;
1245 break;
8b61709d 1246 }
8b61709d 1247 } else if (retval != size) {
f4fd623c
DDP
1248 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" bytes"
1249 " of %"PRIuSIZE") on %s", retval, size,
1250 netdev_get_name(netdev_));
1251 error = EMSGSIZE;
1252 break;
1253 }
1254
1255 /* Process the next packet in the batch */
1256 i++;
1257 }
1258
64839cf4 1259 dp_packet_delete_batch(batch, may_steal);
f4fd623c
DDP
1260
1261 if (error && error != EAGAIN) {
1262 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1263 netdev_get_name(netdev_), ovs_strerror(error));
1264 }
1265
1266 return error;
1267
8b61709d
BP
1268}
1269
1270/* Registers with the poll loop to wake up from the next call to poll_block()
1271 * when the packet transmission queue has sufficient room to transmit a packet
1272 * with netdev_send().
1273 *
1274 * The kernel maintains a packet transmission queue, so the client is not
1275 * expected to do additional queuing of packets. Thus, this function is
1276 * unlikely to ever be used. It is included for completeness. */
1277static void
f00fa8cb 1278netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1279{
796223f5 1280 if (is_tap_netdev(netdev)) {
8b61709d
BP
1281 /* TAP device always accepts packets.*/
1282 poll_immediate_wake();
1283 }
1284}
1285
1286/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1287 * otherwise a positive errno value. */
1288static int
74ff3298 1289netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1290{
b5d57fc8 1291 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1292 enum netdev_flags old_flags = 0;
eb395f2e
BP
1293 int error;
1294
86383816
BP
1295 ovs_mutex_lock(&netdev->mutex);
1296
b5d57fc8 1297 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1298 error = netdev->ether_addr_error;
1299 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1300 goto exit;
44445cac 1301 }
b5d57fc8 1302 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1303 }
1304
7eb1bd81 1305 /* Tap devices must be brought down before setting the address. */
796223f5 1306 if (is_tap_netdev(netdev_)) {
4f9f3f21 1307 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1308 }
44445cac
PS
1309 error = set_etheraddr(netdev_get_name(netdev_), mac);
1310 if (!error || error == ENODEV) {
b5d57fc8
BP
1311 netdev->ether_addr_error = error;
1312 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1313 if (!error) {
74ff3298 1314 netdev->etheraddr = mac;
eb395f2e 1315 }
8b61709d 1316 }
44445cac 1317
4f9f3f21
BP
1318 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1319 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1320 }
7eb1bd81 1321
86383816
BP
1322exit:
1323 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1324 return error;
1325}
1326
44445cac 1327/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1328static int
74ff3298 1329netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1330{
b5d57fc8 1331 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1332 int error;
44445cac 1333
86383816 1334 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1335 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
86383816 1336 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1337 &netdev->etheraddr);
b5d57fc8 1338 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1339 }
44445cac 1340
86383816
BP
1341 error = netdev->ether_addr_error;
1342 if (!error) {
74ff3298 1343 *mac = netdev->etheraddr;
44445cac 1344 }
86383816 1345 ovs_mutex_unlock(&netdev->mutex);
44445cac 1346
86383816 1347 return error;
8b61709d
BP
1348}
1349
8b61709d 1350static int
73371c09 1351netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1352{
86383816
BP
1353 int error;
1354
b5d57fc8 1355 if (!(netdev->cache_valid & VALID_MTU)) {
8b61709d 1356 struct ifreq ifr;
90a6637d 1357
86383816 1358 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1359 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1360 netdev->mtu = ifr.ifr_mtu;
1361 netdev->cache_valid |= VALID_MTU;
8b61709d 1362 }
90a6637d 1363
86383816
BP
1364 error = netdev->netdev_mtu_error;
1365 if (!error) {
b5d57fc8 1366 *mtup = netdev->mtu;
90a6637d 1367 }
73371c09
BP
1368
1369 return error;
1370}
1371
1372/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1373 * in bytes, not including the hardware header; thus, this is typically 1500
1374 * bytes for Ethernet devices. */
1375static int
1376netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1377{
1378 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1379 int error;
1380
1381 ovs_mutex_lock(&netdev->mutex);
1382 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1383 ovs_mutex_unlock(&netdev->mutex);
1384
1385 return error;
8b61709d
BP
1386}
1387
9b020780
PS
1388/* Sets the maximum size of transmitted (MTU) for given device using linux
1389 * networking ioctl interface.
1390 */
1391static int
4124cb12 1392netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1393{
b5d57fc8 1394 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1395 struct ifreq ifr;
1396 int error;
1397
86383816 1398 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1399 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1400 error = netdev->netdev_mtu_error;
1401 if (error || netdev->mtu == mtu) {
1402 goto exit;
90a6637d 1403 }
b5d57fc8 1404 netdev->cache_valid &= ~VALID_MTU;
153e5481 1405 }
9b020780 1406 ifr.ifr_mtu = mtu;
259e0b1a
BP
1407 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1408 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1409 if (!error || error == ENODEV) {
b5d57fc8
BP
1410 netdev->netdev_mtu_error = error;
1411 netdev->mtu = ifr.ifr_mtu;
1412 netdev->cache_valid |= VALID_MTU;
9b020780 1413 }
86383816
BP
1414exit:
1415 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1416 return error;
9b020780
PS
1417}
1418
9ab3d9a3
BP
1419/* Returns the ifindex of 'netdev', if successful, as a positive number.
1420 * On failure, returns a negative errno value. */
1421static int
86383816 1422netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1423{
86383816 1424 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1425 int ifindex, error;
1426
86383816
BP
1427 ovs_mutex_lock(&netdev->mutex);
1428 error = get_ifindex(netdev_, &ifindex);
1429 ovs_mutex_unlock(&netdev->mutex);
1430
9ab3d9a3
BP
1431 return error ? -error : ifindex;
1432}
1433
8b61709d
BP
1434static int
1435netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1436{
b5d57fc8 1437 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1438
86383816 1439 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1440 if (netdev->miimon_interval > 0) {
1441 *carrier = netdev->miimon;
3a183124 1442 } else {
b5d57fc8 1443 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1444 }
86383816 1445 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1446
3a183124 1447 return 0;
8b61709d
BP
1448}
1449
65c3058c 1450static long long int
86383816 1451netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1452{
86383816
BP
1453 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1454 long long int carrier_resets;
1455
1456 ovs_mutex_lock(&netdev->mutex);
1457 carrier_resets = netdev->carrier_resets;
1458 ovs_mutex_unlock(&netdev->mutex);
1459
1460 return carrier_resets;
65c3058c
EJ
1461}
1462
63331829 1463static int
1670c579
EJ
1464netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1465 struct mii_ioctl_data *data)
63331829 1466{
63331829 1467 struct ifreq ifr;
782e6111 1468 int error;
63331829 1469
63331829 1470 memset(&ifr, 0, sizeof ifr);
782e6111 1471 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1472 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1473 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1474
782e6111
EJ
1475 return error;
1476}
1477
1478static int
1670c579 1479netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1480{
782e6111
EJ
1481 struct mii_ioctl_data data;
1482 int error;
63331829 1483
782e6111
EJ
1484 *miimon = false;
1485
1486 memset(&data, 0, sizeof data);
1670c579 1487 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1488 if (!error) {
1489 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1490 data.reg_num = MII_BMSR;
1670c579 1491 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1492 &data);
63331829
EJ
1493
1494 if (!error) {
782e6111 1495 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1496 }
9120cfc0
DH
1497 }
1498 if (error) {
63331829 1499 struct ethtool_cmd ecmd;
63331829
EJ
1500
1501 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1502 name);
1503
ab985a77 1504 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1505 memset(&ecmd, 0, sizeof ecmd);
1506 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1507 "ETHTOOL_GLINK");
1508 if (!error) {
782e6111
EJ
1509 struct ethtool_value eval;
1510
1511 memcpy(&eval, &ecmd, sizeof eval);
1512 *miimon = !!eval.data;
63331829
EJ
1513 } else {
1514 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1515 }
1516 }
1517
1518 return error;
1519}
1520
1670c579
EJ
1521static int
1522netdev_linux_set_miimon_interval(struct netdev *netdev_,
1523 long long int interval)
1524{
b5d57fc8 1525 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1526
86383816 1527 ovs_mutex_lock(&netdev->mutex);
1670c579 1528 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1529 if (netdev->miimon_interval != interval) {
19c8e9c1 1530 if (interval && !netdev->miimon_interval) {
812c272c 1531 atomic_count_inc(&miimon_cnt);
19c8e9c1 1532 } else if (!interval && netdev->miimon_interval) {
812c272c 1533 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1534 }
1535
b5d57fc8
BP
1536 netdev->miimon_interval = interval;
1537 timer_set_expired(&netdev->miimon_timer);
1670c579 1538 }
86383816 1539 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1540
1541 return 0;
1542}
1543
1544static void
1545netdev_linux_miimon_run(void)
1546{
1547 struct shash device_shash;
1548 struct shash_node *node;
1549
1550 shash_init(&device_shash);
b5d57fc8 1551 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1552 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1553 struct netdev *netdev = node->data;
1554 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1555 bool miimon;
1556
86383816
BP
1557 ovs_mutex_lock(&dev->mutex);
1558 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1559 netdev_linux_get_miimon(dev->up.name, &miimon);
1560 if (miimon != dev->miimon) {
1561 dev->miimon = miimon;
1562 netdev_linux_changed(dev, dev->ifi_flags, 0);
1563 }
1670c579 1564
86383816 1565 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1566 }
86383816 1567 ovs_mutex_unlock(&dev->mutex);
2f980d74 1568 netdev_close(netdev);
1670c579
EJ
1569 }
1570
1571 shash_destroy(&device_shash);
1572}
1573
1574static void
1575netdev_linux_miimon_wait(void)
1576{
1577 struct shash device_shash;
1578 struct shash_node *node;
1579
1580 shash_init(&device_shash);
b5d57fc8 1581 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1582 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1583 struct netdev *netdev = node->data;
1584 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1585
86383816 1586 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1587 if (dev->miimon_interval > 0) {
1588 timer_wait(&dev->miimon_timer);
1589 }
86383816 1590 ovs_mutex_unlock(&dev->mutex);
2f980d74 1591 netdev_close(netdev);
1670c579
EJ
1592 }
1593 shash_destroy(&device_shash);
1594}
1595
92df599c
JG
1596static void
1597swap_uint64(uint64_t *a, uint64_t *b)
1598{
1de0e8ae
BP
1599 uint64_t tmp = *a;
1600 *a = *b;
1601 *b = tmp;
92df599c
JG
1602}
1603
c060c4cf
EJ
1604/* Copies 'src' into 'dst', performing format conversion in the process.
1605 *
1606 * 'src' is allowed to be misaligned. */
1607static void
1608netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1609 const struct ovs_vport_stats *src)
1610{
6a54dedc
BP
1611 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1612 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1613 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1614 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1615 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1616 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1617 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1618 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1619 dst->multicast = 0;
1620 dst->collisions = 0;
1621 dst->rx_length_errors = 0;
1622 dst->rx_over_errors = 0;
1623 dst->rx_crc_errors = 0;
1624 dst->rx_frame_errors = 0;
1625 dst->rx_fifo_errors = 0;
1626 dst->rx_missed_errors = 0;
1627 dst->tx_aborted_errors = 0;
1628 dst->tx_carrier_errors = 0;
1629 dst->tx_fifo_errors = 0;
1630 dst->tx_heartbeat_errors = 0;
1631 dst->tx_window_errors = 0;
1632}
1633
1634static int
1635get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1636{
93451a0a 1637 struct dpif_netlink_vport reply;
c060c4cf
EJ
1638 struct ofpbuf *buf;
1639 int error;
1640
93451a0a 1641 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1642 if (error) {
1643 return error;
1644 } else if (!reply.stats) {
1645 ofpbuf_delete(buf);
1646 return EOPNOTSUPP;
1647 }
1648
1649 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1650
1651 ofpbuf_delete(buf);
1652
1653 return 0;
1654}
1655
f613a0d7
PS
1656static void
1657get_stats_via_vport(const struct netdev *netdev_,
1658 struct netdev_stats *stats)
8b61709d 1659{
b5d57fc8 1660 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1661
b5d57fc8
BP
1662 if (!netdev->vport_stats_error ||
1663 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1664 int error;
7fbef77a 1665
c060c4cf 1666 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1667 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1668 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1669 "(%s)",
1670 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1671 }
b5d57fc8
BP
1672 netdev->vport_stats_error = error;
1673 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1674 }
f613a0d7 1675}
8b61709d 1676
f613a0d7
PS
1677/* Retrieves current device stats for 'netdev-linux'. */
1678static int
1679netdev_linux_get_stats(const struct netdev *netdev_,
1680 struct netdev_stats *stats)
1681{
b5d57fc8 1682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1683 struct netdev_stats dev_stats;
1684 int error;
1685
86383816 1686 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1687 get_stats_via_vport(netdev_, stats);
35eef899 1688 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1689 if (error) {
86383816
BP
1690 if (!netdev->vport_stats_error) {
1691 error = 0;
f613a0d7 1692 }
86383816 1693 } else if (netdev->vport_stats_error) {
04c881eb 1694 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1695 *stats = dev_stats;
1696 } else {
04c881eb
AZ
1697 /* Use kernel netdev's packet and byte counts since vport's counters
1698 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1699 * enabled. */
1700 stats->rx_packets = dev_stats.rx_packets;
1701 stats->rx_bytes = dev_stats.rx_bytes;
1702 stats->tx_packets = dev_stats.tx_packets;
1703 stats->tx_bytes = dev_stats.tx_bytes;
1704
f613a0d7
PS
1705 stats->rx_errors += dev_stats.rx_errors;
1706 stats->tx_errors += dev_stats.tx_errors;
1707 stats->rx_dropped += dev_stats.rx_dropped;
1708 stats->tx_dropped += dev_stats.tx_dropped;
1709 stats->multicast += dev_stats.multicast;
1710 stats->collisions += dev_stats.collisions;
1711 stats->rx_length_errors += dev_stats.rx_length_errors;
1712 stats->rx_over_errors += dev_stats.rx_over_errors;
1713 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1714 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1715 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1716 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1717 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1718 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1719 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1720 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1721 stats->tx_window_errors += dev_stats.tx_window_errors;
1722 }
86383816
BP
1723 ovs_mutex_unlock(&netdev->mutex);
1724
1725 return error;
f613a0d7
PS
1726}
1727
1728/* Retrieves current device stats for 'netdev-tap' netdev or
1729 * netdev-internal. */
1730static int
15aee116 1731netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1732{
b5d57fc8 1733 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1734 struct netdev_stats dev_stats;
1735 int error;
1736
86383816 1737 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1738 get_stats_via_vport(netdev_, stats);
35eef899 1739 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1740 if (error) {
86383816
BP
1741 if (!netdev->vport_stats_error) {
1742 error = 0;
8b61709d 1743 }
86383816
BP
1744 } else if (netdev->vport_stats_error) {
1745 /* Transmit and receive stats will appear to be swapped relative to the
1746 * other ports since we are the one sending the data, not a remote
1747 * computer. For consistency, we swap them back here. This does not
1748 * apply if we are getting stats from the vport layer because it always
1749 * tracks stats from the perspective of the switch. */
fe6b0e03 1750
f613a0d7 1751 *stats = dev_stats;
92df599c
JG
1752 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1753 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1754 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1755 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1756 stats->rx_length_errors = 0;
1757 stats->rx_over_errors = 0;
1758 stats->rx_crc_errors = 0;
1759 stats->rx_frame_errors = 0;
1760 stats->rx_fifo_errors = 0;
1761 stats->rx_missed_errors = 0;
1762 stats->tx_aborted_errors = 0;
1763 stats->tx_carrier_errors = 0;
1764 stats->tx_fifo_errors = 0;
1765 stats->tx_heartbeat_errors = 0;
1766 stats->tx_window_errors = 0;
f613a0d7 1767 } else {
04c881eb
AZ
1768 /* Use kernel netdev's packet and byte counts since vport counters
1769 * do not reflect packet counts on the wire when GSO, TSO or GRO
1770 * are enabled. */
1771 stats->rx_packets = dev_stats.tx_packets;
1772 stats->rx_bytes = dev_stats.tx_bytes;
1773 stats->tx_packets = dev_stats.rx_packets;
1774 stats->tx_bytes = dev_stats.rx_bytes;
1775
f613a0d7
PS
1776 stats->rx_dropped += dev_stats.tx_dropped;
1777 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1778
f613a0d7
PS
1779 stats->rx_errors += dev_stats.tx_errors;
1780 stats->tx_errors += dev_stats.rx_errors;
1781
1782 stats->multicast += dev_stats.multicast;
1783 stats->collisions += dev_stats.collisions;
1784 }
86383816
BP
1785 ovs_mutex_unlock(&netdev->mutex);
1786
1787 return error;
8b61709d
BP
1788}
1789
bba1e6f3
PS
1790static int
1791netdev_internal_get_stats(const struct netdev *netdev_,
1792 struct netdev_stats *stats)
1793{
b5d57fc8 1794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1795 int error;
bba1e6f3 1796
86383816 1797 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1798 get_stats_via_vport(netdev_, stats);
86383816
BP
1799 error = netdev->vport_stats_error;
1800 ovs_mutex_unlock(&netdev->mutex);
1801
1802 return error;
bba1e6f3
PS
1803}
1804
51f87458 1805static void
b5d57fc8 1806netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1807{
1808 struct ethtool_cmd ecmd;
6c038611 1809 uint32_t speed;
8b61709d
BP
1810 int error;
1811
b5d57fc8 1812 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1813 return;
1814 }
1815
ab985a77 1816 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1817 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1818 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1819 ETHTOOL_GSET, "ETHTOOL_GSET");
1820 if (error) {
51f87458 1821 goto out;
8b61709d
BP
1822 }
1823
1824 /* Supported features. */
b5d57fc8 1825 netdev->supported = 0;
8b61709d 1826 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1827 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1828 }
1829 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1830 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1831 }
1832 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1833 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1834 }
1835 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1836 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1837 }
1838 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1839 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 1840 }
67bed84c
SH
1841 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1842 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 1843 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 1844 }
67bed84c
SH
1845 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1846 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1847 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1848 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 1849 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 1850 }
67bed84c
SH
1851 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1852 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1853 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1854 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1855 netdev->supported |= NETDEV_F_40GB_FD;
1856 }
8b61709d 1857 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1858 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1859 }
1860 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1861 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1862 }
1863 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1864 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
1865 }
1866 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 1867 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
1868 }
1869 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 1870 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1871 }
1872
1873 /* Advertised features. */
b5d57fc8 1874 netdev->advertised = 0;
8b61709d 1875 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 1876 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
1877 }
1878 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 1879 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
1880 }
1881 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 1882 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
1883 }
1884 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 1885 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
1886 }
1887 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 1888 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 1889 }
67bed84c
SH
1890 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
1891 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 1892 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 1893 }
67bed84c
SH
1894 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
1895 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
1896 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
1897 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 1898 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 1899 }
67bed84c
SH
1900 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
1901 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
1902 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
1903 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
1904 netdev->advertised |= NETDEV_F_40GB_FD;
1905 }
8b61709d 1906 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 1907 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
1908 }
1909 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 1910 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
1911 }
1912 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 1913 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
1914 }
1915 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 1916 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
1917 }
1918 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 1919 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
1920 }
1921
1922 /* Current settings. */
0c615356 1923 speed = ethtool_cmd_speed(&ecmd);
6c038611 1924 if (speed == SPEED_10) {
b5d57fc8 1925 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 1926 } else if (speed == SPEED_100) {
b5d57fc8 1927 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 1928 } else if (speed == SPEED_1000) {
b5d57fc8 1929 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 1930 } else if (speed == SPEED_10000) {
b5d57fc8 1931 netdev->current = NETDEV_F_10GB_FD;
6c038611 1932 } else if (speed == 40000) {
b5d57fc8 1933 netdev->current = NETDEV_F_40GB_FD;
6c038611 1934 } else if (speed == 100000) {
b5d57fc8 1935 netdev->current = NETDEV_F_100GB_FD;
6c038611 1936 } else if (speed == 1000000) {
b5d57fc8 1937 netdev->current = NETDEV_F_1TB_FD;
8b61709d 1938 } else {
b5d57fc8 1939 netdev->current = 0;
8b61709d
BP
1940 }
1941
1942 if (ecmd.port == PORT_TP) {
b5d57fc8 1943 netdev->current |= NETDEV_F_COPPER;
8b61709d 1944 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 1945 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
1946 }
1947
1948 if (ecmd.autoneg) {
b5d57fc8 1949 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
1950 }
1951
51f87458 1952out:
b5d57fc8
BP
1953 netdev->cache_valid |= VALID_FEATURES;
1954 netdev->get_features_error = error;
51f87458
PS
1955}
1956
887ed8b2
BP
1957/* Stores the features supported by 'netdev' into of '*current', '*advertised',
1958 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1959 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
1960static int
1961netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
1962 enum netdev_features *current,
1963 enum netdev_features *advertised,
1964 enum netdev_features *supported,
1965 enum netdev_features *peer)
51f87458 1966{
b5d57fc8 1967 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1968 int error;
51f87458 1969
86383816 1970 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1971 netdev_linux_read_features(netdev);
b5d57fc8
BP
1972 if (!netdev->get_features_error) {
1973 *current = netdev->current;
1974 *advertised = netdev->advertised;
1975 *supported = netdev->supported;
887ed8b2 1976 *peer = 0; /* XXX */
51f87458 1977 }
86383816
BP
1978 error = netdev->get_features_error;
1979 ovs_mutex_unlock(&netdev->mutex);
1980
1981 return error;
8b61709d
BP
1982}
1983
1984/* Set the features advertised by 'netdev' to 'advertise'. */
1985static int
86383816 1986netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 1987 enum netdev_features advertise)
8b61709d 1988{
86383816 1989 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
1990 struct ethtool_cmd ecmd;
1991 int error;
1992
86383816
BP
1993 ovs_mutex_lock(&netdev->mutex);
1994
ab985a77 1995 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1996 memset(&ecmd, 0, sizeof ecmd);
86383816 1997 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
1998 ETHTOOL_GSET, "ETHTOOL_GSET");
1999 if (error) {
86383816 2000 goto exit;
8b61709d
BP
2001 }
2002
2003 ecmd.advertising = 0;
6c038611 2004 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2005 ecmd.advertising |= ADVERTISED_10baseT_Half;
2006 }
6c038611 2007 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2008 ecmd.advertising |= ADVERTISED_10baseT_Full;
2009 }
6c038611 2010 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2011 ecmd.advertising |= ADVERTISED_100baseT_Half;
2012 }
6c038611 2013 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2014 ecmd.advertising |= ADVERTISED_100baseT_Full;
2015 }
6c038611 2016 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2017 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2018 }
6c038611 2019 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2020 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2021 }
6c038611 2022 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2023 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2024 }
6c038611 2025 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2026 ecmd.advertising |= ADVERTISED_TP;
2027 }
6c038611 2028 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2029 ecmd.advertising |= ADVERTISED_FIBRE;
2030 }
6c038611 2031 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2032 ecmd.advertising |= ADVERTISED_Autoneg;
2033 }
6c038611 2034 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2035 ecmd.advertising |= ADVERTISED_Pause;
2036 }
6c038611 2037 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2038 ecmd.advertising |= ADVERTISED_Asym_Pause;
2039 }
ab985a77 2040 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2041 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2042 ETHTOOL_SSET, "ETHTOOL_SSET");
2043
2044exit:
2045 ovs_mutex_unlock(&netdev->mutex);
2046 return error;
8b61709d
BP
2047}
2048
f8500004
JP
2049/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2050 * successful, otherwise a positive errno value. */
8b61709d 2051static int
b5d57fc8 2052netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2053 uint32_t kbits_rate, uint32_t kbits_burst)
2054{
b5d57fc8
BP
2055 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2056 const char *netdev_name = netdev_get_name(netdev_);
f8500004 2057 int error;
8b61709d 2058
80a86fbe 2059 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2060 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2061 : kbits_burst); /* Stick with user-specified value. */
2062
86383816 2063 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2064 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2065 error = netdev->netdev_policing_error;
2066 if (error || (netdev->kbits_rate == kbits_rate &&
2067 netdev->kbits_burst == kbits_burst)) {
c9f71668 2068 /* Assume that settings haven't changed since we last set them. */
86383816 2069 goto out;
c9f71668 2070 }
b5d57fc8 2071 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2072 }
2073
ac8c3412 2074 COVERAGE_INC(netdev_set_policing);
f8500004 2075 /* Remove any existing ingress qdisc. */
b5d57fc8 2076 error = tc_add_del_ingress_qdisc(netdev_, false);
f8500004
JP
2077 if (error) {
2078 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2079 netdev_name, ovs_strerror(error));
c9f71668 2080 goto out;
f8500004
JP
2081 }
2082
8b61709d 2083 if (kbits_rate) {
b5d57fc8 2084 error = tc_add_del_ingress_qdisc(netdev_, true);
f8500004
JP
2085 if (error) {
2086 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2087 netdev_name, ovs_strerror(error));
c9f71668 2088 goto out;
8b61709d
BP
2089 }
2090
b5d57fc8 2091 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2092 if (error){
2093 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2094 netdev_name, ovs_strerror(error));
c9f71668 2095 goto out;
8b61709d 2096 }
8b61709d
BP
2097 }
2098
b5d57fc8
BP
2099 netdev->kbits_rate = kbits_rate;
2100 netdev->kbits_burst = kbits_burst;
f8500004 2101
c9f71668
PS
2102out:
2103 if (!error || error == ENODEV) {
b5d57fc8
BP
2104 netdev->netdev_policing_error = error;
2105 netdev->cache_valid |= VALID_POLICING;
c9f71668 2106 }
86383816 2107 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2108 return error;
8b61709d
BP
2109}
2110
c1c9c9c4
BP
2111static int
2112netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2113 struct sset *types)
c1c9c9c4 2114{
559eb230 2115 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2116 for (opsp = tcs; *opsp != NULL; opsp++) {
2117 const struct tc_ops *ops = *opsp;
2118 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2119 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2120 }
2121 }
2122 return 0;
2123}
2124
2125static const struct tc_ops *
2126tc_lookup_ovs_name(const char *name)
2127{
559eb230 2128 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2129
2130 for (opsp = tcs; *opsp != NULL; opsp++) {
2131 const struct tc_ops *ops = *opsp;
2132 if (!strcmp(name, ops->ovs_name)) {
2133 return ops;
2134 }
2135 }
2136 return NULL;
2137}
2138
2139static const struct tc_ops *
2140tc_lookup_linux_name(const char *name)
2141{
559eb230 2142 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2143
2144 for (opsp = tcs; *opsp != NULL; opsp++) {
2145 const struct tc_ops *ops = *opsp;
2146 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2147 return ops;
2148 }
2149 }
2150 return NULL;
2151}
2152
93b13be8 2153static struct tc_queue *
b5d57fc8 2154tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2155 size_t hash)
2156{
b5d57fc8 2157 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2158 struct tc_queue *queue;
2159
b5d57fc8 2160 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2161 if (queue->queue_id == queue_id) {
2162 return queue;
2163 }
2164 }
2165 return NULL;
2166}
2167
2168static struct tc_queue *
2169tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2170{
2171 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2172}
2173
c1c9c9c4
BP
2174static int
2175netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2176 const char *type,
2177 struct netdev_qos_capabilities *caps)
2178{
2179 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2180 if (!ops) {
2181 return EOPNOTSUPP;
2182 }
2183 caps->n_queues = ops->n_queues;
2184 return 0;
2185}
2186
2187static int
b5d57fc8 2188netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2189 const char **typep, struct smap *details)
c1c9c9c4 2190{
b5d57fc8 2191 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2192 int error;
2193
86383816 2194 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2195 error = tc_query_qdisc(netdev_);
86383816
BP
2196 if (!error) {
2197 *typep = netdev->tc->ops->ovs_name;
2198 error = (netdev->tc->ops->qdisc_get
2199 ? netdev->tc->ops->qdisc_get(netdev_, details)
2200 : 0);
c1c9c9c4 2201 }
86383816 2202 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2203
86383816 2204 return error;
c1c9c9c4
BP
2205}
2206
2207static int
b5d57fc8 2208netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2209 const char *type, const struct smap *details)
c1c9c9c4 2210{
b5d57fc8 2211 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2212 const struct tc_ops *new_ops;
2213 int error;
2214
2215 new_ops = tc_lookup_ovs_name(type);
2216 if (!new_ops || !new_ops->tc_install) {
2217 return EOPNOTSUPP;
2218 }
2219
6cf888b8
BS
2220 if (new_ops == &tc_ops_noop) {
2221 return new_ops->tc_install(netdev_, details);
2222 }
2223
86383816 2224 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2225 error = tc_query_qdisc(netdev_);
c1c9c9c4 2226 if (error) {
86383816 2227 goto exit;
c1c9c9c4
BP
2228 }
2229
b5d57fc8 2230 if (new_ops == netdev->tc->ops) {
86383816 2231 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2232 } else {
2233 /* Delete existing qdisc. */
b5d57fc8 2234 error = tc_del_qdisc(netdev_);
c1c9c9c4 2235 if (error) {
86383816 2236 goto exit;
c1c9c9c4 2237 }
b5d57fc8 2238 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2239
2240 /* Install new qdisc. */
b5d57fc8
BP
2241 error = new_ops->tc_install(netdev_, details);
2242 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2243 }
86383816
BP
2244
2245exit:
2246 ovs_mutex_unlock(&netdev->mutex);
2247 return error;
c1c9c9c4
BP
2248}
2249
2250static int
b5d57fc8 2251netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2252 unsigned int queue_id, struct smap *details)
c1c9c9c4 2253{
b5d57fc8 2254 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2255 int error;
2256
86383816 2257 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2258 error = tc_query_qdisc(netdev_);
86383816 2259 if (!error) {
b5d57fc8 2260 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2261 error = (queue
b5d57fc8 2262 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2263 : ENOENT);
c1c9c9c4 2264 }
86383816
BP
2265 ovs_mutex_unlock(&netdev->mutex);
2266
2267 return error;
c1c9c9c4
BP
2268}
2269
2270static int
b5d57fc8 2271netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2272 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2273{
b5d57fc8 2274 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2275 int error;
2276
86383816 2277 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2278 error = tc_query_qdisc(netdev_);
86383816
BP
2279 if (!error) {
2280 error = (queue_id < netdev->tc->ops->n_queues
2281 && netdev->tc->ops->class_set
2282 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2283 : EINVAL);
c1c9c9c4 2284 }
86383816 2285 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2286
86383816 2287 return error;
c1c9c9c4
BP
2288}
2289
2290static int
b5d57fc8 2291netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2292{
b5d57fc8 2293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2294 int error;
2295
86383816 2296 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2297 error = tc_query_qdisc(netdev_);
86383816
BP
2298 if (!error) {
2299 if (netdev->tc->ops->class_delete) {
2300 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2301 error = (queue
2302 ? netdev->tc->ops->class_delete(netdev_, queue)
2303 : ENOENT);
2304 } else {
2305 error = EINVAL;
2306 }
c1c9c9c4 2307 }
86383816
BP
2308 ovs_mutex_unlock(&netdev->mutex);
2309
2310 return error;
c1c9c9c4
BP
2311}
2312
2313static int
b5d57fc8 2314netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2315 unsigned int queue_id,
2316 struct netdev_queue_stats *stats)
2317{
b5d57fc8 2318 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2319 int error;
2320
86383816 2321 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2322 error = tc_query_qdisc(netdev_);
86383816
BP
2323 if (!error) {
2324 if (netdev->tc->ops->class_get_stats) {
2325 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2326 if (queue) {
2327 stats->created = queue->created;
2328 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2329 stats);
2330 } else {
2331 error = ENOENT;
2332 }
2333 } else {
2334 error = EOPNOTSUPP;
6dc34a0d 2335 }
c1c9c9c4 2336 }
86383816
BP
2337 ovs_mutex_unlock(&netdev->mutex);
2338
2339 return error;
c1c9c9c4
BP
2340}
2341
d57695d7
JS
2342struct queue_dump_state {
2343 struct nl_dump dump;
2344 struct ofpbuf buf;
2345};
2346
23a98ffe 2347static bool
d57695d7 2348start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2349{
2350 struct ofpbuf request;
2351 struct tcmsg *tcmsg;
2352
2353 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2354 if (!tcmsg) {
2355 return false;
2356 }
3c4de644 2357 tcmsg->tcm_parent = 0;
d57695d7 2358 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2359 ofpbuf_uninit(&request);
d57695d7
JS
2360
2361 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2362 return true;
c1c9c9c4
BP
2363}
2364
d57695d7
JS
2365static int
2366finish_queue_dump(struct queue_dump_state *state)
2367{
2368 ofpbuf_uninit(&state->buf);
2369 return nl_dump_done(&state->dump);
2370}
2371
89454bf4
BP
2372struct netdev_linux_queue_state {
2373 unsigned int *queues;
2374 size_t cur_queue;
2375 size_t n_queues;
2376};
2377
c1c9c9c4 2378static int
89454bf4 2379netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2380{
89454bf4 2381 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2382 int error;
2383
86383816 2384 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2385 error = tc_query_qdisc(netdev_);
86383816
BP
2386 if (!error) {
2387 if (netdev->tc->ops->class_get) {
89454bf4
BP
2388 struct netdev_linux_queue_state *state;
2389 struct tc_queue *queue;
2390 size_t i;
2391
2392 *statep = state = xmalloc(sizeof *state);
2393 state->n_queues = hmap_count(&netdev->tc->queues);
2394 state->cur_queue = 0;
2395 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2396
2397 i = 0;
2398 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2399 state->queues[i++] = queue->queue_id;
86383816 2400 }
c1c9c9c4 2401 } else {
86383816 2402 error = EOPNOTSUPP;
c1c9c9c4
BP
2403 }
2404 }
86383816 2405 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2406
86383816 2407 return error;
c1c9c9c4
BP
2408}
2409
89454bf4
BP
2410static int
2411netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2412 unsigned int *queue_idp, struct smap *details)
2413{
2414 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2415 struct netdev_linux_queue_state *state = state_;
2416 int error = EOF;
2417
2418 ovs_mutex_lock(&netdev->mutex);
2419 while (state->cur_queue < state->n_queues) {
2420 unsigned int queue_id = state->queues[state->cur_queue++];
2421 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2422
2423 if (queue) {
2424 *queue_idp = queue_id;
2425 error = netdev->tc->ops->class_get(netdev_, queue, details);
2426 break;
2427 }
2428 }
2429 ovs_mutex_unlock(&netdev->mutex);
2430
2431 return error;
2432}
2433
2434static int
2435netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2436 void *state_)
2437{
2438 struct netdev_linux_queue_state *state = state_;
2439
2440 free(state->queues);
2441 free(state);
2442 return 0;
2443}
2444
c1c9c9c4 2445static int
b5d57fc8 2446netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2447 netdev_dump_queue_stats_cb *cb, void *aux)
2448{
b5d57fc8 2449 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2450 int error;
2451
86383816 2452 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2453 error = tc_query_qdisc(netdev_);
86383816 2454 if (!error) {
d57695d7 2455 struct queue_dump_state state;
c1c9c9c4 2456
86383816
BP
2457 if (!netdev->tc->ops->class_dump_stats) {
2458 error = EOPNOTSUPP;
d57695d7 2459 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2460 error = ENODEV;
2461 } else {
2462 struct ofpbuf msg;
2463 int retval;
2464
d57695d7 2465 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2466 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2467 cb, aux);
2468 if (retval) {
2469 error = retval;
2470 }
2471 }
2472
d57695d7 2473 retval = finish_queue_dump(&state);
86383816
BP
2474 if (retval) {
2475 error = retval;
2476 }
c1c9c9c4
BP
2477 }
2478 }
86383816 2479 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2480
86383816 2481 return error;
c1c9c9c4
BP
2482}
2483
8b61709d 2484static int
f1acd62b
BP
2485netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2486 struct in_addr netmask)
8b61709d 2487{
b5d57fc8 2488 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2489 int error;
2490
86383816 2491 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2492 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2493 if (!error) {
f1acd62b 2494 if (address.s_addr != INADDR_ANY) {
8b61709d 2495 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2496 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2497 }
2498 }
49af9a3d 2499
86383816
BP
2500 ovs_mutex_unlock(&netdev->mutex);
2501
8b61709d
BP
2502 return error;
2503}
2504
7df6932e
AW
2505/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2506 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2507 * error. */
8b61709d 2508static int
a8704b50
PS
2509netdev_linux_get_addr_list(const struct netdev *netdev_,
2510 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2511{
b5d57fc8 2512 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2513 int error;
86383816
BP
2514
2515 ovs_mutex_lock(&netdev->mutex);
a8704b50 2516 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816
BP
2517 ovs_mutex_unlock(&netdev->mutex);
2518
7df6932e 2519 return error;
8b61709d
BP
2520}
2521
2522static void
2523make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2524{
2525 struct sockaddr_in sin;
2526 memset(&sin, 0, sizeof sin);
2527 sin.sin_family = AF_INET;
2528 sin.sin_addr = addr;
2529 sin.sin_port = 0;
2530
2531 memset(sa, 0, sizeof *sa);
2532 memcpy(sa, &sin, sizeof sin);
2533}
2534
2535static int
2536do_set_addr(struct netdev *netdev,
2537 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2538{
2539 struct ifreq ifr;
149f577a 2540
259e0b1a
BP
2541 make_in4_sockaddr(&ifr.ifr_addr, addr);
2542 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2543 ioctl_name);
8b61709d
BP
2544}
2545
2546/* Adds 'router' as a default IP gateway. */
2547static int
67a4917b 2548netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2549{
2550 struct in_addr any = { INADDR_ANY };
2551 struct rtentry rt;
2552 int error;
2553
2554 memset(&rt, 0, sizeof rt);
2555 make_in4_sockaddr(&rt.rt_dst, any);
2556 make_in4_sockaddr(&rt.rt_gateway, router);
2557 make_in4_sockaddr(&rt.rt_genmask, any);
2558 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2559 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2560 if (error) {
10a89ef0 2561 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2562 }
2563 return error;
2564}
2565
f1acd62b
BP
2566static int
2567netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2568 char **netdev_name)
2569{
2570 static const char fn[] = "/proc/net/route";
2571 FILE *stream;
2572 char line[256];
2573 int ln;
2574
2575 *netdev_name = NULL;
2576 stream = fopen(fn, "r");
2577 if (stream == NULL) {
10a89ef0 2578 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2579 return errno;
2580 }
2581
2582 ln = 0;
2583 while (fgets(line, sizeof line, stream)) {
2584 if (++ln >= 2) {
2585 char iface[17];
dbba996b 2586 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2587 int refcnt, metric, mtu;
2588 unsigned int flags, use, window, irtt;
2589
c2c28dfd
BP
2590 if (!ovs_scan(line,
2591 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2592 " %d %u %u\n",
2593 iface, &dest, &gateway, &flags, &refcnt,
2594 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2595 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2596 fn, ln, line);
2597 continue;
2598 }
2599 if (!(flags & RTF_UP)) {
2600 /* Skip routes that aren't up. */
2601 continue;
2602 }
2603
2604 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2605 * network byte order, so we don't need need any endian
f1acd62b
BP
2606 * conversions here. */
2607 if ((dest & mask) == (host->s_addr & mask)) {
2608 if (!gateway) {
2609 /* The host is directly reachable. */
2610 next_hop->s_addr = 0;
2611 } else {
2612 /* To reach the host, we must go through a gateway. */
2613 next_hop->s_addr = gateway;
2614 }
2615 *netdev_name = xstrdup(iface);
2616 fclose(stream);
2617 return 0;
2618 }
2619 }
2620 }
2621
2622 fclose(stream);
2623 return ENXIO;
2624}
2625
e210037e 2626static int
b5d57fc8 2627netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2628{
b5d57fc8 2629 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2630 int error = 0;
2631
86383816 2632 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2633 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2634 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2635
2636 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2637 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2638 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2639 cmd,
2640 ETHTOOL_GDRVINFO,
2641 "ETHTOOL_GDRVINFO");
2642 if (!error) {
b5d57fc8 2643 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2644 }
2645 }
e210037e 2646
e210037e 2647 if (!error) {
b5d57fc8
BP
2648 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2649 smap_add(smap, "driver_version", netdev->drvinfo.version);
2650 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2651 }
86383816
BP
2652 ovs_mutex_unlock(&netdev->mutex);
2653
e210037e
AE
2654 return error;
2655}
2656
4f925bd3 2657static int
275707c3
EJ
2658netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2659 struct smap *smap)
4f925bd3 2660{
79f1cbe9 2661 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2662 return 0;
2663}
2664
8b61709d
BP
2665/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2666 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2667 * returns 0. Otherwise, it returns a positive errno value; in particular,
2668 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2669static int
2670netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2671 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2672{
2673 struct arpreq r;
c100e025 2674 struct sockaddr_in sin;
8b61709d
BP
2675 int retval;
2676
2677 memset(&r, 0, sizeof r);
f2cc621b 2678 memset(&sin, 0, sizeof sin);
c100e025
BP
2679 sin.sin_family = AF_INET;
2680 sin.sin_addr.s_addr = ip;
2681 sin.sin_port = 0;
2682 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2683 r.arp_ha.sa_family = ARPHRD_ETHER;
2684 r.arp_flags = 0;
71d7c22f 2685 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2686 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2687 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2688 if (!retval) {
2689 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2690 } else if (retval != ENXIO) {
2691 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2692 netdev_get_name(netdev), IP_ARGS(ip),
2693 ovs_strerror(retval));
8b61709d
BP
2694 }
2695 return retval;
2696}
2697
2698static int
2699nd_to_iff_flags(enum netdev_flags nd)
2700{
2701 int iff = 0;
2702 if (nd & NETDEV_UP) {
2703 iff |= IFF_UP;
2704 }
2705 if (nd & NETDEV_PROMISC) {
2706 iff |= IFF_PROMISC;
2707 }
7ba19d41
AC
2708 if (nd & NETDEV_LOOPBACK) {
2709 iff |= IFF_LOOPBACK;
2710 }
8b61709d
BP
2711 return iff;
2712}
2713
2714static int
2715iff_to_nd_flags(int iff)
2716{
2717 enum netdev_flags nd = 0;
2718 if (iff & IFF_UP) {
2719 nd |= NETDEV_UP;
2720 }
2721 if (iff & IFF_PROMISC) {
2722 nd |= NETDEV_PROMISC;
2723 }
7ba19d41
AC
2724 if (iff & IFF_LOOPBACK) {
2725 nd |= NETDEV_LOOPBACK;
2726 }
8b61709d
BP
2727 return nd;
2728}
2729
2730static int
4f9f3f21
BP
2731update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2732 enum netdev_flags on, enum netdev_flags *old_flagsp)
2733 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2734{
2735 int old_flags, new_flags;
c37d4da4
EJ
2736 int error = 0;
2737
b5d57fc8 2738 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2739 *old_flagsp = iff_to_nd_flags(old_flags);
2740 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2741 if (new_flags != old_flags) {
4f9f3f21
BP
2742 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2743 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2744 }
4f9f3f21
BP
2745
2746 return error;
2747}
2748
2749static int
2750netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2751 enum netdev_flags on, enum netdev_flags *old_flagsp)
2752{
2753 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2754 int error;
2755
2756 ovs_mutex_lock(&netdev->mutex);
2757 error = update_flags(netdev, off, on, old_flagsp);
86383816
BP
2758 ovs_mutex_unlock(&netdev->mutex);
2759
8b61709d
BP
2760 return error;
2761}
2762
2f9dd77f 2763#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
51f87458 2764 GET_FEATURES, GET_STATUS) \
c3827f61
BP
2765{ \
2766 NAME, \
118c77b1 2767 false, /* is_pmd */ \
c3827f61 2768 \
259e0b1a 2769 NULL, \
c3827f61
BP
2770 netdev_linux_run, \
2771 netdev_linux_wait, \
2772 \
9dc63482
BP
2773 netdev_linux_alloc, \
2774 CONSTRUCT, \
2775 netdev_linux_destruct, \
2776 netdev_linux_dealloc, \
de5cdb90 2777 NULL, /* get_config */ \
6d9e6eb4 2778 NULL, /* set_config */ \
f431bf7d 2779 NULL, /* get_tunnel_config */ \
a36de779
PS
2780 NULL, /* build header */ \
2781 NULL, /* push header */ \
2782 NULL, /* pop header */ \
7dec44fe 2783 NULL, /* get_numa_id */ \
050c60bf 2784 NULL, /* set_tx_multiq */ \
c3827f61 2785 \
c3827f61
BP
2786 netdev_linux_send, \
2787 netdev_linux_send_wait, \
2788 \
2789 netdev_linux_set_etheraddr, \
2790 netdev_linux_get_etheraddr, \
2791 netdev_linux_get_mtu, \
9b020780 2792 netdev_linux_set_mtu, \
c3827f61
BP
2793 netdev_linux_get_ifindex, \
2794 netdev_linux_get_carrier, \
65c3058c 2795 netdev_linux_get_carrier_resets, \
1670c579 2796 netdev_linux_set_miimon_interval, \
f613a0d7 2797 GET_STATS, \
c3827f61 2798 \
51f87458 2799 GET_FEATURES, \
c3827f61 2800 netdev_linux_set_advertisements, \
c3827f61
BP
2801 \
2802 netdev_linux_set_policing, \
2803 netdev_linux_get_qos_types, \
2804 netdev_linux_get_qos_capabilities, \
2805 netdev_linux_get_qos, \
2806 netdev_linux_set_qos, \
2807 netdev_linux_get_queue, \
2808 netdev_linux_set_queue, \
2809 netdev_linux_delete_queue, \
2810 netdev_linux_get_queue_stats, \
89454bf4
BP
2811 netdev_linux_queue_dump_start, \
2812 netdev_linux_queue_dump_next, \
2813 netdev_linux_queue_dump_done, \
c3827f61
BP
2814 netdev_linux_dump_queue_stats, \
2815 \
c3827f61 2816 netdev_linux_set_in4, \
a8704b50 2817 netdev_linux_get_addr_list, \
c3827f61
BP
2818 netdev_linux_add_router, \
2819 netdev_linux_get_next_hop, \
4f925bd3 2820 GET_STATUS, \
c3827f61
BP
2821 netdev_linux_arp_lookup, \
2822 \
2823 netdev_linux_update_flags, \
790fb3b7 2824 NULL, /* reconfigure */ \
c3827f61 2825 \
f7791740
PS
2826 netdev_linux_rxq_alloc, \
2827 netdev_linux_rxq_construct, \
2828 netdev_linux_rxq_destruct, \
2829 netdev_linux_rxq_dealloc, \
2830 netdev_linux_rxq_recv, \
2831 netdev_linux_rxq_wait, \
2832 netdev_linux_rxq_drain, \
c3827f61
BP
2833}
2834
2835const struct netdev_class netdev_linux_class =
2836 NETDEV_LINUX_CLASS(
2837 "system",
9dc63482 2838 netdev_linux_construct,
f613a0d7 2839 netdev_linux_get_stats,
51f87458 2840 netdev_linux_get_features,
275707c3 2841 netdev_linux_get_status);
c3827f61
BP
2842
2843const struct netdev_class netdev_tap_class =
2844 NETDEV_LINUX_CLASS(
2845 "tap",
9dc63482 2846 netdev_linux_construct_tap,
bba1e6f3 2847 netdev_tap_get_stats,
51f87458 2848 netdev_linux_get_features,
275707c3 2849 netdev_linux_get_status);
c3827f61
BP
2850
2851const struct netdev_class netdev_internal_class =
2852 NETDEV_LINUX_CLASS(
2853 "internal",
9dc63482 2854 netdev_linux_construct,
bba1e6f3 2855 netdev_internal_get_stats,
51f87458 2856 NULL, /* get_features */
275707c3 2857 netdev_internal_get_status);
8b61709d 2858\f
677d9158
JV
2859
2860#define CODEL_N_QUEUES 0x0000
2861
2f4298ce
BP
2862/* In sufficiently new kernel headers these are defined as enums in
2863 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2864 * kernels. (This overrides any enum definition in the header file but that's
2865 * harmless.) */
2866#define TCA_CODEL_TARGET 1
2867#define TCA_CODEL_LIMIT 2
2868#define TCA_CODEL_INTERVAL 3
2869
677d9158
JV
2870struct codel {
2871 struct tc tc;
2872 uint32_t target;
2873 uint32_t limit;
2874 uint32_t interval;
2875};
2876
2877static struct codel *
2878codel_get__(const struct netdev *netdev_)
2879{
2880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2881 return CONTAINER_OF(netdev->tc, struct codel, tc);
2882}
2883
2884static void
2885codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
2886 uint32_t interval)
2887{
2888 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2889 struct codel *codel;
2890
2891 codel = xmalloc(sizeof *codel);
2892 tc_init(&codel->tc, &tc_ops_codel);
2893 codel->target = target;
2894 codel->limit = limit;
2895 codel->interval = interval;
2896
2897 netdev->tc = &codel->tc;
2898}
2899
2900static int
2901codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
2902 uint32_t interval)
2903{
2904 size_t opt_offset;
2905 struct ofpbuf request;
2906 struct tcmsg *tcmsg;
2907 uint32_t otarget, olimit, ointerval;
2908 int error;
2909
2910 tc_del_qdisc(netdev);
2911
2912 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2913 NLM_F_EXCL | NLM_F_CREATE, &request);
2914 if (!tcmsg) {
2915 return ENODEV;
2916 }
2917 tcmsg->tcm_handle = tc_make_handle(1, 0);
2918 tcmsg->tcm_parent = TC_H_ROOT;
2919
2920 otarget = target ? target : 5000;
2921 olimit = limit ? limit : 10240;
2922 ointerval = interval ? interval : 100000;
2923
2924 nl_msg_put_string(&request, TCA_KIND, "codel");
2925 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2926 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
2927 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
2928 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
2929 nl_msg_end_nested(&request, opt_offset);
2930
2931 error = tc_transact(&request, NULL);
2932 if (error) {
2933 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
2934 "target %u, limit %u, interval %u error %d(%s)",
2935 netdev_get_name(netdev),
2936 otarget, olimit, ointerval,
2937 error, ovs_strerror(error));
2938 }
2939 return error;
2940}
2941
2942static void
2943codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
2944 const struct smap *details, struct codel *codel)
2945{
13c1637f
BP
2946 codel->target = smap_get_ullong(details, "target", 0);
2947 codel->limit = smap_get_ullong(details, "limit", 0);
2948 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
2949
2950 if (!codel->target) {
2951 codel->target = 5000;
2952 }
2953 if (!codel->limit) {
2954 codel->limit = 10240;
2955 }
2956 if (!codel->interval) {
2957 codel->interval = 100000;
2958 }
2959}
2960
2961static int
2962codel_tc_install(struct netdev *netdev, const struct smap *details)
2963{
2964 int error;
2965 struct codel codel;
2966
2967 codel_parse_qdisc_details__(netdev, details, &codel);
2968 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
2969 codel.interval);
2970 if (!error) {
2971 codel_install__(netdev, codel.target, codel.limit, codel.interval);
2972 }
2973 return error;
2974}
2975
2976static int
2977codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
2978{
2979 static const struct nl_policy tca_codel_policy[] = {
2980 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
2981 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
2982 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
2983 };
2984
2985 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
2986
2987 if (!nl_parse_nested(nl_options, tca_codel_policy,
2988 attrs, ARRAY_SIZE(tca_codel_policy))) {
2989 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
2990 return EPROTO;
2991 }
2992
2993 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
2994 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
2995 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
2996 return 0;
2997}
2998
2999static int
3000codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3001{
3002 struct nlattr *nlattr;
3003 const char * kind;
3004 int error;
3005 struct codel codel;
3006
3007 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3008 if (error != 0) {
3009 return error;
3010 }
3011
3012 error = codel_parse_tca_options__(nlattr, &codel);
3013 if (error != 0) {
3014 return error;
3015 }
3016
3017 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3018 return 0;
3019}
3020
3021
3022static void
3023codel_tc_destroy(struct tc *tc)
3024{
3025 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3026 tc_destroy(tc);
3027 free(codel);
3028}
3029
3030static int
3031codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3032{
3033 const struct codel *codel = codel_get__(netdev);
3034 smap_add_format(details, "target", "%u", codel->target);
3035 smap_add_format(details, "limit", "%u", codel->limit);
3036 smap_add_format(details, "interval", "%u", codel->interval);
3037 return 0;
3038}
3039
3040static int
3041codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3042{
3043 struct codel codel;
3044
3045 codel_parse_qdisc_details__(netdev, details, &codel);
3046 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3047 codel_get__(netdev)->target = codel.target;
3048 codel_get__(netdev)->limit = codel.limit;
3049 codel_get__(netdev)->interval = codel.interval;
3050 return 0;
3051}
3052
3053static const struct tc_ops tc_ops_codel = {
3054 "codel", /* linux_name */
3055 "linux-codel", /* ovs_name */
3056 CODEL_N_QUEUES, /* n_queues */
3057 codel_tc_install,
3058 codel_tc_load,
3059 codel_tc_destroy,
3060 codel_qdisc_get,
3061 codel_qdisc_set,
3062 NULL,
3063 NULL,
3064 NULL,
3065 NULL,
3066 NULL
3067};
3068\f
3069/* FQ-CoDel traffic control class. */
3070
3071#define FQCODEL_N_QUEUES 0x0000
3072
2f4298ce
BP
3073/* In sufficiently new kernel headers these are defined as enums in
3074 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3075 * kernels. (This overrides any enum definition in the header file but that's
3076 * harmless.) */
3077#define TCA_FQ_CODEL_TARGET 1
3078#define TCA_FQ_CODEL_LIMIT 2
3079#define TCA_FQ_CODEL_INTERVAL 3
3080#define TCA_FQ_CODEL_ECN 4
3081#define TCA_FQ_CODEL_FLOWS 5
3082#define TCA_FQ_CODEL_QUANTUM 6
3083
677d9158
JV
3084struct fqcodel {
3085 struct tc tc;
3086 uint32_t target;
3087 uint32_t limit;
3088 uint32_t interval;
3089 uint32_t flows;
3090 uint32_t quantum;
3091};
3092
3093static struct fqcodel *
3094fqcodel_get__(const struct netdev *netdev_)
3095{
3096 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3097 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3098}
3099
3100static void
3101fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3102 uint32_t interval, uint32_t flows, uint32_t quantum)
3103{
3104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3105 struct fqcodel *fqcodel;
3106
3107 fqcodel = xmalloc(sizeof *fqcodel);
3108 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3109 fqcodel->target = target;
3110 fqcodel->limit = limit;
3111 fqcodel->interval = interval;
3112 fqcodel->flows = flows;
3113 fqcodel->quantum = quantum;
3114
3115 netdev->tc = &fqcodel->tc;
3116}
3117
3118static int
3119fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3120 uint32_t interval, uint32_t flows, uint32_t quantum)
3121{
3122 size_t opt_offset;
3123 struct ofpbuf request;
3124 struct tcmsg *tcmsg;
3125 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3126 int error;
3127
3128 tc_del_qdisc(netdev);
3129
3130 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3131 NLM_F_EXCL | NLM_F_CREATE, &request);
3132 if (!tcmsg) {
3133 return ENODEV;
3134 }
3135 tcmsg->tcm_handle = tc_make_handle(1, 0);
3136 tcmsg->tcm_parent = TC_H_ROOT;
3137
3138 otarget = target ? target : 5000;
3139 olimit = limit ? limit : 10240;
3140 ointerval = interval ? interval : 100000;
3141 oflows = flows ? flows : 1024;
3142 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3143 not mtu */
3144
3145 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3146 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3147 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3148 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3149 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3150 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3151 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3152 nl_msg_end_nested(&request, opt_offset);
3153
3154 error = tc_transact(&request, NULL);
3155 if (error) {
3156 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3157 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3158 netdev_get_name(netdev),
3159 otarget, olimit, ointerval, oflows, oquantum,
3160 error, ovs_strerror(error));
3161 }
3162 return error;
3163}
3164
3165static void
3166fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3167 const struct smap *details, struct fqcodel *fqcodel)
3168{
13c1637f
BP
3169 fqcodel->target = smap_get_ullong(details, "target", 0);
3170 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3171 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3172 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3173 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3174
677d9158
JV
3175 if (!fqcodel->target) {
3176 fqcodel->target = 5000;
3177 }
3178 if (!fqcodel->limit) {
3179 fqcodel->limit = 10240;
3180 }
3181 if (!fqcodel->interval) {
3182 fqcodel->interval = 1000000;
3183 }
3184 if (!fqcodel->flows) {
3185 fqcodel->flows = 1024;
3186 }
3187 if (!fqcodel->quantum) {
3188 fqcodel->quantum = 1514;
3189 }
3190}
3191
3192static int
3193fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3194{
3195 int error;
3196 struct fqcodel fqcodel;
3197
3198 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3199 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3200 fqcodel.interval, fqcodel.flows,
3201 fqcodel.quantum);
3202 if (!error) {
3203 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3204 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3205 }
3206 return error;
3207}
3208
3209static int
3210fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3211{
3212 static const struct nl_policy tca_fqcodel_policy[] = {
3213 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3214 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3215 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3216 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3217 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3218 };
3219
3220 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3221
3222 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3223 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3224 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3225 return EPROTO;
3226 }
3227
3228 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3229 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3230 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3231 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3232 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3233 return 0;
3234}
3235
3236static int
3237fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3238{
3239 struct nlattr *nlattr;
3240 const char * kind;
3241 int error;
3242 struct fqcodel fqcodel;
3243
3244 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3245 if (error != 0) {
3246 return error;
3247 }
3248
3249 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3250 if (error != 0) {
3251 return error;
3252 }
3253
3254 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3255 fqcodel.flows, fqcodel.quantum);
3256 return 0;
3257}
3258
3259static void
3260fqcodel_tc_destroy(struct tc *tc)
3261{
3262 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3263 tc_destroy(tc);
3264 free(fqcodel);
3265}
3266
3267static int
3268fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3269{
3270 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3271 smap_add_format(details, "target", "%u", fqcodel->target);
3272 smap_add_format(details, "limit", "%u", fqcodel->limit);
3273 smap_add_format(details, "interval", "%u", fqcodel->interval);
3274 smap_add_format(details, "flows", "%u", fqcodel->flows);
3275 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3276 return 0;
3277}
3278
3279static int
3280fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3281{
3282 struct fqcodel fqcodel;
3283
3284 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3285 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3286 fqcodel.flows, fqcodel.quantum);
3287 fqcodel_get__(netdev)->target = fqcodel.target;
3288 fqcodel_get__(netdev)->limit = fqcodel.limit;
3289 fqcodel_get__(netdev)->interval = fqcodel.interval;
3290 fqcodel_get__(netdev)->flows = fqcodel.flows;
3291 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3292 return 0;
3293}
3294
3295static const struct tc_ops tc_ops_fqcodel = {
3296 "fq_codel", /* linux_name */
3297 "linux-fq_codel", /* ovs_name */
3298 FQCODEL_N_QUEUES, /* n_queues */
3299 fqcodel_tc_install,
3300 fqcodel_tc_load,
3301 fqcodel_tc_destroy,
3302 fqcodel_qdisc_get,
3303 fqcodel_qdisc_set,
3304 NULL,
3305 NULL,
3306 NULL,
3307 NULL,
3308 NULL
3309};
3310\f
3311/* SFQ traffic control class. */
3312
3313#define SFQ_N_QUEUES 0x0000
3314
3315struct sfq {
3316 struct tc tc;
3317 uint32_t quantum;
3318 uint32_t perturb;
3319};
3320
3321static struct sfq *
3322sfq_get__(const struct netdev *netdev_)
3323{
3324 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3325 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3326}
3327
3328static void
3329sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3330{
3331 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3332 struct sfq *sfq;
3333
3334 sfq = xmalloc(sizeof *sfq);
3335 tc_init(&sfq->tc, &tc_ops_sfq);
3336 sfq->perturb = perturb;
3337 sfq->quantum = quantum;
3338
3339 netdev->tc = &sfq->tc;
3340}
3341
3342static int
3343sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3344{
3345 struct tc_sfq_qopt opt;
3346 struct ofpbuf request;
3347 struct tcmsg *tcmsg;
3348 int mtu;
3349 int mtu_error, error;
3350 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3351
3352 tc_del_qdisc(netdev);
3353
3354 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3355 NLM_F_EXCL | NLM_F_CREATE, &request);
3356 if (!tcmsg) {
3357 return ENODEV;
3358 }
3359 tcmsg->tcm_handle = tc_make_handle(1, 0);
3360 tcmsg->tcm_parent = TC_H_ROOT;
3361
3362 memset(&opt, 0, sizeof opt);
3363 if (!quantum) {
3364 if (!mtu_error) {
3365 opt.quantum = mtu; /* if we cannot find mtu, use default */
3366 }
3367 } else {
3368 opt.quantum = quantum;
3369 }
3370
3371 if (!perturb) {
3372 opt.perturb_period = 10;
3373 } else {
3374 opt.perturb_period = perturb;
3375 }
3376
3377 nl_msg_put_string(&request, TCA_KIND, "sfq");
3378 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3379
3380 error = tc_transact(&request, NULL);
3381 if (error) {
3382 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3383 "quantum %u, perturb %u error %d(%s)",
3384 netdev_get_name(netdev),
3385 opt.quantum, opt.perturb_period,
3386 error, ovs_strerror(error));
3387 }
3388 return error;
3389}
3390
3391static void
3392sfq_parse_qdisc_details__(struct netdev *netdev,
3393 const struct smap *details, struct sfq *sfq)
3394{
13c1637f
BP
3395 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3396 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3397
677d9158
JV
3398 if (!sfq->perturb) {
3399 sfq->perturb = 10;
3400 }
3401
3402 if (!sfq->quantum) {
13c1637f
BP
3403 int mtu;
3404 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3405 sfq->quantum = mtu;
3406 } else {
3407 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3408 "device without mtu");
677d9158
JV
3409 }
3410 }
3411}
3412
3413static int
3414sfq_tc_install(struct netdev *netdev, const struct smap *details)
3415{
3416 int error;
3417 struct sfq sfq;
3418
3419 sfq_parse_qdisc_details__(netdev, details, &sfq);
3420 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3421 if (!error) {
3422 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3423 }
3424 return error;
3425}
3426
3427static int
3428sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3429{
3430 const struct tc_sfq_qopt *sfq;
3431 struct nlattr *nlattr;
3432 const char * kind;
3433 int error;
3434
3435 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3436 if (error == 0) {
3437 sfq = nl_attr_get(nlattr);
3438 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3439 return 0;
3440 }
3441
3442 return error;
3443}
3444
3445static void
3446sfq_tc_destroy(struct tc *tc)
3447{
3448 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3449 tc_destroy(tc);
3450 free(sfq);
3451}
3452
3453static int
3454sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3455{
3456 const struct sfq *sfq = sfq_get__(netdev);
3457 smap_add_format(details, "quantum", "%u", sfq->quantum);
3458 smap_add_format(details, "perturb", "%u", sfq->perturb);
3459 return 0;
3460}
3461
3462static int
3463sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3464{
3465 struct sfq sfq;
3466
3467 sfq_parse_qdisc_details__(netdev, details, &sfq);
3468 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3469 sfq_get__(netdev)->quantum = sfq.quantum;
3470 sfq_get__(netdev)->perturb = sfq.perturb;
3471 return 0;
3472}
3473
3474static const struct tc_ops tc_ops_sfq = {
3475 "sfq", /* linux_name */
3476 "linux-sfq", /* ovs_name */
3477 SFQ_N_QUEUES, /* n_queues */
3478 sfq_tc_install,
3479 sfq_tc_load,
3480 sfq_tc_destroy,
3481 sfq_qdisc_get,
3482 sfq_qdisc_set,
3483 NULL,
3484 NULL,
3485 NULL,
3486 NULL,
3487 NULL
3488};
3489\f
c1c9c9c4 3490/* HTB traffic control class. */
559843ed 3491
c1c9c9c4 3492#define HTB_N_QUEUES 0xf000
4f631ccd 3493#define HTB_RATE2QUANTUM 10
8b61709d 3494
c1c9c9c4
BP
3495struct htb {
3496 struct tc tc;
3497 unsigned int max_rate; /* In bytes/s. */
3498};
8b61709d 3499
c1c9c9c4 3500struct htb_class {
93b13be8 3501 struct tc_queue tc_queue;
c1c9c9c4
BP
3502 unsigned int min_rate; /* In bytes/s. */
3503 unsigned int max_rate; /* In bytes/s. */
3504 unsigned int burst; /* In bytes. */
3505 unsigned int priority; /* Lower values are higher priorities. */
3506};
8b61709d 3507
c1c9c9c4 3508static struct htb *
b5d57fc8 3509htb_get__(const struct netdev *netdev_)
c1c9c9c4 3510{
b5d57fc8
BP
3511 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3512 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3513}
3514
24045e35 3515static void
b5d57fc8 3516htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3517{
b5d57fc8 3518 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3519 struct htb *htb;
3520
3521 htb = xmalloc(sizeof *htb);
3522 tc_init(&htb->tc, &tc_ops_htb);
3523 htb->max_rate = max_rate;
3524
b5d57fc8 3525 netdev->tc = &htb->tc;
c1c9c9c4
BP
3526}
3527
3528/* Create an HTB qdisc.
3529 *
a339aa81 3530 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3531static int
3532htb_setup_qdisc__(struct netdev *netdev)
3533{
3534 size_t opt_offset;
3535 struct tc_htb_glob opt;
3536 struct ofpbuf request;
3537 struct tcmsg *tcmsg;
3538
3539 tc_del_qdisc(netdev);
3540
3541 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3542 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3543 if (!tcmsg) {
3544 return ENODEV;
3545 }
c1c9c9c4
BP
3546 tcmsg->tcm_handle = tc_make_handle(1, 0);
3547 tcmsg->tcm_parent = TC_H_ROOT;
3548
3549 nl_msg_put_string(&request, TCA_KIND, "htb");
3550
3551 memset(&opt, 0, sizeof opt);
4f631ccd 3552 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3553 opt.version = 3;
4ecf12d5 3554 opt.defcls = 1;
c1c9c9c4
BP
3555
3556 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3557 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3558 nl_msg_end_nested(&request, opt_offset);
3559
3560 return tc_transact(&request, NULL);
3561}
3562
3563/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3564 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3565static int
3566htb_setup_class__(struct netdev *netdev, unsigned int handle,
3567 unsigned int parent, struct htb_class *class)
3568{
3569 size_t opt_offset;
3570 struct tc_htb_opt opt;
3571 struct ofpbuf request;
3572 struct tcmsg *tcmsg;
3573 int error;
3574 int mtu;
3575
73371c09 3576 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3577 if (error) {
f915f1a8
BP
3578 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3579 netdev_get_name(netdev));
9b020780 3580 return error;
f915f1a8 3581 }
c1c9c9c4
BP
3582
3583 memset(&opt, 0, sizeof opt);
3584 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3585 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3586 /* Makes sure the quantum is at least MTU. Setting quantum will
3587 * make htb ignore the r2q for this class. */
3588 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3589 opt.quantum = mtu;
3590 }
c1c9c9c4
BP
3591 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3592 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3593 opt.prio = class->priority;
3594
3595 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
23a98ffe
BP
3596 if (!tcmsg) {
3597 return ENODEV;
3598 }
c1c9c9c4
BP
3599 tcmsg->tcm_handle = handle;
3600 tcmsg->tcm_parent = parent;
3601
3602 nl_msg_put_string(&request, TCA_KIND, "htb");
3603 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3604 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3605 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3606 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3607 nl_msg_end_nested(&request, opt_offset);
3608
3609 error = tc_transact(&request, NULL);
3610 if (error) {
3611 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3612 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3613 netdev_get_name(netdev),
3614 tc_get_major(handle), tc_get_minor(handle),
3615 tc_get_major(parent), tc_get_minor(parent),
3616 class->min_rate, class->max_rate,
10a89ef0 3617 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3618 }
3619 return error;
3620}
3621
3622/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3623 * description of them into 'details'. The description complies with the
3624 * specification given in the vswitch database documentation for linux-htb
3625 * queue details. */
3626static int
3627htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3628{
3629 static const struct nl_policy tca_htb_policy[] = {
3630 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3631 .min_len = sizeof(struct tc_htb_opt) },
3632 };
3633
3634 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3635 const struct tc_htb_opt *htb;
3636
3637 if (!nl_parse_nested(nl_options, tca_htb_policy,
3638 attrs, ARRAY_SIZE(tca_htb_policy))) {
3639 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3640 return EPROTO;
3641 }
3642
3643 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3644 class->min_rate = htb->rate.rate;
3645 class->max_rate = htb->ceil.rate;
3646 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3647 class->priority = htb->prio;
3648 return 0;
3649}
3650
3651static int
3652htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3653 struct htb_class *options,
3654 struct netdev_queue_stats *stats)
3655{
3656 struct nlattr *nl_options;
3657 unsigned int handle;
3658 int error;
3659
3660 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3661 if (!error && queue_id) {
17ee3c1f
BP
3662 unsigned int major = tc_get_major(handle);
3663 unsigned int minor = tc_get_minor(handle);
3664 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3665 *queue_id = minor - 1;
c1c9c9c4
BP
3666 } else {
3667 error = EPROTO;
3668 }
3669 }
3670 if (!error && options) {
3671 error = htb_parse_tca_options__(nl_options, options);
3672 }
3673 return error;
3674}
3675
3676static void
73371c09 3677htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3678 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3679{
73371c09 3680 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 3681
13c1637f 3682 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 3683 if (!hc->max_rate) {
a00ca915 3684 enum netdev_features current;
c1c9c9c4 3685
73371c09
BP
3686 netdev_linux_read_features(netdev);
3687 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3688 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3689 }
3690 hc->min_rate = hc->max_rate;
3691 hc->burst = 0;
3692 hc->priority = 0;
3693}
3694
3695static int
3696htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3697 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3698{
3699 const struct htb *htb = htb_get__(netdev);
9b020780 3700 int mtu, error;
c1c9c9c4 3701
73371c09 3702 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3703 if (error) {
f915f1a8
BP
3704 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3705 netdev_get_name(netdev));
9b020780 3706 return error;
f915f1a8
BP
3707 }
3708
4f104611
EJ
3709 /* HTB requires at least an mtu sized min-rate to send any traffic even
3710 * on uncongested links. */
13c1637f 3711 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 3712 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3713 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3714
3715 /* max-rate */
13c1637f
BP
3716 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
3717 if (!hc->max_rate) {
3718 hc->max_rate = htb->max_rate;
3719 }
c1c9c9c4
BP
3720 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3721 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3722
3723 /* burst
3724 *
3725 * According to hints in the documentation that I've read, it is important
3726 * that 'burst' be at least as big as the largest frame that might be
3727 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3728 * but having it a bit too small is a problem. Since netdev_get_mtu()
3729 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3730 * the MTU. We actually add 64, instead of 14, as a guard against
3731 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 3732 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
3733 hc->burst = MAX(hc->burst, mtu + 64);
3734
3735 /* priority */
13c1637f 3736 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
3737
3738 return 0;
3739}
3740
3741static int
3742htb_query_class__(const struct netdev *netdev, unsigned int handle,
3743 unsigned int parent, struct htb_class *options,
3744 struct netdev_queue_stats *stats)
3745{
3746 struct ofpbuf *reply;
3747 int error;
3748
3749 error = tc_query_class(netdev, handle, parent, &reply);
3750 if (!error) {
3751 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3752 ofpbuf_delete(reply);
3753 }
3754 return error;
3755}
3756
3757static int
79f1cbe9 3758htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3759{
3760 int error;
3761
3762 error = htb_setup_qdisc__(netdev);
3763 if (!error) {
3764 struct htb_class hc;
3765
3766 htb_parse_qdisc_details__(netdev, details, &hc);
3767 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3768 tc_make_handle(1, 0), &hc);
3769 if (!error) {
3770 htb_install__(netdev, hc.max_rate);
3771 }
3772 }
3773 return error;
3774}
3775
93b13be8
BP
3776static struct htb_class *
3777htb_class_cast__(const struct tc_queue *queue)
3778{
3779 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3780}
3781
c1c9c9c4
BP
3782static void
3783htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3784 const struct htb_class *hc)
3785{
3786 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3787 size_t hash = hash_int(queue_id, 0);
3788 struct tc_queue *queue;
c1c9c9c4
BP
3789 struct htb_class *hcp;
3790
93b13be8
BP
3791 queue = tc_find_queue__(netdev, queue_id, hash);
3792 if (queue) {
3793 hcp = htb_class_cast__(queue);
3794 } else {
c1c9c9c4 3795 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3796 queue = &hcp->tc_queue;
3797 queue->queue_id = queue_id;
6dc34a0d 3798 queue->created = time_msec();
93b13be8 3799 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3800 }
93b13be8
BP
3801
3802 hcp->min_rate = hc->min_rate;
3803 hcp->max_rate = hc->max_rate;
3804 hcp->burst = hc->burst;
3805 hcp->priority = hc->priority;
c1c9c9c4
BP
3806}
3807
3808static int
3809htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3810{
c1c9c9c4 3811 struct ofpbuf msg;
d57695d7 3812 struct queue_dump_state state;
c1c9c9c4 3813 struct htb_class hc;
c1c9c9c4
BP
3814
3815 /* Get qdisc options. */
3816 hc.max_rate = 0;
3817 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3818 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3819
3820 /* Get queues. */
d57695d7 3821 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3822 return ENODEV;
3823 }
d57695d7 3824 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3825 unsigned int queue_id;
3826
3827 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3828 htb_update_queue__(netdev, queue_id, &hc);
3829 }
3830 }
d57695d7 3831 finish_queue_dump(&state);
c1c9c9c4
BP
3832
3833 return 0;
3834}
3835
3836static void
3837htb_tc_destroy(struct tc *tc)
3838{
3839 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 3840 struct htb_class *hc;
c1c9c9c4 3841
4ec3d7c7 3842 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
3843 free(hc);
3844 }
3845 tc_destroy(tc);
3846 free(htb);
3847}
3848
3849static int
79f1cbe9 3850htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
3851{
3852 const struct htb *htb = htb_get__(netdev);
79f1cbe9 3853 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
3854 return 0;
3855}
3856
3857static int
79f1cbe9 3858htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3859{
3860 struct htb_class hc;
3861 int error;
3862
3863 htb_parse_qdisc_details__(netdev, details, &hc);
3864 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3865 tc_make_handle(1, 0), &hc);
3866 if (!error) {
3867 htb_get__(netdev)->max_rate = hc.max_rate;
3868 }
3869 return error;
3870}
3871
3872static int
93b13be8 3873htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 3874 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 3875{
93b13be8 3876 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3877
79f1cbe9 3878 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 3879 if (hc->min_rate != hc->max_rate) {
79f1cbe9 3880 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 3881 }
79f1cbe9 3882 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 3883 if (hc->priority) {
79f1cbe9 3884 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
3885 }
3886 return 0;
3887}
3888
3889static int
3890htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 3891 const struct smap *details)
c1c9c9c4
BP
3892{
3893 struct htb_class hc;
3894 int error;
3895
3896 error = htb_parse_class_details__(netdev, details, &hc);
3897 if (error) {
3898 return error;
3899 }
3900
17ee3c1f 3901 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
3902 tc_make_handle(1, 0xfffe), &hc);
3903 if (error) {
3904 return error;
3905 }
3906
3907 htb_update_queue__(netdev, queue_id, &hc);
3908 return 0;
3909}
3910
3911static int
93b13be8 3912htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 3913{
93b13be8 3914 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 3915 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
3916 int error;
3917
93b13be8 3918 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 3919 if (!error) {
93b13be8 3920 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 3921 free(hc);
c1c9c9c4
BP
3922 }
3923 return error;
3924}
3925
3926static int
93b13be8 3927htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
3928 struct netdev_queue_stats *stats)
3929{
93b13be8 3930 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
3931 tc_make_handle(1, 0xfffe), NULL, stats);
3932}
3933
3934static int
3935htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3936 const struct ofpbuf *nlmsg,
3937 netdev_dump_queue_stats_cb *cb, void *aux)
3938{
3939 struct netdev_queue_stats stats;
17ee3c1f 3940 unsigned int handle, major, minor;
c1c9c9c4
BP
3941 int error;
3942
3943 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3944 if (error) {
3945 return error;
3946 }
3947
17ee3c1f
BP
3948 major = tc_get_major(handle);
3949 minor = tc_get_minor(handle);
3950 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 3951 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
3952 }
3953 return 0;
3954}
3955
3956static const struct tc_ops tc_ops_htb = {
3957 "htb", /* linux_name */
3958 "linux-htb", /* ovs_name */
3959 HTB_N_QUEUES, /* n_queues */
3960 htb_tc_install,
3961 htb_tc_load,
3962 htb_tc_destroy,
3963 htb_qdisc_get,
3964 htb_qdisc_set,
3965 htb_class_get,
3966 htb_class_set,
3967 htb_class_delete,
3968 htb_class_get_stats,
3969 htb_class_dump_stats
3970};
3971\f
a339aa81
EJ
3972/* "linux-hfsc" traffic control class. */
3973
3974#define HFSC_N_QUEUES 0xf000
3975
3976struct hfsc {
3977 struct tc tc;
3978 uint32_t max_rate;
3979};
3980
3981struct hfsc_class {
3982 struct tc_queue tc_queue;
3983 uint32_t min_rate;
3984 uint32_t max_rate;
3985};
3986
3987static struct hfsc *
b5d57fc8 3988hfsc_get__(const struct netdev *netdev_)
a339aa81 3989{
b5d57fc8
BP
3990 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3991 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
3992}
3993
3994static struct hfsc_class *
3995hfsc_class_cast__(const struct tc_queue *queue)
3996{
3997 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
3998}
3999
24045e35 4000static void
b5d57fc8 4001hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4002{
b5d57fc8 4003 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4004 struct hfsc *hfsc;
4005
a339aa81
EJ
4006 hfsc = xmalloc(sizeof *hfsc);
4007 tc_init(&hfsc->tc, &tc_ops_hfsc);
4008 hfsc->max_rate = max_rate;
b5d57fc8 4009 netdev->tc = &hfsc->tc;
a339aa81
EJ
4010}
4011
4012static void
4013hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4014 const struct hfsc_class *hc)
4015{
4016 size_t hash;
4017 struct hfsc *hfsc;
4018 struct hfsc_class *hcp;
4019 struct tc_queue *queue;
4020
4021 hfsc = hfsc_get__(netdev);
4022 hash = hash_int(queue_id, 0);
4023
4024 queue = tc_find_queue__(netdev, queue_id, hash);
4025 if (queue) {
4026 hcp = hfsc_class_cast__(queue);
4027 } else {
4028 hcp = xmalloc(sizeof *hcp);
4029 queue = &hcp->tc_queue;
4030 queue->queue_id = queue_id;
6dc34a0d 4031 queue->created = time_msec();
a339aa81
EJ
4032 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4033 }
4034
4035 hcp->min_rate = hc->min_rate;
4036 hcp->max_rate = hc->max_rate;
4037}
4038
4039static int
4040hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4041{
4042 const struct tc_service_curve *rsc, *fsc, *usc;
4043 static const struct nl_policy tca_hfsc_policy[] = {
4044 [TCA_HFSC_RSC] = {
4045 .type = NL_A_UNSPEC,
4046 .optional = false,
4047 .min_len = sizeof(struct tc_service_curve),
4048 },
4049 [TCA_HFSC_FSC] = {
4050 .type = NL_A_UNSPEC,
4051 .optional = false,
4052 .min_len = sizeof(struct tc_service_curve),
4053 },
4054 [TCA_HFSC_USC] = {
4055 .type = NL_A_UNSPEC,
4056 .optional = false,
4057 .min_len = sizeof(struct tc_service_curve),
4058 },
4059 };
4060 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4061
4062 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4063 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4064 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4065 return EPROTO;
4066 }
4067
4068 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4069 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4070 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4071
4072 if (rsc->m1 != 0 || rsc->d != 0 ||
4073 fsc->m1 != 0 || fsc->d != 0 ||
4074 usc->m1 != 0 || usc->d != 0) {
4075 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4076 "Non-linear service curves are not supported.");
4077 return EPROTO;
4078 }
4079
4080 if (rsc->m2 != fsc->m2) {
4081 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4082 "Real-time service curves are not supported ");
4083 return EPROTO;
4084 }
4085
4086 if (rsc->m2 > usc->m2) {
4087 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4088 "Min-rate service curve is greater than "
4089 "the max-rate service curve.");
4090 return EPROTO;
4091 }
4092
4093 class->min_rate = fsc->m2;
4094 class->max_rate = usc->m2;
4095 return 0;
4096}
4097
4098static int
4099hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4100 struct hfsc_class *options,
4101 struct netdev_queue_stats *stats)
4102{
4103 int error;
4104 unsigned int handle;
4105 struct nlattr *nl_options;
4106
4107 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4108 if (error) {
4109 return error;
4110 }
4111
4112 if (queue_id) {
4113 unsigned int major, minor;
4114
4115 major = tc_get_major(handle);
4116 minor = tc_get_minor(handle);
4117 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4118 *queue_id = minor - 1;
4119 } else {
4120 return EPROTO;
4121 }
4122 }
4123
4124 if (options) {
4125 error = hfsc_parse_tca_options__(nl_options, options);
4126 }
4127
4128 return error;
4129}
4130
4131static int
4132hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4133 unsigned int parent, struct hfsc_class *options,
4134 struct netdev_queue_stats *stats)
4135{
4136 int error;
4137 struct ofpbuf *reply;
4138
4139 error = tc_query_class(netdev, handle, parent, &reply);
4140 if (error) {
4141 return error;
4142 }
4143
4144 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4145 ofpbuf_delete(reply);
4146 return error;
4147}
4148
4149static void
73371c09 4150hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4151 struct hfsc_class *class)
4152{
73371c09 4153 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4154
13c1637f 4155 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4156 if (!max_rate) {
a00ca915 4157 enum netdev_features current;
a339aa81 4158
73371c09
BP
4159 netdev_linux_read_features(netdev);
4160 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4161 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4162 }
4163
4164 class->min_rate = max_rate;
4165 class->max_rate = max_rate;
4166}
4167
4168static int
4169hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4170 const struct smap *details,
a339aa81
EJ
4171 struct hfsc_class * class)
4172{
4173 const struct hfsc *hfsc;
4174 uint32_t min_rate, max_rate;
a339aa81
EJ
4175
4176 hfsc = hfsc_get__(netdev);
a339aa81 4177
13c1637f 4178 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4179 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4180 min_rate = MIN(min_rate, hfsc->max_rate);
4181
13c1637f 4182 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4183 max_rate = MAX(max_rate, min_rate);
4184 max_rate = MIN(max_rate, hfsc->max_rate);
4185
4186 class->min_rate = min_rate;
4187 class->max_rate = max_rate;
4188
4189 return 0;
4190}
4191
4192/* Create an HFSC qdisc.
4193 *
4194 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4195static int
4196hfsc_setup_qdisc__(struct netdev * netdev)
4197{
4198 struct tcmsg *tcmsg;
4199 struct ofpbuf request;
4200 struct tc_hfsc_qopt opt;
4201
4202 tc_del_qdisc(netdev);
4203
4204 tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
4205 NLM_F_EXCL | NLM_F_CREATE, &request);
4206
4207 if (!tcmsg) {
4208 return ENODEV;
4209 }
4210
4211 tcmsg->tcm_handle = tc_make_handle(1, 0);
4212 tcmsg->tcm_parent = TC_H_ROOT;
4213
4214 memset(&opt, 0, sizeof opt);
4215 opt.defcls = 1;
4216
4217 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4218 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4219
4220 return tc_transact(&request, NULL);
4221}
4222
4223/* Create an HFSC class.
4224 *
4225 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4226 * sc rate <min_rate> ul rate <max_rate>" */
4227static int
4228hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4229 unsigned int parent, struct hfsc_class *class)
4230{
4231 int error;
4232 size_t opt_offset;
4233 struct tcmsg *tcmsg;
4234 struct ofpbuf request;
4235 struct tc_service_curve min, max;
4236
4237 tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
4238
4239 if (!tcmsg) {
4240 return ENODEV;
4241 }
4242
4243 tcmsg->tcm_handle = handle;
4244 tcmsg->tcm_parent = parent;
4245
4246 min.m1 = 0;
4247 min.d = 0;
4248 min.m2 = class->min_rate;
4249
4250 max.m1 = 0;
4251 max.d = 0;
4252 max.m2 = class->max_rate;
4253
4254 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4255 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4256 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4257 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4258 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4259 nl_msg_end_nested(&request, opt_offset);
4260
4261 error = tc_transact(&request, NULL);
4262 if (error) {
4263 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4264 "min-rate %ubps, max-rate %ubps (%s)",
4265 netdev_get_name(netdev),
4266 tc_get_major(handle), tc_get_minor(handle),
4267 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4268 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4269 }
4270
4271 return error;
4272}
4273
4274static int
79f1cbe9 4275hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4276{
4277 int error;
4278 struct hfsc_class class;
4279
4280 error = hfsc_setup_qdisc__(netdev);
4281
4282 if (error) {
4283 return error;
4284 }
4285
4286 hfsc_parse_qdisc_details__(netdev, details, &class);
4287 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4288 tc_make_handle(1, 0), &class);
4289
4290 if (error) {
4291 return error;
4292 }
4293
4294 hfsc_install__(netdev, class.max_rate);
4295 return 0;
4296}
4297
4298static int
4299hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4300{
4301 struct ofpbuf msg;
d57695d7 4302 struct queue_dump_state state;
a339aa81
EJ
4303 struct hfsc_class hc;
4304
4305 hc.max_rate = 0;
4306 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4307 hfsc_install__(netdev, hc.max_rate);
a339aa81 4308
d57695d7 4309 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4310 return ENODEV;
4311 }
4312
d57695d7 4313 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4314 unsigned int queue_id;
4315
4316 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4317 hfsc_update_queue__(netdev, queue_id, &hc);
4318 }
4319 }
4320
d57695d7 4321 finish_queue_dump(&state);
a339aa81
EJ
4322 return 0;
4323}
4324
4325static void
4326hfsc_tc_destroy(struct tc *tc)
4327{
4328 struct hfsc *hfsc;
4329 struct hfsc_class *hc, *next;
4330
4331 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4332
4333 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4334 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4335 free(hc);
4336 }
4337
4338 tc_destroy(tc);
4339 free(hfsc);
4340}
4341
4342static int
79f1cbe9 4343hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4344{
4345 const struct hfsc *hfsc;
4346 hfsc = hfsc_get__(netdev);
79f1cbe9 4347 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4348 return 0;
4349}
4350
4351static int
79f1cbe9 4352hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4353{
4354 int error;
4355 struct hfsc_class class;
4356
4357 hfsc_parse_qdisc_details__(netdev, details, &class);
4358 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4359 tc_make_handle(1, 0), &class);
4360
4361 if (!error) {
4362 hfsc_get__(netdev)->max_rate = class.max_rate;
4363 }
4364
4365 return error;
4366}
4367
4368static int
4369hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4370 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4371{
4372 const struct hfsc_class *hc;
4373
4374 hc = hfsc_class_cast__(queue);
79f1cbe9 4375 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4376 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4377 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4378 }
4379 return 0;
4380}
4381
4382static int
4383hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4384 const struct smap *details)
a339aa81
EJ
4385{
4386 int error;
4387 struct hfsc_class class;
4388
4389 error = hfsc_parse_class_details__(netdev, details, &class);
4390 if (error) {
4391 return error;
4392 }
4393
4394 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4395 tc_make_handle(1, 0xfffe), &class);
4396 if (error) {
4397 return error;
4398 }
4399
4400 hfsc_update_queue__(netdev, queue_id, &class);
4401 return 0;
4402}
4403
4404static int
4405hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4406{
4407 int error;
4408 struct hfsc *hfsc;
4409 struct hfsc_class *hc;
4410
4411 hc = hfsc_class_cast__(queue);
4412 hfsc = hfsc_get__(netdev);
4413
4414 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4415 if (!error) {
4416 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4417 free(hc);
4418 }
4419 return error;
4420}
4421
4422static int
4423hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4424 struct netdev_queue_stats *stats)
4425{
4426 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4427 tc_make_handle(1, 0xfffe), NULL, stats);
4428}
4429
4430static int
4431hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4432 const struct ofpbuf *nlmsg,
4433 netdev_dump_queue_stats_cb *cb, void *aux)
4434{
4435 struct netdev_queue_stats stats;
4436 unsigned int handle, major, minor;
4437 int error;
4438
4439 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4440 if (error) {
4441 return error;
4442 }
4443
4444 major = tc_get_major(handle);
4445 minor = tc_get_minor(handle);
4446 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4447 (*cb)(minor - 1, &stats, aux);
4448 }
4449 return 0;
4450}
4451
4452static const struct tc_ops tc_ops_hfsc = {
4453 "hfsc", /* linux_name */
4454 "linux-hfsc", /* ovs_name */
4455 HFSC_N_QUEUES, /* n_queues */
4456 hfsc_tc_install, /* tc_install */
4457 hfsc_tc_load, /* tc_load */
4458 hfsc_tc_destroy, /* tc_destroy */
4459 hfsc_qdisc_get, /* qdisc_get */
4460 hfsc_qdisc_set, /* qdisc_set */
4461 hfsc_class_get, /* class_get */
4462 hfsc_class_set, /* class_set */
4463 hfsc_class_delete, /* class_delete */
4464 hfsc_class_get_stats, /* class_get_stats */
4465 hfsc_class_dump_stats /* class_dump_stats */
4466};
4467\f
6cf888b8
BS
4468/* "linux-noop" traffic control class. */
4469
4470static void
4471noop_install__(struct netdev *netdev_)
4472{
4473 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4474 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4475
4476 netdev->tc = CONST_CAST(struct tc *, &tc);
4477}
4478
4479static int
4480noop_tc_install(struct netdev *netdev,
4481 const struct smap *details OVS_UNUSED)
4482{
4483 noop_install__(netdev);
4484 return 0;
4485}
4486
4487static int
4488noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4489{
4490 noop_install__(netdev);
4491 return 0;
4492}
4493
4494static const struct tc_ops tc_ops_noop = {
4495 NULL, /* linux_name */
4496 "linux-noop", /* ovs_name */
4497 0, /* n_queues */
4498 noop_tc_install,
4499 noop_tc_load,
4500 NULL, /* tc_destroy */
4501 NULL, /* qdisc_get */
4502 NULL, /* qdisc_set */
4503 NULL, /* class_get */
4504 NULL, /* class_set */
4505 NULL, /* class_delete */
4506 NULL, /* class_get_stats */
4507 NULL /* class_dump_stats */
4508};
4509\f
c1c9c9c4
BP
4510/* "linux-default" traffic control class.
4511 *
4512 * This class represents the default, unnamed Linux qdisc. It corresponds to
4513 * the "" (empty string) QoS type in the OVS database. */
4514
4515static void
b5d57fc8 4516default_install__(struct netdev *netdev_)
c1c9c9c4 4517{
b5d57fc8 4518 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4519 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4520
559eb230
BP
4521 /* Nothing but a tc class implementation is allowed to write to a tc. This
4522 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4523 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4524}
4525
4526static int
4527default_tc_install(struct netdev *netdev,
79f1cbe9 4528 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4529{
4530 default_install__(netdev);
4531 return 0;
4532}
4533
4534static int
4535default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4536{
4537 default_install__(netdev);
4538 return 0;
4539}
4540
4541static const struct tc_ops tc_ops_default = {
4542 NULL, /* linux_name */
4543 "", /* ovs_name */
4544 0, /* n_queues */
4545 default_tc_install,
4546 default_tc_load,
4547 NULL, /* tc_destroy */
4548 NULL, /* qdisc_get */
4549 NULL, /* qdisc_set */
4550 NULL, /* class_get */
4551 NULL, /* class_set */
4552 NULL, /* class_delete */
4553 NULL, /* class_get_stats */
4554 NULL /* class_dump_stats */
4555};
4556\f
4557/* "linux-other" traffic control class.
4558 *
4559 * */
4560
4561static int
b5d57fc8 4562other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4563{
b5d57fc8 4564 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4565 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4566
559eb230
BP
4567 /* Nothing but a tc class implementation is allowed to write to a tc. This
4568 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4569 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4570 return 0;
4571}
4572
4573static const struct tc_ops tc_ops_other = {
4574 NULL, /* linux_name */
4575 "linux-other", /* ovs_name */
4576 0, /* n_queues */
4577 NULL, /* tc_install */
4578 other_tc_load,
4579 NULL, /* tc_destroy */
4580 NULL, /* qdisc_get */
4581 NULL, /* qdisc_set */
4582 NULL, /* class_get */
4583 NULL, /* class_set */
4584 NULL, /* class_delete */
4585 NULL, /* class_get_stats */
4586 NULL /* class_dump_stats */
4587};
4588\f
4589/* Traffic control. */
4590
4591/* Number of kernel "tc" ticks per second. */
4592static double ticks_per_s;
4593
4594/* Number of kernel "jiffies" per second. This is used for the purpose of
4595 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4596 * one jiffy's worth of data.
4597 *
4598 * There are two possibilities here:
4599 *
4600 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4601 * approximate range of 100 to 1024. That means that we really need to
4602 * make sure that the qdisc can buffer that much data.
4603 *
4604 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4605 * has finely granular timers and there's no need to fudge additional room
4606 * for buffers. (There's no extra effort needed to implement that: the
4607 * large 'buffer_hz' is used as a divisor, so practically any number will
4608 * come out as 0 in the division. Small integer results in the case of
4609 * really high dividends won't have any real effect anyhow.)
4610 */
4611static unsigned int buffer_hz;
4612
4613/* Returns tc handle 'major':'minor'. */
4614static unsigned int
4615tc_make_handle(unsigned int major, unsigned int minor)
4616{
4617 return TC_H_MAKE(major << 16, minor);
4618}
4619
4620/* Returns the major number from 'handle'. */
4621static unsigned int
4622tc_get_major(unsigned int handle)
4623{
4624 return TC_H_MAJ(handle) >> 16;
4625}
4626
4627/* Returns the minor number from 'handle'. */
4628static unsigned int
4629tc_get_minor(unsigned int handle)
4630{
4631 return TC_H_MIN(handle);
4632}
4633
4634static struct tcmsg *
4635tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
4636 struct ofpbuf *request)
4637{
4638 struct tcmsg *tcmsg;
4639 int ifindex;
4640 int error;
4641
4642 error = get_ifindex(netdev, &ifindex);
4643 if (error) {
4644 return NULL;
4645 }
4646
4647 ofpbuf_init(request, 512);
4648 nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
4649 tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
4650 tcmsg->tcm_family = AF_UNSPEC;
4651 tcmsg->tcm_ifindex = ifindex;
4652 /* Caller should fill in tcmsg->tcm_handle. */
4653 /* Caller should fill in tcmsg->tcm_parent. */
4654
4655 return tcmsg;
4656}
4657
4658static int
4659tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
4660{
a88b4e04 4661 int error = nl_transact(NETLINK_ROUTE, request, replyp);
c1c9c9c4
BP
4662 ofpbuf_uninit(request);
4663 return error;
4664}
4665
f8500004
JP
4666/* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4667 * policing configuration.
4668 *
4669 * This function is equivalent to running the following when 'add' is true:
4670 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4671 *
4672 * This function is equivalent to running the following when 'add' is false:
4673 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4674 *
4675 * The configuration and stats may be seen with the following command:
4676 * /sbin/tc -s qdisc show dev <devname>
4677 *
4678 * Returns 0 if successful, otherwise a positive errno value.
4679 */
4680static int
4681tc_add_del_ingress_qdisc(struct netdev *netdev, bool add)
4682{
4683 struct ofpbuf request;
4684 struct tcmsg *tcmsg;
4685 int error;
4686 int type = add ? RTM_NEWQDISC : RTM_DELQDISC;
4687 int flags = add ? NLM_F_EXCL | NLM_F_CREATE : 0;
4688
4689 tcmsg = tc_make_request(netdev, type, flags, &request);
4690 if (!tcmsg) {
4691 return ENODEV;
4692 }
4693 tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
4694 tcmsg->tcm_parent = TC_H_INGRESS;
4695 nl_msg_put_string(&request, TCA_KIND, "ingress");
4696 nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
4697
4698 error = tc_transact(&request, NULL);
4699 if (error) {
4700 /* If we're deleting the qdisc, don't worry about some of the
4701 * error conditions. */
4702 if (!add && (error == ENOENT || error == EINVAL)) {
4703 return 0;
4704 }
4705 return error;
4706 }
4707
4708 return 0;
4709}
4710
4711/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4712 * of 'kbits_burst'.
4713 *
4714 * This function is equivalent to running:
4715 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4716 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4717 * mtu 65535 drop
4718 *
4719 * The configuration and stats may be seen with the following command:
c7952afb 4720 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4721 *
4722 * Returns 0 if successful, otherwise a positive errno value.
4723 */
4724static int
c7952afb
BP
4725tc_add_policer(struct netdev *netdev,
4726 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4727{
4728 struct tc_police tc_police;
4729 struct ofpbuf request;
4730 struct tcmsg *tcmsg;
4731 size_t basic_offset;
4732 size_t police_offset;
4733 int error;
4734 int mtu = 65535;
4735
4736 memset(&tc_police, 0, sizeof tc_police);
4737 tc_police.action = TC_POLICE_SHOT;
4738 tc_police.mtu = mtu;
1aca400c 4739 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4740
79abacc8
MAA
4741 /* The following appears wrong in one way: In networking a kilobit is
4742 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4743 *
4744 * However if you "fix" those problems then "tc filter show ..." shows
4745 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4746 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4747 * tc's point of view. Whatever. */
4748 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4749 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004
JP
4750
4751 tcmsg = tc_make_request(netdev, RTM_NEWTFILTER,
4752 NLM_F_EXCL | NLM_F_CREATE, &request);
4753 if (!tcmsg) {
4754 return ENODEV;
4755 }
4756 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4757 tcmsg->tcm_info = tc_make_handle(49,
4758 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4759
4760 nl_msg_put_string(&request, TCA_KIND, "basic");
4761 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4762 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4763 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4764 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4765 nl_msg_end_nested(&request, police_offset);
4766 nl_msg_end_nested(&request, basic_offset);
4767
4768 error = tc_transact(&request, NULL);
4769 if (error) {
4770 return error;
4771 }
4772
4773 return 0;
4774}
4775
c1c9c9c4
BP
4776static void
4777read_psched(void)
4778{
4779 /* The values in psched are not individually very meaningful, but they are
4780 * important. The tables below show some values seen in the wild.
4781 *
4782 * Some notes:
4783 *
4784 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4785 * (Before that, there are hints that it was 1000000000.)
4786 *
4787 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4788 * above.
4789 *
4790 * /proc/net/psched
4791 * -----------------------------------
4792 * [1] 000c8000 000f4240 000f4240 00000064
4793 * [2] 000003e8 00000400 000f4240 3b9aca00
4794 * [3] 000003e8 00000400 000f4240 3b9aca00
4795 * [4] 000003e8 00000400 000f4240 00000064
4796 * [5] 000003e8 00000040 000f4240 3b9aca00
4797 * [6] 000003e8 00000040 000f4240 000000f9
4798 *
4799 * a b c d ticks_per_s buffer_hz
4800 * ------- --------- ---------- ------------- ----------- -------------
4801 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4802 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4803 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4804 * [4] 1,000 1,024 1,000,000 100 976,562 100
4805 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4806 * [6] 1,000 64 1,000,000 249 15,625,000 249
4807 *
4808 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4809 * [2] 2.6.26-1-686-bigmem from Debian lenny
4810 * [3] 2.6.26-2-sparc64 from Debian lenny
4811 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4812 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4813 * [6] 2.6.34 from kernel.org on KVM
4814 */
23882115 4815 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4816 static const char fn[] = "/proc/net/psched";
4817 unsigned int a, b, c, d;
4818 FILE *stream;
4819
23882115
BP
4820 if (!ovsthread_once_start(&once)) {
4821 return;
4822 }
4823
c1c9c9c4
BP
4824 ticks_per_s = 1.0;
4825 buffer_hz = 100;
4826
4827 stream = fopen(fn, "r");
4828 if (!stream) {
10a89ef0 4829 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4830 goto exit;
c1c9c9c4
BP
4831 }
4832
4833 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4834 VLOG_WARN("%s: read failed", fn);
4835 fclose(stream);
23882115 4836 goto exit;
c1c9c9c4
BP
4837 }
4838 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4839 fclose(stream);
4840
4841 if (!a || !c) {
4842 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4843 goto exit;
c1c9c9c4
BP
4844 }
4845
4846 ticks_per_s = (double) a * c / b;
4847 if (c == 1000000) {
4848 buffer_hz = d;
4849 } else {
4850 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4851 fn, a, b, c, d);
4852 }
4853 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4854
4855exit:
4856 ovsthread_once_done(&once);
c1c9c9c4
BP
4857}
4858
4859/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4860 * rate of 'rate' bytes per second. */
4861static unsigned int
4862tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4863{
23882115 4864 read_psched();
c1c9c9c4
BP
4865 return (rate * ticks) / ticks_per_s;
4866}
4867
4868/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4869 * rate of 'rate' bytes per second. */
4870static unsigned int
4871tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4872{
23882115 4873 read_psched();
015c93a4 4874 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4875}
4876
4877/* Returns the number of bytes that need to be reserved for qdisc buffering at
4878 * a transmission rate of 'rate' bytes per second. */
4879static unsigned int
4880tc_buffer_per_jiffy(unsigned int rate)
4881{
23882115 4882 read_psched();
c1c9c9c4
BP
4883 return rate / buffer_hz;
4884}
4885
4886/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4887 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4888 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4889 * stores NULL into it if it is absent.
4890 *
4891 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4892 * 'msg'.
4893 *
4894 * Returns 0 if successful, otherwise a positive errno value. */
4895static int
4896tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4897 struct nlattr **options)
4898{
4899 static const struct nl_policy tca_policy[] = {
4900 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4901 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4902 };
4903 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4904
4905 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4906 tca_policy, ta, ARRAY_SIZE(ta))) {
4907 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4908 goto error;
4909 }
4910
4911 if (kind) {
4912 *kind = nl_attr_get_string(ta[TCA_KIND]);
4913 }
4914
4915 if (options) {
4916 *options = ta[TCA_OPTIONS];
4917 }
4918
4919 return 0;
4920
4921error:
4922 if (kind) {
4923 *kind = NULL;
4924 }
4925 if (options) {
4926 *options = NULL;
4927 }
4928 return EPROTO;
4929}
4930
4931/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4932 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4933 * into '*options', and its queue statistics into '*stats'. Any of the output
4934 * arguments may be null.
4935 *
4936 * Returns 0 if successful, otherwise a positive errno value. */
4937static int
4938tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
4939 struct nlattr **options, struct netdev_queue_stats *stats)
4940{
4941 static const struct nl_policy tca_policy[] = {
4942 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
4943 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
4944 };
4945 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4946
4947 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4948 tca_policy, ta, ARRAY_SIZE(ta))) {
4949 VLOG_WARN_RL(&rl, "failed to parse class message");
4950 goto error;
4951 }
4952
4953 if (handlep) {
4954 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
4955 *handlep = tc->tcm_handle;
4956 }
4957
4958 if (options) {
4959 *options = ta[TCA_OPTIONS];
4960 }
4961
4962 if (stats) {
4963 const struct gnet_stats_queue *gsq;
4964 struct gnet_stats_basic gsb;
4965
4966 static const struct nl_policy stats_policy[] = {
4967 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
4968 .min_len = sizeof gsb },
4969 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
4970 .min_len = sizeof *gsq },
4971 };
4972 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
4973
4974 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
4975 sa, ARRAY_SIZE(sa))) {
4976 VLOG_WARN_RL(&rl, "failed to parse class stats");
4977 goto error;
4978 }
4979
4980 /* Alignment issues screw up the length of struct gnet_stats_basic on
4981 * some arch/bitsize combinations. Newer versions of Linux have a
4982 * struct gnet_stats_basic_packed, but we can't depend on that. The
4983 * easiest thing to do is just to make a copy. */
4984 memset(&gsb, 0, sizeof gsb);
4985 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
4986 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
4987 stats->tx_bytes = gsb.bytes;
4988 stats->tx_packets = gsb.packets;
4989
4990 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
4991 stats->tx_errors = gsq->drops;
4992 }
4993
4994 return 0;
4995
4996error:
4997 if (options) {
4998 *options = NULL;
4999 }
5000 if (stats) {
5001 memset(stats, 0, sizeof *stats);
5002 }
5003 return EPROTO;
5004}
5005
5006/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5007 * on 'netdev'. */
5008static int
5009tc_query_class(const struct netdev *netdev,
5010 unsigned int handle, unsigned int parent,
5011 struct ofpbuf **replyp)
5012{
5013 struct ofpbuf request;
5014 struct tcmsg *tcmsg;
5015 int error;
5016
5017 tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
23a98ffe
BP
5018 if (!tcmsg) {
5019 return ENODEV;
5020 }
c1c9c9c4
BP
5021 tcmsg->tcm_handle = handle;
5022 tcmsg->tcm_parent = parent;
5023
5024 error = tc_transact(&request, replyp);
5025 if (error) {
5026 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5027 netdev_get_name(netdev),
5028 tc_get_major(handle), tc_get_minor(handle),
5029 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5030 ovs_strerror(error));
c1c9c9c4
BP
5031 }
5032 return error;
5033}
5034
5035/* Equivalent to "tc class del dev <name> handle <handle>". */
5036static int
5037tc_delete_class(const struct netdev *netdev, unsigned int handle)
5038{
5039 struct ofpbuf request;
5040 struct tcmsg *tcmsg;
5041 int error;
5042
5043 tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5044 if (!tcmsg) {
5045 return ENODEV;
5046 }
c1c9c9c4
BP
5047 tcmsg->tcm_handle = handle;
5048 tcmsg->tcm_parent = 0;
5049
5050 error = tc_transact(&request, NULL);
5051 if (error) {
5052 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5053 netdev_get_name(netdev),
5054 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5055 ovs_strerror(error));
c1c9c9c4
BP
5056 }
5057 return error;
5058}
5059
5060/* Equivalent to "tc qdisc del dev <name> root". */
5061static int
b5d57fc8 5062tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5063{
b5d57fc8 5064 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5065 struct ofpbuf request;
5066 struct tcmsg *tcmsg;
5067 int error;
5068
b5d57fc8 5069 tcmsg = tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5070 if (!tcmsg) {
5071 return ENODEV;
5072 }
c1c9c9c4
BP
5073 tcmsg->tcm_handle = tc_make_handle(1, 0);
5074 tcmsg->tcm_parent = TC_H_ROOT;
5075
5076 error = tc_transact(&request, NULL);
5077 if (error == EINVAL) {
5078 /* EINVAL probably means that the default qdisc was in use, in which
5079 * case we've accomplished our purpose. */
5080 error = 0;
5081 }
b5d57fc8
BP
5082 if (!error && netdev->tc) {
5083 if (netdev->tc->ops->tc_destroy) {
5084 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5085 }
b5d57fc8 5086 netdev->tc = NULL;
c1c9c9c4
BP
5087 }
5088 return error;
5089}
5090
ac3e3aaa
BP
5091static bool
5092getqdisc_is_safe(void)
5093{
5094 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5095 static bool safe = false;
5096
5097 if (ovsthread_once_start(&once)) {
5098 struct utsname utsname;
5099 int major, minor;
5100
5101 if (uname(&utsname) == -1) {
5102 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5103 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5104 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5105 } else if (major < 2 || (major == 2 && minor < 35)) {
5106 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5107 utsname.release);
5108 } else {
5109 safe = true;
5110 }
5111 ovsthread_once_done(&once);
5112 }
5113 return safe;
5114}
5115
c1c9c9c4
BP
5116/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5117 * kernel to determine what they are. Returns 0 if successful, otherwise a
5118 * positive errno value. */
5119static int
b5d57fc8 5120tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5121{
b5d57fc8 5122 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5123 struct ofpbuf request, *qdisc;
5124 const struct tc_ops *ops;
5125 struct tcmsg *tcmsg;
5126 int load_error;
5127 int error;
5128
b5d57fc8 5129 if (netdev->tc) {
c1c9c9c4
BP
5130 return 0;
5131 }
5132
5133 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5134 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5135 * 2.6.35 without that fix backported to it.
5136 *
5137 * To avoid the OOPS, we must not make a request that would attempt to dump
5138 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5139 * few others. There are a few ways that I can see to do this, but most of
5140 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5141 * technique chosen here is to assume that any non-default qdisc that we
5142 * create will have a class with handle 1:0. The built-in qdiscs only have
5143 * a class with handle 0:0.
5144 *
ac3e3aaa
BP
5145 * On Linux 2.6.35+ we use the straightforward method because it allows us
5146 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5147 * in such a case we get no response at all from the kernel (!) if a
5148 * builtin qdisc is in use (which is later caught by "!error &&
5149 * !qdisc->size"). */
b5d57fc8 5150 tcmsg = tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO, &request);
23a98ffe
BP
5151 if (!tcmsg) {
5152 return ENODEV;
5153 }
ac3e3aaa
BP
5154 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5155 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5156
5157 /* Figure out what tc class to instantiate. */
5158 error = tc_transact(&request, &qdisc);
ac3e3aaa 5159 if (!error && qdisc->size) {
c1c9c9c4
BP
5160 const char *kind;
5161
5162 error = tc_parse_qdisc(qdisc, &kind, NULL);
5163 if (error) {
5164 ops = &tc_ops_other;
5165 } else {
5166 ops = tc_lookup_linux_name(kind);
5167 if (!ops) {
5168 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5169 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5170
5171 ops = &tc_ops_other;
5172 }
5173 }
ac3e3aaa
BP
5174 } else if ((!error && !qdisc->size) || error == ENOENT) {
5175 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5176 * set up by some other entity that doesn't have a handle 1:0. We will
5177 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5178 ops = &tc_ops_default;
5179 error = 0;
5180 } else {
5181 /* Who knows? Maybe the device got deleted. */
5182 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5183 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5184 ops = &tc_ops_other;
5185 }
5186
5187 /* Instantiate it. */
b5d57fc8
BP
5188 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5189 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5190 ofpbuf_delete(qdisc);
5191
5192 return error ? error : load_error;
5193}
5194
5195/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5196 approximate the time to transmit packets of various lengths. For an MTU of
5197 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5198 represents two possible packet lengths; for a MTU of 513 through 1024, four
5199 possible lengths; and so on.
5200
5201 Returns, for the specified 'mtu', the number of bits that packet lengths
5202 need to be shifted right to fit within such a 256-entry table. */
5203static int
5204tc_calc_cell_log(unsigned int mtu)
5205{
5206 int cell_log;
5207
5208 if (!mtu) {
5209 mtu = ETH_PAYLOAD_MAX;
5210 }
5211 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5212
5213 for (cell_log = 0; mtu >= 256; cell_log++) {
5214 mtu >>= 1;
5215 }
5216
5217 return cell_log;
5218}
5219
5220/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5221 * of 'mtu'. */
5222static void
5223tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5224{
5225 memset(rate, 0, sizeof *rate);
5226 rate->cell_log = tc_calc_cell_log(mtu);
5227 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5228 /* rate->cell_align = 0; */ /* distro headers. */
5229 rate->mpu = ETH_TOTAL_MIN;
5230 rate->rate = Bps;
5231}
5232
5233/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5234 * attribute of the specified "type".
5235 *
5236 * See tc_calc_cell_log() above for a description of "rtab"s. */
5237static void
5238tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5239{
5240 uint32_t *rtab;
5241 unsigned int i;
5242
5243 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5244 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5245 unsigned packet_size = (i + 1) << rate->cell_log;
5246 if (packet_size < rate->mpu) {
5247 packet_size = rate->mpu;
5248 }
5249 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5250 }
5251}
5252
5253/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5254 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5255 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5256 * 0 is fine.) */
c1c9c9c4
BP
5257static int
5258tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5259{
5260 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5261 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5262}
d3980822 5263\f
aaf2fb1a
BP
5264/* Linux-only functions declared in netdev-linux.h */
5265
5266/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5267 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5268int
5269netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5270 const char *flag_name, bool enable)
5271{
5272 const char *netdev_name = netdev_get_name(netdev);
5273 struct ethtool_value evalue;
5274 uint32_t new_flags;
5275 int error;
5276
ab985a77 5277 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5278 memset(&evalue, 0, sizeof evalue);
5279 error = netdev_linux_do_ethtool(netdev_name,
5280 (struct ethtool_cmd *)&evalue,
5281 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5282 if (error) {
5283 return error;
5284 }
5285
ab985a77 5286 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5287 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5288 if (new_flags == evalue.data) {
5289 return 0;
5290 }
5291 evalue.data = new_flags;
aaf2fb1a
BP
5292 error = netdev_linux_do_ethtool(netdev_name,
5293 (struct ethtool_cmd *)&evalue,
5294 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5295 if (error) {
5296 return error;
5297 }
5298
ab985a77 5299 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5300 memset(&evalue, 0, sizeof evalue);
5301 error = netdev_linux_do_ethtool(netdev_name,
5302 (struct ethtool_cmd *)&evalue,
5303 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5304 if (error) {
5305 return error;
5306 }
5307
5308 if (new_flags != evalue.data) {
5309 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5310 "device %s failed", enable ? "enable" : "disable",
5311 flag_name, netdev_name);
5312 return EOPNOTSUPP;
5313 }
5314
5315 return 0;
5316}
5317\f
5318/* Utility functions. */
5319
d3980822 5320/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5321static void
d3980822
BP
5322netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5323 const struct rtnl_link_stats *src)
5324{
f613a0d7
PS
5325 dst->rx_packets = src->rx_packets;
5326 dst->tx_packets = src->tx_packets;
5327 dst->rx_bytes = src->rx_bytes;
5328 dst->tx_bytes = src->tx_bytes;
5329 dst->rx_errors = src->rx_errors;
5330 dst->tx_errors = src->tx_errors;
5331 dst->rx_dropped = src->rx_dropped;
5332 dst->tx_dropped = src->tx_dropped;
5333 dst->multicast = src->multicast;
5334 dst->collisions = src->collisions;
5335 dst->rx_length_errors = src->rx_length_errors;
5336 dst->rx_over_errors = src->rx_over_errors;
5337 dst->rx_crc_errors = src->rx_crc_errors;
5338 dst->rx_frame_errors = src->rx_frame_errors;
5339 dst->rx_fifo_errors = src->rx_fifo_errors;
5340 dst->rx_missed_errors = src->rx_missed_errors;
5341 dst->tx_aborted_errors = src->tx_aborted_errors;
5342 dst->tx_carrier_errors = src->tx_carrier_errors;
5343 dst->tx_fifo_errors = src->tx_fifo_errors;
5344 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5345 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5346}
5347
337c9b99
BP
5348/* Copies 'src' into 'dst', performing format conversion in the process. */
5349static void
5350netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5351 const struct rtnl_link_stats64 *src)
5352{
5353 dst->rx_packets = src->rx_packets;
5354 dst->tx_packets = src->tx_packets;
5355 dst->rx_bytes = src->rx_bytes;
5356 dst->tx_bytes = src->tx_bytes;
5357 dst->rx_errors = src->rx_errors;
5358 dst->tx_errors = src->tx_errors;
5359 dst->rx_dropped = src->rx_dropped;
5360 dst->tx_dropped = src->tx_dropped;
5361 dst->multicast = src->multicast;
5362 dst->collisions = src->collisions;
5363 dst->rx_length_errors = src->rx_length_errors;
5364 dst->rx_over_errors = src->rx_over_errors;
5365 dst->rx_crc_errors = src->rx_crc_errors;
5366 dst->rx_frame_errors = src->rx_frame_errors;
5367 dst->rx_fifo_errors = src->rx_fifo_errors;
5368 dst->rx_missed_errors = src->rx_missed_errors;
5369 dst->tx_aborted_errors = src->tx_aborted_errors;
5370 dst->tx_carrier_errors = src->tx_carrier_errors;
5371 dst->tx_fifo_errors = src->tx_fifo_errors;
5372 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5373 dst->tx_window_errors = src->tx_window_errors;
5374}
5375
c1c9c9c4 5376static int
35eef899 5377get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5378{
c1c9c9c4
BP
5379 struct ofpbuf request;
5380 struct ofpbuf *reply;
c1c9c9c4
BP
5381 int error;
5382
d6e3feb5 5383 /* Filtering all counters by default */
5384 memset(stats, 0xFF, sizeof(struct netdev_stats));
5385
c1c9c9c4 5386 ofpbuf_init(&request, 0);
13a24df8
BP
5387 nl_msg_put_nlmsghdr(&request,
5388 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5389 RTM_GETLINK, NLM_F_REQUEST);
5390 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5391 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5392 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5393 ofpbuf_uninit(&request);
5394 if (error) {
5395 return error;
5396 }
5397
13a24df8 5398 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5399 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5400 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5401 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5402 error = 0;
5403 } else {
337c9b99
BP
5404 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS);
5405 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5406 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5407 error = 0;
5408 } else {
5409 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5410 error = EPROTO;
5411 }
13a24df8
BP
5412 }
5413 } else {
5414 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5415 error = EPROTO;
c1c9c9c4 5416 }
8b61709d 5417
8b61709d 5418
576e26d7 5419 ofpbuf_delete(reply);
35eef899 5420 return error;
8b61709d 5421}
c1c9c9c4 5422
3a183124 5423static int
b5d57fc8 5424get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5425{
5426 struct ifreq ifr;
5427 int error;
5428
755be9ea 5429 *flags = 0;
259e0b1a 5430 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5431 if (!error) {
5432 *flags = ifr.ifr_flags;
5433 }
8b61709d
BP
5434 return error;
5435}
5436
5437static int
4b609110 5438set_flags(const char *name, unsigned int flags)
8b61709d
BP
5439{
5440 struct ifreq ifr;
5441
5442 ifr.ifr_flags = flags;
259e0b1a 5443 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5444}
5445
5446static int
5447do_get_ifindex(const char *netdev_name)
5448{
5449 struct ifreq ifr;
259e0b1a 5450 int error;
8b61709d 5451
71d7c22f 5452 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5453 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5454
5455 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5456 if (error) {
8b61709d 5457 VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
259e0b1a
BP
5458 netdev_name, ovs_strerror(error));
5459 return -error;
8b61709d
BP
5460 }
5461 return ifr.ifr_ifindex;
5462}
5463
5464static int
5465get_ifindex(const struct netdev *netdev_, int *ifindexp)
5466{
b5d57fc8 5467 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5468
b5d57fc8 5469 if (!(netdev->cache_valid & VALID_IFINDEX)) {
8b61709d 5470 int ifindex = do_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5471
8b61709d 5472 if (ifindex < 0) {
b5d57fc8
BP
5473 netdev->get_ifindex_error = -ifindex;
5474 netdev->ifindex = 0;
c7b1b0a5 5475 } else {
b5d57fc8
BP
5476 netdev->get_ifindex_error = 0;
5477 netdev->ifindex = ifindex;
8b61709d 5478 }
b5d57fc8 5479 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5480 }
c7b1b0a5 5481
b5d57fc8
BP
5482 *ifindexp = netdev->ifindex;
5483 return netdev->get_ifindex_error;
8b61709d
BP
5484}
5485
5486static int
74ff3298 5487get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5488{
5489 struct ifreq ifr;
5490 int hwaddr_family;
259e0b1a 5491 int error;
8b61709d
BP
5492
5493 memset(&ifr, 0, sizeof ifr);
71d7c22f 5494 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5495 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5496 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5497 if (error) {
78857dfb
BP
5498 /* ENODEV probably means that a vif disappeared asynchronously and
5499 * hasn't been removed from the database yet, so reduce the log level
5500 * to INFO for that case. */
259e0b1a 5501 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5502 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5503 netdev_name, ovs_strerror(error));
5504 return error;
8b61709d
BP
5505 }
5506 hwaddr_family = ifr.ifr_hwaddr.sa_family;
5507 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
c9697f35 5508 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5509 netdev_name, hwaddr_family);
c9697f35 5510 return EINVAL;
8b61709d
BP
5511 }
5512 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5513 return 0;
5514}
5515
5516static int
74ff3298 5517set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5518{
5519 struct ifreq ifr;
259e0b1a 5520 int error;
8b61709d
BP
5521
5522 memset(&ifr, 0, sizeof ifr);
71d7c22f 5523 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5524 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5525 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5526 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5527 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5528 if (error) {
8b61709d 5529 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5530 netdev_name, ovs_strerror(error));
8b61709d 5531 }
259e0b1a 5532 return error;
8b61709d
BP
5533}
5534
5535static int
0b0544d7 5536netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5537 int cmd, const char *cmd_name)
5538{
5539 struct ifreq ifr;
259e0b1a 5540 int error;
8b61709d
BP
5541
5542 memset(&ifr, 0, sizeof ifr);
71d7c22f 5543 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5544 ifr.ifr_data = (caddr_t) ecmd;
5545
5546 ecmd->cmd = cmd;
259e0b1a
BP
5547 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5548 if (error) {
5549 if (error != EOPNOTSUPP) {
8b61709d 5550 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5551 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5552 } else {
5553 /* The device doesn't support this operation. That's pretty
5554 * common, so there's no point in logging anything. */
5555 }
8b61709d 5556 }
259e0b1a 5557 return error;
8b61709d 5558}
f1acd62b 5559
488d734d
BP
5560/* Returns an AF_PACKET raw socket or a negative errno value. */
5561static int
5562af_packet_sock(void)
5563{
23882115
BP
5564 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5565 static int sock;
488d734d 5566
23882115 5567 if (ovsthread_once_start(&once)) {
488d734d
BP
5568 sock = socket(AF_PACKET, SOCK_RAW, 0);
5569 if (sock >= 0) {
8450059e
BP
5570 int error = set_nonblocking(sock);
5571 if (error) {
5572 close(sock);
5573 sock = -error;
5574 }
488d734d
BP
5575 } else {
5576 sock = -errno;
10a89ef0
BP
5577 VLOG_ERR("failed to create packet socket: %s",
5578 ovs_strerror(errno));
488d734d 5579 }
23882115 5580 ovsthread_once_done(&once);
488d734d
BP
5581 }
5582
5583 return sock;
5584}