]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
netdev-linux: use netlink to update netdev.
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
20
e9e28be3 21#include <errno.h>
8b61709d 22#include <fcntl.h>
b2befd5b
BP
23#include <sys/types.h>
24#include <netinet/in.h>
55bc98d6 25#include <arpa/inet.h>
8b61709d 26#include <inttypes.h>
32383c3b 27#include <linux/filter.h>
c1c9c9c4 28#include <linux/gen_stats.h>
bb7d0e22 29#include <linux/if_ether.h>
8b61709d
BP
30#include <linux/if_tun.h>
31#include <linux/types.h>
32#include <linux/ethtool.h>
63331829 33#include <linux/mii.h>
ef3767f5 34#include <linux/rtnetlink.h>
8b61709d 35#include <linux/sockios.h>
8b61709d
BP
36#include <sys/ioctl.h>
37#include <sys/socket.h>
ac3e3aaa 38#include <sys/utsname.h>
55bc98d6 39#include <netpacket/packet.h>
8b61709d
BP
40#include <net/if.h>
41#include <net/if_arp.h>
55bc98d6 42#include <net/if_packet.h>
8b61709d 43#include <net/route.h>
e9e28be3 44#include <poll.h>
8b61709d
BP
45#include <stdlib.h>
46#include <string.h>
47#include <unistd.h>
e9e28be3
BP
48
49#include "coverage.h"
e14deea0 50#include "dp-packet.h"
93451a0a 51#include "dpif-netlink.h"
df1e5a3b 52#include "dpif-netdev.h"
3e8a2ad1 53#include "openvswitch/dynamic-string.h"
8b61709d 54#include "fatal-signal.h"
93b13be8 55#include "hash.h"
ee89ea7b 56#include "openvswitch/hmap.h"
8b61709d 57#include "netdev-provider.h"
18ebd48c 58#include "netdev-tc-offloads.h"
7fbef77a 59#include "netdev-vport.h"
45c8d3a1 60#include "netlink-notifier.h"
2fe27d5a 61#include "netlink-socket.h"
c060c4cf 62#include "netlink.h"
bfda5239 63#include "netnsid.h"
64c96779 64#include "openvswitch/ofpbuf.h"
8b61709d 65#include "openflow/openflow.h"
19c8e9c1 66#include "ovs-atomic.h"
8b61709d 67#include "packets.h"
fd016ae3 68#include "openvswitch/poll-loop.h"
7e9dcc0f 69#include "rtnetlink.h"
ee89ea7b 70#include "openvswitch/shash.h"
c060c4cf 71#include "socket-util.h"
19993ef3 72#include "sset.h"
c1c5c723 73#include "tc.h"
1670c579 74#include "timer.h"
c060c4cf 75#include "unaligned.h"
e6211adc 76#include "openvswitch/vlog.h"
ee89ea7b 77#include "util.h"
5136ce49 78
d98e6007 79VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 80
d76f09ea
BP
81COVERAGE_DEFINE(netdev_set_policing);
82COVERAGE_DEFINE(netdev_arp_lookup);
83COVERAGE_DEFINE(netdev_get_ifindex);
84COVERAGE_DEFINE(netdev_get_hwaddr);
85COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
86COVERAGE_DEFINE(netdev_get_ethtool);
87COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 88
8b61709d 89\f
756819dd
FL
90#ifndef IFLA_IF_NETNSID
91#define IFLA_IF_NETNSID 0x45
92#endif
8b61709d
BP
93/* These were introduced in Linux 2.6.14, so they might be missing if we have
94 * old headers. */
95#ifndef ADVERTISED_Pause
96#define ADVERTISED_Pause (1 << 13)
97#endif
98#ifndef ADVERTISED_Asym_Pause
99#define ADVERTISED_Asym_Pause (1 << 14)
100#endif
101
e47bd51a
JP
102/* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104#ifndef ETHTOOL_GFLAGS
105#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106#endif
107#ifndef ETHTOOL_SFLAGS
108#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
109#endif
110
c1c9c9c4
BP
111/* This was introduced in Linux 2.6.25, so it might be missing if we have old
112 * headers. */
113#ifndef TC_RTAB_SIZE
114#define TC_RTAB_SIZE 1024
115#endif
116
b73c8518
SH
117/* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
122 *
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
125 */
55bc98d6
BP
126#ifndef PACKET_AUXDATA
127#define PACKET_AUXDATA 8
128#endif
b73c8518
SH
129#ifndef TP_STATUS_VLAN_VALID
130#define TP_STATUS_VLAN_VALID (1 << 4)
131#endif
132#ifndef TP_STATUS_VLAN_TPID_VALID
133#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
134#endif
135#undef tpacket_auxdata
136#define tpacket_auxdata rpl_tpacket_auxdata
137struct tpacket_auxdata {
138 uint32_t tp_status;
139 uint32_t tp_len;
140 uint32_t tp_snaplen;
141 uint16_t tp_mac;
142 uint16_t tp_net;
143 uint16_t tp_vlan_tci;
144 uint16_t tp_vlan_tpid;
145};
146
0c615356
SH
147/* Linux 2.6.27 introduced ethtool_cmd_speed
148 *
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
8a7903c6 151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
152 * unconditionally replace ethtool_cmd_speed. */
153#define ethtool_cmd_speed rpl_ethtool_cmd_speed
154static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
155{
156 return ep->speed | (ep->speed_hi << 16);
157}
158
67bed84c
SH
159/* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161#ifndef SUPPORTED_1000baseKX_Full
162#define SUPPORTED_1000baseKX_Full (1 << 17)
163#define SUPPORTED_10000baseKX4_Full (1 << 18)
164#define SUPPORTED_10000baseKR_Full (1 << 19)
165#define SUPPORTED_10000baseR_FEC (1 << 20)
166#define ADVERTISED_1000baseKX_Full (1 << 17)
167#define ADVERTISED_10000baseKX4_Full (1 << 18)
168#define ADVERTISED_10000baseKR_Full (1 << 19)
169#define ADVERTISED_10000baseR_FEC (1 << 20)
170#endif
171
172/* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174#ifndef SUPPORTED_40000baseKR4_Full
175#define SUPPORTED_40000baseKR4_Full (1 << 23)
176#define SUPPORTED_40000baseCR4_Full (1 << 24)
177#define SUPPORTED_40000baseSR4_Full (1 << 25)
178#define SUPPORTED_40000baseLR4_Full (1 << 26)
179#define ADVERTISED_40000baseKR4_Full (1 << 23)
180#define ADVERTISED_40000baseCR4_Full (1 << 24)
181#define ADVERTISED_40000baseSR4_Full (1 << 25)
182#define ADVERTISED_40000baseLR4_Full (1 << 26)
183#endif
184
fa373af4
BP
185/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
186 *
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
191 * unconditionally define a replacement. */
192#ifndef IFLA_STATS64
337c9b99 193#define IFLA_STATS64 23
fa373af4
BP
194#endif
195#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
196struct rtnl_link_stats64 {
197 uint64_t rx_packets;
198 uint64_t tx_packets;
199 uint64_t rx_bytes;
200 uint64_t tx_bytes;
201 uint64_t rx_errors;
202 uint64_t tx_errors;
203 uint64_t rx_dropped;
204 uint64_t tx_dropped;
205 uint64_t multicast;
206 uint64_t collisions;
207
208 uint64_t rx_length_errors;
209 uint64_t rx_over_errors;
210 uint64_t rx_crc_errors;
211 uint64_t rx_frame_errors;
212 uint64_t rx_fifo_errors;
213 uint64_t rx_missed_errors;
214
215 uint64_t tx_aborted_errors;
216 uint64_t tx_carrier_errors;
217 uint64_t tx_fifo_errors;
218 uint64_t tx_heartbeat_errors;
219 uint64_t tx_window_errors;
220
221 uint64_t rx_compressed;
222 uint64_t tx_compressed;
223};
337c9b99 224
8b61709d 225enum {
7fbef77a
JG
226 VALID_IFINDEX = 1 << 0,
227 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
228 VALID_IN = 1 << 2,
229 VALID_MTU = 1 << 3,
230 VALID_POLICING = 1 << 4,
231 VALID_VPORT_STAT_ERROR = 1 << 5,
232 VALID_DRVINFO = 1 << 6,
233 VALID_FEATURES = 1 << 7,
8b61709d 234};
c1c9c9c4
BP
235\f
236/* Traffic control. */
237
238/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
239 * network device.
240 *
241 * Each TC implementation subclasses this with whatever additional data it
242 * needs. */
c1c9c9c4
BP
243struct tc {
244 const struct tc_ops *ops;
93b13be8
BP
245 struct hmap queues; /* Contains "struct tc_queue"s.
246 * Read by generic TC layer.
247 * Written only by TC implementation. */
248};
c1c9c9c4 249
559eb230
BP
250#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
251
93b13be8
BP
252/* One traffic control queue.
253 *
254 * Each TC implementation subclasses this with whatever additional data it
255 * needs. */
256struct tc_queue {
257 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
258 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 259 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
260};
261
262/* A particular kind of traffic control. Each implementation generally maps to
263 * one particular Linux qdisc class.
264 *
265 * The functions below return 0 if successful or a positive errno value on
266 * failure, except where otherwise noted. All of them must be provided, except
267 * where otherwise noted. */
268struct tc_ops {
269 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
270 * This is null for tc_ops_default and tc_ops_other, for which there are no
271 * appropriate values. */
272 const char *linux_name;
273
274 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
275 const char *ovs_name;
276
277 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
278 * queues. The queues are numbered 0 through n_queues - 1. */
279 unsigned int n_queues;
280
281 /* Called to install this TC class on 'netdev'. The implementation should
282 * make the Netlink calls required to set up 'netdev' with the right qdisc
283 * and configure it according to 'details'. The implementation may assume
284 * that the current qdisc is the default; that is, there is no need for it
285 * to delete the current qdisc before installing itself.
286 *
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
290 *
291 * This function must return 0 if and only if it sets 'netdev->tc' to an
292 * initialized 'struct tc'.
293 *
294 * (This function is null for tc_ops_other, which cannot be installed. For
295 * other TC classes it should always be nonnull.) */
79f1cbe9 296 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
297
298 /* Called when the netdev code determines (through a Netlink query) that
299 * this TC class's qdisc is installed on 'netdev', but we didn't install
300 * it ourselves and so don't know any of the details.
301 *
302 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
303 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
304 * implementation should parse the other attributes of 'nlmsg' as
305 * necessary to determine its configuration. If necessary it should also
306 * use Netlink queries to determine the configuration of queues on
307 * 'netdev'.
308 *
309 * This function must return 0 if and only if it sets 'netdev->tc' to an
310 * initialized 'struct tc'. */
311 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
312
313 /* Destroys the data structures allocated by the implementation as part of
314 * 'tc'. (This includes destroying 'tc->queues' by calling
315 * tc_destroy(tc).
316 *
317 * The implementation should not need to perform any Netlink calls. If
318 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
319 * (But it may not be desirable.)
320 *
321 * This function may be null if 'tc' is trivial. */
322 void (*tc_destroy)(struct tc *tc);
323
324 /* Retrieves details of 'netdev->tc' configuration into 'details'.
325 *
326 * The implementation should not need to perform any Netlink calls, because
327 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
328 * cached the configuration.
329 *
330 * The contents of 'details' should be documented as valid for 'ovs_name'
331 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
332 * (which is built as ovs-vswitchd.conf.db(8)).
333 *
334 * This function may be null if 'tc' is not configurable.
335 */
79f1cbe9 336 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
337
338 /* Reconfigures 'netdev->tc' according to 'details', performing any
339 * required Netlink calls to complete the reconfiguration.
340 *
341 * The contents of 'details' should be documented as valid for 'ovs_name'
342 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
343 * (which is built as ovs-vswitchd.conf.db(8)).
344 *
345 * This function may be null if 'tc' is not configurable.
346 */
79f1cbe9 347 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 348
93b13be8
BP
349 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
350 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
351 *
352 * The contents of 'details' should be documented as valid for 'ovs_name'
353 * in the "other_config" column in the "Queue" table in
354 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
355 *
356 * The implementation should not need to perform any Netlink calls, because
357 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
358 * cached the queue configuration.
359 *
360 * This function may be null if 'tc' does not have queues ('n_queues' is
361 * 0). */
93b13be8 362 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 363 struct smap *details);
c1c9c9c4
BP
364
365 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
366 * 'details', perfoming any required Netlink calls to complete the
367 * reconfiguration. The caller ensures that 'queue_id' is less than
368 * 'n_queues'.
369 *
370 * The contents of 'details' should be documented as valid for 'ovs_name'
371 * in the "other_config" column in the "Queue" table in
372 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
373 *
374 * This function may be null if 'tc' does not have queues or its queues are
375 * not configurable. */
376 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 377 const struct smap *details);
c1c9c9c4 378
93b13be8
BP
379 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
380 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
381 *
382 * This function may be null if 'tc' does not have queues or its queues
383 * cannot be deleted. */
93b13be8 384 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 385
93b13be8
BP
386 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
387 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
388 *
389 * On success, initializes '*stats'.
390 *
391 * This function may be null if 'tc' does not have queues or if it cannot
392 * report queue statistics. */
93b13be8
BP
393 int (*class_get_stats)(const struct netdev *netdev,
394 const struct tc_queue *queue,
c1c9c9c4
BP
395 struct netdev_queue_stats *stats);
396
397 /* Extracts queue stats from 'nlmsg', which is a response to a
398 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
399 *
400 * This function may be null if 'tc' does not have queues or if it cannot
401 * report queue statistics. */
402 int (*class_dump_stats)(const struct netdev *netdev,
403 const struct ofpbuf *nlmsg,
404 netdev_dump_queue_stats_cb *cb, void *aux);
405};
406
407static void
408tc_init(struct tc *tc, const struct tc_ops *ops)
409{
410 tc->ops = ops;
93b13be8 411 hmap_init(&tc->queues);
c1c9c9c4
BP
412}
413
414static void
415tc_destroy(struct tc *tc)
416{
93b13be8 417 hmap_destroy(&tc->queues);
c1c9c9c4
BP
418}
419
420static const struct tc_ops tc_ops_htb;
a339aa81 421static const struct tc_ops tc_ops_hfsc;
677d9158
JV
422static const struct tc_ops tc_ops_codel;
423static const struct tc_ops tc_ops_fqcodel;
424static const struct tc_ops tc_ops_sfq;
c1c9c9c4 425static const struct tc_ops tc_ops_default;
6cf888b8 426static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
427static const struct tc_ops tc_ops_other;
428
559eb230 429static const struct tc_ops *const tcs[] = {
c1c9c9c4 430 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 431 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
432 &tc_ops_codel, /* Controlled delay */
433 &tc_ops_fqcodel, /* Fair queue controlled delay */
434 &tc_ops_sfq, /* Stochastic fair queueing */
6cf888b8 435 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
436 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
437 &tc_ops_other, /* Some other qdisc. */
438 NULL
439};
149f577a 440
c1c9c9c4
BP
441static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
442static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
443static unsigned int tc_buffer_per_jiffy(unsigned int rate);
444
7874bdff
RD
445static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
446 int type,
447 unsigned int flags,
448 struct ofpbuf *);
c7952afb
BP
449static int tc_add_policer(struct netdev *,
450 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
451
452static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
453 struct nlattr **options);
454static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
455 struct nlattr **options,
456 struct netdev_queue_stats *);
457static int tc_query_class(const struct netdev *,
458 unsigned int handle, unsigned int parent,
459 struct ofpbuf **replyp);
460static int tc_delete_class(const struct netdev *, unsigned int handle);
461
462static int tc_del_qdisc(struct netdev *netdev);
463static int tc_query_qdisc(const struct netdev *netdev);
464
465static int tc_calc_cell_log(unsigned int mtu);
466static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
467static void tc_put_rtab(struct ofpbuf *, uint16_t type,
468 const struct tc_ratespec *rate);
469static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
470\f
b5d57fc8
BP
471struct netdev_linux {
472 struct netdev up;
149f577a 473
86383816
BP
474 /* Protects all members below. */
475 struct ovs_mutex mutex;
476
149f577a 477 unsigned int cache_valid;
8b61709d 478
1670c579
EJ
479 bool miimon; /* Link status of last poll. */
480 long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer;
482
bfda5239 483 int netnsid; /* Network namespace ID. */
8722022c
BP
484 /* The following are figured out "on demand" only. They are only valid
485 * when the corresponding VALID_* bit in 'cache_valid' is set. */
8b61709d 486 int ifindex;
74ff3298 487 struct eth_addr etheraddr;
8b61709d 488 int mtu;
059e5f4f 489 unsigned int ifi_flags;
65c3058c 490 long long int carrier_resets;
80a86fbe
BP
491 uint32_t kbits_rate; /* Policing data. */
492 uint32_t kbits_burst;
bba1e6f3
PS
493 int vport_stats_error; /* Cached error code from vport_get_stats().
494 0 or an errno value. */
90a6637d 495 int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
44445cac 496 int ether_addr_error; /* Cached error code from set/get etheraddr. */
c9f71668 497 int netdev_policing_error; /* Cached error code from set policing. */
51f87458 498 int get_features_error; /* Cached error code from ETHTOOL_GSET. */
c7b1b0a5 499 int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
51f87458 500
a00ca915
EJ
501 enum netdev_features current; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
90a6637d 504
4f925bd3 505 struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
c1c9c9c4 506 struct tc *tc;
149f577a 507
d0d08f8a
BP
508 /* For devices of class netdev_tap_class only. */
509 int tap_fd;
22dcb534
FL
510 bool present; /* If the device is present in the namespace */
511 uint64_t tx_dropped; /* tap device can drop if the iface is down */
8b61709d
BP
512};
513
f7791740
PS
514struct netdev_rxq_linux {
515 struct netdev_rxq up;
796223f5 516 bool is_tap;
5b7448ed 517 int fd;
149f577a 518};
8b61709d 519
8b61709d
BP
520/* This is set pretty low because we probably won't learn anything from the
521 * additional log messages. */
522static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
523
19c8e9c1
JS
524/* Polling miimon status for all ports causes performance degradation when
525 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
526 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
527 *
528 * Readers do not depend on this variable synchronizing with the related
529 * changes in the device miimon status, so we can use atomic_count. */
530static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 531
1c33f0c3 532static void netdev_linux_run(const struct netdev_class *);
6f643e49 533
0b0544d7 534static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 535 int cmd, const char *cmd_name);
b5d57fc8 536static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 537static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
538static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
539 enum netdev_flags on, enum netdev_flags *old_flagsp)
540 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
541static int get_ifindex(const struct netdev *, int *ifindexp);
542static int do_set_addr(struct netdev *netdev,
543 int ioctl_nr, const char *ioctl_name,
544 struct in_addr addr);
74ff3298
JR
545static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
546static int set_etheraddr(const char *netdev_name, const struct eth_addr);
35eef899 547static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
488d734d 548static int af_packet_sock(void);
19c8e9c1 549static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
550static void netdev_linux_miimon_run(void);
551static void netdev_linux_miimon_wait(void);
df1e5a3b 552static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 553
15b3596a
JG
554static bool
555is_netdev_linux_class(const struct netdev_class *netdev_class)
556{
259e0b1a 557 return netdev_class->run == netdev_linux_run;
15b3596a
JG
558}
559
796223f5
BP
560static bool
561is_tap_netdev(const struct netdev *netdev)
562{
b5d57fc8 563 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577
JP
564}
565
8b61709d
BP
566static struct netdev_linux *
567netdev_linux_cast(const struct netdev *netdev)
568{
b5d57fc8 569 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
15b3596a 570
180c6d0b 571 return CONTAINER_OF(netdev, struct netdev_linux, up);
8b61709d 572}
796223f5 573
f7791740
PS
574static struct netdev_rxq_linux *
575netdev_rxq_linux_cast(const struct netdev_rxq *rx)
796223f5 576{
9dc63482 577 ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
f7791740 578 return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
796223f5 579}
ff4ed3c9 580\f
bfda5239
FL
581static int
582netdev_linux_netnsid_update__(struct netdev_linux *netdev)
583{
584 struct dpif_netlink_vport reply;
585 struct ofpbuf *buf;
586 int error;
587
588 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
589 if (error) {
590 netnsid_unset(&netdev->netnsid);
591 return error;
592 }
593
594 netnsid_set(&netdev->netnsid, reply.netnsid);
595 ofpbuf_delete(buf);
596 return 0;
597}
598
599static int
600netdev_linux_netnsid_update(struct netdev_linux *netdev)
601{
602 if (netnsid_is_unset(netdev->netnsid)) {
603 return netdev_linux_netnsid_update__(netdev);
604 }
605
606 return 0;
607}
608
609static bool
610netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
611{
612 netdev_linux_netnsid_update(netdev);
613 return netnsid_eq(netdev->netnsid, nsid);
614}
615
756819dd
FL
616static bool
617netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
618{
619 netdev_linux_netnsid_update(netdev);
620 return netnsid_is_remote(netdev->netnsid);
621}
622
623static int netdev_linux_update_via_netlink(struct netdev_linux *);
bfda5239 624static void netdev_linux_update(struct netdev_linux *netdev, int,
7e9dcc0f 625 const struct rtnetlink_change *)
86383816 626 OVS_REQUIRES(netdev->mutex);
cee87338 627static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
628 unsigned int ifi_flags, unsigned int mask)
629 OVS_REQUIRES(netdev->mutex);
cee87338 630
d6384a3a
AW
631/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
632 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
633 * if no such socket could be created. */
634static struct nl_sock *
635netdev_linux_notify_sock(void)
636{
637 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
638 static struct nl_sock *sock;
989d7135
PS
639 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
640 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
641
642 if (ovsthread_once_start(&once)) {
643 int error;
644
645 error = nl_sock_create(NETLINK_ROUTE, &sock);
646 if (!error) {
d6384a3a
AW
647 size_t i;
648
649 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
650 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
651 if (error) {
652 nl_sock_destroy(sock);
653 sock = NULL;
654 break;
655 }
cee87338
BP
656 }
657 }
658 ovsthread_once_done(&once);
659 }
660
661 return sock;
662}
663
19c8e9c1
JS
664static bool
665netdev_linux_miimon_enabled(void)
666{
812c272c 667 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
668}
669
8b61709d 670static void
1c33f0c3 671netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 672{
cee87338
BP
673 struct nl_sock *sock;
674 int error;
675
19c8e9c1
JS
676 if (netdev_linux_miimon_enabled()) {
677 netdev_linux_miimon_run();
678 }
cee87338
BP
679
680 sock = netdev_linux_notify_sock();
681 if (!sock) {
682 return;
683 }
684
685 do {
cee87338 686 uint64_t buf_stub[4096 / 8];
bfda5239 687 int nsid;
cee87338
BP
688 struct ofpbuf buf;
689
690 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
bfda5239 691 error = nl_sock_recv(sock, &buf, &nsid, false);
cee87338 692 if (!error) {
7e9dcc0f 693 struct rtnetlink_change change;
cee87338 694
7e9dcc0f 695 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
696 struct netdev *netdev_ = NULL;
697 char dev_name[IFNAMSIZ];
698
699 if (!change.ifname) {
700 change.ifname = if_indextoname(change.if_index, dev_name);
701 }
702
703 if (change.ifname) {
704 netdev_ = netdev_from_name(change.ifname);
705 }
cee87338
BP
706 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
707 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
708
709 ovs_mutex_lock(&netdev->mutex);
bfda5239 710 netdev_linux_update(netdev, nsid, &change);
86383816 711 ovs_mutex_unlock(&netdev->mutex);
cee87338 712 }
38e0065b 713 netdev_close(netdev_);
cee87338
BP
714 }
715 } else if (error == ENOBUFS) {
716 struct shash device_shash;
717 struct shash_node *node;
718
719 nl_sock_drain(sock);
720
721 shash_init(&device_shash);
722 netdev_get_devices(&netdev_linux_class, &device_shash);
723 SHASH_FOR_EACH (node, &device_shash) {
724 struct netdev *netdev_ = node->data;
725 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
726 unsigned int flags;
727
86383816 728 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
729 get_flags(netdev_, &flags);
730 netdev_linux_changed(netdev, flags, 0);
86383816
BP
731 ovs_mutex_unlock(&netdev->mutex);
732
cee87338
BP
733 netdev_close(netdev_);
734 }
735 shash_destroy(&device_shash);
736 } else if (error != EAGAIN) {
7ed58d4a
JP
737 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
738 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
739 ovs_strerror(error));
740 }
741 ofpbuf_uninit(&buf);
742 } while (!error);
8b61709d
BP
743}
744
745static void
1c33f0c3 746netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 747{
cee87338
BP
748 struct nl_sock *sock;
749
19c8e9c1
JS
750 if (netdev_linux_miimon_enabled()) {
751 netdev_linux_miimon_wait();
752 }
cee87338
BP
753 sock = netdev_linux_notify_sock();
754 if (sock) {
755 nl_sock_wait(sock, POLLIN);
756 }
8b61709d
BP
757}
758
ac4d3bcb 759static void
b5d57fc8
BP
760netdev_linux_changed(struct netdev_linux *dev,
761 unsigned int ifi_flags, unsigned int mask)
86383816 762 OVS_REQUIRES(dev->mutex)
ac4d3bcb 763{
3e912ffc 764 netdev_change_seq_changed(&dev->up);
8aa77183
BP
765
766 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
767 dev->carrier_resets++;
768 }
769 dev->ifi_flags = ifi_flags;
770
4f925bd3 771 dev->cache_valid &= mask;
6b6e1329 772 if (!(mask & VALID_IN)) {
a8704b50
PS
773 netdev_get_addrs_list_flush();
774 }
4f925bd3
PS
775}
776
777static void
bfda5239
FL
778netdev_linux_update__(struct netdev_linux *dev,
779 const struct rtnetlink_change *change)
86383816 780 OVS_REQUIRES(dev->mutex)
4f925bd3 781{
bfda5239 782 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
d6384a3a 783 if (change->nlmsg_type == RTM_NEWLINK) {
6b6e1329 784 /* Keep drv-info, and ip addresses. */
d6384a3a 785 netdev_linux_changed(dev, change->ifi_flags,
6b6e1329 786 VALID_DRVINFO | VALID_IN);
d6384a3a
AW
787
788 /* Update netdev from rtnl-change msg. */
789 if (change->mtu) {
790 dev->mtu = change->mtu;
791 dev->cache_valid |= VALID_MTU;
792 dev->netdev_mtu_error = 0;
793 }
90a6637d 794
74ff3298
JR
795 if (!eth_addr_is_zero(change->mac)) {
796 dev->etheraddr = change->mac;
d6384a3a
AW
797 dev->cache_valid |= VALID_ETHERADDR;
798 dev->ether_addr_error = 0;
e8e1a409
TZ
799
800 /* The mac addr has been changed, report it now. */
801 rtnetlink_report_link();
d6384a3a 802 }
44445cac 803
d6384a3a
AW
804 dev->ifindex = change->if_index;
805 dev->cache_valid |= VALID_IFINDEX;
806 dev->get_ifindex_error = 0;
22dcb534 807 dev->present = true;
d6384a3a 808 } else {
bfda5239 809 /* FIXME */
d6384a3a 810 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 811 dev->present = false;
bfda5239 812 netnsid_unset(&dev->netnsid);
d6384a3a
AW
813 }
814 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
815 /* Invalidates in4, in6. */
6b6e1329 816 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 817 } else {
d6384a3a 818 OVS_NOT_REACHED();
4f925bd3 819 }
ac4d3bcb
EJ
820}
821
bfda5239
FL
822static void
823netdev_linux_update(struct netdev_linux *dev, int nsid,
824 const struct rtnetlink_change *change)
825 OVS_REQUIRES(dev->mutex)
826{
827 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
828 netdev_linux_update__(dev, change);
829 }
830}
831
9dc63482
BP
832static struct netdev *
833netdev_linux_alloc(void)
834{
835 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
836 return &netdev->up;
837}
838
48c6733c
WT
839static int
840netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 841{
48c6733c
WT
842 /* Prevent any attempt to create (or open) a network device named "default"
843 * or "all". These device names are effectively reserved on Linux because
844 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
845 * itself this wouldn't call for any special treatment, but in practice if
846 * a program tries to create devices with these names, it causes the kernel
847 * to fire a "new device" notification event even though creation failed,
848 * and in turn that causes OVS to wake up and try to create them again,
849 * which ends up as a 100% CPU loop. */
850 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
851 const char *name = netdev_->name;
852 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
853 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
854 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
855 name);
856 return EINVAL;
857 }
858
bfda5239
FL
859 /* The device could be in the same network namespace or in another one. */
860 netnsid_unset(&netdev->netnsid);
834d6caf 861 ovs_mutex_init(&netdev->mutex);
48c6733c 862 return 0;
9dc63482
BP
863}
864
1f6e0fbd
BP
865/* Creates system and internal devices. */
866static int
9dc63482 867netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 868{
9dc63482 869 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
870 int error = netdev_linux_common_construct(netdev_);
871 if (error) {
872 return error;
873 }
1f6e0fbd 874
b5d57fc8
BP
875 error = get_flags(&netdev->up, &netdev->ifi_flags);
876 if (error == ENODEV) {
9dc63482 877 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 878 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
879 return ENODEV;
880 } else {
881 /* "Internal" netdevs have to be created as netdev objects before
882 * they exist in the kernel, because creating them in the kernel
883 * happens by passing a netdev object to dpif_port_add().
884 * Therefore, ignore the error. */
885 }
886 }
46415c90 887
a740f0de
JG
888 return 0;
889}
890
5b7448ed
JG
891/* For most types of netdevs we open the device for each call of
892 * netdev_open(). However, this is not the case with tap devices,
893 * since it is only possible to open the device once. In this
894 * situation we share a single file descriptor, and consequently
895 * buffers, across all readers. Therefore once data is read it will
896 * be unavailable to other reads for tap devices. */
a740f0de 897static int
9dc63482 898netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 899{
9dc63482 900 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 901 static const char tap_dev[] = "/dev/net/tun";
9dc63482 902 const char *name = netdev_->name;
a740f0de 903 struct ifreq ifr;
a740f0de 904
48c6733c
WT
905 int error = netdev_linux_common_construct(netdev_);
906 if (error) {
907 return error;
908 }
1f6e0fbd 909
6c88d577 910 /* Open tap device. */
d0d08f8a
BP
911 netdev->tap_fd = open(tap_dev, O_RDWR);
912 if (netdev->tap_fd < 0) {
6c88d577 913 error = errno;
10a89ef0 914 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 915 return error;
6c88d577
JP
916 }
917
918 /* Create tap device. */
61b9d078 919 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 920 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
71d7c22f 921 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 922 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 923 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 924 ovs_strerror(errno));
6c88d577 925 error = errno;
f61d8d29 926 goto error_close;
6c88d577
JP
927 }
928
929 /* Make non-blocking. */
d0d08f8a 930 error = set_nonblocking(netdev->tap_fd);
a740f0de 931 if (error) {
f61d8d29 932 goto error_close;
a740f0de
JG
933 }
934
0f28164b
FL
935 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
936 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
937 ovs_strerror(errno));
938 error = errno;
939 goto error_close;
940 }
941
a740f0de
JG
942 return 0;
943
f61d8d29 944error_close:
d0d08f8a 945 close(netdev->tap_fd);
a740f0de
JG
946 return error;
947}
948
6c88d577 949static void
9dc63482 950netdev_linux_destruct(struct netdev *netdev_)
6c88d577 951{
b5d57fc8 952 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 953
b5d57fc8
BP
954 if (netdev->tc && netdev->tc->ops->tc_destroy) {
955 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
956 }
957
d0d08f8a
BP
958 if (netdev_get_class(netdev_) == &netdev_tap_class
959 && netdev->tap_fd >= 0)
960 {
0f28164b 961 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 962 close(netdev->tap_fd);
6c88d577 963 }
86383816 964
19c8e9c1 965 if (netdev->miimon_interval > 0) {
812c272c 966 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
967 }
968
86383816 969 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
970}
971
9dc63482
BP
972static void
973netdev_linux_dealloc(struct netdev *netdev_)
974{
975 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
976 free(netdev);
977}
978
f7791740
PS
979static struct netdev_rxq *
980netdev_linux_rxq_alloc(void)
9dc63482 981{
f7791740 982 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
983 return &rx->up;
984}
985
7b6b0ef4 986static int
f7791740 987netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 988{
f7791740 989 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 990 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 991 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 992 int error;
7b6b0ef4 993
86383816 994 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
995 rx->is_tap = is_tap_netdev(netdev_);
996 if (rx->is_tap) {
997 rx->fd = netdev->tap_fd;
796223f5
BP
998 } else {
999 struct sockaddr_ll sll;
b73c8518 1000 int ifindex, val;
32383c3b 1001 /* Result of tcpdump -dd inbound */
259e0b1a 1002 static const struct sock_filter filt[] = {
32383c3b
MM
1003 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1004 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1005 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1006 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1007 };
259e0b1a
BP
1008 static const struct sock_fprog fprog = {
1009 ARRAY_SIZE(filt), (struct sock_filter *) filt
1010 };
7b6b0ef4 1011
796223f5 1012 /* Create file descriptor. */
9dc63482
BP
1013 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1014 if (rx->fd < 0) {
796223f5 1015 error = errno;
10a89ef0 1016 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
1017 goto error;
1018 }
33d82a56 1019
b73c8518
SH
1020 val = 1;
1021 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1022 error = errno;
1023 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1024 netdev_get_name(netdev_), ovs_strerror(error));
1025 goto error;
1026 }
1027
796223f5 1028 /* Set non-blocking mode. */
9dc63482 1029 error = set_nonblocking(rx->fd);
796223f5
BP
1030 if (error) {
1031 goto error;
1032 }
7b6b0ef4 1033
796223f5 1034 /* Get ethernet device index. */
180c6d0b 1035 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
1036 if (error) {
1037 goto error;
1038 }
7b6b0ef4 1039
796223f5
BP
1040 /* Bind to specific ethernet device. */
1041 memset(&sll, 0, sizeof sll);
1042 sll.sll_family = AF_PACKET;
1043 sll.sll_ifindex = ifindex;
b73c8518 1044 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 1045 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
1046 error = errno;
1047 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 1048 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
1049 goto error;
1050 }
32383c3b
MM
1051
1052 /* Filter for only inbound packets. */
9dc63482 1053 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
1054 sizeof fprog);
1055 if (error) {
1056 error = errno;
259e0b1a 1057 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 1058 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
1059 goto error;
1060 }
7b6b0ef4 1061 }
86383816 1062 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1063
7b6b0ef4
BP
1064 return 0;
1065
1066error:
9dc63482
BP
1067 if (rx->fd >= 0) {
1068 close(rx->fd);
7b6b0ef4 1069 }
86383816 1070 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1071 return error;
1072}
1073
796223f5 1074static void
f7791740 1075netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1076{
f7791740 1077 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
8b61709d 1078
796223f5
BP
1079 if (!rx->is_tap) {
1080 close(rx->fd);
8b61709d 1081 }
9dc63482
BP
1082}
1083
1084static void
f7791740 1085netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1086{
f7791740 1087 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1088
796223f5
BP
1089 free(rx);
1090}
8b61709d 1091
b73c8518 1092static ovs_be16
1ebdc7eb 1093auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1094{
1095 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1096 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1097 } else if (double_tagged) {
1098 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1099 } else {
1ebdc7eb 1100 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1101 }
1102}
1103
1104static bool
1105auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1106{
1107 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1108}
1109
796223f5 1110static int
cf62fa4c 1111netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
796223f5 1112{
b73c8518 1113 size_t size;
796223f5 1114 ssize_t retval;
b73c8518
SH
1115 struct iovec iov;
1116 struct cmsghdr *cmsg;
1117 union {
1118 struct cmsghdr cmsg;
1119 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1120 } cmsg_buffer;
1121 struct msghdr msgh;
1122
1123 /* Reserve headroom for a single VLAN tag */
cf62fa4c
PS
1124 dp_packet_reserve(buffer, VLAN_HEADER_LEN);
1125 size = dp_packet_tailroom(buffer);
b73c8518 1126
cf62fa4c 1127 iov.iov_base = dp_packet_data(buffer);
b73c8518
SH
1128 iov.iov_len = size;
1129 msgh.msg_name = NULL;
1130 msgh.msg_namelen = 0;
1131 msgh.msg_iov = &iov;
1132 msgh.msg_iovlen = 1;
1133 msgh.msg_control = &cmsg_buffer;
1134 msgh.msg_controllen = sizeof cmsg_buffer;
1135 msgh.msg_flags = 0;
8e8cddf7 1136
796223f5 1137 do {
b73c8518 1138 retval = recvmsg(fd, &msgh, MSG_TRUNC);
796223f5
BP
1139 } while (retval < 0 && errno == EINTR);
1140
bfd3367b 1141 if (retval < 0) {
b73c8518
SH
1142 return errno;
1143 } else if (retval > size) {
1144 return EMSGSIZE;
1145 }
1146
cf62fa4c 1147 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1148
1149 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
1150 const struct tpacket_auxdata *aux;
1151
1152 if (cmsg->cmsg_level != SOL_PACKET
1153 || cmsg->cmsg_type != PACKET_AUXDATA
1154 || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1155 continue;
8b61709d 1156 }
b73c8518
SH
1157
1158 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1159 if (auxdata_has_vlan_tci(aux)) {
1ebdc7eb
EG
1160 struct eth_header *eth;
1161 bool double_tagged;
1162
b73c8518
SH
1163 if (retval < ETH_HEADER_LEN) {
1164 return EINVAL;
1165 }
1166
1ebdc7eb
EG
1167 eth = dp_packet_data(buffer);
1168 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1169
1170 eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
b73c8518
SH
1171 htons(aux->tp_vlan_tci));
1172 break;
1173 }
1174 }
1175
1176 return 0;
1177}
1178
1179static int
cf62fa4c 1180netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
b73c8518
SH
1181{
1182 ssize_t retval;
cf62fa4c 1183 size_t size = dp_packet_tailroom(buffer);
b73c8518
SH
1184
1185 do {
cf62fa4c 1186 retval = read(fd, dp_packet_data(buffer), size);
b73c8518
SH
1187 } while (retval < 0 && errno == EINTR);
1188
1189 if (retval < 0) {
bfd3367b 1190 return errno;
8b61709d 1191 }
b73c8518 1192
cf62fa4c 1193 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
b73c8518
SH
1194 return 0;
1195}
1196
1197static int
64839cf4 1198netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch)
b73c8518 1199{
f7791740 1200 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1201 struct netdev *netdev = rx->up.netdev;
cf62fa4c 1202 struct dp_packet *buffer;
df1e5a3b
PS
1203 ssize_t retval;
1204 int mtu;
1205
1206 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1207 mtu = ETH_PAYLOAD_MAX;
1208 }
1209
2482b0b0 1210 /* Assume Ethernet port. No need to set packet_type. */
cf62fa4c 1211 buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
91088554 1212 DP_NETDEV_HEADROOM);
b73c8518 1213 retval = (rx->is_tap
f7791740
PS
1214 ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
1215 : netdev_linux_rxq_recv_sock(rx->fd, buffer));
df1e5a3b
PS
1216
1217 if (retval) {
1218 if (retval != EAGAIN && retval != EMSGSIZE) {
1219 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1220 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1221 }
cf62fa4c 1222 dp_packet_delete(buffer);
df1e5a3b 1223 } else {
72c84bc2 1224 dp_packet_batch_init_packet(batch, buffer);
b73c8518
SH
1225 }
1226
1227 return retval;
8b61709d
BP
1228}
1229
8b61709d 1230static void
f7791740 1231netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1232{
f7791740 1233 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1234 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1235}
1236
8b61709d 1237static int
f7791740 1238netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1239{
f7791740 1240 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1241 if (rx->is_tap) {
8b61709d 1242 struct ifreq ifr;
f7791740 1243 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1244 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1245 if (error) {
1246 return error;
1247 }
796223f5 1248 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1249 return 0;
1250 } else {
796223f5 1251 return drain_rcvbuf(rx->fd);
8b61709d
BP
1252 }
1253}
1254
d19cf8bb
ZG
1255static int
1256netdev_linux_sock_batch_send(int sock, int ifindex,
1257 struct dp_packet_batch *batch)
1258{
e0a00cee 1259 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1260 /* We don't bother setting most fields in sockaddr_ll because the
1261 * kernel ignores them for SOCK_RAW. */
1262 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1263 .sll_ifindex = ifindex };
1264
e0a00cee
BB
1265 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1266 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1267
e0a00cee 1268 struct dp_packet *packet;
e883448e 1269 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
d19cf8bb 1270 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1271 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1272 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1273 .msg_namelen = sizeof sll,
1274 .msg_iov = &iov[i],
1275 .msg_iovlen = 1 };
1276 }
1277
1278 int error = 0;
e0a00cee 1279 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1280 ssize_t retval;
1281 do {
e0a00cee 1282 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1283 error = retval < 0 ? errno : 0;
1284 } while (error == EINTR);
1285 if (error) {
1286 break;
1287 }
1288 ofs += retval;
1289 }
1290
1291 free(mmsg);
1292 free(iov);
1293 return error;
1294}
1295
1296/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1297 * essential, because packets sent to a tap device with an AF_PACKET socket
1298 * will loop back to be *received* again on the tap device. This doesn't occur
1299 * on other interface types because we attach a socket filter to the rx
1300 * socket. */
1301static int
1302netdev_linux_tap_batch_send(struct netdev *netdev_,
1303 struct dp_packet_batch *batch)
1304{
1305 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1306 struct dp_packet *packet;
22dcb534
FL
1307
1308 /* The Linux tap driver returns EIO if the device is not up,
1309 * so if the device is not up, don't waste time sending it.
1310 * However, if the device is in another network namespace
1311 * then OVS can't retrieve the state. In that case, send the
1312 * packets anyway. */
1313 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1314 netdev->tx_dropped += dp_packet_batch_size(batch);
1315 return 0;
1316 }
1317
e883448e 1318 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
ad8b0b4f 1319 size_t size = dp_packet_size(packet);
d19cf8bb
ZG
1320 ssize_t retval;
1321 int error;
1322
1323 do {
1324 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1325 error = retval < 0 ? errno : 0;
1326 } while (error == EINTR);
1327
1328 if (error) {
1329 /* The Linux tap driver returns EIO if the device is not up. From
1330 * the OVS side this is not an error, so we ignore it; otherwise,
1331 * return the erro. */
1332 if (error != EIO) {
1333 return error;
1334 }
1335 } else if (retval != size) {
1336 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1337 "bytes of %"PRIuSIZE") on %s",
1338 retval, size, netdev_get_name(netdev_));
1339 return EMSGSIZE;
1340 }
1341 }
1342 return 0;
1343}
1344
1345/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1346 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1347 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1348 * the packet is too big or too small to transmit on the device.
1349 *
8b61709d
BP
1350 * The kernel maintains a packet transmission queue, so the caller is not
1351 * expected to do additional queuing of packets. */
1352static int
f00fa8cb 1353netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1354 struct dp_packet_batch *batch,
324c8374 1355 bool concurrent_txq OVS_UNUSED)
8b61709d 1356{
f4fd623c 1357 int error = 0;
0a62ae2c
ZG
1358 int sock = 0;
1359
0a62ae2c
ZG
1360 if (!is_tap_netdev(netdev_)) {
1361 sock = af_packet_sock();
1362 if (sock < 0) {
1363 error = -sock;
1364 goto free_batch;
1365 }
1366
1367 int ifindex = netdev_get_ifindex(netdev_);
1368 if (ifindex < 0) {
1369 error = -ifindex;
1370 goto free_batch;
1371 }
1372
d19cf8bb
ZG
1373 error = netdev_linux_sock_batch_send(sock, ifindex, batch);
1374 } else {
1375 error = netdev_linux_tap_batch_send(netdev_, batch);
0a62ae2c 1376 }
d19cf8bb
ZG
1377 if (error) {
1378 if (error == ENOBUFS) {
1379 /* The Linux AF_PACKET implementation never blocks waiting
1380 * for room for packets, instead returning ENOBUFS.
1381 * Translate this into EAGAIN for the caller. */
1382 error = EAGAIN;
f23347ea 1383 } else {
f4fd623c
DDP
1384 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1385 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1386 }
f4fd623c
DDP
1387 }
1388
0a62ae2c 1389free_batch:
b30896c9 1390 dp_packet_delete_batch(batch, true);
f4fd623c 1391 return error;
8b61709d
BP
1392}
1393
1394/* Registers with the poll loop to wake up from the next call to poll_block()
1395 * when the packet transmission queue has sufficient room to transmit a packet
1396 * with netdev_send().
1397 *
1398 * The kernel maintains a packet transmission queue, so the client is not
1399 * expected to do additional queuing of packets. Thus, this function is
1400 * unlikely to ever be used. It is included for completeness. */
1401static void
f00fa8cb 1402netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1403{
796223f5 1404 if (is_tap_netdev(netdev)) {
8b61709d
BP
1405 /* TAP device always accepts packets.*/
1406 poll_immediate_wake();
1407 }
1408}
1409
1410/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1411 * otherwise a positive errno value. */
1412static int
74ff3298 1413netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1414{
b5d57fc8 1415 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1416 enum netdev_flags old_flags = 0;
eb395f2e
BP
1417 int error;
1418
86383816
BP
1419 ovs_mutex_lock(&netdev->mutex);
1420
b5d57fc8 1421 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1422 error = netdev->ether_addr_error;
1423 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1424 goto exit;
44445cac 1425 }
b5d57fc8 1426 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1427 }
1428
7eb1bd81 1429 /* Tap devices must be brought down before setting the address. */
796223f5 1430 if (is_tap_netdev(netdev_)) {
4f9f3f21 1431 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1432 }
44445cac
PS
1433 error = set_etheraddr(netdev_get_name(netdev_), mac);
1434 if (!error || error == ENODEV) {
b5d57fc8
BP
1435 netdev->ether_addr_error = error;
1436 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1437 if (!error) {
74ff3298 1438 netdev->etheraddr = mac;
eb395f2e 1439 }
8b61709d 1440 }
44445cac 1441
4f9f3f21
BP
1442 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1443 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1444 }
7eb1bd81 1445
86383816
BP
1446exit:
1447 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1448 return error;
1449}
1450
44445cac 1451/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1452static int
74ff3298 1453netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1454{
b5d57fc8 1455 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1456 int error;
44445cac 1457
86383816 1458 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1459 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
756819dd
FL
1460 netdev_linux_update_via_netlink(netdev);
1461 }
1462
1463 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1464 /* Fall back to ioctl if netlink fails */
86383816 1465 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1466 &netdev->etheraddr);
b5d57fc8 1467 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1468 }
44445cac 1469
86383816
BP
1470 error = netdev->ether_addr_error;
1471 if (!error) {
74ff3298 1472 *mac = netdev->etheraddr;
44445cac 1473 }
86383816 1474 ovs_mutex_unlock(&netdev->mutex);
44445cac 1475
86383816 1476 return error;
8b61709d
BP
1477}
1478
8b61709d 1479static int
73371c09 1480netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1481{
86383816
BP
1482 int error;
1483
b5d57fc8 1484 if (!(netdev->cache_valid & VALID_MTU)) {
756819dd
FL
1485 netdev_linux_update_via_netlink(netdev);
1486 }
1487
1488 if (!(netdev->cache_valid & VALID_MTU)) {
1489 /* Fall back to ioctl if netlink fails */
8b61709d 1490 struct ifreq ifr;
90a6637d 1491
86383816 1492 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1493 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1494 netdev->mtu = ifr.ifr_mtu;
1495 netdev->cache_valid |= VALID_MTU;
8b61709d 1496 }
90a6637d 1497
86383816
BP
1498 error = netdev->netdev_mtu_error;
1499 if (!error) {
b5d57fc8 1500 *mtup = netdev->mtu;
90a6637d 1501 }
73371c09
BP
1502
1503 return error;
1504}
1505
1506/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1507 * in bytes, not including the hardware header; thus, this is typically 1500
1508 * bytes for Ethernet devices. */
1509static int
1510netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1511{
1512 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1513 int error;
1514
1515 ovs_mutex_lock(&netdev->mutex);
1516 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1517 ovs_mutex_unlock(&netdev->mutex);
1518
1519 return error;
8b61709d
BP
1520}
1521
9b020780
PS
1522/* Sets the maximum size of transmitted (MTU) for given device using linux
1523 * networking ioctl interface.
1524 */
1525static int
4124cb12 1526netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1527{
b5d57fc8 1528 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1529 struct ifreq ifr;
1530 int error;
1531
86383816 1532 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1533 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1534 error = netdev->netdev_mtu_error;
1535 if (error || netdev->mtu == mtu) {
1536 goto exit;
90a6637d 1537 }
b5d57fc8 1538 netdev->cache_valid &= ~VALID_MTU;
153e5481 1539 }
9b020780 1540 ifr.ifr_mtu = mtu;
259e0b1a
BP
1541 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1542 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1543 if (!error || error == ENODEV) {
b5d57fc8
BP
1544 netdev->netdev_mtu_error = error;
1545 netdev->mtu = ifr.ifr_mtu;
1546 netdev->cache_valid |= VALID_MTU;
9b020780 1547 }
86383816
BP
1548exit:
1549 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1550 return error;
9b020780
PS
1551}
1552
9ab3d9a3
BP
1553/* Returns the ifindex of 'netdev', if successful, as a positive number.
1554 * On failure, returns a negative errno value. */
1555static int
86383816 1556netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1557{
86383816 1558 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1559 int ifindex, error;
1560
86383816
BP
1561 ovs_mutex_lock(&netdev->mutex);
1562 error = get_ifindex(netdev_, &ifindex);
1563 ovs_mutex_unlock(&netdev->mutex);
1564
9ab3d9a3
BP
1565 return error ? -error : ifindex;
1566}
1567
8b61709d
BP
1568static int
1569netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1570{
b5d57fc8 1571 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1572
86383816 1573 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1574 if (netdev->miimon_interval > 0) {
1575 *carrier = netdev->miimon;
3a183124 1576 } else {
b5d57fc8 1577 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1578 }
86383816 1579 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1580
3a183124 1581 return 0;
8b61709d
BP
1582}
1583
65c3058c 1584static long long int
86383816 1585netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1586{
86383816
BP
1587 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1588 long long int carrier_resets;
1589
1590 ovs_mutex_lock(&netdev->mutex);
1591 carrier_resets = netdev->carrier_resets;
1592 ovs_mutex_unlock(&netdev->mutex);
1593
1594 return carrier_resets;
65c3058c
EJ
1595}
1596
63331829 1597static int
1670c579
EJ
1598netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1599 struct mii_ioctl_data *data)
63331829 1600{
63331829 1601 struct ifreq ifr;
782e6111 1602 int error;
63331829 1603
63331829 1604 memset(&ifr, 0, sizeof ifr);
782e6111 1605 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1606 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1607 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1608
782e6111
EJ
1609 return error;
1610}
1611
1612static int
1670c579 1613netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 1614{
782e6111
EJ
1615 struct mii_ioctl_data data;
1616 int error;
63331829 1617
782e6111
EJ
1618 *miimon = false;
1619
1620 memset(&data, 0, sizeof data);
1670c579 1621 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
1622 if (!error) {
1623 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1624 data.reg_num = MII_BMSR;
1670c579 1625 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 1626 &data);
63331829
EJ
1627
1628 if (!error) {
782e6111 1629 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 1630 }
9120cfc0
DH
1631 }
1632 if (error) {
63331829 1633 struct ethtool_cmd ecmd;
63331829
EJ
1634
1635 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1636 name);
1637
ab985a77 1638 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
1639 memset(&ecmd, 0, sizeof ecmd);
1640 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1641 "ETHTOOL_GLINK");
1642 if (!error) {
782e6111
EJ
1643 struct ethtool_value eval;
1644
1645 memcpy(&eval, &ecmd, sizeof eval);
1646 *miimon = !!eval.data;
63331829
EJ
1647 } else {
1648 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1649 }
1650 }
1651
1652 return error;
1653}
1654
1670c579
EJ
1655static int
1656netdev_linux_set_miimon_interval(struct netdev *netdev_,
1657 long long int interval)
1658{
b5d57fc8 1659 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 1660
86383816 1661 ovs_mutex_lock(&netdev->mutex);
1670c579 1662 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 1663 if (netdev->miimon_interval != interval) {
19c8e9c1 1664 if (interval && !netdev->miimon_interval) {
812c272c 1665 atomic_count_inc(&miimon_cnt);
19c8e9c1 1666 } else if (!interval && netdev->miimon_interval) {
812c272c 1667 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1668 }
1669
b5d57fc8
BP
1670 netdev->miimon_interval = interval;
1671 timer_set_expired(&netdev->miimon_timer);
1670c579 1672 }
86383816 1673 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
1674
1675 return 0;
1676}
1677
1678static void
1679netdev_linux_miimon_run(void)
1680{
1681 struct shash device_shash;
1682 struct shash_node *node;
1683
1684 shash_init(&device_shash);
b5d57fc8 1685 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1686 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1687 struct netdev *netdev = node->data;
1688 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
1689 bool miimon;
1690
86383816
BP
1691 ovs_mutex_lock(&dev->mutex);
1692 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
1693 netdev_linux_get_miimon(dev->up.name, &miimon);
1694 if (miimon != dev->miimon) {
1695 dev->miimon = miimon;
1696 netdev_linux_changed(dev, dev->ifi_flags, 0);
1697 }
1670c579 1698
86383816 1699 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 1700 }
86383816 1701 ovs_mutex_unlock(&dev->mutex);
2f980d74 1702 netdev_close(netdev);
1670c579
EJ
1703 }
1704
1705 shash_destroy(&device_shash);
1706}
1707
1708static void
1709netdev_linux_miimon_wait(void)
1710{
1711 struct shash device_shash;
1712 struct shash_node *node;
1713
1714 shash_init(&device_shash);
b5d57fc8 1715 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 1716 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
1717 struct netdev *netdev = node->data;
1718 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 1719
86383816 1720 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
1721 if (dev->miimon_interval > 0) {
1722 timer_wait(&dev->miimon_timer);
1723 }
86383816 1724 ovs_mutex_unlock(&dev->mutex);
2f980d74 1725 netdev_close(netdev);
1670c579
EJ
1726 }
1727 shash_destroy(&device_shash);
1728}
1729
92df599c
JG
1730static void
1731swap_uint64(uint64_t *a, uint64_t *b)
1732{
1de0e8ae
BP
1733 uint64_t tmp = *a;
1734 *a = *b;
1735 *b = tmp;
92df599c
JG
1736}
1737
c060c4cf
EJ
1738/* Copies 'src' into 'dst', performing format conversion in the process.
1739 *
1740 * 'src' is allowed to be misaligned. */
1741static void
1742netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
1743 const struct ovs_vport_stats *src)
1744{
6a54dedc
BP
1745 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
1746 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
1747 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
1748 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
1749 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
1750 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
1751 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
1752 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
1753 dst->multicast = 0;
1754 dst->collisions = 0;
1755 dst->rx_length_errors = 0;
1756 dst->rx_over_errors = 0;
1757 dst->rx_crc_errors = 0;
1758 dst->rx_frame_errors = 0;
1759 dst->rx_fifo_errors = 0;
1760 dst->rx_missed_errors = 0;
1761 dst->tx_aborted_errors = 0;
1762 dst->tx_carrier_errors = 0;
1763 dst->tx_fifo_errors = 0;
1764 dst->tx_heartbeat_errors = 0;
1765 dst->tx_window_errors = 0;
1766}
1767
1768static int
1769get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
1770{
93451a0a 1771 struct dpif_netlink_vport reply;
c060c4cf
EJ
1772 struct ofpbuf *buf;
1773 int error;
1774
93451a0a 1775 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
1776 if (error) {
1777 return error;
1778 } else if (!reply.stats) {
1779 ofpbuf_delete(buf);
1780 return EOPNOTSUPP;
1781 }
1782
1783 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
1784
1785 ofpbuf_delete(buf);
1786
1787 return 0;
1788}
1789
f613a0d7
PS
1790static void
1791get_stats_via_vport(const struct netdev *netdev_,
1792 struct netdev_stats *stats)
8b61709d 1793{
b5d57fc8 1794 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1795
b5d57fc8
BP
1796 if (!netdev->vport_stats_error ||
1797 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 1798 int error;
7fbef77a 1799
c060c4cf 1800 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 1801 if (error && error != ENOENT && error != ENODEV) {
a57a8488 1802 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
1803 "(%s)",
1804 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 1805 }
b5d57fc8
BP
1806 netdev->vport_stats_error = error;
1807 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 1808 }
f613a0d7 1809}
8b61709d 1810
f613a0d7
PS
1811/* Retrieves current device stats for 'netdev-linux'. */
1812static int
1813netdev_linux_get_stats(const struct netdev *netdev_,
1814 struct netdev_stats *stats)
1815{
b5d57fc8 1816 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1817 struct netdev_stats dev_stats;
1818 int error;
1819
86383816 1820 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1821 get_stats_via_vport(netdev_, stats);
35eef899 1822 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1823 if (error) {
86383816
BP
1824 if (!netdev->vport_stats_error) {
1825 error = 0;
f613a0d7 1826 }
86383816 1827 } else if (netdev->vport_stats_error) {
04c881eb 1828 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
1829 *stats = dev_stats;
1830 } else {
04c881eb
AZ
1831 /* Use kernel netdev's packet and byte counts since vport's counters
1832 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1833 * enabled. */
1834 stats->rx_packets = dev_stats.rx_packets;
1835 stats->rx_bytes = dev_stats.rx_bytes;
1836 stats->tx_packets = dev_stats.tx_packets;
1837 stats->tx_bytes = dev_stats.tx_bytes;
1838
f613a0d7
PS
1839 stats->rx_errors += dev_stats.rx_errors;
1840 stats->tx_errors += dev_stats.tx_errors;
1841 stats->rx_dropped += dev_stats.rx_dropped;
1842 stats->tx_dropped += dev_stats.tx_dropped;
1843 stats->multicast += dev_stats.multicast;
1844 stats->collisions += dev_stats.collisions;
1845 stats->rx_length_errors += dev_stats.rx_length_errors;
1846 stats->rx_over_errors += dev_stats.rx_over_errors;
1847 stats->rx_crc_errors += dev_stats.rx_crc_errors;
1848 stats->rx_frame_errors += dev_stats.rx_frame_errors;
1849 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1850 stats->rx_missed_errors += dev_stats.rx_missed_errors;
1851 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1852 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1853 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1854 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1855 stats->tx_window_errors += dev_stats.tx_window_errors;
1856 }
86383816
BP
1857 ovs_mutex_unlock(&netdev->mutex);
1858
1859 return error;
f613a0d7
PS
1860}
1861
1862/* Retrieves current device stats for 'netdev-tap' netdev or
1863 * netdev-internal. */
1864static int
15aee116 1865netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 1866{
b5d57fc8 1867 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
1868 struct netdev_stats dev_stats;
1869 int error;
1870
86383816 1871 ovs_mutex_lock(&netdev->mutex);
f613a0d7 1872 get_stats_via_vport(netdev_, stats);
35eef899 1873 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 1874 if (error) {
86383816
BP
1875 if (!netdev->vport_stats_error) {
1876 error = 0;
8b61709d 1877 }
86383816
BP
1878 } else if (netdev->vport_stats_error) {
1879 /* Transmit and receive stats will appear to be swapped relative to the
1880 * other ports since we are the one sending the data, not a remote
1881 * computer. For consistency, we swap them back here. This does not
1882 * apply if we are getting stats from the vport layer because it always
1883 * tracks stats from the perspective of the switch. */
fe6b0e03 1884
f613a0d7 1885 *stats = dev_stats;
92df599c
JG
1886 swap_uint64(&stats->rx_packets, &stats->tx_packets);
1887 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1888 swap_uint64(&stats->rx_errors, &stats->tx_errors);
1889 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
1890 stats->rx_length_errors = 0;
1891 stats->rx_over_errors = 0;
1892 stats->rx_crc_errors = 0;
1893 stats->rx_frame_errors = 0;
1894 stats->rx_fifo_errors = 0;
1895 stats->rx_missed_errors = 0;
1896 stats->tx_aborted_errors = 0;
1897 stats->tx_carrier_errors = 0;
1898 stats->tx_fifo_errors = 0;
1899 stats->tx_heartbeat_errors = 0;
1900 stats->tx_window_errors = 0;
f613a0d7 1901 } else {
04c881eb
AZ
1902 /* Use kernel netdev's packet and byte counts since vport counters
1903 * do not reflect packet counts on the wire when GSO, TSO or GRO
1904 * are enabled. */
1905 stats->rx_packets = dev_stats.tx_packets;
1906 stats->rx_bytes = dev_stats.tx_bytes;
1907 stats->tx_packets = dev_stats.rx_packets;
1908 stats->tx_bytes = dev_stats.rx_bytes;
1909
f613a0d7
PS
1910 stats->rx_dropped += dev_stats.tx_dropped;
1911 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 1912
f613a0d7
PS
1913 stats->rx_errors += dev_stats.tx_errors;
1914 stats->tx_errors += dev_stats.rx_errors;
1915
1916 stats->multicast += dev_stats.multicast;
1917 stats->collisions += dev_stats.collisions;
1918 }
22dcb534 1919 stats->tx_dropped += netdev->tx_dropped;
86383816
BP
1920 ovs_mutex_unlock(&netdev->mutex);
1921
1922 return error;
8b61709d
BP
1923}
1924
bba1e6f3
PS
1925static int
1926netdev_internal_get_stats(const struct netdev *netdev_,
1927 struct netdev_stats *stats)
1928{
b5d57fc8 1929 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1930 int error;
bba1e6f3 1931
86383816 1932 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 1933 get_stats_via_vport(netdev_, stats);
86383816
BP
1934 error = netdev->vport_stats_error;
1935 ovs_mutex_unlock(&netdev->mutex);
1936
1937 return error;
bba1e6f3
PS
1938}
1939
51f87458 1940static void
b5d57fc8 1941netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
1942{
1943 struct ethtool_cmd ecmd;
6c038611 1944 uint32_t speed;
8b61709d
BP
1945 int error;
1946
b5d57fc8 1947 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
1948 return;
1949 }
1950
ab985a77 1951 COVERAGE_INC(netdev_get_ethtool);
8b61709d 1952 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 1953 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
1954 ETHTOOL_GSET, "ETHTOOL_GSET");
1955 if (error) {
51f87458 1956 goto out;
8b61709d
BP
1957 }
1958
1959 /* Supported features. */
b5d57fc8 1960 netdev->supported = 0;
8b61709d 1961 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 1962 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
1963 }
1964 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 1965 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
1966 }
1967 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 1968 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
1969 }
1970 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 1971 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
1972 }
1973 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 1974 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 1975 }
67bed84c
SH
1976 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
1977 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 1978 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 1979 }
67bed84c
SH
1980 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
1981 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
1982 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
1983 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 1984 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 1985 }
67bed84c
SH
1986 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
1987 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
1988 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
1989 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
1990 netdev->supported |= NETDEV_F_40GB_FD;
1991 }
8b61709d 1992 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 1993 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
1994 }
1995 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 1996 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
1997 }
1998 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 1999 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
2000 }
2001 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 2002 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
2003 }
2004 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 2005 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2006 }
2007
2008 /* Advertised features. */
b5d57fc8 2009 netdev->advertised = 0;
8b61709d 2010 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 2011 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
2012 }
2013 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 2014 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
2015 }
2016 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 2017 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
2018 }
2019 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 2020 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
2021 }
2022 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 2023 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 2024 }
67bed84c
SH
2025 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2026 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 2027 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 2028 }
67bed84c
SH
2029 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2030 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2031 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2032 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 2033 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 2034 }
67bed84c
SH
2035 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2036 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2037 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2038 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2039 netdev->advertised |= NETDEV_F_40GB_FD;
2040 }
8b61709d 2041 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 2042 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
2043 }
2044 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 2045 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
2046 }
2047 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 2048 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
2049 }
2050 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 2051 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
2052 }
2053 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 2054 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2055 }
2056
2057 /* Current settings. */
0c615356 2058 speed = ethtool_cmd_speed(&ecmd);
6c038611 2059 if (speed == SPEED_10) {
b5d57fc8 2060 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 2061 } else if (speed == SPEED_100) {
b5d57fc8 2062 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 2063 } else if (speed == SPEED_1000) {
b5d57fc8 2064 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 2065 } else if (speed == SPEED_10000) {
b5d57fc8 2066 netdev->current = NETDEV_F_10GB_FD;
6c038611 2067 } else if (speed == 40000) {
b5d57fc8 2068 netdev->current = NETDEV_F_40GB_FD;
6c038611 2069 } else if (speed == 100000) {
b5d57fc8 2070 netdev->current = NETDEV_F_100GB_FD;
6c038611 2071 } else if (speed == 1000000) {
b5d57fc8 2072 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2073 } else {
b5d57fc8 2074 netdev->current = 0;
8b61709d
BP
2075 }
2076
2077 if (ecmd.port == PORT_TP) {
b5d57fc8 2078 netdev->current |= NETDEV_F_COPPER;
8b61709d 2079 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2080 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2081 }
2082
2083 if (ecmd.autoneg) {
b5d57fc8 2084 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2085 }
2086
51f87458 2087out:
b5d57fc8
BP
2088 netdev->cache_valid |= VALID_FEATURES;
2089 netdev->get_features_error = error;
51f87458
PS
2090}
2091
887ed8b2
BP
2092/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2093 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2094 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2095static int
2096netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2097 enum netdev_features *current,
2098 enum netdev_features *advertised,
2099 enum netdev_features *supported,
2100 enum netdev_features *peer)
51f87458 2101{
b5d57fc8 2102 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2103 int error;
51f87458 2104
86383816 2105 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2106 netdev_linux_read_features(netdev);
b5d57fc8
BP
2107 if (!netdev->get_features_error) {
2108 *current = netdev->current;
2109 *advertised = netdev->advertised;
2110 *supported = netdev->supported;
887ed8b2 2111 *peer = 0; /* XXX */
51f87458 2112 }
86383816
BP
2113 error = netdev->get_features_error;
2114 ovs_mutex_unlock(&netdev->mutex);
2115
2116 return error;
8b61709d
BP
2117}
2118
2119/* Set the features advertised by 'netdev' to 'advertise'. */
2120static int
86383816 2121netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2122 enum netdev_features advertise)
8b61709d 2123{
86383816 2124 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2125 struct ethtool_cmd ecmd;
2126 int error;
2127
86383816
BP
2128 ovs_mutex_lock(&netdev->mutex);
2129
ab985a77 2130 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2131 memset(&ecmd, 0, sizeof ecmd);
86383816 2132 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2133 ETHTOOL_GSET, "ETHTOOL_GSET");
2134 if (error) {
86383816 2135 goto exit;
8b61709d
BP
2136 }
2137
2138 ecmd.advertising = 0;
6c038611 2139 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2140 ecmd.advertising |= ADVERTISED_10baseT_Half;
2141 }
6c038611 2142 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2143 ecmd.advertising |= ADVERTISED_10baseT_Full;
2144 }
6c038611 2145 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2146 ecmd.advertising |= ADVERTISED_100baseT_Half;
2147 }
6c038611 2148 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2149 ecmd.advertising |= ADVERTISED_100baseT_Full;
2150 }
6c038611 2151 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2152 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2153 }
6c038611 2154 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2155 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2156 }
6c038611 2157 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2158 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2159 }
6c038611 2160 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2161 ecmd.advertising |= ADVERTISED_TP;
2162 }
6c038611 2163 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2164 ecmd.advertising |= ADVERTISED_FIBRE;
2165 }
6c038611 2166 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2167 ecmd.advertising |= ADVERTISED_Autoneg;
2168 }
6c038611 2169 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2170 ecmd.advertising |= ADVERTISED_Pause;
2171 }
6c038611 2172 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2173 ecmd.advertising |= ADVERTISED_Asym_Pause;
2174 }
ab985a77 2175 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2176 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2177 ETHTOOL_SSET, "ETHTOOL_SSET");
2178
2179exit:
2180 ovs_mutex_unlock(&netdev->mutex);
2181 return error;
8b61709d
BP
2182}
2183
f8500004
JP
2184/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2185 * successful, otherwise a positive errno value. */
8b61709d 2186static int
b5d57fc8 2187netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2188 uint32_t kbits_rate, uint32_t kbits_burst)
2189{
b5d57fc8
BP
2190 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2191 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2192 int ifindex;
f8500004 2193 int error;
8b61709d 2194
d5ae4a60
PB
2195 if (netdev_is_flow_api_enabled()) {
2196 if (kbits_rate) {
2197 VLOG_WARN_RL(&rl, "%s: policing with offload isn't supported",
2198 netdev_name);
2199 }
2200 return EOPNOTSUPP;
2201 }
2202
80a86fbe 2203 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2204 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2205 : kbits_burst); /* Stick with user-specified value. */
2206
86383816 2207 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2208 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2209 error = netdev->netdev_policing_error;
2210 if (error || (netdev->kbits_rate == kbits_rate &&
2211 netdev->kbits_burst == kbits_burst)) {
c9f71668 2212 /* Assume that settings haven't changed since we last set them. */
86383816 2213 goto out;
c9f71668 2214 }
b5d57fc8 2215 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2216 }
2217
7874bdff
RD
2218 error = get_ifindex(netdev_, &ifindex);
2219 if (error) {
2220 goto out;
2221 }
2222
ac8c3412 2223 COVERAGE_INC(netdev_set_policing);
f8500004 2224 /* Remove any existing ingress qdisc. */
7874bdff 2225 error = tc_add_del_ingress_qdisc(ifindex, false);
f8500004
JP
2226 if (error) {
2227 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2228 netdev_name, ovs_strerror(error));
c9f71668 2229 goto out;
f8500004
JP
2230 }
2231
8b61709d 2232 if (kbits_rate) {
7874bdff 2233 error = tc_add_del_ingress_qdisc(ifindex, true);
f8500004
JP
2234 if (error) {
2235 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2236 netdev_name, ovs_strerror(error));
c9f71668 2237 goto out;
8b61709d
BP
2238 }
2239
b5d57fc8 2240 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2241 if (error){
2242 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2243 netdev_name, ovs_strerror(error));
c9f71668 2244 goto out;
8b61709d 2245 }
8b61709d
BP
2246 }
2247
b5d57fc8
BP
2248 netdev->kbits_rate = kbits_rate;
2249 netdev->kbits_burst = kbits_burst;
f8500004 2250
c9f71668
PS
2251out:
2252 if (!error || error == ENODEV) {
b5d57fc8
BP
2253 netdev->netdev_policing_error = error;
2254 netdev->cache_valid |= VALID_POLICING;
c9f71668 2255 }
86383816 2256 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2257 return error;
8b61709d
BP
2258}
2259
c1c9c9c4
BP
2260static int
2261netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2262 struct sset *types)
c1c9c9c4 2263{
559eb230 2264 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2265 for (opsp = tcs; *opsp != NULL; opsp++) {
2266 const struct tc_ops *ops = *opsp;
2267 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2268 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2269 }
2270 }
2271 return 0;
2272}
2273
2274static const struct tc_ops *
2275tc_lookup_ovs_name(const char *name)
2276{
559eb230 2277 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2278
2279 for (opsp = tcs; *opsp != NULL; opsp++) {
2280 const struct tc_ops *ops = *opsp;
2281 if (!strcmp(name, ops->ovs_name)) {
2282 return ops;
2283 }
2284 }
2285 return NULL;
2286}
2287
2288static const struct tc_ops *
2289tc_lookup_linux_name(const char *name)
2290{
559eb230 2291 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2292
2293 for (opsp = tcs; *opsp != NULL; opsp++) {
2294 const struct tc_ops *ops = *opsp;
2295 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2296 return ops;
2297 }
2298 }
2299 return NULL;
2300}
2301
93b13be8 2302static struct tc_queue *
b5d57fc8 2303tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2304 size_t hash)
2305{
b5d57fc8 2306 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2307 struct tc_queue *queue;
2308
b5d57fc8 2309 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2310 if (queue->queue_id == queue_id) {
2311 return queue;
2312 }
2313 }
2314 return NULL;
2315}
2316
2317static struct tc_queue *
2318tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2319{
2320 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2321}
2322
c1c9c9c4
BP
2323static int
2324netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2325 const char *type,
2326 struct netdev_qos_capabilities *caps)
2327{
2328 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2329 if (!ops) {
2330 return EOPNOTSUPP;
2331 }
2332 caps->n_queues = ops->n_queues;
2333 return 0;
2334}
2335
2336static int
b5d57fc8 2337netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2338 const char **typep, struct smap *details)
c1c9c9c4 2339{
b5d57fc8 2340 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2341 int error;
2342
86383816 2343 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2344 error = tc_query_qdisc(netdev_);
86383816
BP
2345 if (!error) {
2346 *typep = netdev->tc->ops->ovs_name;
2347 error = (netdev->tc->ops->qdisc_get
2348 ? netdev->tc->ops->qdisc_get(netdev_, details)
2349 : 0);
c1c9c9c4 2350 }
86383816 2351 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2352
86383816 2353 return error;
c1c9c9c4
BP
2354}
2355
2356static int
b5d57fc8 2357netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2358 const char *type, const struct smap *details)
c1c9c9c4 2359{
b5d57fc8 2360 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2361 const struct tc_ops *new_ops;
2362 int error;
2363
2364 new_ops = tc_lookup_ovs_name(type);
2365 if (!new_ops || !new_ops->tc_install) {
2366 return EOPNOTSUPP;
2367 }
2368
6cf888b8
BS
2369 if (new_ops == &tc_ops_noop) {
2370 return new_ops->tc_install(netdev_, details);
2371 }
2372
86383816 2373 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2374 error = tc_query_qdisc(netdev_);
c1c9c9c4 2375 if (error) {
86383816 2376 goto exit;
c1c9c9c4
BP
2377 }
2378
b5d57fc8 2379 if (new_ops == netdev->tc->ops) {
86383816 2380 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2381 } else {
2382 /* Delete existing qdisc. */
b5d57fc8 2383 error = tc_del_qdisc(netdev_);
c1c9c9c4 2384 if (error) {
86383816 2385 goto exit;
c1c9c9c4 2386 }
b5d57fc8 2387 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2388
2389 /* Install new qdisc. */
b5d57fc8
BP
2390 error = new_ops->tc_install(netdev_, details);
2391 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2392 }
86383816
BP
2393
2394exit:
2395 ovs_mutex_unlock(&netdev->mutex);
2396 return error;
c1c9c9c4
BP
2397}
2398
2399static int
b5d57fc8 2400netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2401 unsigned int queue_id, struct smap *details)
c1c9c9c4 2402{
b5d57fc8 2403 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2404 int error;
2405
86383816 2406 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2407 error = tc_query_qdisc(netdev_);
86383816 2408 if (!error) {
b5d57fc8 2409 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2410 error = (queue
b5d57fc8 2411 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2412 : ENOENT);
c1c9c9c4 2413 }
86383816
BP
2414 ovs_mutex_unlock(&netdev->mutex);
2415
2416 return error;
c1c9c9c4
BP
2417}
2418
2419static int
b5d57fc8 2420netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2421 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2422{
b5d57fc8 2423 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2424 int error;
2425
86383816 2426 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2427 error = tc_query_qdisc(netdev_);
86383816
BP
2428 if (!error) {
2429 error = (queue_id < netdev->tc->ops->n_queues
2430 && netdev->tc->ops->class_set
2431 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2432 : EINVAL);
c1c9c9c4 2433 }
86383816 2434 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2435
86383816 2436 return error;
c1c9c9c4
BP
2437}
2438
2439static int
b5d57fc8 2440netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2441{
b5d57fc8 2442 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2443 int error;
2444
86383816 2445 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2446 error = tc_query_qdisc(netdev_);
86383816
BP
2447 if (!error) {
2448 if (netdev->tc->ops->class_delete) {
2449 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2450 error = (queue
2451 ? netdev->tc->ops->class_delete(netdev_, queue)
2452 : ENOENT);
2453 } else {
2454 error = EINVAL;
2455 }
c1c9c9c4 2456 }
86383816
BP
2457 ovs_mutex_unlock(&netdev->mutex);
2458
2459 return error;
c1c9c9c4
BP
2460}
2461
2462static int
b5d57fc8 2463netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2464 unsigned int queue_id,
2465 struct netdev_queue_stats *stats)
2466{
b5d57fc8 2467 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2468 int error;
2469
86383816 2470 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2471 error = tc_query_qdisc(netdev_);
86383816
BP
2472 if (!error) {
2473 if (netdev->tc->ops->class_get_stats) {
2474 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2475 if (queue) {
2476 stats->created = queue->created;
2477 error = netdev->tc->ops->class_get_stats(netdev_, queue,
2478 stats);
2479 } else {
2480 error = ENOENT;
2481 }
2482 } else {
2483 error = EOPNOTSUPP;
6dc34a0d 2484 }
c1c9c9c4 2485 }
86383816
BP
2486 ovs_mutex_unlock(&netdev->mutex);
2487
2488 return error;
c1c9c9c4
BP
2489}
2490
d57695d7
JS
2491struct queue_dump_state {
2492 struct nl_dump dump;
2493 struct ofpbuf buf;
2494};
2495
23a98ffe 2496static bool
d57695d7 2497start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
2498{
2499 struct ofpbuf request;
2500 struct tcmsg *tcmsg;
2501
7874bdff 2502 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
2503 if (!tcmsg) {
2504 return false;
2505 }
3c4de644 2506 tcmsg->tcm_parent = 0;
d57695d7 2507 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 2508 ofpbuf_uninit(&request);
d57695d7
JS
2509
2510 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 2511 return true;
c1c9c9c4
BP
2512}
2513
d57695d7
JS
2514static int
2515finish_queue_dump(struct queue_dump_state *state)
2516{
2517 ofpbuf_uninit(&state->buf);
2518 return nl_dump_done(&state->dump);
2519}
2520
89454bf4
BP
2521struct netdev_linux_queue_state {
2522 unsigned int *queues;
2523 size_t cur_queue;
2524 size_t n_queues;
2525};
2526
c1c9c9c4 2527static int
89454bf4 2528netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 2529{
89454bf4 2530 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2531 int error;
2532
86383816 2533 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2534 error = tc_query_qdisc(netdev_);
86383816
BP
2535 if (!error) {
2536 if (netdev->tc->ops->class_get) {
89454bf4
BP
2537 struct netdev_linux_queue_state *state;
2538 struct tc_queue *queue;
2539 size_t i;
2540
2541 *statep = state = xmalloc(sizeof *state);
2542 state->n_queues = hmap_count(&netdev->tc->queues);
2543 state->cur_queue = 0;
2544 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
2545
2546 i = 0;
2547 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
2548 state->queues[i++] = queue->queue_id;
86383816 2549 }
c1c9c9c4 2550 } else {
86383816 2551 error = EOPNOTSUPP;
c1c9c9c4
BP
2552 }
2553 }
86383816 2554 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2555
86383816 2556 return error;
c1c9c9c4
BP
2557}
2558
89454bf4
BP
2559static int
2560netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
2561 unsigned int *queue_idp, struct smap *details)
2562{
2563 const struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2564 struct netdev_linux_queue_state *state = state_;
2565 int error = EOF;
2566
2567 ovs_mutex_lock(&netdev->mutex);
2568 while (state->cur_queue < state->n_queues) {
2569 unsigned int queue_id = state->queues[state->cur_queue++];
2570 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2571
2572 if (queue) {
2573 *queue_idp = queue_id;
2574 error = netdev->tc->ops->class_get(netdev_, queue, details);
2575 break;
2576 }
2577 }
2578 ovs_mutex_unlock(&netdev->mutex);
2579
2580 return error;
2581}
2582
2583static int
2584netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
2585 void *state_)
2586{
2587 struct netdev_linux_queue_state *state = state_;
2588
2589 free(state->queues);
2590 free(state);
2591 return 0;
2592}
2593
c1c9c9c4 2594static int
b5d57fc8 2595netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2596 netdev_dump_queue_stats_cb *cb, void *aux)
2597{
b5d57fc8 2598 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2599 int error;
2600
86383816 2601 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 2602 error = tc_query_qdisc(netdev_);
86383816 2603 if (!error) {
d57695d7 2604 struct queue_dump_state state;
c1c9c9c4 2605
86383816
BP
2606 if (!netdev->tc->ops->class_dump_stats) {
2607 error = EOPNOTSUPP;
d57695d7 2608 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
2609 error = ENODEV;
2610 } else {
2611 struct ofpbuf msg;
2612 int retval;
2613
d57695d7 2614 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
2615 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
2616 cb, aux);
2617 if (retval) {
2618 error = retval;
2619 }
2620 }
2621
d57695d7 2622 retval = finish_queue_dump(&state);
86383816
BP
2623 if (retval) {
2624 error = retval;
2625 }
c1c9c9c4
BP
2626 }
2627 }
86383816 2628 ovs_mutex_unlock(&netdev->mutex);
c1c9c9c4 2629
86383816 2630 return error;
c1c9c9c4
BP
2631}
2632
8b61709d 2633static int
f1acd62b
BP
2634netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
2635 struct in_addr netmask)
8b61709d 2636{
b5d57fc8 2637 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2638 int error;
2639
86383816 2640 ovs_mutex_lock(&netdev->mutex);
f1acd62b 2641 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 2642 if (!error) {
f1acd62b 2643 if (address.s_addr != INADDR_ANY) {
8b61709d 2644 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 2645 "SIOCSIFNETMASK", netmask);
8b61709d
BP
2646 }
2647 }
49af9a3d 2648
86383816
BP
2649 ovs_mutex_unlock(&netdev->mutex);
2650
8b61709d
BP
2651 return error;
2652}
2653
7df6932e
AW
2654/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2655 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2656 * error. */
8b61709d 2657static int
a8704b50
PS
2658netdev_linux_get_addr_list(const struct netdev *netdev_,
2659 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 2660{
b5d57fc8 2661 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 2662 int error;
86383816
BP
2663
2664 ovs_mutex_lock(&netdev->mutex);
a8704b50 2665 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816
BP
2666 ovs_mutex_unlock(&netdev->mutex);
2667
7df6932e 2668 return error;
8b61709d
BP
2669}
2670
2671static void
2672make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2673{
2674 struct sockaddr_in sin;
2675 memset(&sin, 0, sizeof sin);
2676 sin.sin_family = AF_INET;
2677 sin.sin_addr = addr;
2678 sin.sin_port = 0;
2679
2680 memset(sa, 0, sizeof *sa);
2681 memcpy(sa, &sin, sizeof sin);
2682}
2683
2684static int
2685do_set_addr(struct netdev *netdev,
2686 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2687{
2688 struct ifreq ifr;
149f577a 2689
259e0b1a
BP
2690 make_in4_sockaddr(&ifr.ifr_addr, addr);
2691 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2692 ioctl_name);
8b61709d
BP
2693}
2694
2695/* Adds 'router' as a default IP gateway. */
2696static int
67a4917b 2697netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
2698{
2699 struct in_addr any = { INADDR_ANY };
2700 struct rtentry rt;
2701 int error;
2702
2703 memset(&rt, 0, sizeof rt);
2704 make_in4_sockaddr(&rt.rt_dst, any);
2705 make_in4_sockaddr(&rt.rt_gateway, router);
2706 make_in4_sockaddr(&rt.rt_genmask, any);
2707 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 2708 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 2709 if (error) {
10a89ef0 2710 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
2711 }
2712 return error;
2713}
2714
f1acd62b
BP
2715static int
2716netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2717 char **netdev_name)
2718{
2719 static const char fn[] = "/proc/net/route";
2720 FILE *stream;
2721 char line[256];
2722 int ln;
2723
2724 *netdev_name = NULL;
2725 stream = fopen(fn, "r");
2726 if (stream == NULL) {
10a89ef0 2727 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
2728 return errno;
2729 }
2730
2731 ln = 0;
2732 while (fgets(line, sizeof line, stream)) {
2733 if (++ln >= 2) {
2734 char iface[17];
dbba996b 2735 ovs_be32 dest, gateway, mask;
f1acd62b
BP
2736 int refcnt, metric, mtu;
2737 unsigned int flags, use, window, irtt;
2738
c2c28dfd
BP
2739 if (!ovs_scan(line,
2740 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2741 " %d %u %u\n",
2742 iface, &dest, &gateway, &flags, &refcnt,
2743 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 2744 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
2745 fn, ln, line);
2746 continue;
2747 }
2748 if (!(flags & RTF_UP)) {
2749 /* Skip routes that aren't up. */
2750 continue;
2751 }
2752
2753 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 2754 * network byte order, so we don't need need any endian
f1acd62b
BP
2755 * conversions here. */
2756 if ((dest & mask) == (host->s_addr & mask)) {
2757 if (!gateway) {
2758 /* The host is directly reachable. */
2759 next_hop->s_addr = 0;
2760 } else {
2761 /* To reach the host, we must go through a gateway. */
2762 next_hop->s_addr = gateway;
2763 }
2764 *netdev_name = xstrdup(iface);
2765 fclose(stream);
2766 return 0;
2767 }
2768 }
2769 }
2770
2771 fclose(stream);
2772 return ENXIO;
2773}
2774
e210037e 2775static int
b5d57fc8 2776netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 2777{
b5d57fc8 2778 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
2779 int error = 0;
2780
86383816 2781 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
2782 if (!(netdev->cache_valid & VALID_DRVINFO)) {
2783 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
2784
2785 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
2786 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
2787 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
2788 cmd,
2789 ETHTOOL_GDRVINFO,
2790 "ETHTOOL_GDRVINFO");
2791 if (!error) {
b5d57fc8 2792 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
2793 }
2794 }
e210037e 2795
e210037e 2796 if (!error) {
b5d57fc8
BP
2797 smap_add(smap, "driver_name", netdev->drvinfo.driver);
2798 smap_add(smap, "driver_version", netdev->drvinfo.version);
2799 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 2800 }
86383816
BP
2801 ovs_mutex_unlock(&netdev->mutex);
2802
e210037e
AE
2803 return error;
2804}
2805
4f925bd3 2806static int
275707c3
EJ
2807netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
2808 struct smap *smap)
4f925bd3 2809{
79f1cbe9 2810 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
2811 return 0;
2812}
2813
8b61709d
BP
2814/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2815 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2816 * returns 0. Otherwise, it returns a positive errno value; in particular,
2817 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2818static int
2819netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 2820 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
2821{
2822 struct arpreq r;
c100e025 2823 struct sockaddr_in sin;
8b61709d
BP
2824 int retval;
2825
2826 memset(&r, 0, sizeof r);
f2cc621b 2827 memset(&sin, 0, sizeof sin);
c100e025
BP
2828 sin.sin_family = AF_INET;
2829 sin.sin_addr.s_addr = ip;
2830 sin.sin_port = 0;
2831 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
2832 r.arp_ha.sa_family = ARPHRD_ETHER;
2833 r.arp_flags = 0;
71d7c22f 2834 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 2835 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 2836 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
2837 if (!retval) {
2838 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2839 } else if (retval != ENXIO) {
2840 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
2841 netdev_get_name(netdev), IP_ARGS(ip),
2842 ovs_strerror(retval));
8b61709d
BP
2843 }
2844 return retval;
2845}
2846
2847static int
2848nd_to_iff_flags(enum netdev_flags nd)
2849{
2850 int iff = 0;
2851 if (nd & NETDEV_UP) {
2852 iff |= IFF_UP;
2853 }
2854 if (nd & NETDEV_PROMISC) {
2855 iff |= IFF_PROMISC;
2856 }
7ba19d41
AC
2857 if (nd & NETDEV_LOOPBACK) {
2858 iff |= IFF_LOOPBACK;
2859 }
8b61709d
BP
2860 return iff;
2861}
2862
2863static int
2864iff_to_nd_flags(int iff)
2865{
2866 enum netdev_flags nd = 0;
2867 if (iff & IFF_UP) {
2868 nd |= NETDEV_UP;
2869 }
2870 if (iff & IFF_PROMISC) {
2871 nd |= NETDEV_PROMISC;
2872 }
7ba19d41
AC
2873 if (iff & IFF_LOOPBACK) {
2874 nd |= NETDEV_LOOPBACK;
2875 }
8b61709d
BP
2876 return nd;
2877}
2878
2879static int
4f9f3f21
BP
2880update_flags(struct netdev_linux *netdev, enum netdev_flags off,
2881 enum netdev_flags on, enum netdev_flags *old_flagsp)
2882 OVS_REQUIRES(netdev->mutex)
8b61709d
BP
2883{
2884 int old_flags, new_flags;
c37d4da4
EJ
2885 int error = 0;
2886
b5d57fc8 2887 old_flags = netdev->ifi_flags;
c37d4da4
EJ
2888 *old_flagsp = iff_to_nd_flags(old_flags);
2889 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2890 if (new_flags != old_flags) {
4f9f3f21
BP
2891 error = set_flags(netdev_get_name(&netdev->up), new_flags);
2892 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 2893 }
4f9f3f21
BP
2894
2895 return error;
2896}
2897
2898static int
2899netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
2900 enum netdev_flags on, enum netdev_flags *old_flagsp)
2901{
2902 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756819dd 2903 int error = 0;
4f9f3f21
BP
2904
2905 ovs_mutex_lock(&netdev->mutex);
756819dd
FL
2906 if (on || off) {
2907 /* Changing flags over netlink isn't support yet. */
2908 error = update_flags(netdev, off, on, old_flagsp);
2909 } else {
2910 /* Try reading flags over netlink, or fall back to ioctl. */
2911 if (!netdev_linux_update_via_netlink(netdev)) {
2912 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
2913 } else {
2914 error = update_flags(netdev, off, on, old_flagsp);
2915 }
2916 }
86383816 2917 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
2918 return error;
2919}
2920
2f9dd77f 2921#define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
18ebd48c
PB
2922 GET_FEATURES, GET_STATUS, \
2923 FLOW_OFFLOAD_API) \
c3827f61
BP
2924{ \
2925 NAME, \
118c77b1 2926 false, /* is_pmd */ \
c3827f61 2927 \
259e0b1a 2928 NULL, \
c3827f61
BP
2929 netdev_linux_run, \
2930 netdev_linux_wait, \
2931 \
9dc63482
BP
2932 netdev_linux_alloc, \
2933 CONSTRUCT, \
2934 netdev_linux_destruct, \
2935 netdev_linux_dealloc, \
de5cdb90 2936 NULL, /* get_config */ \
6d9e6eb4 2937 NULL, /* set_config */ \
f431bf7d 2938 NULL, /* get_tunnel_config */ \
a36de779
PS
2939 NULL, /* build header */ \
2940 NULL, /* push header */ \
2941 NULL, /* pop header */ \
7dec44fe 2942 NULL, /* get_numa_id */ \
050c60bf 2943 NULL, /* set_tx_multiq */ \
c3827f61 2944 \
c3827f61
BP
2945 netdev_linux_send, \
2946 netdev_linux_send_wait, \
2947 \
2948 netdev_linux_set_etheraddr, \
2949 netdev_linux_get_etheraddr, \
2950 netdev_linux_get_mtu, \
9b020780 2951 netdev_linux_set_mtu, \
c3827f61
BP
2952 netdev_linux_get_ifindex, \
2953 netdev_linux_get_carrier, \
65c3058c 2954 netdev_linux_get_carrier_resets, \
1670c579 2955 netdev_linux_set_miimon_interval, \
f613a0d7 2956 GET_STATS, \
971f4b39 2957 NULL, \
c3827f61 2958 \
51f87458 2959 GET_FEATURES, \
c3827f61 2960 netdev_linux_set_advertisements, \
875ab130 2961 NULL, /* get_pt_mode */ \
c3827f61
BP
2962 \
2963 netdev_linux_set_policing, \
2964 netdev_linux_get_qos_types, \
2965 netdev_linux_get_qos_capabilities, \
2966 netdev_linux_get_qos, \
2967 netdev_linux_set_qos, \
2968 netdev_linux_get_queue, \
2969 netdev_linux_set_queue, \
2970 netdev_linux_delete_queue, \
2971 netdev_linux_get_queue_stats, \
89454bf4
BP
2972 netdev_linux_queue_dump_start, \
2973 netdev_linux_queue_dump_next, \
2974 netdev_linux_queue_dump_done, \
c3827f61
BP
2975 netdev_linux_dump_queue_stats, \
2976 \
c3827f61 2977 netdev_linux_set_in4, \
a8704b50 2978 netdev_linux_get_addr_list, \
c3827f61
BP
2979 netdev_linux_add_router, \
2980 netdev_linux_get_next_hop, \
4f925bd3 2981 GET_STATUS, \
c3827f61
BP
2982 netdev_linux_arp_lookup, \
2983 \
2984 netdev_linux_update_flags, \
790fb3b7 2985 NULL, /* reconfigure */ \
c3827f61 2986 \
f7791740
PS
2987 netdev_linux_rxq_alloc, \
2988 netdev_linux_rxq_construct, \
2989 netdev_linux_rxq_destruct, \
2990 netdev_linux_rxq_dealloc, \
2991 netdev_linux_rxq_recv, \
2992 netdev_linux_rxq_wait, \
2993 netdev_linux_rxq_drain, \
18ebd48c
PB
2994 \
2995 FLOW_OFFLOAD_API \
c3827f61
BP
2996}
2997
2998const struct netdev_class netdev_linux_class =
2999 NETDEV_LINUX_CLASS(
3000 "system",
9dc63482 3001 netdev_linux_construct,
f613a0d7 3002 netdev_linux_get_stats,
51f87458 3003 netdev_linux_get_features,
18ebd48c
PB
3004 netdev_linux_get_status,
3005 LINUX_FLOW_OFFLOAD_API);
c3827f61
BP
3006
3007const struct netdev_class netdev_tap_class =
3008 NETDEV_LINUX_CLASS(
3009 "tap",
9dc63482 3010 netdev_linux_construct_tap,
bba1e6f3 3011 netdev_tap_get_stats,
51f87458 3012 netdev_linux_get_features,
18ebd48c
PB
3013 netdev_linux_get_status,
3014 NO_OFFLOAD_API);
c3827f61
BP
3015
3016const struct netdev_class netdev_internal_class =
3017 NETDEV_LINUX_CLASS(
3018 "internal",
9dc63482 3019 netdev_linux_construct,
bba1e6f3 3020 netdev_internal_get_stats,
51f87458 3021 NULL, /* get_features */
18ebd48c
PB
3022 netdev_internal_get_status,
3023 NO_OFFLOAD_API);
8b61709d 3024\f
677d9158
JV
3025
3026#define CODEL_N_QUEUES 0x0000
3027
2f4298ce
BP
3028/* In sufficiently new kernel headers these are defined as enums in
3029 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3030 * kernels. (This overrides any enum definition in the header file but that's
3031 * harmless.) */
3032#define TCA_CODEL_TARGET 1
3033#define TCA_CODEL_LIMIT 2
3034#define TCA_CODEL_INTERVAL 3
3035
677d9158
JV
3036struct codel {
3037 struct tc tc;
3038 uint32_t target;
3039 uint32_t limit;
3040 uint32_t interval;
3041};
3042
3043static struct codel *
3044codel_get__(const struct netdev *netdev_)
3045{
3046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3047 return CONTAINER_OF(netdev->tc, struct codel, tc);
3048}
3049
3050static void
3051codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3052 uint32_t interval)
3053{
3054 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3055 struct codel *codel;
3056
3057 codel = xmalloc(sizeof *codel);
3058 tc_init(&codel->tc, &tc_ops_codel);
3059 codel->target = target;
3060 codel->limit = limit;
3061 codel->interval = interval;
3062
3063 netdev->tc = &codel->tc;
3064}
3065
3066static int
3067codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3068 uint32_t interval)
3069{
3070 size_t opt_offset;
3071 struct ofpbuf request;
3072 struct tcmsg *tcmsg;
3073 uint32_t otarget, olimit, ointerval;
3074 int error;
3075
3076 tc_del_qdisc(netdev);
3077
7874bdff
RD
3078 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3079 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3080 if (!tcmsg) {
3081 return ENODEV;
3082 }
3083 tcmsg->tcm_handle = tc_make_handle(1, 0);
3084 tcmsg->tcm_parent = TC_H_ROOT;
3085
3086 otarget = target ? target : 5000;
3087 olimit = limit ? limit : 10240;
3088 ointerval = interval ? interval : 100000;
3089
3090 nl_msg_put_string(&request, TCA_KIND, "codel");
3091 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3092 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3093 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3094 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3095 nl_msg_end_nested(&request, opt_offset);
3096
3097 error = tc_transact(&request, NULL);
3098 if (error) {
3099 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3100 "target %u, limit %u, interval %u error %d(%s)",
3101 netdev_get_name(netdev),
3102 otarget, olimit, ointerval,
3103 error, ovs_strerror(error));
3104 }
3105 return error;
3106}
3107
3108static void
3109codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3110 const struct smap *details, struct codel *codel)
3111{
13c1637f
BP
3112 codel->target = smap_get_ullong(details, "target", 0);
3113 codel->limit = smap_get_ullong(details, "limit", 0);
3114 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3115
3116 if (!codel->target) {
3117 codel->target = 5000;
3118 }
3119 if (!codel->limit) {
3120 codel->limit = 10240;
3121 }
3122 if (!codel->interval) {
3123 codel->interval = 100000;
3124 }
3125}
3126
3127static int
3128codel_tc_install(struct netdev *netdev, const struct smap *details)
3129{
3130 int error;
3131 struct codel codel;
3132
3133 codel_parse_qdisc_details__(netdev, details, &codel);
3134 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3135 codel.interval);
3136 if (!error) {
3137 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3138 }
3139 return error;
3140}
3141
3142static int
3143codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3144{
3145 static const struct nl_policy tca_codel_policy[] = {
3146 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3147 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3148 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3149 };
3150
3151 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3152
3153 if (!nl_parse_nested(nl_options, tca_codel_policy,
3154 attrs, ARRAY_SIZE(tca_codel_policy))) {
3155 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3156 return EPROTO;
3157 }
3158
3159 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3160 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3161 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3162 return 0;
3163}
3164
3165static int
3166codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3167{
3168 struct nlattr *nlattr;
3169 const char * kind;
3170 int error;
3171 struct codel codel;
3172
3173 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3174 if (error != 0) {
3175 return error;
3176 }
3177
3178 error = codel_parse_tca_options__(nlattr, &codel);
3179 if (error != 0) {
3180 return error;
3181 }
3182
3183 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3184 return 0;
3185}
3186
3187
3188static void
3189codel_tc_destroy(struct tc *tc)
3190{
3191 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3192 tc_destroy(tc);
3193 free(codel);
3194}
3195
3196static int
3197codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3198{
3199 const struct codel *codel = codel_get__(netdev);
3200 smap_add_format(details, "target", "%u", codel->target);
3201 smap_add_format(details, "limit", "%u", codel->limit);
3202 smap_add_format(details, "interval", "%u", codel->interval);
3203 return 0;
3204}
3205
3206static int
3207codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3208{
3209 struct codel codel;
3210
3211 codel_parse_qdisc_details__(netdev, details, &codel);
3212 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3213 codel_get__(netdev)->target = codel.target;
3214 codel_get__(netdev)->limit = codel.limit;
3215 codel_get__(netdev)->interval = codel.interval;
3216 return 0;
3217}
3218
3219static const struct tc_ops tc_ops_codel = {
3220 "codel", /* linux_name */
3221 "linux-codel", /* ovs_name */
3222 CODEL_N_QUEUES, /* n_queues */
3223 codel_tc_install,
3224 codel_tc_load,
3225 codel_tc_destroy,
3226 codel_qdisc_get,
3227 codel_qdisc_set,
3228 NULL,
3229 NULL,
3230 NULL,
3231 NULL,
3232 NULL
3233};
3234\f
3235/* FQ-CoDel traffic control class. */
3236
3237#define FQCODEL_N_QUEUES 0x0000
3238
2f4298ce
BP
3239/* In sufficiently new kernel headers these are defined as enums in
3240 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3241 * kernels. (This overrides any enum definition in the header file but that's
3242 * harmless.) */
3243#define TCA_FQ_CODEL_TARGET 1
3244#define TCA_FQ_CODEL_LIMIT 2
3245#define TCA_FQ_CODEL_INTERVAL 3
3246#define TCA_FQ_CODEL_ECN 4
3247#define TCA_FQ_CODEL_FLOWS 5
3248#define TCA_FQ_CODEL_QUANTUM 6
3249
677d9158
JV
3250struct fqcodel {
3251 struct tc tc;
3252 uint32_t target;
3253 uint32_t limit;
3254 uint32_t interval;
3255 uint32_t flows;
3256 uint32_t quantum;
3257};
3258
3259static struct fqcodel *
3260fqcodel_get__(const struct netdev *netdev_)
3261{
3262 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3263 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3264}
3265
3266static void
3267fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3268 uint32_t interval, uint32_t flows, uint32_t quantum)
3269{
3270 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3271 struct fqcodel *fqcodel;
3272
3273 fqcodel = xmalloc(sizeof *fqcodel);
3274 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3275 fqcodel->target = target;
3276 fqcodel->limit = limit;
3277 fqcodel->interval = interval;
3278 fqcodel->flows = flows;
3279 fqcodel->quantum = quantum;
3280
3281 netdev->tc = &fqcodel->tc;
3282}
3283
3284static int
3285fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3286 uint32_t interval, uint32_t flows, uint32_t quantum)
3287{
3288 size_t opt_offset;
3289 struct ofpbuf request;
3290 struct tcmsg *tcmsg;
3291 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3292 int error;
3293
3294 tc_del_qdisc(netdev);
3295
7874bdff
RD
3296 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3297 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3298 if (!tcmsg) {
3299 return ENODEV;
3300 }
3301 tcmsg->tcm_handle = tc_make_handle(1, 0);
3302 tcmsg->tcm_parent = TC_H_ROOT;
3303
3304 otarget = target ? target : 5000;
3305 olimit = limit ? limit : 10240;
3306 ointerval = interval ? interval : 100000;
3307 oflows = flows ? flows : 1024;
3308 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3309 not mtu */
3310
3311 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3312 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3313 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3314 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3315 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3316 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3317 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3318 nl_msg_end_nested(&request, opt_offset);
3319
3320 error = tc_transact(&request, NULL);
3321 if (error) {
3322 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3323 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3324 netdev_get_name(netdev),
3325 otarget, olimit, ointerval, oflows, oquantum,
3326 error, ovs_strerror(error));
3327 }
3328 return error;
3329}
3330
3331static void
3332fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3333 const struct smap *details, struct fqcodel *fqcodel)
3334{
13c1637f
BP
3335 fqcodel->target = smap_get_ullong(details, "target", 0);
3336 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3337 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3338 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3339 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3340
677d9158
JV
3341 if (!fqcodel->target) {
3342 fqcodel->target = 5000;
3343 }
3344 if (!fqcodel->limit) {
3345 fqcodel->limit = 10240;
3346 }
3347 if (!fqcodel->interval) {
3348 fqcodel->interval = 1000000;
3349 }
3350 if (!fqcodel->flows) {
3351 fqcodel->flows = 1024;
3352 }
3353 if (!fqcodel->quantum) {
3354 fqcodel->quantum = 1514;
3355 }
3356}
3357
3358static int
3359fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3360{
3361 int error;
3362 struct fqcodel fqcodel;
3363
3364 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3365 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3366 fqcodel.interval, fqcodel.flows,
3367 fqcodel.quantum);
3368 if (!error) {
3369 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3370 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3371 }
3372 return error;
3373}
3374
3375static int
3376fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3377{
3378 static const struct nl_policy tca_fqcodel_policy[] = {
3379 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3380 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3381 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3382 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3383 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3384 };
3385
3386 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3387
3388 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3389 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3390 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3391 return EPROTO;
3392 }
3393
3394 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3395 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3396 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3397 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3398 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3399 return 0;
3400}
3401
3402static int
3403fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3404{
3405 struct nlattr *nlattr;
3406 const char * kind;
3407 int error;
3408 struct fqcodel fqcodel;
3409
3410 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3411 if (error != 0) {
3412 return error;
3413 }
3414
3415 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
3416 if (error != 0) {
3417 return error;
3418 }
3419
3420 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3421 fqcodel.flows, fqcodel.quantum);
3422 return 0;
3423}
3424
3425static void
3426fqcodel_tc_destroy(struct tc *tc)
3427{
3428 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
3429 tc_destroy(tc);
3430 free(fqcodel);
3431}
3432
3433static int
3434fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
3435{
3436 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
3437 smap_add_format(details, "target", "%u", fqcodel->target);
3438 smap_add_format(details, "limit", "%u", fqcodel->limit);
3439 smap_add_format(details, "interval", "%u", fqcodel->interval);
3440 smap_add_format(details, "flows", "%u", fqcodel->flows);
3441 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
3442 return 0;
3443}
3444
3445static int
3446fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
3447{
3448 struct fqcodel fqcodel;
3449
3450 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3451 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
3452 fqcodel.flows, fqcodel.quantum);
3453 fqcodel_get__(netdev)->target = fqcodel.target;
3454 fqcodel_get__(netdev)->limit = fqcodel.limit;
3455 fqcodel_get__(netdev)->interval = fqcodel.interval;
3456 fqcodel_get__(netdev)->flows = fqcodel.flows;
3457 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
3458 return 0;
3459}
3460
3461static const struct tc_ops tc_ops_fqcodel = {
3462 "fq_codel", /* linux_name */
3463 "linux-fq_codel", /* ovs_name */
3464 FQCODEL_N_QUEUES, /* n_queues */
3465 fqcodel_tc_install,
3466 fqcodel_tc_load,
3467 fqcodel_tc_destroy,
3468 fqcodel_qdisc_get,
3469 fqcodel_qdisc_set,
3470 NULL,
3471 NULL,
3472 NULL,
3473 NULL,
3474 NULL
3475};
3476\f
3477/* SFQ traffic control class. */
3478
3479#define SFQ_N_QUEUES 0x0000
3480
3481struct sfq {
3482 struct tc tc;
3483 uint32_t quantum;
3484 uint32_t perturb;
3485};
3486
3487static struct sfq *
3488sfq_get__(const struct netdev *netdev_)
3489{
3490 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3491 return CONTAINER_OF(netdev->tc, struct sfq, tc);
3492}
3493
3494static void
3495sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
3496{
3497 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3498 struct sfq *sfq;
3499
3500 sfq = xmalloc(sizeof *sfq);
3501 tc_init(&sfq->tc, &tc_ops_sfq);
3502 sfq->perturb = perturb;
3503 sfq->quantum = quantum;
3504
3505 netdev->tc = &sfq->tc;
3506}
3507
3508static int
3509sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
3510{
3511 struct tc_sfq_qopt opt;
3512 struct ofpbuf request;
3513 struct tcmsg *tcmsg;
3514 int mtu;
3515 int mtu_error, error;
3516 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
3517
3518 tc_del_qdisc(netdev);
3519
7874bdff
RD
3520 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3521 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3522 if (!tcmsg) {
3523 return ENODEV;
3524 }
3525 tcmsg->tcm_handle = tc_make_handle(1, 0);
3526 tcmsg->tcm_parent = TC_H_ROOT;
3527
3528 memset(&opt, 0, sizeof opt);
3529 if (!quantum) {
3530 if (!mtu_error) {
3531 opt.quantum = mtu; /* if we cannot find mtu, use default */
3532 }
3533 } else {
3534 opt.quantum = quantum;
3535 }
3536
3537 if (!perturb) {
3538 opt.perturb_period = 10;
3539 } else {
3540 opt.perturb_period = perturb;
3541 }
3542
3543 nl_msg_put_string(&request, TCA_KIND, "sfq");
3544 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3545
3546 error = tc_transact(&request, NULL);
3547 if (error) {
3548 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3549 "quantum %u, perturb %u error %d(%s)",
3550 netdev_get_name(netdev),
3551 opt.quantum, opt.perturb_period,
3552 error, ovs_strerror(error));
3553 }
3554 return error;
3555}
3556
3557static void
3558sfq_parse_qdisc_details__(struct netdev *netdev,
3559 const struct smap *details, struct sfq *sfq)
3560{
13c1637f
BP
3561 sfq->perturb = smap_get_ullong(details, "perturb", 0);
3562 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 3563
677d9158
JV
3564 if (!sfq->perturb) {
3565 sfq->perturb = 10;
3566 }
3567
3568 if (!sfq->quantum) {
13c1637f
BP
3569 int mtu;
3570 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
3571 sfq->quantum = mtu;
3572 } else {
3573 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
3574 "device without mtu");
677d9158
JV
3575 }
3576 }
3577}
3578
3579static int
3580sfq_tc_install(struct netdev *netdev, const struct smap *details)
3581{
3582 int error;
3583 struct sfq sfq;
3584
3585 sfq_parse_qdisc_details__(netdev, details, &sfq);
3586 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
3587 if (!error) {
3588 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3589 }
3590 return error;
3591}
3592
3593static int
3594sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3595{
3596 const struct tc_sfq_qopt *sfq;
3597 struct nlattr *nlattr;
3598 const char * kind;
3599 int error;
3600
3601 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3602 if (error == 0) {
3603 sfq = nl_attr_get(nlattr);
3604 sfq_install__(netdev, sfq->perturb_period, sfq->quantum);
3605 return 0;
3606 }
3607
3608 return error;
3609}
3610
3611static void
3612sfq_tc_destroy(struct tc *tc)
3613{
3614 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
3615 tc_destroy(tc);
3616 free(sfq);
3617}
3618
3619static int
3620sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
3621{
3622 const struct sfq *sfq = sfq_get__(netdev);
3623 smap_add_format(details, "quantum", "%u", sfq->quantum);
3624 smap_add_format(details, "perturb", "%u", sfq->perturb);
3625 return 0;
3626}
3627
3628static int
3629sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
3630{
3631 struct sfq sfq;
3632
3633 sfq_parse_qdisc_details__(netdev, details, &sfq);
3634 sfq_install__(netdev, sfq.quantum, sfq.perturb);
3635 sfq_get__(netdev)->quantum = sfq.quantum;
3636 sfq_get__(netdev)->perturb = sfq.perturb;
3637 return 0;
3638}
3639
3640static const struct tc_ops tc_ops_sfq = {
3641 "sfq", /* linux_name */
3642 "linux-sfq", /* ovs_name */
3643 SFQ_N_QUEUES, /* n_queues */
3644 sfq_tc_install,
3645 sfq_tc_load,
3646 sfq_tc_destroy,
3647 sfq_qdisc_get,
3648 sfq_qdisc_set,
3649 NULL,
3650 NULL,
3651 NULL,
3652 NULL,
3653 NULL
3654};
3655\f
c1c9c9c4 3656/* HTB traffic control class. */
559843ed 3657
c1c9c9c4 3658#define HTB_N_QUEUES 0xf000
4f631ccd 3659#define HTB_RATE2QUANTUM 10
8b61709d 3660
c1c9c9c4
BP
3661struct htb {
3662 struct tc tc;
3663 unsigned int max_rate; /* In bytes/s. */
3664};
8b61709d 3665
c1c9c9c4 3666struct htb_class {
93b13be8 3667 struct tc_queue tc_queue;
c1c9c9c4
BP
3668 unsigned int min_rate; /* In bytes/s. */
3669 unsigned int max_rate; /* In bytes/s. */
3670 unsigned int burst; /* In bytes. */
3671 unsigned int priority; /* Lower values are higher priorities. */
3672};
8b61709d 3673
c1c9c9c4 3674static struct htb *
b5d57fc8 3675htb_get__(const struct netdev *netdev_)
c1c9c9c4 3676{
b5d57fc8
BP
3677 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3678 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
3679}
3680
24045e35 3681static void
b5d57fc8 3682htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 3683{
b5d57fc8 3684 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3685 struct htb *htb;
3686
3687 htb = xmalloc(sizeof *htb);
3688 tc_init(&htb->tc, &tc_ops_htb);
3689 htb->max_rate = max_rate;
3690
b5d57fc8 3691 netdev->tc = &htb->tc;
c1c9c9c4
BP
3692}
3693
3694/* Create an HTB qdisc.
3695 *
a339aa81 3696 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
3697static int
3698htb_setup_qdisc__(struct netdev *netdev)
3699{
3700 size_t opt_offset;
3701 struct tc_htb_glob opt;
3702 struct ofpbuf request;
3703 struct tcmsg *tcmsg;
3704
3705 tc_del_qdisc(netdev);
3706
7874bdff
RD
3707 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3708 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
3709 if (!tcmsg) {
3710 return ENODEV;
3711 }
c1c9c9c4
BP
3712 tcmsg->tcm_handle = tc_make_handle(1, 0);
3713 tcmsg->tcm_parent = TC_H_ROOT;
3714
3715 nl_msg_put_string(&request, TCA_KIND, "htb");
3716
3717 memset(&opt, 0, sizeof opt);
4f631ccd 3718 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 3719 opt.version = 3;
4ecf12d5 3720 opt.defcls = 1;
c1c9c9c4
BP
3721
3722 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3723 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
3724 nl_msg_end_nested(&request, opt_offset);
3725
3726 return tc_transact(&request, NULL);
3727}
3728
3729/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3730 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3731static int
3732htb_setup_class__(struct netdev *netdev, unsigned int handle,
3733 unsigned int parent, struct htb_class *class)
3734{
3735 size_t opt_offset;
3736 struct tc_htb_opt opt;
3737 struct ofpbuf request;
3738 struct tcmsg *tcmsg;
3739 int error;
3740 int mtu;
3741
73371c09 3742 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3743 if (error) {
f915f1a8
BP
3744 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
3745 netdev_get_name(netdev));
9b020780 3746 return error;
f915f1a8 3747 }
c1c9c9c4
BP
3748
3749 memset(&opt, 0, sizeof opt);
3750 tc_fill_rate(&opt.rate, class->min_rate, mtu);
3751 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
3752 /* Makes sure the quantum is at least MTU. Setting quantum will
3753 * make htb ignore the r2q for this class. */
3754 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
3755 opt.quantum = mtu;
3756 }
c1c9c9c4
BP
3757 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
3758 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
3759 opt.prio = class->priority;
3760
7874bdff
RD
3761 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
3762 &request);
23a98ffe
BP
3763 if (!tcmsg) {
3764 return ENODEV;
3765 }
c1c9c9c4
BP
3766 tcmsg->tcm_handle = handle;
3767 tcmsg->tcm_parent = parent;
3768
3769 nl_msg_put_string(&request, TCA_KIND, "htb");
3770 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3771 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
3772 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
3773 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
3774 nl_msg_end_nested(&request, opt_offset);
3775
3776 error = tc_transact(&request, NULL);
3777 if (error) {
3778 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3779 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3780 netdev_get_name(netdev),
3781 tc_get_major(handle), tc_get_minor(handle),
3782 tc_get_major(parent), tc_get_minor(parent),
3783 class->min_rate, class->max_rate,
10a89ef0 3784 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
3785 }
3786 return error;
3787}
3788
3789/* Parses Netlink attributes in 'options' for HTB parameters and stores a
3790 * description of them into 'details'. The description complies with the
3791 * specification given in the vswitch database documentation for linux-htb
3792 * queue details. */
3793static int
3794htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
3795{
3796 static const struct nl_policy tca_htb_policy[] = {
3797 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
3798 .min_len = sizeof(struct tc_htb_opt) },
3799 };
3800
3801 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
3802 const struct tc_htb_opt *htb;
3803
3804 if (!nl_parse_nested(nl_options, tca_htb_policy,
3805 attrs, ARRAY_SIZE(tca_htb_policy))) {
3806 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
3807 return EPROTO;
3808 }
3809
3810 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
3811 class->min_rate = htb->rate.rate;
3812 class->max_rate = htb->ceil.rate;
3813 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
3814 class->priority = htb->prio;
3815 return 0;
3816}
3817
3818static int
3819htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
3820 struct htb_class *options,
3821 struct netdev_queue_stats *stats)
3822{
3823 struct nlattr *nl_options;
3824 unsigned int handle;
3825 int error;
3826
3827 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
3828 if (!error && queue_id) {
17ee3c1f
BP
3829 unsigned int major = tc_get_major(handle);
3830 unsigned int minor = tc_get_minor(handle);
3831 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
3832 *queue_id = minor - 1;
c1c9c9c4
BP
3833 } else {
3834 error = EPROTO;
3835 }
3836 }
3837 if (!error && options) {
3838 error = htb_parse_tca_options__(nl_options, options);
3839 }
3840 return error;
3841}
3842
3843static void
73371c09 3844htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 3845 const struct smap *details, struct htb_class *hc)
c1c9c9c4 3846{
73371c09 3847 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 3848
13c1637f 3849 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 3850 if (!hc->max_rate) {
a00ca915 3851 enum netdev_features current;
c1c9c9c4 3852
73371c09
BP
3853 netdev_linux_read_features(netdev);
3854 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 3855 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
3856 }
3857 hc->min_rate = hc->max_rate;
3858 hc->burst = 0;
3859 hc->priority = 0;
3860}
3861
3862static int
3863htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 3864 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
3865{
3866 const struct htb *htb = htb_get__(netdev);
9b020780 3867 int mtu, error;
214117fd 3868 unsigned long long int max_rate_bit;
c1c9c9c4 3869
73371c09 3870 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 3871 if (error) {
f915f1a8
BP
3872 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
3873 netdev_get_name(netdev));
9b020780 3874 return error;
f915f1a8
BP
3875 }
3876
4f104611
EJ
3877 /* HTB requires at least an mtu sized min-rate to send any traffic even
3878 * on uncongested links. */
13c1637f 3879 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 3880 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
3881 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
3882
3883 /* max-rate */
214117fd
KF
3884 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
3885 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
3886 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
3887 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
3888
3889 /* burst
3890 *
3891 * According to hints in the documentation that I've read, it is important
3892 * that 'burst' be at least as big as the largest frame that might be
3893 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3894 * but having it a bit too small is a problem. Since netdev_get_mtu()
3895 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3896 * the MTU. We actually add 64, instead of 14, as a guard against
3897 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 3898 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
3899 hc->burst = MAX(hc->burst, mtu + 64);
3900
3901 /* priority */
13c1637f 3902 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
3903
3904 return 0;
3905}
3906
3907static int
3908htb_query_class__(const struct netdev *netdev, unsigned int handle,
3909 unsigned int parent, struct htb_class *options,
3910 struct netdev_queue_stats *stats)
3911{
3912 struct ofpbuf *reply;
3913 int error;
3914
3915 error = tc_query_class(netdev, handle, parent, &reply);
3916 if (!error) {
3917 error = htb_parse_tcmsg__(reply, NULL, options, stats);
3918 ofpbuf_delete(reply);
3919 }
3920 return error;
3921}
3922
3923static int
79f1cbe9 3924htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
3925{
3926 int error;
3927
3928 error = htb_setup_qdisc__(netdev);
3929 if (!error) {
3930 struct htb_class hc;
3931
3932 htb_parse_qdisc_details__(netdev, details, &hc);
3933 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3934 tc_make_handle(1, 0), &hc);
3935 if (!error) {
3936 htb_install__(netdev, hc.max_rate);
3937 }
3938 }
3939 return error;
3940}
3941
93b13be8
BP
3942static struct htb_class *
3943htb_class_cast__(const struct tc_queue *queue)
3944{
3945 return CONTAINER_OF(queue, struct htb_class, tc_queue);
3946}
3947
c1c9c9c4
BP
3948static void
3949htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
3950 const struct htb_class *hc)
3951{
3952 struct htb *htb = htb_get__(netdev);
93b13be8
BP
3953 size_t hash = hash_int(queue_id, 0);
3954 struct tc_queue *queue;
c1c9c9c4
BP
3955 struct htb_class *hcp;
3956
93b13be8
BP
3957 queue = tc_find_queue__(netdev, queue_id, hash);
3958 if (queue) {
3959 hcp = htb_class_cast__(queue);
3960 } else {
c1c9c9c4 3961 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
3962 queue = &hcp->tc_queue;
3963 queue->queue_id = queue_id;
6dc34a0d 3964 queue->created = time_msec();
93b13be8 3965 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 3966 }
93b13be8
BP
3967
3968 hcp->min_rate = hc->min_rate;
3969 hcp->max_rate = hc->max_rate;
3970 hcp->burst = hc->burst;
3971 hcp->priority = hc->priority;
c1c9c9c4
BP
3972}
3973
3974static int
3975htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3976{
c1c9c9c4 3977 struct ofpbuf msg;
d57695d7 3978 struct queue_dump_state state;
c1c9c9c4 3979 struct htb_class hc;
c1c9c9c4
BP
3980
3981 /* Get qdisc options. */
3982 hc.max_rate = 0;
3983 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 3984 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
3985
3986 /* Get queues. */
d57695d7 3987 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
3988 return ENODEV;
3989 }
d57695d7 3990 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
3991 unsigned int queue_id;
3992
3993 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3994 htb_update_queue__(netdev, queue_id, &hc);
3995 }
3996 }
d57695d7 3997 finish_queue_dump(&state);
c1c9c9c4
BP
3998
3999 return 0;
4000}
4001
4002static void
4003htb_tc_destroy(struct tc *tc)
4004{
4005 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 4006 struct htb_class *hc;
c1c9c9c4 4007
4ec3d7c7 4008 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
4009 free(hc);
4010 }
4011 tc_destroy(tc);
4012 free(htb);
4013}
4014
4015static int
79f1cbe9 4016htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
4017{
4018 const struct htb *htb = htb_get__(netdev);
79f1cbe9 4019 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
4020 return 0;
4021}
4022
4023static int
79f1cbe9 4024htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4025{
4026 struct htb_class hc;
4027 int error;
4028
4029 htb_parse_qdisc_details__(netdev, details, &hc);
4030 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4031 tc_make_handle(1, 0), &hc);
4032 if (!error) {
4033 htb_get__(netdev)->max_rate = hc.max_rate;
4034 }
4035 return error;
4036}
4037
4038static int
93b13be8 4039htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4040 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 4041{
93b13be8 4042 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4043
79f1cbe9 4044 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 4045 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4046 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 4047 }
79f1cbe9 4048 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 4049 if (hc->priority) {
79f1cbe9 4050 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
4051 }
4052 return 0;
4053}
4054
4055static int
4056htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4057 const struct smap *details)
c1c9c9c4
BP
4058{
4059 struct htb_class hc;
4060 int error;
4061
4062 error = htb_parse_class_details__(netdev, details, &hc);
4063 if (error) {
4064 return error;
4065 }
4066
17ee3c1f 4067 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
4068 tc_make_handle(1, 0xfffe), &hc);
4069 if (error) {
4070 return error;
4071 }
4072
4073 htb_update_queue__(netdev, queue_id, &hc);
4074 return 0;
4075}
4076
4077static int
93b13be8 4078htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 4079{
93b13be8 4080 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4081 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4082 int error;
4083
93b13be8 4084 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4085 if (!error) {
93b13be8 4086 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4087 free(hc);
c1c9c9c4
BP
4088 }
4089 return error;
4090}
4091
4092static int
93b13be8 4093htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4094 struct netdev_queue_stats *stats)
4095{
93b13be8 4096 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4097 tc_make_handle(1, 0xfffe), NULL, stats);
4098}
4099
4100static int
4101htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4102 const struct ofpbuf *nlmsg,
4103 netdev_dump_queue_stats_cb *cb, void *aux)
4104{
4105 struct netdev_queue_stats stats;
17ee3c1f 4106 unsigned int handle, major, minor;
c1c9c9c4
BP
4107 int error;
4108
4109 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4110 if (error) {
4111 return error;
4112 }
4113
17ee3c1f
BP
4114 major = tc_get_major(handle);
4115 minor = tc_get_minor(handle);
4116 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4117 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4118 }
4119 return 0;
4120}
4121
4122static const struct tc_ops tc_ops_htb = {
4123 "htb", /* linux_name */
4124 "linux-htb", /* ovs_name */
4125 HTB_N_QUEUES, /* n_queues */
4126 htb_tc_install,
4127 htb_tc_load,
4128 htb_tc_destroy,
4129 htb_qdisc_get,
4130 htb_qdisc_set,
4131 htb_class_get,
4132 htb_class_set,
4133 htb_class_delete,
4134 htb_class_get_stats,
4135 htb_class_dump_stats
4136};
4137\f
a339aa81
EJ
4138/* "linux-hfsc" traffic control class. */
4139
4140#define HFSC_N_QUEUES 0xf000
4141
4142struct hfsc {
4143 struct tc tc;
4144 uint32_t max_rate;
4145};
4146
4147struct hfsc_class {
4148 struct tc_queue tc_queue;
4149 uint32_t min_rate;
4150 uint32_t max_rate;
4151};
4152
4153static struct hfsc *
b5d57fc8 4154hfsc_get__(const struct netdev *netdev_)
a339aa81 4155{
b5d57fc8
BP
4156 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4157 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4158}
4159
4160static struct hfsc_class *
4161hfsc_class_cast__(const struct tc_queue *queue)
4162{
4163 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4164}
4165
24045e35 4166static void
b5d57fc8 4167hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4168{
b5d57fc8 4169 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4170 struct hfsc *hfsc;
4171
a339aa81
EJ
4172 hfsc = xmalloc(sizeof *hfsc);
4173 tc_init(&hfsc->tc, &tc_ops_hfsc);
4174 hfsc->max_rate = max_rate;
b5d57fc8 4175 netdev->tc = &hfsc->tc;
a339aa81
EJ
4176}
4177
4178static void
4179hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4180 const struct hfsc_class *hc)
4181{
4182 size_t hash;
4183 struct hfsc *hfsc;
4184 struct hfsc_class *hcp;
4185 struct tc_queue *queue;
4186
4187 hfsc = hfsc_get__(netdev);
4188 hash = hash_int(queue_id, 0);
4189
4190 queue = tc_find_queue__(netdev, queue_id, hash);
4191 if (queue) {
4192 hcp = hfsc_class_cast__(queue);
4193 } else {
4194 hcp = xmalloc(sizeof *hcp);
4195 queue = &hcp->tc_queue;
4196 queue->queue_id = queue_id;
6dc34a0d 4197 queue->created = time_msec();
a339aa81
EJ
4198 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4199 }
4200
4201 hcp->min_rate = hc->min_rate;
4202 hcp->max_rate = hc->max_rate;
4203}
4204
4205static int
4206hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4207{
4208 const struct tc_service_curve *rsc, *fsc, *usc;
4209 static const struct nl_policy tca_hfsc_policy[] = {
4210 [TCA_HFSC_RSC] = {
4211 .type = NL_A_UNSPEC,
4212 .optional = false,
4213 .min_len = sizeof(struct tc_service_curve),
4214 },
4215 [TCA_HFSC_FSC] = {
4216 .type = NL_A_UNSPEC,
4217 .optional = false,
4218 .min_len = sizeof(struct tc_service_curve),
4219 },
4220 [TCA_HFSC_USC] = {
4221 .type = NL_A_UNSPEC,
4222 .optional = false,
4223 .min_len = sizeof(struct tc_service_curve),
4224 },
4225 };
4226 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4227
4228 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4229 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4230 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4231 return EPROTO;
4232 }
4233
4234 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4235 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4236 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4237
4238 if (rsc->m1 != 0 || rsc->d != 0 ||
4239 fsc->m1 != 0 || fsc->d != 0 ||
4240 usc->m1 != 0 || usc->d != 0) {
4241 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4242 "Non-linear service curves are not supported.");
4243 return EPROTO;
4244 }
4245
4246 if (rsc->m2 != fsc->m2) {
4247 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4248 "Real-time service curves are not supported ");
4249 return EPROTO;
4250 }
4251
4252 if (rsc->m2 > usc->m2) {
4253 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
4254 "Min-rate service curve is greater than "
4255 "the max-rate service curve.");
4256 return EPROTO;
4257 }
4258
4259 class->min_rate = fsc->m2;
4260 class->max_rate = usc->m2;
4261 return 0;
4262}
4263
4264static int
4265hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4266 struct hfsc_class *options,
4267 struct netdev_queue_stats *stats)
4268{
4269 int error;
4270 unsigned int handle;
4271 struct nlattr *nl_options;
4272
4273 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4274 if (error) {
4275 return error;
4276 }
4277
4278 if (queue_id) {
4279 unsigned int major, minor;
4280
4281 major = tc_get_major(handle);
4282 minor = tc_get_minor(handle);
4283 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4284 *queue_id = minor - 1;
4285 } else {
4286 return EPROTO;
4287 }
4288 }
4289
4290 if (options) {
4291 error = hfsc_parse_tca_options__(nl_options, options);
4292 }
4293
4294 return error;
4295}
4296
4297static int
4298hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
4299 unsigned int parent, struct hfsc_class *options,
4300 struct netdev_queue_stats *stats)
4301{
4302 int error;
4303 struct ofpbuf *reply;
4304
4305 error = tc_query_class(netdev, handle, parent, &reply);
4306 if (error) {
4307 return error;
4308 }
4309
4310 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
4311 ofpbuf_delete(reply);
4312 return error;
4313}
4314
4315static void
73371c09 4316hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
4317 struct hfsc_class *class)
4318{
73371c09 4319 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 4320
13c1637f 4321 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 4322 if (!max_rate) {
a00ca915 4323 enum netdev_features current;
a339aa81 4324
73371c09
BP
4325 netdev_linux_read_features(netdev);
4326 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4327 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
4328 }
4329
4330 class->min_rate = max_rate;
4331 class->max_rate = max_rate;
4332}
4333
4334static int
4335hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 4336 const struct smap *details,
a339aa81
EJ
4337 struct hfsc_class * class)
4338{
4339 const struct hfsc *hfsc;
4340 uint32_t min_rate, max_rate;
a339aa81
EJ
4341
4342 hfsc = hfsc_get__(netdev);
a339aa81 4343
13c1637f 4344 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 4345 min_rate = MAX(min_rate, 1);
a339aa81
EJ
4346 min_rate = MIN(min_rate, hfsc->max_rate);
4347
13c1637f 4348 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
4349 max_rate = MAX(max_rate, min_rate);
4350 max_rate = MIN(max_rate, hfsc->max_rate);
4351
4352 class->min_rate = min_rate;
4353 class->max_rate = max_rate;
4354
4355 return 0;
4356}
4357
4358/* Create an HFSC qdisc.
4359 *
4360 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4361static int
4362hfsc_setup_qdisc__(struct netdev * netdev)
4363{
4364 struct tcmsg *tcmsg;
4365 struct ofpbuf request;
4366 struct tc_hfsc_qopt opt;
4367
4368 tc_del_qdisc(netdev);
4369
7874bdff
RD
4370 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4371 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
4372
4373 if (!tcmsg) {
4374 return ENODEV;
4375 }
4376
4377 tcmsg->tcm_handle = tc_make_handle(1, 0);
4378 tcmsg->tcm_parent = TC_H_ROOT;
4379
4380 memset(&opt, 0, sizeof opt);
4381 opt.defcls = 1;
4382
4383 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4384 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4385
4386 return tc_transact(&request, NULL);
4387}
4388
4389/* Create an HFSC class.
4390 *
4391 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4392 * sc rate <min_rate> ul rate <max_rate>" */
4393static int
4394hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
4395 unsigned int parent, struct hfsc_class *class)
4396{
4397 int error;
4398 size_t opt_offset;
4399 struct tcmsg *tcmsg;
4400 struct ofpbuf request;
4401 struct tc_service_curve min, max;
4402
7874bdff
RD
4403 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4404 &request);
a339aa81
EJ
4405
4406 if (!tcmsg) {
4407 return ENODEV;
4408 }
4409
4410 tcmsg->tcm_handle = handle;
4411 tcmsg->tcm_parent = parent;
4412
4413 min.m1 = 0;
4414 min.d = 0;
4415 min.m2 = class->min_rate;
4416
4417 max.m1 = 0;
4418 max.d = 0;
4419 max.m2 = class->max_rate;
4420
4421 nl_msg_put_string(&request, TCA_KIND, "hfsc");
4422 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4423 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
4424 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
4425 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
4426 nl_msg_end_nested(&request, opt_offset);
4427
4428 error = tc_transact(&request, NULL);
4429 if (error) {
4430 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4431 "min-rate %ubps, max-rate %ubps (%s)",
4432 netdev_get_name(netdev),
4433 tc_get_major(handle), tc_get_minor(handle),
4434 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 4435 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
4436 }
4437
4438 return error;
4439}
4440
4441static int
79f1cbe9 4442hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4443{
4444 int error;
4445 struct hfsc_class class;
4446
4447 error = hfsc_setup_qdisc__(netdev);
4448
4449 if (error) {
4450 return error;
4451 }
4452
4453 hfsc_parse_qdisc_details__(netdev, details, &class);
4454 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4455 tc_make_handle(1, 0), &class);
4456
4457 if (error) {
4458 return error;
4459 }
4460
4461 hfsc_install__(netdev, class.max_rate);
4462 return 0;
4463}
4464
4465static int
4466hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4467{
4468 struct ofpbuf msg;
d57695d7 4469 struct queue_dump_state state;
a339aa81
EJ
4470 struct hfsc_class hc;
4471
4472 hc.max_rate = 0;
4473 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4474 hfsc_install__(netdev, hc.max_rate);
a339aa81 4475
d57695d7 4476 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
4477 return ENODEV;
4478 }
4479
d57695d7 4480 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
4481 unsigned int queue_id;
4482
4483 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4484 hfsc_update_queue__(netdev, queue_id, &hc);
4485 }
4486 }
4487
d57695d7 4488 finish_queue_dump(&state);
a339aa81
EJ
4489 return 0;
4490}
4491
4492static void
4493hfsc_tc_destroy(struct tc *tc)
4494{
4495 struct hfsc *hfsc;
4496 struct hfsc_class *hc, *next;
4497
4498 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
4499
4500 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
4501 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4502 free(hc);
4503 }
4504
4505 tc_destroy(tc);
4506 free(hfsc);
4507}
4508
4509static int
79f1cbe9 4510hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
4511{
4512 const struct hfsc *hfsc;
4513 hfsc = hfsc_get__(netdev);
79f1cbe9 4514 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
4515 return 0;
4516}
4517
4518static int
79f1cbe9 4519hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
4520{
4521 int error;
4522 struct hfsc_class class;
4523
4524 hfsc_parse_qdisc_details__(netdev, details, &class);
4525 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4526 tc_make_handle(1, 0), &class);
4527
4528 if (!error) {
4529 hfsc_get__(netdev)->max_rate = class.max_rate;
4530 }
4531
4532 return error;
4533}
4534
4535static int
4536hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4537 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
4538{
4539 const struct hfsc_class *hc;
4540
4541 hc = hfsc_class_cast__(queue);
79f1cbe9 4542 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 4543 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4544 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
4545 }
4546 return 0;
4547}
4548
4549static int
4550hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4551 const struct smap *details)
a339aa81
EJ
4552{
4553 int error;
4554 struct hfsc_class class;
4555
4556 error = hfsc_parse_class_details__(netdev, details, &class);
4557 if (error) {
4558 return error;
4559 }
4560
4561 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
4562 tc_make_handle(1, 0xfffe), &class);
4563 if (error) {
4564 return error;
4565 }
4566
4567 hfsc_update_queue__(netdev, queue_id, &class);
4568 return 0;
4569}
4570
4571static int
4572hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
4573{
4574 int error;
4575 struct hfsc *hfsc;
4576 struct hfsc_class *hc;
4577
4578 hc = hfsc_class_cast__(queue);
4579 hfsc = hfsc_get__(netdev);
4580
4581 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
4582 if (!error) {
4583 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
4584 free(hc);
4585 }
4586 return error;
4587}
4588
4589static int
4590hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
4591 struct netdev_queue_stats *stats)
4592{
4593 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
4594 tc_make_handle(1, 0xfffe), NULL, stats);
4595}
4596
4597static int
4598hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4599 const struct ofpbuf *nlmsg,
4600 netdev_dump_queue_stats_cb *cb, void *aux)
4601{
4602 struct netdev_queue_stats stats;
4603 unsigned int handle, major, minor;
4604 int error;
4605
4606 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4607 if (error) {
4608 return error;
4609 }
4610
4611 major = tc_get_major(handle);
4612 minor = tc_get_minor(handle);
4613 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
4614 (*cb)(minor - 1, &stats, aux);
4615 }
4616 return 0;
4617}
4618
4619static const struct tc_ops tc_ops_hfsc = {
4620 "hfsc", /* linux_name */
4621 "linux-hfsc", /* ovs_name */
4622 HFSC_N_QUEUES, /* n_queues */
4623 hfsc_tc_install, /* tc_install */
4624 hfsc_tc_load, /* tc_load */
4625 hfsc_tc_destroy, /* tc_destroy */
4626 hfsc_qdisc_get, /* qdisc_get */
4627 hfsc_qdisc_set, /* qdisc_set */
4628 hfsc_class_get, /* class_get */
4629 hfsc_class_set, /* class_set */
4630 hfsc_class_delete, /* class_delete */
4631 hfsc_class_get_stats, /* class_get_stats */
4632 hfsc_class_dump_stats /* class_dump_stats */
4633};
4634\f
6cf888b8
BS
4635/* "linux-noop" traffic control class. */
4636
4637static void
4638noop_install__(struct netdev *netdev_)
4639{
4640 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4641 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
4642
4643 netdev->tc = CONST_CAST(struct tc *, &tc);
4644}
4645
4646static int
4647noop_tc_install(struct netdev *netdev,
4648 const struct smap *details OVS_UNUSED)
4649{
4650 noop_install__(netdev);
4651 return 0;
4652}
4653
4654static int
4655noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4656{
4657 noop_install__(netdev);
4658 return 0;
4659}
4660
4661static const struct tc_ops tc_ops_noop = {
4662 NULL, /* linux_name */
4663 "linux-noop", /* ovs_name */
4664 0, /* n_queues */
4665 noop_tc_install,
4666 noop_tc_load,
4667 NULL, /* tc_destroy */
4668 NULL, /* qdisc_get */
4669 NULL, /* qdisc_set */
4670 NULL, /* class_get */
4671 NULL, /* class_set */
4672 NULL, /* class_delete */
4673 NULL, /* class_get_stats */
4674 NULL /* class_dump_stats */
4675};
4676\f
c1c9c9c4
BP
4677/* "linux-default" traffic control class.
4678 *
4679 * This class represents the default, unnamed Linux qdisc. It corresponds to
4680 * the "" (empty string) QoS type in the OVS database. */
4681
4682static void
b5d57fc8 4683default_install__(struct netdev *netdev_)
c1c9c9c4 4684{
b5d57fc8 4685 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4686 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 4687
559eb230
BP
4688 /* Nothing but a tc class implementation is allowed to write to a tc. This
4689 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4690 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4691}
4692
4693static int
4694default_tc_install(struct netdev *netdev,
79f1cbe9 4695 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
4696{
4697 default_install__(netdev);
4698 return 0;
4699}
4700
4701static int
4702default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4703{
4704 default_install__(netdev);
4705 return 0;
4706}
4707
4708static const struct tc_ops tc_ops_default = {
4709 NULL, /* linux_name */
4710 "", /* ovs_name */
4711 0, /* n_queues */
4712 default_tc_install,
4713 default_tc_load,
4714 NULL, /* tc_destroy */
4715 NULL, /* qdisc_get */
4716 NULL, /* qdisc_set */
4717 NULL, /* class_get */
4718 NULL, /* class_set */
4719 NULL, /* class_delete */
4720 NULL, /* class_get_stats */
4721 NULL /* class_dump_stats */
4722};
4723\f
4724/* "linux-other" traffic control class.
4725 *
4726 * */
4727
4728static int
b5d57fc8 4729other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 4730{
b5d57fc8 4731 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 4732 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 4733
559eb230
BP
4734 /* Nothing but a tc class implementation is allowed to write to a tc. This
4735 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 4736 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
4737 return 0;
4738}
4739
4740static const struct tc_ops tc_ops_other = {
4741 NULL, /* linux_name */
4742 "linux-other", /* ovs_name */
4743 0, /* n_queues */
4744 NULL, /* tc_install */
4745 other_tc_load,
4746 NULL, /* tc_destroy */
4747 NULL, /* qdisc_get */
4748 NULL, /* qdisc_set */
4749 NULL, /* class_get */
4750 NULL, /* class_set */
4751 NULL, /* class_delete */
4752 NULL, /* class_get_stats */
4753 NULL /* class_dump_stats */
4754};
4755\f
4756/* Traffic control. */
4757
4758/* Number of kernel "tc" ticks per second. */
4759static double ticks_per_s;
4760
4761/* Number of kernel "jiffies" per second. This is used for the purpose of
4762 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4763 * one jiffy's worth of data.
4764 *
4765 * There are two possibilities here:
4766 *
4767 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4768 * approximate range of 100 to 1024. That means that we really need to
4769 * make sure that the qdisc can buffer that much data.
4770 *
4771 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4772 * has finely granular timers and there's no need to fudge additional room
4773 * for buffers. (There's no extra effort needed to implement that: the
4774 * large 'buffer_hz' is used as a divisor, so practically any number will
4775 * come out as 0 in the division. Small integer results in the case of
4776 * really high dividends won't have any real effect anyhow.)
4777 */
4778static unsigned int buffer_hz;
4779
7874bdff
RD
4780static struct tcmsg *
4781netdev_linux_tc_make_request(const struct netdev *netdev, int type,
4782 unsigned int flags, struct ofpbuf *request)
4783{
4784 int ifindex;
4785 int error;
4786
4787 error = get_ifindex(netdev, &ifindex);
4788 if (error) {
4789 return NULL;
4790 }
4791
4792 return tc_make_request(ifindex, type, flags, request);
4793}
4794
f8500004
JP
4795/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4796 * of 'kbits_burst'.
4797 *
4798 * This function is equivalent to running:
4799 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4800 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4801 * mtu 65535 drop
4802 *
4803 * The configuration and stats may be seen with the following command:
c7952afb 4804 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
4805 *
4806 * Returns 0 if successful, otherwise a positive errno value.
4807 */
4808static int
c7952afb
BP
4809tc_add_policer(struct netdev *netdev,
4810 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
4811{
4812 struct tc_police tc_police;
4813 struct ofpbuf request;
4814 struct tcmsg *tcmsg;
4815 size_t basic_offset;
4816 size_t police_offset;
4817 int error;
4818 int mtu = 65535;
4819
4820 memset(&tc_police, 0, sizeof tc_police);
4821 tc_police.action = TC_POLICE_SHOT;
4822 tc_police.mtu = mtu;
1aca400c 4823 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 4824
79abacc8
MAA
4825 /* The following appears wrong in one way: In networking a kilobit is
4826 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
4827 *
4828 * However if you "fix" those problems then "tc filter show ..." shows
4829 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4830 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4831 * tc's point of view. Whatever. */
4832 tc_police.burst = tc_bytes_to_ticks(
79abacc8 4833 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 4834
7874bdff
RD
4835 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
4836 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
4837 if (!tcmsg) {
4838 return ENODEV;
4839 }
4840 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
4841 tcmsg->tcm_info = tc_make_handle(49,
4842 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
4843
4844 nl_msg_put_string(&request, TCA_KIND, "basic");
4845 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4846 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
4847 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
4848 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
4849 nl_msg_end_nested(&request, police_offset);
4850 nl_msg_end_nested(&request, basic_offset);
4851
4852 error = tc_transact(&request, NULL);
4853 if (error) {
4854 return error;
4855 }
4856
4857 return 0;
4858}
4859
c1c9c9c4
BP
4860static void
4861read_psched(void)
4862{
4863 /* The values in psched are not individually very meaningful, but they are
4864 * important. The tables below show some values seen in the wild.
4865 *
4866 * Some notes:
4867 *
4868 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4869 * (Before that, there are hints that it was 1000000000.)
4870 *
4871 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4872 * above.
4873 *
4874 * /proc/net/psched
4875 * -----------------------------------
4876 * [1] 000c8000 000f4240 000f4240 00000064
4877 * [2] 000003e8 00000400 000f4240 3b9aca00
4878 * [3] 000003e8 00000400 000f4240 3b9aca00
4879 * [4] 000003e8 00000400 000f4240 00000064
4880 * [5] 000003e8 00000040 000f4240 3b9aca00
4881 * [6] 000003e8 00000040 000f4240 000000f9
4882 *
4883 * a b c d ticks_per_s buffer_hz
4884 * ------- --------- ---------- ------------- ----------- -------------
4885 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4886 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4887 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4888 * [4] 1,000 1,024 1,000,000 100 976,562 100
4889 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4890 * [6] 1,000 64 1,000,000 249 15,625,000 249
4891 *
4892 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4893 * [2] 2.6.26-1-686-bigmem from Debian lenny
4894 * [3] 2.6.26-2-sparc64 from Debian lenny
4895 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4896 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4897 * [6] 2.6.34 from kernel.org on KVM
4898 */
23882115 4899 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
4900 static const char fn[] = "/proc/net/psched";
4901 unsigned int a, b, c, d;
4902 FILE *stream;
4903
23882115
BP
4904 if (!ovsthread_once_start(&once)) {
4905 return;
4906 }
4907
c1c9c9c4
BP
4908 ticks_per_s = 1.0;
4909 buffer_hz = 100;
4910
4911 stream = fopen(fn, "r");
4912 if (!stream) {
10a89ef0 4913 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 4914 goto exit;
c1c9c9c4
BP
4915 }
4916
4917 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
4918 VLOG_WARN("%s: read failed", fn);
4919 fclose(stream);
23882115 4920 goto exit;
c1c9c9c4
BP
4921 }
4922 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
4923 fclose(stream);
4924
4925 if (!a || !c) {
4926 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 4927 goto exit;
c1c9c9c4
BP
4928 }
4929
4930 ticks_per_s = (double) a * c / b;
4931 if (c == 1000000) {
4932 buffer_hz = d;
4933 } else {
4934 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4935 fn, a, b, c, d);
4936 }
4937 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
4938
4939exit:
4940 ovsthread_once_done(&once);
c1c9c9c4
BP
4941}
4942
4943/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4944 * rate of 'rate' bytes per second. */
4945static unsigned int
4946tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
4947{
23882115 4948 read_psched();
c1c9c9c4
BP
4949 return (rate * ticks) / ticks_per_s;
4950}
4951
4952/* Returns the number of ticks that it would take to transmit 'size' bytes at a
4953 * rate of 'rate' bytes per second. */
4954static unsigned int
4955tc_bytes_to_ticks(unsigned int rate, unsigned int size)
4956{
23882115 4957 read_psched();
015c93a4 4958 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
4959}
4960
4961/* Returns the number of bytes that need to be reserved for qdisc buffering at
4962 * a transmission rate of 'rate' bytes per second. */
4963static unsigned int
4964tc_buffer_per_jiffy(unsigned int rate)
4965{
23882115 4966 read_psched();
c1c9c9c4
BP
4967 return rate / buffer_hz;
4968}
4969
4970/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4971 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4972 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4973 * stores NULL into it if it is absent.
4974 *
4975 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4976 * 'msg'.
4977 *
4978 * Returns 0 if successful, otherwise a positive errno value. */
4979static int
4980tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
4981 struct nlattr **options)
4982{
4983 static const struct nl_policy tca_policy[] = {
4984 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
4985 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
4986 };
4987 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
4988
4989 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
4990 tca_policy, ta, ARRAY_SIZE(ta))) {
4991 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
4992 goto error;
4993 }
4994
4995 if (kind) {
4996 *kind = nl_attr_get_string(ta[TCA_KIND]);
4997 }
4998
4999 if (options) {
5000 *options = ta[TCA_OPTIONS];
5001 }
5002
5003 return 0;
5004
5005error:
5006 if (kind) {
5007 *kind = NULL;
5008 }
5009 if (options) {
5010 *options = NULL;
5011 }
5012 return EPROTO;
5013}
5014
5015/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5016 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5017 * into '*options', and its queue statistics into '*stats'. Any of the output
5018 * arguments may be null.
5019 *
5020 * Returns 0 if successful, otherwise a positive errno value. */
5021static int
5022tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5023 struct nlattr **options, struct netdev_queue_stats *stats)
5024{
5025 static const struct nl_policy tca_policy[] = {
5026 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5027 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5028 };
5029 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5030
5031 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5032 tca_policy, ta, ARRAY_SIZE(ta))) {
5033 VLOG_WARN_RL(&rl, "failed to parse class message");
5034 goto error;
5035 }
5036
5037 if (handlep) {
5038 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5039 *handlep = tc->tcm_handle;
5040 }
5041
5042 if (options) {
5043 *options = ta[TCA_OPTIONS];
5044 }
5045
5046 if (stats) {
5047 const struct gnet_stats_queue *gsq;
5048 struct gnet_stats_basic gsb;
5049
5050 static const struct nl_policy stats_policy[] = {
5051 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5052 .min_len = sizeof gsb },
5053 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5054 .min_len = sizeof *gsq },
5055 };
5056 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5057
5058 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5059 sa, ARRAY_SIZE(sa))) {
5060 VLOG_WARN_RL(&rl, "failed to parse class stats");
5061 goto error;
5062 }
5063
5064 /* Alignment issues screw up the length of struct gnet_stats_basic on
5065 * some arch/bitsize combinations. Newer versions of Linux have a
5066 * struct gnet_stats_basic_packed, but we can't depend on that. The
5067 * easiest thing to do is just to make a copy. */
5068 memset(&gsb, 0, sizeof gsb);
5069 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5070 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5071 stats->tx_bytes = gsb.bytes;
5072 stats->tx_packets = gsb.packets;
5073
5074 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5075 stats->tx_errors = gsq->drops;
5076 }
5077
5078 return 0;
5079
5080error:
5081 if (options) {
5082 *options = NULL;
5083 }
5084 if (stats) {
5085 memset(stats, 0, sizeof *stats);
5086 }
5087 return EPROTO;
5088}
5089
5090/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5091 * on 'netdev'. */
5092static int
5093tc_query_class(const struct netdev *netdev,
5094 unsigned int handle, unsigned int parent,
5095 struct ofpbuf **replyp)
5096{
5097 struct ofpbuf request;
5098 struct tcmsg *tcmsg;
5099 int error;
5100
7874bdff
RD
5101 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5102 &request);
23a98ffe
BP
5103 if (!tcmsg) {
5104 return ENODEV;
5105 }
c1c9c9c4
BP
5106 tcmsg->tcm_handle = handle;
5107 tcmsg->tcm_parent = parent;
5108
5109 error = tc_transact(&request, replyp);
5110 if (error) {
5111 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5112 netdev_get_name(netdev),
5113 tc_get_major(handle), tc_get_minor(handle),
5114 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5115 ovs_strerror(error));
c1c9c9c4
BP
5116 }
5117 return error;
5118}
5119
5120/* Equivalent to "tc class del dev <name> handle <handle>". */
5121static int
5122tc_delete_class(const struct netdev *netdev, unsigned int handle)
5123{
5124 struct ofpbuf request;
5125 struct tcmsg *tcmsg;
5126 int error;
5127
7874bdff 5128 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5129 if (!tcmsg) {
5130 return ENODEV;
5131 }
c1c9c9c4
BP
5132 tcmsg->tcm_handle = handle;
5133 tcmsg->tcm_parent = 0;
5134
5135 error = tc_transact(&request, NULL);
5136 if (error) {
5137 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5138 netdev_get_name(netdev),
5139 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5140 ovs_strerror(error));
c1c9c9c4
BP
5141 }
5142 return error;
5143}
5144
5145/* Equivalent to "tc qdisc del dev <name> root". */
5146static int
b5d57fc8 5147tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5148{
b5d57fc8 5149 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5150 struct ofpbuf request;
5151 struct tcmsg *tcmsg;
5152 int error;
5153
7874bdff 5154 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5155 if (!tcmsg) {
5156 return ENODEV;
5157 }
c1c9c9c4
BP
5158 tcmsg->tcm_handle = tc_make_handle(1, 0);
5159 tcmsg->tcm_parent = TC_H_ROOT;
5160
5161 error = tc_transact(&request, NULL);
5162 if (error == EINVAL) {
5163 /* EINVAL probably means that the default qdisc was in use, in which
5164 * case we've accomplished our purpose. */
5165 error = 0;
5166 }
b5d57fc8
BP
5167 if (!error && netdev->tc) {
5168 if (netdev->tc->ops->tc_destroy) {
5169 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5170 }
b5d57fc8 5171 netdev->tc = NULL;
c1c9c9c4
BP
5172 }
5173 return error;
5174}
5175
ac3e3aaa
BP
5176static bool
5177getqdisc_is_safe(void)
5178{
5179 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5180 static bool safe = false;
5181
5182 if (ovsthread_once_start(&once)) {
5183 struct utsname utsname;
5184 int major, minor;
5185
5186 if (uname(&utsname) == -1) {
5187 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5188 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5189 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5190 } else if (major < 2 || (major == 2 && minor < 35)) {
5191 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5192 utsname.release);
5193 } else {
5194 safe = true;
5195 }
5196 ovsthread_once_done(&once);
5197 }
5198 return safe;
5199}
5200
c1c9c9c4
BP
5201/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5202 * kernel to determine what they are. Returns 0 if successful, otherwise a
5203 * positive errno value. */
5204static int
b5d57fc8 5205tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5206{
b5d57fc8 5207 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5208 struct ofpbuf request, *qdisc;
5209 const struct tc_ops *ops;
5210 struct tcmsg *tcmsg;
5211 int load_error;
5212 int error;
5213
b5d57fc8 5214 if (netdev->tc) {
c1c9c9c4
BP
5215 return 0;
5216 }
5217
5218 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5219 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5220 * 2.6.35 without that fix backported to it.
5221 *
5222 * To avoid the OOPS, we must not make a request that would attempt to dump
5223 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5224 * few others. There are a few ways that I can see to do this, but most of
5225 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5226 * technique chosen here is to assume that any non-default qdisc that we
5227 * create will have a class with handle 1:0. The built-in qdiscs only have
5228 * a class with handle 0:0.
5229 *
ac3e3aaa
BP
5230 * On Linux 2.6.35+ we use the straightforward method because it allows us
5231 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5232 * in such a case we get no response at all from the kernel (!) if a
5233 * builtin qdisc is in use (which is later caught by "!error &&
5234 * !qdisc->size"). */
7874bdff
RD
5235 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5236 &request);
23a98ffe
BP
5237 if (!tcmsg) {
5238 return ENODEV;
5239 }
ac3e3aaa
BP
5240 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5241 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5242
5243 /* Figure out what tc class to instantiate. */
5244 error = tc_transact(&request, &qdisc);
ac3e3aaa 5245 if (!error && qdisc->size) {
c1c9c9c4
BP
5246 const char *kind;
5247
5248 error = tc_parse_qdisc(qdisc, &kind, NULL);
5249 if (error) {
5250 ops = &tc_ops_other;
5251 } else {
5252 ops = tc_lookup_linux_name(kind);
5253 if (!ops) {
5254 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5255 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5256
5257 ops = &tc_ops_other;
5258 }
5259 }
ac3e3aaa
BP
5260 } else if ((!error && !qdisc->size) || error == ENOENT) {
5261 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5262 * set up by some other entity that doesn't have a handle 1:0. We will
5263 * assume that it's the system default qdisc. */
c1c9c9c4
BP
5264 ops = &tc_ops_default;
5265 error = 0;
5266 } else {
5267 /* Who knows? Maybe the device got deleted. */
5268 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 5269 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
5270 ops = &tc_ops_other;
5271 }
5272
5273 /* Instantiate it. */
b5d57fc8
BP
5274 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
5275 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
5276 ofpbuf_delete(qdisc);
5277
5278 return error ? error : load_error;
5279}
5280
5281/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5282 approximate the time to transmit packets of various lengths. For an MTU of
5283 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5284 represents two possible packet lengths; for a MTU of 513 through 1024, four
5285 possible lengths; and so on.
5286
5287 Returns, for the specified 'mtu', the number of bits that packet lengths
5288 need to be shifted right to fit within such a 256-entry table. */
5289static int
5290tc_calc_cell_log(unsigned int mtu)
5291{
5292 int cell_log;
5293
5294 if (!mtu) {
5295 mtu = ETH_PAYLOAD_MAX;
5296 }
5297 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
5298
5299 for (cell_log = 0; mtu >= 256; cell_log++) {
5300 mtu >>= 1;
5301 }
5302
5303 return cell_log;
5304}
5305
5306/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5307 * of 'mtu'. */
5308static void
5309tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
5310{
5311 memset(rate, 0, sizeof *rate);
5312 rate->cell_log = tc_calc_cell_log(mtu);
5313 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5314 /* rate->cell_align = 0; */ /* distro headers. */
5315 rate->mpu = ETH_TOTAL_MIN;
5316 rate->rate = Bps;
5317}
5318
5319/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5320 * attribute of the specified "type".
5321 *
5322 * See tc_calc_cell_log() above for a description of "rtab"s. */
5323static void
5324tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
5325{
5326 uint32_t *rtab;
5327 unsigned int i;
5328
5329 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
5330 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
5331 unsigned packet_size = (i + 1) << rate->cell_log;
5332 if (packet_size < rate->mpu) {
5333 packet_size = rate->mpu;
5334 }
5335 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
5336 }
5337}
5338
5339/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5340 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5341 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 5342 * 0 is fine.) */
c1c9c9c4
BP
5343static int
5344tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
5345{
5346 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
5347 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
5348}
d3980822 5349\f
aaf2fb1a
BP
5350/* Linux-only functions declared in netdev-linux.h */
5351
5352/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5353 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5354int
5355netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
5356 const char *flag_name, bool enable)
5357{
5358 const char *netdev_name = netdev_get_name(netdev);
5359 struct ethtool_value evalue;
5360 uint32_t new_flags;
5361 int error;
5362
ab985a77 5363 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5364 memset(&evalue, 0, sizeof evalue);
5365 error = netdev_linux_do_ethtool(netdev_name,
5366 (struct ethtool_cmd *)&evalue,
5367 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5368 if (error) {
5369 return error;
5370 }
5371
ab985a77 5372 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
5373 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
5374 if (new_flags == evalue.data) {
5375 return 0;
5376 }
5377 evalue.data = new_flags;
aaf2fb1a
BP
5378 error = netdev_linux_do_ethtool(netdev_name,
5379 (struct ethtool_cmd *)&evalue,
5380 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
5381 if (error) {
5382 return error;
5383 }
5384
ab985a77 5385 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
5386 memset(&evalue, 0, sizeof evalue);
5387 error = netdev_linux_do_ethtool(netdev_name,
5388 (struct ethtool_cmd *)&evalue,
5389 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
5390 if (error) {
5391 return error;
5392 }
5393
5394 if (new_flags != evalue.data) {
5395 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
5396 "device %s failed", enable ? "enable" : "disable",
5397 flag_name, netdev_name);
5398 return EOPNOTSUPP;
5399 }
5400
5401 return 0;
5402}
5403\f
5404/* Utility functions. */
5405
d3980822 5406/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 5407static void
d3980822
BP
5408netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
5409 const struct rtnl_link_stats *src)
5410{
f613a0d7
PS
5411 dst->rx_packets = src->rx_packets;
5412 dst->tx_packets = src->tx_packets;
5413 dst->rx_bytes = src->rx_bytes;
5414 dst->tx_bytes = src->tx_bytes;
5415 dst->rx_errors = src->rx_errors;
5416 dst->tx_errors = src->tx_errors;
5417 dst->rx_dropped = src->rx_dropped;
5418 dst->tx_dropped = src->tx_dropped;
5419 dst->multicast = src->multicast;
5420 dst->collisions = src->collisions;
5421 dst->rx_length_errors = src->rx_length_errors;
5422 dst->rx_over_errors = src->rx_over_errors;
5423 dst->rx_crc_errors = src->rx_crc_errors;
5424 dst->rx_frame_errors = src->rx_frame_errors;
5425 dst->rx_fifo_errors = src->rx_fifo_errors;
5426 dst->rx_missed_errors = src->rx_missed_errors;
5427 dst->tx_aborted_errors = src->tx_aborted_errors;
5428 dst->tx_carrier_errors = src->tx_carrier_errors;
5429 dst->tx_fifo_errors = src->tx_fifo_errors;
5430 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5431 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
5432}
5433
337c9b99
BP
5434/* Copies 'src' into 'dst', performing format conversion in the process. */
5435static void
5436netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
5437 const struct rtnl_link_stats64 *src)
5438{
5439 dst->rx_packets = src->rx_packets;
5440 dst->tx_packets = src->tx_packets;
5441 dst->rx_bytes = src->rx_bytes;
5442 dst->tx_bytes = src->tx_bytes;
5443 dst->rx_errors = src->rx_errors;
5444 dst->tx_errors = src->tx_errors;
5445 dst->rx_dropped = src->rx_dropped;
5446 dst->tx_dropped = src->tx_dropped;
5447 dst->multicast = src->multicast;
5448 dst->collisions = src->collisions;
5449 dst->rx_length_errors = src->rx_length_errors;
5450 dst->rx_over_errors = src->rx_over_errors;
5451 dst->rx_crc_errors = src->rx_crc_errors;
5452 dst->rx_frame_errors = src->rx_frame_errors;
5453 dst->rx_fifo_errors = src->rx_fifo_errors;
5454 dst->rx_missed_errors = src->rx_missed_errors;
5455 dst->tx_aborted_errors = src->tx_aborted_errors;
5456 dst->tx_carrier_errors = src->tx_carrier_errors;
5457 dst->tx_fifo_errors = src->tx_fifo_errors;
5458 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
5459 dst->tx_window_errors = src->tx_window_errors;
5460}
5461
c1c9c9c4 5462static int
35eef899 5463get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 5464{
c1c9c9c4
BP
5465 struct ofpbuf request;
5466 struct ofpbuf *reply;
c1c9c9c4
BP
5467 int error;
5468
d6e3feb5 5469 /* Filtering all counters by default */
5470 memset(stats, 0xFF, sizeof(struct netdev_stats));
5471
c1c9c9c4 5472 ofpbuf_init(&request, 0);
13a24df8
BP
5473 nl_msg_put_nlmsghdr(&request,
5474 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5475 RTM_GETLINK, NLM_F_REQUEST);
5476 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5477 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 5478 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
5479 ofpbuf_uninit(&request);
5480 if (error) {
5481 return error;
5482 }
5483
13a24df8 5484 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
5485 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
5486 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
5487 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
5488 error = 0;
5489 } else {
71f21279 5490 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
5491 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
5492 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
5493 error = 0;
5494 } else {
5495 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
5496 error = EPROTO;
5497 }
13a24df8
BP
5498 }
5499 } else {
5500 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
5501 error = EPROTO;
c1c9c9c4 5502 }
8b61709d 5503
8b61709d 5504
576e26d7 5505 ofpbuf_delete(reply);
35eef899 5506 return error;
8b61709d 5507}
c1c9c9c4 5508
3a183124 5509static int
b5d57fc8 5510get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
5511{
5512 struct ifreq ifr;
5513 int error;
5514
755be9ea 5515 *flags = 0;
259e0b1a 5516 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
5517 if (!error) {
5518 *flags = ifr.ifr_flags;
5519 }
8b61709d
BP
5520 return error;
5521}
5522
5523static int
4b609110 5524set_flags(const char *name, unsigned int flags)
8b61709d
BP
5525{
5526 struct ifreq ifr;
5527
5528 ifr.ifr_flags = flags;
259e0b1a 5529 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
5530}
5531
01b25786
PB
5532int
5533linux_get_ifindex(const char *netdev_name)
8b61709d
BP
5534{
5535 struct ifreq ifr;
259e0b1a 5536 int error;
8b61709d 5537
71d7c22f 5538 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5539 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
5540
5541 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
5542 if (error) {
580e1152
RD
5543 /* ENODEV probably means that a vif disappeared asynchronously and
5544 * hasn't been removed from the database yet, so reduce the log level
5545 * to INFO for that case. */
5546 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
5547 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5548 netdev_name, ovs_strerror(error));
259e0b1a 5549 return -error;
8b61709d
BP
5550 }
5551 return ifr.ifr_ifindex;
5552}
5553
5554static int
5555get_ifindex(const struct netdev *netdev_, int *ifindexp)
5556{
b5d57fc8 5557 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 5558
b5d57fc8 5559 if (!(netdev->cache_valid & VALID_IFINDEX)) {
756819dd
FL
5560 netdev_linux_update_via_netlink(netdev);
5561 }
5562
5563 if (!(netdev->cache_valid & VALID_IFINDEX)) {
5564 /* Fall back to ioctl if netlink fails */
01b25786 5565 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 5566
8b61709d 5567 if (ifindex < 0) {
b5d57fc8
BP
5568 netdev->get_ifindex_error = -ifindex;
5569 netdev->ifindex = 0;
c7b1b0a5 5570 } else {
b5d57fc8
BP
5571 netdev->get_ifindex_error = 0;
5572 netdev->ifindex = ifindex;
8b61709d 5573 }
b5d57fc8 5574 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 5575 }
c7b1b0a5 5576
b5d57fc8
BP
5577 *ifindexp = netdev->ifindex;
5578 return netdev->get_ifindex_error;
8b61709d
BP
5579}
5580
5581static int
756819dd
FL
5582netdev_linux_update_via_netlink(struct netdev_linux *netdev)
5583{
5584 struct ofpbuf request;
5585 struct ofpbuf *reply;
5586 struct rtnetlink_change chg;
5587 struct rtnetlink_change *change = &chg;
5588 int error;
5589
5590 ofpbuf_init(&request, 0);
5591 nl_msg_put_nlmsghdr(&request,
5592 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
5593 RTM_GETLINK, NLM_F_REQUEST);
5594 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
5595
5596 /* The correct identifiers for a Linux device are netnsid and ifindex,
5597 * but ifindex changes as the port is moved to another network namespace
5598 * and the interface name statically stored in ovsdb. */
5599 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
5600 if (netdev_linux_netnsid_is_remote(netdev)) {
5601 nl_msg_push_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
5602 }
5603 error = nl_transact(NETLINK_ROUTE, &request, &reply);
5604 ofpbuf_uninit(&request);
5605 if (error) {
5606 ofpbuf_delete(reply);
5607 return error;
5608 }
5609
5610 if (rtnetlink_parse(reply, change)
5611 && change->nlmsg_type == RTM_NEWLINK) {
5612 bool changed = false;
5613 error = 0;
5614
5615 /* Update netdev from rtnl msg and increment its seq if needed. */
5616 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
5617 netdev->carrier_resets++;
5618 changed = true;
5619 }
5620 if (change->ifi_flags != netdev->ifi_flags) {
5621 netdev->ifi_flags = change->ifi_flags;
5622 changed = true;
5623 }
5624 if (change->mtu && change->mtu != netdev->mtu) {
5625 netdev->mtu = change->mtu;
5626 netdev->cache_valid |= VALID_MTU;
5627 netdev->netdev_mtu_error = 0;
5628 changed = true;
5629 }
5630 if (!eth_addr_is_zero(change->mac)
5631 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
5632 netdev->etheraddr = change->mac;
5633 netdev->cache_valid |= VALID_ETHERADDR;
5634 netdev->ether_addr_error = 0;
5635 changed = true;
5636 }
5637 if (change->if_index != netdev->ifindex) {
5638 netdev->ifindex = change->if_index;
5639 netdev->cache_valid |= VALID_IFINDEX;
5640 netdev->get_ifindex_error = 0;
5641 changed = true;
5642 }
5643 if (changed) {
5644 netdev_change_seq_changed(&netdev->up);
5645 }
5646 } else {
5647 error = EINVAL;
5648 }
5649
5650 ofpbuf_delete(reply);
5651 return error;
5652}
5653
5654static int
74ff3298 5655get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
5656{
5657 struct ifreq ifr;
5658 int hwaddr_family;
259e0b1a 5659 int error;
8b61709d
BP
5660
5661 memset(&ifr, 0, sizeof ifr);
71d7c22f 5662 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 5663 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
5664 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
5665 if (error) {
78857dfb
BP
5666 /* ENODEV probably means that a vif disappeared asynchronously and
5667 * hasn't been removed from the database yet, so reduce the log level
5668 * to INFO for that case. */
259e0b1a 5669 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 5670 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
5671 netdev_name, ovs_strerror(error));
5672 return error;
8b61709d
BP
5673 }
5674 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
5675 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
5676 hwaddr_family != ARPHRD_NONE) {
c9697f35 5677 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 5678 netdev_name, hwaddr_family);
c9697f35 5679 return EINVAL;
8b61709d
BP
5680 }
5681 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
5682 return 0;
5683}
5684
5685static int
74ff3298 5686set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
5687{
5688 struct ifreq ifr;
259e0b1a 5689 int error;
8b61709d
BP
5690
5691 memset(&ifr, 0, sizeof ifr);
71d7c22f 5692 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 5693 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 5694 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 5695 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
5696 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
5697 if (error) {
8b61709d 5698 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 5699 netdev_name, ovs_strerror(error));
8b61709d 5700 }
259e0b1a 5701 return error;
8b61709d
BP
5702}
5703
5704static int
0b0544d7 5705netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
5706 int cmd, const char *cmd_name)
5707{
5708 struct ifreq ifr;
259e0b1a 5709 int error;
8b61709d
BP
5710
5711 memset(&ifr, 0, sizeof ifr);
71d7c22f 5712 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
5713 ifr.ifr_data = (caddr_t) ecmd;
5714
5715 ecmd->cmd = cmd;
259e0b1a
BP
5716 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
5717 if (error) {
5718 if (error != EOPNOTSUPP) {
8b61709d 5719 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 5720 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
5721 } else {
5722 /* The device doesn't support this operation. That's pretty
5723 * common, so there's no point in logging anything. */
5724 }
8b61709d 5725 }
259e0b1a 5726 return error;
8b61709d 5727}
f1acd62b 5728
488d734d
BP
5729/* Returns an AF_PACKET raw socket or a negative errno value. */
5730static int
5731af_packet_sock(void)
5732{
23882115
BP
5733 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5734 static int sock;
488d734d 5735
23882115 5736 if (ovsthread_once_start(&once)) {
488d734d
BP
5737 sock = socket(AF_PACKET, SOCK_RAW, 0);
5738 if (sock >= 0) {
8450059e
BP
5739 int error = set_nonblocking(sock);
5740 if (error) {
5741 close(sock);
5742 sock = -error;
5743 }
488d734d
BP
5744 } else {
5745 sock = -errno;
10a89ef0
BP
5746 VLOG_ERR("failed to create packet socket: %s",
5747 ovs_strerror(errno));
488d734d 5748 }
23882115 5749 ovsthread_once_done(&once);
488d734d
BP
5750 }
5751
5752 return sock;
5753}