]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/core/dev.c
Merge branch 'cgroup-bpf'
[mirror_ubuntu-jammy-kernel.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
a7862b45 97#include <linux/bpf.h>
457c4cbc 98#include <net/net_namespace.h>
1da177e4 99#include <net/sock.h>
02d62e86 100#include <net/busy_poll.h>
1da177e4 101#include <linux/rtnetlink.h>
1da177e4 102#include <linux/stat.h>
1da177e4 103#include <net/dst.h>
fc4099f1 104#include <net/dst_metadata.h>
1da177e4
LT
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
44540960 107#include <net/xfrm.h>
1da177e4
LT
108#include <linux/highmem.h>
109#include <linux/init.h>
1da177e4 110#include <linux/module.h>
1da177e4
LT
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
1da177e4 114#include <net/iw_handler.h>
1da177e4 115#include <asm/current.h>
5bdb9886 116#include <linux/audit.h>
db217334 117#include <linux/dmaengine.h>
f6a78bfc 118#include <linux/err.h>
c7fa9d18 119#include <linux/ctype.h>
723e98b7 120#include <linux/if_arp.h>
6de329e2 121#include <linux/if_vlan.h>
8f0f2223 122#include <linux/ip.h>
ad55dcaf 123#include <net/ip.h>
25cd9ba0 124#include <net/mpls.h>
8f0f2223
DM
125#include <linux/ipv6.h>
126#include <linux/in.h>
b6b2fed1
DM
127#include <linux/jhash.h>
128#include <linux/random.h>
9cbc1cb8 129#include <trace/events/napi.h>
cf66ba58 130#include <trace/events/net.h>
07dc22e7 131#include <trace/events/skb.h>
5acbbd42 132#include <linux/pci.h>
caeda9b9 133#include <linux/inetdevice.h>
c445477d 134#include <linux/cpu_rmap.h>
c5905afb 135#include <linux/static_key.h>
af12fa6e 136#include <linux/hashtable.h>
60877a32 137#include <linux/vmalloc.h>
529d0489 138#include <linux/if_macvlan.h>
e7fd2885 139#include <linux/errqueue.h>
3b47d303 140#include <linux/hrtimer.h>
e687ad60 141#include <linux/netfilter_ingress.h>
40e4e713 142#include <linux/crash_dump.h>
1da177e4 143
342709ef
PE
144#include "net-sysfs.h"
145
d565b0a1
HX
146/* Instead of increasing this, you should create a hash table. */
147#define MAX_GRO_SKBS 8
148
5d38a079
HX
149/* This should be increased if a protocol with a bigger head is added. */
150#define GRO_MAX_HEAD (MAX_HEADER + 128)
151
1da177e4 152static DEFINE_SPINLOCK(ptype_lock);
62532da9 153static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155struct list_head ptype_all __read_mostly; /* Taps */
62532da9 156static struct list_head offload_base __read_mostly;
1da177e4 157
ae78dbfa 158static int netif_rx_internal(struct sk_buff *skb);
54951194
LP
159static int call_netdevice_notifiers_info(unsigned long val,
160 struct net_device *dev,
161 struct netdev_notifier_info *info);
ae78dbfa 162
1da177e4 163/*
7562f876 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
165 * semaphore.
166 *
c6d14c84 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
168 *
169 * Writers must hold the rtnl semaphore while they loop through the
7562f876 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
171 * actual updates. This allows pure readers to access the list even
172 * while a writer is preparing to update it.
173 *
174 * To put it another way, dev_base_lock is held for writing only to
175 * protect against pure readers; the rtnl semaphore provides the
176 * protection against other writers.
177 *
178 * See, for example usages, register_netdevice() and
179 * unregister_netdevice(), which must be called with the rtnl
180 * semaphore held.
181 */
1da177e4 182DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
183EXPORT_SYMBOL(dev_base_lock);
184
af12fa6e
ET
185/* protects napi_hash addition/deletion and napi_gen_id */
186static DEFINE_SPINLOCK(napi_hash_lock);
187
52bd2d62 188static unsigned int napi_gen_id = NR_CPUS;
6180d9de 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
af12fa6e 190
18afa4b0 191static seqcount_t devnet_rename_seq;
c91f6df2 192
4e985ada
TG
193static inline void dev_base_seq_inc(struct net *net)
194{
195 while (++net->dev_base_seq == 0);
196}
197
881d966b 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 199{
8387ff25 200 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
95c96174 201
08e9897d 202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
203}
204
881d966b 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 206{
7c28bd0b 207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
208}
209
e36fa2f7 210static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
211{
212#ifdef CONFIG_RPS
e36fa2f7 213 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
214#endif
215}
216
e36fa2f7 217static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
218{
219#ifdef CONFIG_RPS
e36fa2f7 220 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
221#endif
222}
223
ce286d32 224/* Device list insertion */
53759be9 225static void list_netdevice(struct net_device *dev)
ce286d32 226{
c346dca1 227 struct net *net = dev_net(dev);
ce286d32
EB
228
229 ASSERT_RTNL();
230
231 write_lock_bh(&dev_base_lock);
c6d14c84 232 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 233 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
234 hlist_add_head_rcu(&dev->index_hlist,
235 dev_index_hash(net, dev->ifindex));
ce286d32 236 write_unlock_bh(&dev_base_lock);
4e985ada
TG
237
238 dev_base_seq_inc(net);
ce286d32
EB
239}
240
fb699dfd
ED
241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
ce286d32
EB
244static void unlist_netdevice(struct net_device *dev)
245{
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
c6d14c84 250 list_del_rcu(&dev->dev_list);
72c9528b 251 hlist_del_rcu(&dev->name_hlist);
fb699dfd 252 hlist_del_rcu(&dev->index_hlist);
ce286d32 253 write_unlock_bh(&dev_base_lock);
4e985ada
TG
254
255 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
256}
257
1da177e4
LT
258/*
259 * Our notifier list
260 */
261
f07d5b94 262static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
263
264/*
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
267 */
bea3348e 268
9958da05 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 270EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 271
cf508b12 272#ifdef CONFIG_LOCKDEP
723e98b7 273/*
c773e847 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
275 * according to dev->type
276 */
277static const unsigned short netdev_lock_type[] =
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
290 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 293
36cbd3dc 294static const char *const netdev_lock_name[] =
723e98b7
JP
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
307 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
cf508b12
DM
325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
723e98b7
JP
327{
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333}
cf508b12
DM
334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343}
723e98b7 344#else
cf508b12
DM
345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
350{
351}
352#endif
1da177e4
LT
353
354/*******************************************************************************
355
356 Protocol management and registration routines
357
358*******************************************************************************/
359
1da177e4
LT
360/*
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
c07b68e8
ED
376static inline struct list_head *ptype_head(const struct packet_type *pt)
377{
378 if (pt->type == htons(ETH_P_ALL))
7866a621 379 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
c07b68e8 380 else
7866a621
SN
381 return pt->dev ? &pt->dev->ptype_specific :
382 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
c07b68e8
ED
383}
384
1da177e4
LT
385/**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
4ec93edb 393 * This call does not sleep therefore it can not
1da177e4
LT
394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398void dev_add_pack(struct packet_type *pt)
399{
c07b68e8 400 struct list_head *head = ptype_head(pt);
1da177e4 401
c07b68e8
ED
402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
1da177e4 405}
d1b19dff 406EXPORT_SYMBOL(dev_add_pack);
1da177e4 407
1da177e4
LT
408/**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
4ec93edb 415 * returns.
1da177e4
LT
416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421void __dev_remove_pack(struct packet_type *pt)
422{
c07b68e8 423 struct list_head *head = ptype_head(pt);
1da177e4
LT
424 struct packet_type *pt1;
425
c07b68e8 426 spin_lock(&ptype_lock);
1da177e4
LT
427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
7b6cd1ce 435 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 436out:
c07b68e8 437 spin_unlock(&ptype_lock);
1da177e4 438}
d1b19dff
ED
439EXPORT_SYMBOL(__dev_remove_pack);
440
1da177e4
LT
441/**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453void dev_remove_pack(struct packet_type *pt)
454{
455 __dev_remove_pack(pt);
4ec93edb 456
1da177e4
LT
457 synchronize_net();
458}
d1b19dff 459EXPORT_SYMBOL(dev_remove_pack);
1da177e4 460
62532da9
VY
461
462/**
463 * dev_add_offload - register offload handlers
464 * @po: protocol offload declaration
465 *
466 * Add protocol offload handlers to the networking stack. The passed
467 * &proto_offload is linked into kernel lists and may not be freed until
468 * it has been removed from the kernel lists.
469 *
470 * This call does not sleep therefore it can not
471 * guarantee all CPU's that are in middle of receiving packets
472 * will see the new offload handlers (until the next received packet).
473 */
474void dev_add_offload(struct packet_offload *po)
475{
bdef7de4 476 struct packet_offload *elem;
62532da9
VY
477
478 spin_lock(&offload_lock);
bdef7de4
DM
479 list_for_each_entry(elem, &offload_base, list) {
480 if (po->priority < elem->priority)
481 break;
482 }
483 list_add_rcu(&po->list, elem->list.prev);
62532da9
VY
484 spin_unlock(&offload_lock);
485}
486EXPORT_SYMBOL(dev_add_offload);
487
488/**
489 * __dev_remove_offload - remove offload handler
490 * @po: packet offload declaration
491 *
492 * Remove a protocol offload handler that was previously added to the
493 * kernel offload handlers by dev_add_offload(). The passed &offload_type
494 * is removed from the kernel lists and can be freed or reused once this
495 * function returns.
496 *
497 * The packet type might still be in use by receivers
498 * and must not be freed until after all the CPU's have gone
499 * through a quiescent state.
500 */
1d143d9f 501static void __dev_remove_offload(struct packet_offload *po)
62532da9
VY
502{
503 struct list_head *head = &offload_base;
504 struct packet_offload *po1;
505
c53aa505 506 spin_lock(&offload_lock);
62532da9
VY
507
508 list_for_each_entry(po1, head, list) {
509 if (po == po1) {
510 list_del_rcu(&po->list);
511 goto out;
512 }
513 }
514
515 pr_warn("dev_remove_offload: %p not found\n", po);
516out:
c53aa505 517 spin_unlock(&offload_lock);
62532da9 518}
62532da9
VY
519
520/**
521 * dev_remove_offload - remove packet offload handler
522 * @po: packet offload declaration
523 *
524 * Remove a packet offload handler that was previously added to the kernel
525 * offload handlers by dev_add_offload(). The passed &offload_type is
526 * removed from the kernel lists and can be freed or reused once this
527 * function returns.
528 *
529 * This call sleeps to guarantee that no CPU is looking at the packet
530 * type after return.
531 */
532void dev_remove_offload(struct packet_offload *po)
533{
534 __dev_remove_offload(po);
535
536 synchronize_net();
537}
538EXPORT_SYMBOL(dev_remove_offload);
539
1da177e4
LT
540/******************************************************************************
541
542 Device Boot-time Settings Routines
543
544*******************************************************************************/
545
546/* Boot time configuration table */
547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548
549/**
550 * netdev_boot_setup_add - add new setup entry
551 * @name: name of the device
552 * @map: configured settings for the device
553 *
554 * Adds new setup entry to the dev_boot_setup list. The function
555 * returns 0 on error and 1 on success. This is a generic routine to
556 * all netdevices.
557 */
558static int netdev_boot_setup_add(char *name, struct ifmap *map)
559{
560 struct netdev_boot_setup *s;
561 int i;
562
563 s = dev_boot_setup;
564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 567 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
568 memcpy(&s[i].map, map, sizeof(s[i].map));
569 break;
570 }
571 }
572
573 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574}
575
576/**
577 * netdev_boot_setup_check - check boot time settings
578 * @dev: the netdevice
579 *
580 * Check boot time settings for the device.
581 * The found settings are set for the device to be used
582 * later in the device probing.
583 * Returns 0 if no settings found, 1 if they are.
584 */
585int netdev_boot_setup_check(struct net_device *dev)
586{
587 struct netdev_boot_setup *s = dev_boot_setup;
588 int i;
589
590 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 592 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
593 dev->irq = s[i].map.irq;
594 dev->base_addr = s[i].map.base_addr;
595 dev->mem_start = s[i].map.mem_start;
596 dev->mem_end = s[i].map.mem_end;
597 return 1;
598 }
599 }
600 return 0;
601}
d1b19dff 602EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
603
604
605/**
606 * netdev_boot_base - get address from boot time settings
607 * @prefix: prefix for network device
608 * @unit: id for network device
609 *
610 * Check boot time settings for the base address of device.
611 * The found settings are set for the device to be used
612 * later in the device probing.
613 * Returns 0 if no settings found.
614 */
615unsigned long netdev_boot_base(const char *prefix, int unit)
616{
617 const struct netdev_boot_setup *s = dev_boot_setup;
618 char name[IFNAMSIZ];
619 int i;
620
621 sprintf(name, "%s%d", prefix, unit);
622
623 /*
624 * If device already registered then return base of 1
625 * to indicate not to probe for this interface
626 */
881d966b 627 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
628 return 1;
629
630 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 if (!strcmp(name, s[i].name))
632 return s[i].map.base_addr;
633 return 0;
634}
635
636/*
637 * Saves at boot time configured settings for any netdevice.
638 */
639int __init netdev_boot_setup(char *str)
640{
641 int ints[5];
642 struct ifmap map;
643
644 str = get_options(str, ARRAY_SIZE(ints), ints);
645 if (!str || !*str)
646 return 0;
647
648 /* Save settings */
649 memset(&map, 0, sizeof(map));
650 if (ints[0] > 0)
651 map.irq = ints[1];
652 if (ints[0] > 1)
653 map.base_addr = ints[2];
654 if (ints[0] > 2)
655 map.mem_start = ints[3];
656 if (ints[0] > 3)
657 map.mem_end = ints[4];
658
659 /* Add new entry to the list */
660 return netdev_boot_setup_add(str, &map);
661}
662
663__setup("netdev=", netdev_boot_setup);
664
665/*******************************************************************************
666
667 Device Interface Subroutines
668
669*******************************************************************************/
670
a54acb3a
ND
671/**
672 * dev_get_iflink - get 'iflink' value of a interface
673 * @dev: targeted interface
674 *
675 * Indicates the ifindex the interface is linked to.
676 * Physical interfaces have the same 'ifindex' and 'iflink' values.
677 */
678
679int dev_get_iflink(const struct net_device *dev)
680{
681 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 return dev->netdev_ops->ndo_get_iflink(dev);
683
7a66bbc9 684 return dev->ifindex;
a54acb3a
ND
685}
686EXPORT_SYMBOL(dev_get_iflink);
687
fc4099f1
PS
688/**
689 * dev_fill_metadata_dst - Retrieve tunnel egress information.
690 * @dev: targeted interface
691 * @skb: The packet.
692 *
693 * For better visibility of tunnel traffic OVS needs to retrieve
694 * egress tunnel information for a packet. Following API allows
695 * user to get this info.
696 */
697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698{
699 struct ip_tunnel_info *info;
700
701 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
702 return -EINVAL;
703
704 info = skb_tunnel_info_unclone(skb);
705 if (!info)
706 return -ENOMEM;
707 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 return -EINVAL;
709
710 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711}
712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713
1da177e4
LT
714/**
715 * __dev_get_by_name - find a device by its name
c4ea43c5 716 * @net: the applicable net namespace
1da177e4
LT
717 * @name: name to find
718 *
719 * Find an interface by name. Must be called under RTNL semaphore
720 * or @dev_base_lock. If the name is found a pointer to the device
721 * is returned. If the name is not found then %NULL is returned. The
722 * reference counters are not incremented so the caller must be
723 * careful with locks.
724 */
725
881d966b 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 727{
0bd8d536
ED
728 struct net_device *dev;
729 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 730
b67bfe0d 731 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
732 if (!strncmp(dev->name, name, IFNAMSIZ))
733 return dev;
0bd8d536 734
1da177e4
LT
735 return NULL;
736}
d1b19dff 737EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 738
72c9528b
ED
739/**
740 * dev_get_by_name_rcu - find a device by its name
741 * @net: the applicable net namespace
742 * @name: name to find
743 *
744 * Find an interface by name.
745 * If the name is found a pointer to the device is returned.
746 * If the name is not found then %NULL is returned.
747 * The reference counters are not incremented so the caller must be
748 * careful with locks. The caller must hold RCU lock.
749 */
750
751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752{
72c9528b
ED
753 struct net_device *dev;
754 struct hlist_head *head = dev_name_hash(net, name);
755
b67bfe0d 756 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
757 if (!strncmp(dev->name, name, IFNAMSIZ))
758 return dev;
759
760 return NULL;
761}
762EXPORT_SYMBOL(dev_get_by_name_rcu);
763
1da177e4
LT
764/**
765 * dev_get_by_name - find a device by its name
c4ea43c5 766 * @net: the applicable net namespace
1da177e4
LT
767 * @name: name to find
768 *
769 * Find an interface by name. This can be called from any
770 * context and does its own locking. The returned handle has
771 * the usage count incremented and the caller must use dev_put() to
772 * release it when it is no longer needed. %NULL is returned if no
773 * matching device is found.
774 */
775
881d966b 776struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
777{
778 struct net_device *dev;
779
72c9528b
ED
780 rcu_read_lock();
781 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
782 if (dev)
783 dev_hold(dev);
72c9528b 784 rcu_read_unlock();
1da177e4
LT
785 return dev;
786}
d1b19dff 787EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
788
789/**
790 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 791 * @net: the applicable net namespace
1da177e4
LT
792 * @ifindex: index of device
793 *
794 * Search for an interface by index. Returns %NULL if the device
795 * is not found or a pointer to the device. The device has not
796 * had its reference counter increased so the caller must be careful
797 * about locking. The caller must hold either the RTNL semaphore
798 * or @dev_base_lock.
799 */
800
881d966b 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 802{
0bd8d536
ED
803 struct net_device *dev;
804 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 805
b67bfe0d 806 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
807 if (dev->ifindex == ifindex)
808 return dev;
0bd8d536 809
1da177e4
LT
810 return NULL;
811}
d1b19dff 812EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 813
fb699dfd
ED
814/**
815 * dev_get_by_index_rcu - find a device by its ifindex
816 * @net: the applicable net namespace
817 * @ifindex: index of device
818 *
819 * Search for an interface by index. Returns %NULL if the device
820 * is not found or a pointer to the device. The device has not
821 * had its reference counter increased so the caller must be careful
822 * about locking. The caller must hold RCU lock.
823 */
824
825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826{
fb699dfd
ED
827 struct net_device *dev;
828 struct hlist_head *head = dev_index_hash(net, ifindex);
829
b67bfe0d 830 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
831 if (dev->ifindex == ifindex)
832 return dev;
833
834 return NULL;
835}
836EXPORT_SYMBOL(dev_get_by_index_rcu);
837
1da177e4
LT
838
839/**
840 * dev_get_by_index - find a device by its ifindex
c4ea43c5 841 * @net: the applicable net namespace
1da177e4
LT
842 * @ifindex: index of device
843 *
844 * Search for an interface by index. Returns NULL if the device
845 * is not found or a pointer to the device. The device returned has
846 * had a reference added and the pointer is safe until the user calls
847 * dev_put to indicate they have finished with it.
848 */
849
881d966b 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
851{
852 struct net_device *dev;
853
fb699dfd
ED
854 rcu_read_lock();
855 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
856 if (dev)
857 dev_hold(dev);
fb699dfd 858 rcu_read_unlock();
1da177e4
LT
859 return dev;
860}
d1b19dff 861EXPORT_SYMBOL(dev_get_by_index);
1da177e4 862
5dbe7c17
NS
863/**
864 * netdev_get_name - get a netdevice name, knowing its ifindex.
865 * @net: network namespace
866 * @name: a pointer to the buffer where the name will be stored.
867 * @ifindex: the ifindex of the interface to get the name from.
868 *
869 * The use of raw_seqcount_begin() and cond_resched() before
870 * retrying is required as we want to give the writers a chance
871 * to complete when CONFIG_PREEMPT is not set.
872 */
873int netdev_get_name(struct net *net, char *name, int ifindex)
874{
875 struct net_device *dev;
876 unsigned int seq;
877
878retry:
879 seq = raw_seqcount_begin(&devnet_rename_seq);
880 rcu_read_lock();
881 dev = dev_get_by_index_rcu(net, ifindex);
882 if (!dev) {
883 rcu_read_unlock();
884 return -ENODEV;
885 }
886
887 strcpy(name, dev->name);
888 rcu_read_unlock();
889 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 cond_resched();
891 goto retry;
892 }
893
894 return 0;
895}
896
1da177e4 897/**
941666c2 898 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 899 * @net: the applicable net namespace
1da177e4
LT
900 * @type: media type of device
901 * @ha: hardware address
902 *
903 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
904 * is not found or a pointer to the device.
905 * The caller must hold RCU or RTNL.
941666c2 906 * The returned device has not had its ref count increased
1da177e4
LT
907 * and the caller must therefore be careful about locking
908 *
1da177e4
LT
909 */
910
941666c2
ED
911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 const char *ha)
1da177e4
LT
913{
914 struct net_device *dev;
915
941666c2 916 for_each_netdev_rcu(net, dev)
1da177e4
LT
917 if (dev->type == type &&
918 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
919 return dev;
920
921 return NULL;
1da177e4 922}
941666c2 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 924
881d966b 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
926{
927 struct net_device *dev;
928
4e9cac2b 929 ASSERT_RTNL();
881d966b 930 for_each_netdev(net, dev)
4e9cac2b 931 if (dev->type == type)
7562f876
PE
932 return dev;
933
934 return NULL;
4e9cac2b 935}
4e9cac2b
PM
936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937
881d966b 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 939{
99fe3c39 940 struct net_device *dev, *ret = NULL;
4e9cac2b 941
99fe3c39
ED
942 rcu_read_lock();
943 for_each_netdev_rcu(net, dev)
944 if (dev->type == type) {
945 dev_hold(dev);
946 ret = dev;
947 break;
948 }
949 rcu_read_unlock();
950 return ret;
1da177e4 951}
1da177e4
LT
952EXPORT_SYMBOL(dev_getfirstbyhwtype);
953
954/**
6c555490 955 * __dev_get_by_flags - find any device with given flags
c4ea43c5 956 * @net: the applicable net namespace
1da177e4
LT
957 * @if_flags: IFF_* values
958 * @mask: bitmask of bits in if_flags to check
959 *
960 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04 961 * is not found or a pointer to the device. Must be called inside
6c555490 962 * rtnl_lock(), and result refcount is unchanged.
1da177e4
LT
963 */
964
6c555490
WC
965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 unsigned short mask)
1da177e4 967{
7562f876 968 struct net_device *dev, *ret;
1da177e4 969
6c555490
WC
970 ASSERT_RTNL();
971
7562f876 972 ret = NULL;
6c555490 973 for_each_netdev(net, dev) {
1da177e4 974 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 975 ret = dev;
1da177e4
LT
976 break;
977 }
978 }
7562f876 979 return ret;
1da177e4 980}
6c555490 981EXPORT_SYMBOL(__dev_get_by_flags);
1da177e4
LT
982
983/**
984 * dev_valid_name - check if name is okay for network device
985 * @name: name string
986 *
987 * Network device names need to be valid file names to
c7fa9d18
DM
988 * to allow sysfs to work. We also disallow any kind of
989 * whitespace.
1da177e4 990 */
95f050bf 991bool dev_valid_name(const char *name)
1da177e4 992{
c7fa9d18 993 if (*name == '\0')
95f050bf 994 return false;
b6fe17d6 995 if (strlen(name) >= IFNAMSIZ)
95f050bf 996 return false;
c7fa9d18 997 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 998 return false;
c7fa9d18
DM
999
1000 while (*name) {
a4176a93 1001 if (*name == '/' || *name == ':' || isspace(*name))
95f050bf 1002 return false;
c7fa9d18
DM
1003 name++;
1004 }
95f050bf 1005 return true;
1da177e4 1006}
d1b19dff 1007EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
1008
1009/**
b267b179
EB
1010 * __dev_alloc_name - allocate a name for a device
1011 * @net: network namespace to allocate the device name in
1da177e4 1012 * @name: name format string
b267b179 1013 * @buf: scratch buffer and result name string
1da177e4
LT
1014 *
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
1022 */
1023
b267b179 1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
1025{
1026 int i = 0;
1da177e4
LT
1027 const char *p;
1028 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 1029 unsigned long *inuse;
1da177e4
LT
1030 struct net_device *d;
1031
1032 p = strnchr(name, IFNAMSIZ-1, '%');
1033 if (p) {
1034 /*
1035 * Verify the string as this thing may have come from
1036 * the user. There must be either one "%d" and no other "%"
1037 * characters.
1038 */
1039 if (p[1] != 'd' || strchr(p + 2, '%'))
1040 return -EINVAL;
1041
1042 /* Use one page as a bit array of possible slots */
cfcabdcc 1043 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
1044 if (!inuse)
1045 return -ENOMEM;
1046
881d966b 1047 for_each_netdev(net, d) {
1da177e4
LT
1048 if (!sscanf(d->name, name, &i))
1049 continue;
1050 if (i < 0 || i >= max_netdevices)
1051 continue;
1052
1053 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 1054 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
1055 if (!strncmp(buf, d->name, IFNAMSIZ))
1056 set_bit(i, inuse);
1057 }
1058
1059 i = find_first_zero_bit(inuse, max_netdevices);
1060 free_page((unsigned long) inuse);
1061 }
1062
d9031024
OP
1063 if (buf != name)
1064 snprintf(buf, IFNAMSIZ, name, i);
b267b179 1065 if (!__dev_get_by_name(net, buf))
1da177e4 1066 return i;
1da177e4
LT
1067
1068 /* It is possible to run out of possible slots
1069 * when the name is long and there isn't enough space left
1070 * for the digits, or if all bits are used.
1071 */
1072 return -ENFILE;
1073}
1074
b267b179
EB
1075/**
1076 * dev_alloc_name - allocate a name for a device
1077 * @dev: device
1078 * @name: name format string
1079 *
1080 * Passed a format string - eg "lt%d" it will try and find a suitable
1081 * id. It scans list of devices to build up a free map, then chooses
1082 * the first empty slot. The caller must hold the dev_base or rtnl lock
1083 * while allocating the name and adding the device in order to avoid
1084 * duplicates.
1085 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 * Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091 char buf[IFNAMSIZ];
1092 struct net *net;
1093 int ret;
1094
c346dca1
YH
1095 BUG_ON(!dev_net(dev));
1096 net = dev_net(dev);
b267b179
EB
1097 ret = __dev_alloc_name(net, name, buf);
1098 if (ret >= 0)
1099 strlcpy(dev->name, buf, IFNAMSIZ);
1100 return ret;
1101}
d1b19dff 1102EXPORT_SYMBOL(dev_alloc_name);
b267b179 1103
828de4f6
G
1104static int dev_alloc_name_ns(struct net *net,
1105 struct net_device *dev,
1106 const char *name)
d9031024 1107{
828de4f6
G
1108 char buf[IFNAMSIZ];
1109 int ret;
8ce6cebc 1110
828de4f6
G
1111 ret = __dev_alloc_name(net, name, buf);
1112 if (ret >= 0)
1113 strlcpy(dev->name, buf, IFNAMSIZ);
1114 return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118 struct net_device *dev,
1119 const char *name)
1120{
1121 BUG_ON(!net);
8ce6cebc 1122
d9031024
OP
1123 if (!dev_valid_name(name))
1124 return -EINVAL;
1125
1c5cae81 1126 if (strchr(name, '%'))
828de4f6 1127 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1128 else if (__dev_get_by_name(net, name))
1129 return -EEXIST;
8ce6cebc
DL
1130 else if (dev->name != name)
1131 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1132
1133 return 0;
1134}
1da177e4
LT
1135
1136/**
1137 * dev_change_name - change name of a device
1138 * @dev: device
1139 * @newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 * Change name of a device, can pass format strings "eth%d".
1142 * for wildcarding.
1143 */
cf04a4c7 1144int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1145{
238fa362 1146 unsigned char old_assign_type;
fcc5a03a 1147 char oldname[IFNAMSIZ];
1da177e4 1148 int err = 0;
fcc5a03a 1149 int ret;
881d966b 1150 struct net *net;
1da177e4
LT
1151
1152 ASSERT_RTNL();
c346dca1 1153 BUG_ON(!dev_net(dev));
1da177e4 1154
c346dca1 1155 net = dev_net(dev);
1da177e4
LT
1156 if (dev->flags & IFF_UP)
1157 return -EBUSY;
1158
30e6c9fa 1159 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1160
1161 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1162 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1163 return 0;
c91f6df2 1164 }
c8d90dca 1165
fcc5a03a
HX
1166 memcpy(oldname, dev->name, IFNAMSIZ);
1167
828de4f6 1168 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1169 if (err < 0) {
30e6c9fa 1170 write_seqcount_end(&devnet_rename_seq);
d9031024 1171 return err;
c91f6df2 1172 }
1da177e4 1173
6fe82a39
VF
1174 if (oldname[0] && !strchr(oldname, '%'))
1175 netdev_info(dev, "renamed from %s\n", oldname);
1176
238fa362
TG
1177 old_assign_type = dev->name_assign_type;
1178 dev->name_assign_type = NET_NAME_RENAMED;
1179
fcc5a03a 1180rollback:
a1b3f594
EB
1181 ret = device_rename(&dev->dev, dev->name);
1182 if (ret) {
1183 memcpy(dev->name, oldname, IFNAMSIZ);
238fa362 1184 dev->name_assign_type = old_assign_type;
30e6c9fa 1185 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1186 return ret;
dcc99773 1187 }
7f988eab 1188
30e6c9fa 1189 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1190
5bb025fa
VF
1191 netdev_adjacent_rename_links(dev, oldname);
1192
7f988eab 1193 write_lock_bh(&dev_base_lock);
372b2312 1194 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1195 write_unlock_bh(&dev_base_lock);
1196
1197 synchronize_rcu();
1198
1199 write_lock_bh(&dev_base_lock);
1200 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1201 write_unlock_bh(&dev_base_lock);
1202
056925ab 1203 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1204 ret = notifier_to_errno(ret);
1205
1206 if (ret) {
91e9c07b
ED
1207 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208 if (err >= 0) {
fcc5a03a 1209 err = ret;
30e6c9fa 1210 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a 1211 memcpy(dev->name, oldname, IFNAMSIZ);
5bb025fa 1212 memcpy(oldname, newname, IFNAMSIZ);
238fa362
TG
1213 dev->name_assign_type = old_assign_type;
1214 old_assign_type = NET_NAME_RENAMED;
fcc5a03a 1215 goto rollback;
91e9c07b 1216 } else {
7b6cd1ce 1217 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1218 dev->name, ret);
fcc5a03a
HX
1219 }
1220 }
1da177e4
LT
1221
1222 return err;
1223}
1224
0b815a1a
SH
1225/**
1226 * dev_set_alias - change ifalias of a device
1227 * @dev: device
1228 * @alias: name up to IFALIASZ
f0db275a 1229 * @len: limit of bytes to copy from info
0b815a1a
SH
1230 *
1231 * Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
7364e445
AK
1235 char *new_ifalias;
1236
0b815a1a
SH
1237 ASSERT_RTNL();
1238
1239 if (len >= IFALIASZ)
1240 return -EINVAL;
1241
96ca4a2c 1242 if (!len) {
388dfc2d
SK
1243 kfree(dev->ifalias);
1244 dev->ifalias = NULL;
96ca4a2c
OH
1245 return 0;
1246 }
1247
7364e445
AK
1248 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 if (!new_ifalias)
0b815a1a 1250 return -ENOMEM;
7364e445 1251 dev->ifalias = new_ifalias;
0b815a1a
SH
1252
1253 strlcpy(dev->ifalias, alias, len+1);
1254 return len;
1255}
1256
1257
d8a33ac4 1258/**
3041a069 1259 * netdev_features_change - device changes features
d8a33ac4
SH
1260 * @dev: device to cause notification
1261 *
1262 * Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
056925ab 1266 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1da177e4
LT
1270/**
1271 * netdev_state_change - device changes state
1272 * @dev: device to cause notification
1273 *
1274 * Called to indicate a device has changed state. This function calls
1275 * the notifier chains for netdev_chain and sends a NEWLINK message
1276 * to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280 if (dev->flags & IFF_UP) {
54951194
LP
1281 struct netdev_notifier_change_info change_info;
1282
1283 change_info.flags_changed = 0;
1284 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 &change_info.info);
7f294054 1286 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1da177e4
LT
1287 }
1288}
d1b19dff 1289EXPORT_SYMBOL(netdev_state_change);
1da177e4 1290
ee89bab1
AW
1291/**
1292 * netdev_notify_peers - notify network peers about existence of @dev
1293 * @dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1302{
ee89bab1
AW
1303 rtnl_lock();
1304 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 rtnl_unlock();
c1da4ac7 1306}
ee89bab1 1307EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1308
bd380811 1309static int __dev_open(struct net_device *dev)
1da177e4 1310{
d314774c 1311 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1312 int ret;
1da177e4 1313
e46b66bc
BH
1314 ASSERT_RTNL();
1315
1da177e4
LT
1316 if (!netif_device_present(dev))
1317 return -ENODEV;
1318
ca99ca14
NH
1319 /* Block netpoll from trying to do any rx path servicing.
1320 * If we don't do this there is a chance ndo_poll_controller
1321 * or ndo_poll may be running while we open the device
1322 */
66b5552f 1323 netpoll_poll_disable(dev);
ca99ca14 1324
3b8bcfd5
JB
1325 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 ret = notifier_to_errno(ret);
1327 if (ret)
1328 return ret;
1329
1da177e4 1330 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1331
d314774c
SH
1332 if (ops->ndo_validate_addr)
1333 ret = ops->ndo_validate_addr(dev);
bada339b 1334
d314774c
SH
1335 if (!ret && ops->ndo_open)
1336 ret = ops->ndo_open(dev);
1da177e4 1337
66b5552f 1338 netpoll_poll_enable(dev);
ca99ca14 1339
bada339b
JG
1340 if (ret)
1341 clear_bit(__LINK_STATE_START, &dev->state);
1342 else {
1da177e4 1343 dev->flags |= IFF_UP;
4417da66 1344 dev_set_rx_mode(dev);
1da177e4 1345 dev_activate(dev);
7bf23575 1346 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1347 }
bada339b 1348
1da177e4
LT
1349 return ret;
1350}
1351
1352/**
bd380811
PM
1353 * dev_open - prepare an interface for use.
1354 * @dev: device to open
1da177e4 1355 *
bd380811
PM
1356 * Takes a device from down to up state. The device's private open
1357 * function is invoked and then the multicast lists are loaded. Finally
1358 * the device is moved into the up state and a %NETDEV_UP message is
1359 * sent to the netdev notifier chain.
1360 *
1361 * Calling this function on an active interface is a nop. On a failure
1362 * a negative errno code is returned.
1da177e4 1363 */
bd380811
PM
1364int dev_open(struct net_device *dev)
1365{
1366 int ret;
1367
bd380811
PM
1368 if (dev->flags & IFF_UP)
1369 return 0;
1370
bd380811
PM
1371 ret = __dev_open(dev);
1372 if (ret < 0)
1373 return ret;
1374
7f294054 1375 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
bd380811
PM
1376 call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378 return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
44345724 1382static int __dev_close_many(struct list_head *head)
1da177e4 1383{
44345724 1384 struct net_device *dev;
e46b66bc 1385
bd380811 1386 ASSERT_RTNL();
9d5010db
DM
1387 might_sleep();
1388
5cde2829 1389 list_for_each_entry(dev, head, close_list) {
3f4df206 1390 /* Temporarily disable netpoll until the interface is down */
66b5552f 1391 netpoll_poll_disable(dev);
3f4df206 1392
44345724 1393 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1394
44345724 1395 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1396
44345724
OP
1397 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398 * can be even on different cpu. So just clear netif_running().
1399 *
1400 * dev->stop() will invoke napi_disable() on all of it's
1401 * napi_struct instances on this device.
1402 */
4e857c58 1403 smp_mb__after_atomic(); /* Commit netif_running(). */
44345724 1404 }
1da177e4 1405
44345724 1406 dev_deactivate_many(head);
d8b2a4d2 1407
5cde2829 1408 list_for_each_entry(dev, head, close_list) {
44345724 1409 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1410
44345724
OP
1411 /*
1412 * Call the device specific close. This cannot fail.
1413 * Only if device is UP
1414 *
1415 * We allow it to be called even after a DETACH hot-plug
1416 * event.
1417 */
1418 if (ops->ndo_stop)
1419 ops->ndo_stop(dev);
1420
44345724 1421 dev->flags &= ~IFF_UP;
66b5552f 1422 netpoll_poll_enable(dev);
44345724
OP
1423 }
1424
1425 return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
f87e6f47 1430 int retval;
44345724
OP
1431 LIST_HEAD(single);
1432
5cde2829 1433 list_add(&dev->close_list, &single);
f87e6f47
LT
1434 retval = __dev_close_many(&single);
1435 list_del(&single);
ca99ca14 1436
f87e6f47 1437 return retval;
44345724
OP
1438}
1439
99c4a26a 1440int dev_close_many(struct list_head *head, bool unlink)
44345724
OP
1441{
1442 struct net_device *dev, *tmp;
1da177e4 1443
5cde2829
EB
1444 /* Remove the devices that don't need to be closed */
1445 list_for_each_entry_safe(dev, tmp, head, close_list)
44345724 1446 if (!(dev->flags & IFF_UP))
5cde2829 1447 list_del_init(&dev->close_list);
44345724
OP
1448
1449 __dev_close_many(head);
1da177e4 1450
5cde2829 1451 list_for_each_entry_safe(dev, tmp, head, close_list) {
7f294054 1452 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
44345724 1453 call_netdevice_notifiers(NETDEV_DOWN, dev);
99c4a26a
DM
1454 if (unlink)
1455 list_del_init(&dev->close_list);
44345724 1456 }
bd380811
PM
1457
1458 return 0;
1459}
99c4a26a 1460EXPORT_SYMBOL(dev_close_many);
bd380811
PM
1461
1462/**
1463 * dev_close - shutdown an interface.
1464 * @dev: device to shutdown
1465 *
1466 * This function moves an active device into down state. A
1467 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 * chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
e14a5993
ED
1473 if (dev->flags & IFF_UP) {
1474 LIST_HEAD(single);
1da177e4 1475
5cde2829 1476 list_add(&dev->close_list, &single);
99c4a26a 1477 dev_close_many(&single, true);
e14a5993
ED
1478 list_del(&single);
1479 }
da6e378b 1480 return 0;
1da177e4 1481}
d1b19dff 1482EXPORT_SYMBOL(dev_close);
1da177e4
LT
1483
1484
0187bdfb
BH
1485/**
1486 * dev_disable_lro - disable Large Receive Offload on a device
1487 * @dev: device
1488 *
1489 * Disable Large Receive Offload (LRO) on a net device. Must be
1490 * called under RTNL. This is needed if received packets may be
1491 * forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
fbe168ba
MK
1495 struct net_device *lower_dev;
1496 struct list_head *iter;
529d0489 1497
bc5787c6
MM
1498 dev->wanted_features &= ~NETIF_F_LRO;
1499 netdev_update_features(dev);
27660515 1500
22d5969f
MM
1501 if (unlikely(dev->features & NETIF_F_LRO))
1502 netdev_WARN(dev, "failed to disable LRO!\n");
fbe168ba
MK
1503
1504 netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 dev_disable_lro(lower_dev);
0187bdfb
BH
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
351638e7
JP
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 struct net_device *dev)
1511{
1512 struct netdev_notifier_info info;
1513
1514 netdev_notifier_info_init(&info, dev);
1515 return nb->notifier_call(nb, val, &info);
1516}
0187bdfb 1517
881d966b
EB
1518static int dev_boot_phase = 1;
1519
1da177e4
LT
1520/**
1521 * register_netdevice_notifier - register a network notifier block
1522 * @nb: notifier
1523 *
1524 * Register a notifier to be called when network device events occur.
1525 * The notifier passed is linked into the kernel structures and must
1526 * not be reused until it has been unregistered. A negative errno code
1527 * is returned on a failure.
1528 *
1529 * When registered all registration and up events are replayed
4ec93edb 1530 * to the new notifier to allow device to have a race free
1da177e4
LT
1531 * view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536 struct net_device *dev;
fcc5a03a 1537 struct net_device *last;
881d966b 1538 struct net *net;
1da177e4
LT
1539 int err;
1540
1541 rtnl_lock();
f07d5b94 1542 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1543 if (err)
1544 goto unlock;
881d966b
EB
1545 if (dev_boot_phase)
1546 goto unlock;
1547 for_each_net(net) {
1548 for_each_netdev(net, dev) {
351638e7 1549 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
881d966b
EB
1550 err = notifier_to_errno(err);
1551 if (err)
1552 goto rollback;
1553
1554 if (!(dev->flags & IFF_UP))
1555 continue;
1da177e4 1556
351638e7 1557 call_netdevice_notifier(nb, NETDEV_UP, dev);
881d966b 1558 }
1da177e4 1559 }
fcc5a03a
HX
1560
1561unlock:
1da177e4
LT
1562 rtnl_unlock();
1563 return err;
fcc5a03a
HX
1564
1565rollback:
1566 last = dev;
881d966b
EB
1567 for_each_net(net) {
1568 for_each_netdev(net, dev) {
1569 if (dev == last)
8f891489 1570 goto outroll;
fcc5a03a 1571
881d966b 1572 if (dev->flags & IFF_UP) {
351638e7
JP
1573 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 dev);
1575 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
881d966b 1576 }
351638e7 1577 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1578 }
fcc5a03a 1579 }
c67625a1 1580
8f891489 1581outroll:
c67625a1 1582 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1583 goto unlock;
1da177e4 1584}
d1b19dff 1585EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1586
1587/**
1588 * unregister_netdevice_notifier - unregister a network notifier block
1589 * @nb: notifier
1590 *
1591 * Unregister a notifier previously registered by
1592 * register_netdevice_notifier(). The notifier is unlinked into the
1593 * kernel structures and may then be reused. A negative errno code
1594 * is returned on a failure.
7d3d43da
EB
1595 *
1596 * After unregistering unregister and down device events are synthesized
1597 * for all devices on the device list to the removed notifier to remove
1598 * the need for special case cleanup code.
1da177e4
LT
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
7d3d43da
EB
1603 struct net_device *dev;
1604 struct net *net;
9f514950
HX
1605 int err;
1606
1607 rtnl_lock();
f07d5b94 1608 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1609 if (err)
1610 goto unlock;
1611
1612 for_each_net(net) {
1613 for_each_netdev(net, dev) {
1614 if (dev->flags & IFF_UP) {
351638e7
JP
1615 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 dev);
1617 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
7d3d43da 1618 }
351638e7 1619 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1620 }
1621 }
1622unlock:
9f514950
HX
1623 rtnl_unlock();
1624 return err;
1da177e4 1625}
d1b19dff 1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4 1627
351638e7
JP
1628/**
1629 * call_netdevice_notifiers_info - call all network notifier blocks
1630 * @val: value passed unmodified to notifier function
1631 * @dev: net_device pointer passed unmodified to notifier function
1632 * @info: notifier information data
1633 *
1634 * Call all network notifier blocks. Parameters and return value
1635 * are as for raw_notifier_call_chain().
1636 */
1637
1d143d9f 1638static int call_netdevice_notifiers_info(unsigned long val,
1639 struct net_device *dev,
1640 struct netdev_notifier_info *info)
351638e7
JP
1641{
1642 ASSERT_RTNL();
1643 netdev_notifier_info_init(info, dev);
1644 return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
351638e7 1646
1da177e4
LT
1647/**
1648 * call_netdevice_notifiers - call all network notifier blocks
1649 * @val: value passed unmodified to notifier function
c4ea43c5 1650 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1651 *
1652 * Call all network notifier blocks. Parameters and return value
f07d5b94 1653 * are as for raw_notifier_call_chain().
1da177e4
LT
1654 */
1655
ad7379d4 1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1657{
351638e7
JP
1658 struct netdev_notifier_info info;
1659
1660 return call_netdevice_notifiers_info(val, dev, &info);
1da177e4 1661}
edf947f1 1662EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1663
1cf51900 1664#ifdef CONFIG_NET_INGRESS
4577139b
DB
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669 static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675 static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1f211a1b
DB
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685 static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691 static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
c5905afb 1696static struct static_key netstamp_needed __read_mostly;
b90e5794 1697#ifdef HAVE_JUMP_LABEL
c5905afb 1698/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1699 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1700 * static_key_slow_dec() calls.
b90e5794
ED
1701 */
1702static atomic_t netstamp_needed_deferred;
1703#endif
1da177e4
LT
1704
1705void net_enable_timestamp(void)
1706{
b90e5794
ED
1707#ifdef HAVE_JUMP_LABEL
1708 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709
1710 if (deferred) {
1711 while (--deferred)
c5905afb 1712 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1713 return;
1714 }
1715#endif
c5905afb 1716 static_key_slow_inc(&netstamp_needed);
1da177e4 1717}
d1b19dff 1718EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1719
1720void net_disable_timestamp(void)
1721{
b90e5794
ED
1722#ifdef HAVE_JUMP_LABEL
1723 if (in_interrupt()) {
1724 atomic_inc(&netstamp_needed_deferred);
1725 return;
1726 }
1727#endif
c5905afb 1728 static_key_slow_dec(&netstamp_needed);
1da177e4 1729}
d1b19dff 1730EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1731
3b098e2d 1732static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1733{
588f0330 1734 skb->tstamp.tv64 = 0;
c5905afb 1735 if (static_key_false(&netstamp_needed))
a61bbcf2 1736 __net_timestamp(skb);
1da177e4
LT
1737}
1738
588f0330 1739#define net_timestamp_check(COND, SKB) \
c5905afb 1740 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1741 if ((COND) && !(SKB)->tstamp.tv64) \
1742 __net_timestamp(SKB); \
1743 } \
3b098e2d 1744
f4b05d27 1745bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
79b569f0
DL
1746{
1747 unsigned int len;
1748
1749 if (!(dev->flags & IFF_UP))
1750 return false;
1751
1752 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753 if (skb->len <= len)
1754 return true;
1755
1756 /* if TSO is enabled, we don't care about the length as the packet
1757 * could be forwarded without being segmented before
1758 */
1759 if (skb_is_gso(skb))
1760 return true;
1761
1762 return false;
1763}
1ee481fb 1764EXPORT_SYMBOL_GPL(is_skb_forwardable);
79b569f0 1765
a0265d28
HX
1766int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767{
4e3264d2 1768 int ret = ____dev_forward_skb(dev, skb);
a0265d28 1769
4e3264d2
MKL
1770 if (likely(!ret)) {
1771 skb->protocol = eth_type_trans(skb, dev);
1772 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1773 }
a0265d28 1774
4e3264d2 1775 return ret;
a0265d28
HX
1776}
1777EXPORT_SYMBOL_GPL(__dev_forward_skb);
1778
44540960
AB
1779/**
1780 * dev_forward_skb - loopback an skb to another netif
1781 *
1782 * @dev: destination network device
1783 * @skb: buffer to forward
1784 *
1785 * return values:
1786 * NET_RX_SUCCESS (no congestion)
6ec82562 1787 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1788 *
1789 * dev_forward_skb can be used for injecting an skb from the
1790 * start_xmit function of one device into the receive queue
1791 * of another device.
1792 *
1793 * The receiving device may be in another namespace, so
1794 * we have to clear all information in the skb that could
1795 * impact namespace isolation.
1796 */
1797int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1798{
a0265d28 1799 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
44540960
AB
1800}
1801EXPORT_SYMBOL_GPL(dev_forward_skb);
1802
71d9dec2
CG
1803static inline int deliver_skb(struct sk_buff *skb,
1804 struct packet_type *pt_prev,
1805 struct net_device *orig_dev)
1806{
1080e512
MT
1807 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1808 return -ENOMEM;
71d9dec2
CG
1809 atomic_inc(&skb->users);
1810 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1811}
1812
7866a621
SN
1813static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1814 struct packet_type **pt,
fbcb2170
JP
1815 struct net_device *orig_dev,
1816 __be16 type,
7866a621
SN
1817 struct list_head *ptype_list)
1818{
1819 struct packet_type *ptype, *pt_prev = *pt;
1820
1821 list_for_each_entry_rcu(ptype, ptype_list, list) {
1822 if (ptype->type != type)
1823 continue;
1824 if (pt_prev)
fbcb2170 1825 deliver_skb(skb, pt_prev, orig_dev);
7866a621
SN
1826 pt_prev = ptype;
1827 }
1828 *pt = pt_prev;
1829}
1830
c0de08d0
EL
1831static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1832{
a3d744e9 1833 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1834 return false;
1835
1836 if (ptype->id_match)
1837 return ptype->id_match(ptype, skb->sk);
1838 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1839 return true;
1840
1841 return false;
1842}
1843
1da177e4
LT
1844/*
1845 * Support routine. Sends outgoing frames to any network
1846 * taps currently in use.
1847 */
1848
74b20582 1849void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1850{
1851 struct packet_type *ptype;
71d9dec2
CG
1852 struct sk_buff *skb2 = NULL;
1853 struct packet_type *pt_prev = NULL;
7866a621 1854 struct list_head *ptype_list = &ptype_all;
a61bbcf2 1855
1da177e4 1856 rcu_read_lock();
7866a621
SN
1857again:
1858 list_for_each_entry_rcu(ptype, ptype_list, list) {
1da177e4
LT
1859 /* Never send packets back to the socket
1860 * they originated from - MvS (miquels@drinkel.ow.org)
1861 */
7866a621
SN
1862 if (skb_loop_sk(ptype, skb))
1863 continue;
71d9dec2 1864
7866a621
SN
1865 if (pt_prev) {
1866 deliver_skb(skb2, pt_prev, skb->dev);
1867 pt_prev = ptype;
1868 continue;
1869 }
1da177e4 1870
7866a621
SN
1871 /* need to clone skb, done only once */
1872 skb2 = skb_clone(skb, GFP_ATOMIC);
1873 if (!skb2)
1874 goto out_unlock;
70978182 1875
7866a621 1876 net_timestamp_set(skb2);
1da177e4 1877
7866a621
SN
1878 /* skb->nh should be correctly
1879 * set by sender, so that the second statement is
1880 * just protection against buggy protocols.
1881 */
1882 skb_reset_mac_header(skb2);
1883
1884 if (skb_network_header(skb2) < skb2->data ||
1885 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1886 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1887 ntohs(skb2->protocol),
1888 dev->name);
1889 skb_reset_network_header(skb2);
1da177e4 1890 }
7866a621
SN
1891
1892 skb2->transport_header = skb2->network_header;
1893 skb2->pkt_type = PACKET_OUTGOING;
1894 pt_prev = ptype;
1895 }
1896
1897 if (ptype_list == &ptype_all) {
1898 ptype_list = &dev->ptype_all;
1899 goto again;
1da177e4 1900 }
7866a621 1901out_unlock:
71d9dec2
CG
1902 if (pt_prev)
1903 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1904 rcu_read_unlock();
1905}
74b20582 1906EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1da177e4 1907
2c53040f
BH
1908/**
1909 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1910 * @dev: Network device
1911 * @txq: number of queues available
1912 *
1913 * If real_num_tx_queues is changed the tc mappings may no longer be
1914 * valid. To resolve this verify the tc mapping remains valid and if
1915 * not NULL the mapping. With no priorities mapping to this
1916 * offset/count pair it will no longer be used. In the worst case TC0
1917 * is invalid nothing can be done so disable priority mappings. If is
1918 * expected that drivers will fix this mapping if they can before
1919 * calling netif_set_real_num_tx_queues.
1920 */
bb134d22 1921static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1922{
1923 int i;
1924 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1925
1926 /* If TC0 is invalidated disable TC mapping */
1927 if (tc->offset + tc->count > txq) {
7b6cd1ce 1928 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1929 dev->num_tc = 0;
1930 return;
1931 }
1932
1933 /* Invalidated prio to tc mappings set to TC0 */
1934 for (i = 1; i < TC_BITMASK + 1; i++) {
1935 int q = netdev_get_prio_tc_map(dev, i);
1936
1937 tc = &dev->tc_to_txq[q];
1938 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1939 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1940 i, q);
4f57c087
JF
1941 netdev_set_prio_tc_map(dev, i, 0);
1942 }
1943 }
1944}
1945
8d059b0f
AD
1946int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1947{
1948 if (dev->num_tc) {
1949 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1950 int i;
1951
1952 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1953 if ((txq - tc->offset) < tc->count)
1954 return i;
1955 }
1956
1957 return -1;
1958 }
1959
1960 return 0;
1961}
1962
537c00de
AD
1963#ifdef CONFIG_XPS
1964static DEFINE_MUTEX(xps_map_mutex);
1965#define xmap_dereference(P) \
1966 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1967
6234f874
AD
1968static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1969 int tci, u16 index)
537c00de 1970{
10cdc3f3
AD
1971 struct xps_map *map = NULL;
1972 int pos;
537c00de 1973
10cdc3f3 1974 if (dev_maps)
6234f874
AD
1975 map = xmap_dereference(dev_maps->cpu_map[tci]);
1976 if (!map)
1977 return false;
537c00de 1978
6234f874
AD
1979 for (pos = map->len; pos--;) {
1980 if (map->queues[pos] != index)
1981 continue;
1982
1983 if (map->len > 1) {
1984 map->queues[pos] = map->queues[--map->len];
10cdc3f3 1985 break;
537c00de 1986 }
6234f874
AD
1987
1988 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1989 kfree_rcu(map, rcu);
1990 return false;
537c00de
AD
1991 }
1992
6234f874 1993 return true;
10cdc3f3
AD
1994}
1995
6234f874
AD
1996static bool remove_xps_queue_cpu(struct net_device *dev,
1997 struct xps_dev_maps *dev_maps,
1998 int cpu, u16 offset, u16 count)
1999{
184c449f
AD
2000 int num_tc = dev->num_tc ? : 1;
2001 bool active = false;
2002 int tci;
6234f874 2003
184c449f
AD
2004 for (tci = cpu * num_tc; num_tc--; tci++) {
2005 int i, j;
2006
2007 for (i = count, j = offset; i--; j++) {
2008 if (!remove_xps_queue(dev_maps, cpu, j))
2009 break;
2010 }
2011
2012 active |= i < 0;
6234f874
AD
2013 }
2014
184c449f 2015 return active;
6234f874
AD
2016}
2017
2018static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2019 u16 count)
10cdc3f3
AD
2020{
2021 struct xps_dev_maps *dev_maps;
024e9679 2022 int cpu, i;
10cdc3f3
AD
2023 bool active = false;
2024
2025 mutex_lock(&xps_map_mutex);
2026 dev_maps = xmap_dereference(dev->xps_maps);
2027
2028 if (!dev_maps)
2029 goto out_no_maps;
2030
6234f874
AD
2031 for_each_possible_cpu(cpu)
2032 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2033 offset, count);
10cdc3f3
AD
2034
2035 if (!active) {
537c00de
AD
2036 RCU_INIT_POINTER(dev->xps_maps, NULL);
2037 kfree_rcu(dev_maps, rcu);
2038 }
2039
6234f874 2040 for (i = offset + (count - 1); count--; i--)
024e9679
AD
2041 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2042 NUMA_NO_NODE);
2043
537c00de
AD
2044out_no_maps:
2045 mutex_unlock(&xps_map_mutex);
2046}
2047
6234f874
AD
2048static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2049{
2050 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2051}
2052
01c5f864
AD
2053static struct xps_map *expand_xps_map(struct xps_map *map,
2054 int cpu, u16 index)
2055{
2056 struct xps_map *new_map;
2057 int alloc_len = XPS_MIN_MAP_ALLOC;
2058 int i, pos;
2059
2060 for (pos = 0; map && pos < map->len; pos++) {
2061 if (map->queues[pos] != index)
2062 continue;
2063 return map;
2064 }
2065
2066 /* Need to add queue to this CPU's existing map */
2067 if (map) {
2068 if (pos < map->alloc_len)
2069 return map;
2070
2071 alloc_len = map->alloc_len * 2;
2072 }
2073
2074 /* Need to allocate new map to store queue on this CPU's map */
2075 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2076 cpu_to_node(cpu));
2077 if (!new_map)
2078 return NULL;
2079
2080 for (i = 0; i < pos; i++)
2081 new_map->queues[i] = map->queues[i];
2082 new_map->alloc_len = alloc_len;
2083 new_map->len = pos;
2084
2085 return new_map;
2086}
2087
3573540c
MT
2088int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2089 u16 index)
537c00de 2090{
01c5f864 2091 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
184c449f
AD
2092 int i, cpu, tci, numa_node_id = -2;
2093 int maps_sz, num_tc = 1, tc = 0;
537c00de 2094 struct xps_map *map, *new_map;
01c5f864 2095 bool active = false;
537c00de 2096
184c449f
AD
2097 if (dev->num_tc) {
2098 num_tc = dev->num_tc;
2099 tc = netdev_txq_to_tc(dev, index);
2100 if (tc < 0)
2101 return -EINVAL;
2102 }
2103
2104 maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2105 if (maps_sz < L1_CACHE_BYTES)
2106 maps_sz = L1_CACHE_BYTES;
2107
537c00de
AD
2108 mutex_lock(&xps_map_mutex);
2109
2110 dev_maps = xmap_dereference(dev->xps_maps);
2111
01c5f864 2112 /* allocate memory for queue storage */
184c449f 2113 for_each_cpu_and(cpu, cpu_online_mask, mask) {
01c5f864
AD
2114 if (!new_dev_maps)
2115 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
2116 if (!new_dev_maps) {
2117 mutex_unlock(&xps_map_mutex);
01c5f864 2118 return -ENOMEM;
2bb60cb9 2119 }
01c5f864 2120
184c449f
AD
2121 tci = cpu * num_tc + tc;
2122 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
01c5f864
AD
2123 NULL;
2124
2125 map = expand_xps_map(map, cpu, index);
2126 if (!map)
2127 goto error;
2128
184c449f 2129 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
01c5f864
AD
2130 }
2131
2132 if (!new_dev_maps)
2133 goto out_no_new_maps;
2134
537c00de 2135 for_each_possible_cpu(cpu) {
184c449f
AD
2136 /* copy maps belonging to foreign traffic classes */
2137 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2138 /* fill in the new device map from the old device map */
2139 map = xmap_dereference(dev_maps->cpu_map[tci]);
2140 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2141 }
2142
2143 /* We need to explicitly update tci as prevous loop
2144 * could break out early if dev_maps is NULL.
2145 */
2146 tci = cpu * num_tc + tc;
2147
01c5f864
AD
2148 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2149 /* add queue to CPU maps */
2150 int pos = 0;
2151
184c449f 2152 map = xmap_dereference(new_dev_maps->cpu_map[tci]);
01c5f864
AD
2153 while ((pos < map->len) && (map->queues[pos] != index))
2154 pos++;
2155
2156 if (pos == map->len)
2157 map->queues[map->len++] = index;
537c00de 2158#ifdef CONFIG_NUMA
537c00de
AD
2159 if (numa_node_id == -2)
2160 numa_node_id = cpu_to_node(cpu);
2161 else if (numa_node_id != cpu_to_node(cpu))
2162 numa_node_id = -1;
537c00de 2163#endif
01c5f864
AD
2164 } else if (dev_maps) {
2165 /* fill in the new device map from the old device map */
184c449f
AD
2166 map = xmap_dereference(dev_maps->cpu_map[tci]);
2167 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
537c00de 2168 }
01c5f864 2169
184c449f
AD
2170 /* copy maps belonging to foreign traffic classes */
2171 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2172 /* fill in the new device map from the old device map */
2173 map = xmap_dereference(dev_maps->cpu_map[tci]);
2174 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2175 }
537c00de
AD
2176 }
2177
01c5f864
AD
2178 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2179
537c00de 2180 /* Cleanup old maps */
184c449f
AD
2181 if (!dev_maps)
2182 goto out_no_old_maps;
2183
2184 for_each_possible_cpu(cpu) {
2185 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2186 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2187 map = xmap_dereference(dev_maps->cpu_map[tci]);
01c5f864
AD
2188 if (map && map != new_map)
2189 kfree_rcu(map, rcu);
2190 }
537c00de
AD
2191 }
2192
184c449f
AD
2193 kfree_rcu(dev_maps, rcu);
2194
2195out_no_old_maps:
01c5f864
AD
2196 dev_maps = new_dev_maps;
2197 active = true;
537c00de 2198
01c5f864
AD
2199out_no_new_maps:
2200 /* update Tx queue numa node */
537c00de
AD
2201 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2202 (numa_node_id >= 0) ? numa_node_id :
2203 NUMA_NO_NODE);
2204
01c5f864
AD
2205 if (!dev_maps)
2206 goto out_no_maps;
2207
2208 /* removes queue from unused CPUs */
2209 for_each_possible_cpu(cpu) {
184c449f
AD
2210 for (i = tc, tci = cpu * num_tc; i--; tci++)
2211 active |= remove_xps_queue(dev_maps, tci, index);
2212 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2213 active |= remove_xps_queue(dev_maps, tci, index);
2214 for (i = num_tc - tc, tci++; --i; tci++)
2215 active |= remove_xps_queue(dev_maps, tci, index);
01c5f864
AD
2216 }
2217
2218 /* free map if not active */
2219 if (!active) {
2220 RCU_INIT_POINTER(dev->xps_maps, NULL);
2221 kfree_rcu(dev_maps, rcu);
2222 }
2223
2224out_no_maps:
537c00de
AD
2225 mutex_unlock(&xps_map_mutex);
2226
2227 return 0;
2228error:
01c5f864
AD
2229 /* remove any maps that we added */
2230 for_each_possible_cpu(cpu) {
184c449f
AD
2231 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2232 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2233 map = dev_maps ?
2234 xmap_dereference(dev_maps->cpu_map[tci]) :
2235 NULL;
2236 if (new_map && new_map != map)
2237 kfree(new_map);
2238 }
01c5f864
AD
2239 }
2240
537c00de
AD
2241 mutex_unlock(&xps_map_mutex);
2242
537c00de
AD
2243 kfree(new_dev_maps);
2244 return -ENOMEM;
2245}
2246EXPORT_SYMBOL(netif_set_xps_queue);
2247
2248#endif
9cf1f6a8
AD
2249void netdev_reset_tc(struct net_device *dev)
2250{
6234f874
AD
2251#ifdef CONFIG_XPS
2252 netif_reset_xps_queues_gt(dev, 0);
2253#endif
9cf1f6a8
AD
2254 dev->num_tc = 0;
2255 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2256 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2257}
2258EXPORT_SYMBOL(netdev_reset_tc);
2259
2260int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2261{
2262 if (tc >= dev->num_tc)
2263 return -EINVAL;
2264
6234f874
AD
2265#ifdef CONFIG_XPS
2266 netif_reset_xps_queues(dev, offset, count);
2267#endif
9cf1f6a8
AD
2268 dev->tc_to_txq[tc].count = count;
2269 dev->tc_to_txq[tc].offset = offset;
2270 return 0;
2271}
2272EXPORT_SYMBOL(netdev_set_tc_queue);
2273
2274int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2275{
2276 if (num_tc > TC_MAX_QUEUE)
2277 return -EINVAL;
2278
6234f874
AD
2279#ifdef CONFIG_XPS
2280 netif_reset_xps_queues_gt(dev, 0);
2281#endif
9cf1f6a8
AD
2282 dev->num_tc = num_tc;
2283 return 0;
2284}
2285EXPORT_SYMBOL(netdev_set_num_tc);
2286
f0796d5c
JF
2287/*
2288 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2289 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2290 */
e6484930 2291int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2292{
1d24eb48
TH
2293 int rc;
2294
e6484930
TH
2295 if (txq < 1 || txq > dev->num_tx_queues)
2296 return -EINVAL;
f0796d5c 2297
5c56580b
BH
2298 if (dev->reg_state == NETREG_REGISTERED ||
2299 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2300 ASSERT_RTNL();
2301
1d24eb48
TH
2302 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2303 txq);
bf264145
TH
2304 if (rc)
2305 return rc;
2306
4f57c087
JF
2307 if (dev->num_tc)
2308 netif_setup_tc(dev, txq);
2309
024e9679 2310 if (txq < dev->real_num_tx_queues) {
e6484930 2311 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2312#ifdef CONFIG_XPS
2313 netif_reset_xps_queues_gt(dev, txq);
2314#endif
2315 }
f0796d5c 2316 }
e6484930
TH
2317
2318 dev->real_num_tx_queues = txq;
2319 return 0;
f0796d5c
JF
2320}
2321EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2322
a953be53 2323#ifdef CONFIG_SYSFS
62fe0b40
BH
2324/**
2325 * netif_set_real_num_rx_queues - set actual number of RX queues used
2326 * @dev: Network device
2327 * @rxq: Actual number of RX queues
2328 *
2329 * This must be called either with the rtnl_lock held or before
2330 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2331 * negative error code. If called before registration, it always
2332 * succeeds.
62fe0b40
BH
2333 */
2334int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2335{
2336 int rc;
2337
bd25fa7b
TH
2338 if (rxq < 1 || rxq > dev->num_rx_queues)
2339 return -EINVAL;
2340
62fe0b40
BH
2341 if (dev->reg_state == NETREG_REGISTERED) {
2342 ASSERT_RTNL();
2343
62fe0b40
BH
2344 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2345 rxq);
2346 if (rc)
2347 return rc;
62fe0b40
BH
2348 }
2349
2350 dev->real_num_rx_queues = rxq;
2351 return 0;
2352}
2353EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2354#endif
2355
2c53040f
BH
2356/**
2357 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2358 *
2359 * This routine should set an upper limit on the number of RSS queues
2360 * used by default by multiqueue devices.
2361 */
a55b138b 2362int netif_get_num_default_rss_queues(void)
16917b87 2363{
40e4e713
HS
2364 return is_kdump_kernel() ?
2365 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
16917b87
YM
2366}
2367EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2368
3bcb846c 2369static void __netif_reschedule(struct Qdisc *q)
56079431 2370{
def82a1d
JP
2371 struct softnet_data *sd;
2372 unsigned long flags;
56079431 2373
def82a1d 2374 local_irq_save(flags);
903ceff7 2375 sd = this_cpu_ptr(&softnet_data);
a9cbd588
CG
2376 q->next_sched = NULL;
2377 *sd->output_queue_tailp = q;
2378 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2379 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2380 local_irq_restore(flags);
2381}
2382
2383void __netif_schedule(struct Qdisc *q)
2384{
2385 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2386 __netif_reschedule(q);
56079431
DV
2387}
2388EXPORT_SYMBOL(__netif_schedule);
2389
e6247027
ED
2390struct dev_kfree_skb_cb {
2391 enum skb_free_reason reason;
2392};
2393
2394static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
56079431 2395{
e6247027
ED
2396 return (struct dev_kfree_skb_cb *)skb->cb;
2397}
2398
46e5da40
JF
2399void netif_schedule_queue(struct netdev_queue *txq)
2400{
2401 rcu_read_lock();
2402 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2403 struct Qdisc *q = rcu_dereference(txq->qdisc);
2404
2405 __netif_schedule(q);
2406 }
2407 rcu_read_unlock();
2408}
2409EXPORT_SYMBOL(netif_schedule_queue);
2410
2411/**
2412 * netif_wake_subqueue - allow sending packets on subqueue
2413 * @dev: network device
2414 * @queue_index: sub queue index
2415 *
2416 * Resume individual transmit queue of a device with multiple transmit queues.
2417 */
2418void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2419{
2420 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2421
2422 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2423 struct Qdisc *q;
2424
2425 rcu_read_lock();
2426 q = rcu_dereference(txq->qdisc);
2427 __netif_schedule(q);
2428 rcu_read_unlock();
2429 }
2430}
2431EXPORT_SYMBOL(netif_wake_subqueue);
2432
2433void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2434{
2435 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2436 struct Qdisc *q;
2437
2438 rcu_read_lock();
2439 q = rcu_dereference(dev_queue->qdisc);
2440 __netif_schedule(q);
2441 rcu_read_unlock();
2442 }
2443}
2444EXPORT_SYMBOL(netif_tx_wake_queue);
2445
e6247027 2446void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
56079431 2447{
e6247027 2448 unsigned long flags;
56079431 2449
e6247027
ED
2450 if (likely(atomic_read(&skb->users) == 1)) {
2451 smp_rmb();
2452 atomic_set(&skb->users, 0);
2453 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2454 return;
bea3348e 2455 }
e6247027
ED
2456 get_kfree_skb_cb(skb)->reason = reason;
2457 local_irq_save(flags);
2458 skb->next = __this_cpu_read(softnet_data.completion_queue);
2459 __this_cpu_write(softnet_data.completion_queue, skb);
2460 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2461 local_irq_restore(flags);
56079431 2462}
e6247027 2463EXPORT_SYMBOL(__dev_kfree_skb_irq);
56079431 2464
e6247027 2465void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
56079431
DV
2466{
2467 if (in_irq() || irqs_disabled())
e6247027 2468 __dev_kfree_skb_irq(skb, reason);
56079431
DV
2469 else
2470 dev_kfree_skb(skb);
2471}
e6247027 2472EXPORT_SYMBOL(__dev_kfree_skb_any);
56079431
DV
2473
2474
bea3348e
SH
2475/**
2476 * netif_device_detach - mark device as removed
2477 * @dev: network device
2478 *
2479 * Mark device as removed from system and therefore no longer available.
2480 */
56079431
DV
2481void netif_device_detach(struct net_device *dev)
2482{
2483 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2484 netif_running(dev)) {
d543103a 2485 netif_tx_stop_all_queues(dev);
56079431
DV
2486 }
2487}
2488EXPORT_SYMBOL(netif_device_detach);
2489
bea3348e
SH
2490/**
2491 * netif_device_attach - mark device as attached
2492 * @dev: network device
2493 *
2494 * Mark device as attached from system and restart if needed.
2495 */
56079431
DV
2496void netif_device_attach(struct net_device *dev)
2497{
2498 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2499 netif_running(dev)) {
d543103a 2500 netif_tx_wake_all_queues(dev);
4ec93edb 2501 __netdev_watchdog_up(dev);
56079431
DV
2502 }
2503}
2504EXPORT_SYMBOL(netif_device_attach);
2505
5605c762
JP
2506/*
2507 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2508 * to be used as a distribution range.
2509 */
2510u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2511 unsigned int num_tx_queues)
2512{
2513 u32 hash;
2514 u16 qoffset = 0;
2515 u16 qcount = num_tx_queues;
2516
2517 if (skb_rx_queue_recorded(skb)) {
2518 hash = skb_get_rx_queue(skb);
2519 while (unlikely(hash >= num_tx_queues))
2520 hash -= num_tx_queues;
2521 return hash;
2522 }
2523
2524 if (dev->num_tc) {
2525 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2526 qoffset = dev->tc_to_txq[tc].offset;
2527 qcount = dev->tc_to_txq[tc].count;
2528 }
2529
2530 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2531}
2532EXPORT_SYMBOL(__skb_tx_hash);
2533
36c92474
BH
2534static void skb_warn_bad_offload(const struct sk_buff *skb)
2535{
84d15ae5 2536 static const netdev_features_t null_features;
36c92474 2537 struct net_device *dev = skb->dev;
88ad4175 2538 const char *name = "";
36c92474 2539
c846ad9b
BG
2540 if (!net_ratelimit())
2541 return;
2542
88ad4175
BM
2543 if (dev) {
2544 if (dev->dev.parent)
2545 name = dev_driver_string(dev->dev.parent);
2546 else
2547 name = netdev_name(dev);
2548 }
36c92474
BH
2549 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2550 "gso_type=%d ip_summed=%d\n",
88ad4175 2551 name, dev ? &dev->features : &null_features,
65e9d2fa 2552 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2553 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2554 skb_shinfo(skb)->gso_type, skb->ip_summed);
2555}
2556
1da177e4
LT
2557/*
2558 * Invalidate hardware checksum when packet is to be mangled, and
2559 * complete checksum manually on outgoing path.
2560 */
84fa7933 2561int skb_checksum_help(struct sk_buff *skb)
1da177e4 2562{
d3bc23e7 2563 __wsum csum;
663ead3b 2564 int ret = 0, offset;
1da177e4 2565
84fa7933 2566 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2567 goto out_set_summed;
2568
2569 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2570 skb_warn_bad_offload(skb);
2571 return -EINVAL;
1da177e4
LT
2572 }
2573
cef401de
ED
2574 /* Before computing a checksum, we should make sure no frag could
2575 * be modified by an external entity : checksum could be wrong.
2576 */
2577 if (skb_has_shared_frag(skb)) {
2578 ret = __skb_linearize(skb);
2579 if (ret)
2580 goto out;
2581 }
2582
55508d60 2583 offset = skb_checksum_start_offset(skb);
a030847e
HX
2584 BUG_ON(offset >= skb_headlen(skb));
2585 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2586
2587 offset += skb->csum_offset;
2588 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2589
2590 if (skb_cloned(skb) &&
2591 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2592 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2593 if (ret)
2594 goto out;
2595 }
2596
4f2e4ad5 2597 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
a430a43d 2598out_set_summed:
1da177e4 2599 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2600out:
1da177e4
LT
2601 return ret;
2602}
d1b19dff 2603EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2604
53d6471c 2605__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
f6a78bfc 2606{
252e3346 2607 __be16 type = skb->protocol;
f6a78bfc 2608
19acc327
PS
2609 /* Tunnel gso handlers can set protocol to ethernet. */
2610 if (type == htons(ETH_P_TEB)) {
2611 struct ethhdr *eth;
2612
2613 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2614 return 0;
2615
2616 eth = (struct ethhdr *)skb_mac_header(skb);
2617 type = eth->h_proto;
2618 }
2619
d4bcef3f 2620 return __vlan_get_protocol(skb, type, depth);
ec5f0615
PS
2621}
2622
2623/**
2624 * skb_mac_gso_segment - mac layer segmentation handler.
2625 * @skb: buffer to segment
2626 * @features: features for the output path (see dev->features)
2627 */
2628struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2629 netdev_features_t features)
2630{
2631 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2632 struct packet_offload *ptype;
53d6471c
VY
2633 int vlan_depth = skb->mac_len;
2634 __be16 type = skb_network_protocol(skb, &vlan_depth);
ec5f0615
PS
2635
2636 if (unlikely(!type))
2637 return ERR_PTR(-EINVAL);
2638
53d6471c 2639 __skb_pull(skb, vlan_depth);
f6a78bfc
HX
2640
2641 rcu_read_lock();
22061d80 2642 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2643 if (ptype->type == type && ptype->callbacks.gso_segment) {
f191a1d1 2644 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2645 break;
2646 }
2647 }
2648 rcu_read_unlock();
2649
98e399f8 2650 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2651
f6a78bfc
HX
2652 return segs;
2653}
05e8ef4a
PS
2654EXPORT_SYMBOL(skb_mac_gso_segment);
2655
2656
2657/* openvswitch calls this on rx path, so we need a different check.
2658 */
2659static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2660{
2661 if (tx_path)
2662 return skb->ip_summed != CHECKSUM_PARTIAL;
2663 else
2664 return skb->ip_summed == CHECKSUM_NONE;
2665}
2666
2667/**
2668 * __skb_gso_segment - Perform segmentation on skb.
2669 * @skb: buffer to segment
2670 * @features: features for the output path (see dev->features)
2671 * @tx_path: whether it is called in TX path
2672 *
2673 * This function segments the given skb and returns a list of segments.
2674 *
2675 * It may return NULL if the skb requires no segmentation. This is
2676 * only possible when GSO is used for verifying header integrity.
9207f9d4
KK
2677 *
2678 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
05e8ef4a
PS
2679 */
2680struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2681 netdev_features_t features, bool tx_path)
2682{
2683 if (unlikely(skb_needs_check(skb, tx_path))) {
2684 int err;
2685
2686 skb_warn_bad_offload(skb);
2687
a40e0a66 2688 err = skb_cow_head(skb, 0);
2689 if (err < 0)
05e8ef4a
PS
2690 return ERR_PTR(err);
2691 }
2692
802ab55a
AD
2693 /* Only report GSO partial support if it will enable us to
2694 * support segmentation on this frame without needing additional
2695 * work.
2696 */
2697 if (features & NETIF_F_GSO_PARTIAL) {
2698 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2699 struct net_device *dev = skb->dev;
2700
2701 partial_features |= dev->features & dev->gso_partial_features;
2702 if (!skb_gso_ok(skb, features | partial_features))
2703 features &= ~NETIF_F_GSO_PARTIAL;
2704 }
2705
9207f9d4
KK
2706 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2707 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2708
68c33163 2709 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3347c960
ED
2710 SKB_GSO_CB(skb)->encap_level = 0;
2711
05e8ef4a
PS
2712 skb_reset_mac_header(skb);
2713 skb_reset_mac_len(skb);
2714
2715 return skb_mac_gso_segment(skb, features);
2716}
12b0004d 2717EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2718
fb286bb2
HX
2719/* Take action when hardware reception checksum errors are detected. */
2720#ifdef CONFIG_BUG
2721void netdev_rx_csum_fault(struct net_device *dev)
2722{
2723 if (net_ratelimit()) {
7b6cd1ce 2724 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2725 dump_stack();
2726 }
2727}
2728EXPORT_SYMBOL(netdev_rx_csum_fault);
2729#endif
2730
1da177e4
LT
2731/* Actually, we should eliminate this check as soon as we know, that:
2732 * 1. IOMMU is present and allows to map all the memory.
2733 * 2. No high memory really exists on this machine.
2734 */
2735
c1e756bf 2736static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2737{
3d3a8533 2738#ifdef CONFIG_HIGHMEM
1da177e4 2739 int i;
5acbbd42 2740 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2741 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2742 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2743 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2744 return 1;
ea2ab693 2745 }
5acbbd42 2746 }
1da177e4 2747
5acbbd42
FT
2748 if (PCI_DMA_BUS_IS_PHYS) {
2749 struct device *pdev = dev->dev.parent;
1da177e4 2750
9092c658
ED
2751 if (!pdev)
2752 return 0;
5acbbd42 2753 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2754 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2755 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2756 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2757 return 1;
2758 }
2759 }
3d3a8533 2760#endif
1da177e4
LT
2761 return 0;
2762}
1da177e4 2763
3b392ddb
SH
2764/* If MPLS offload request, verify we are testing hardware MPLS features
2765 * instead of standard features for the netdev.
2766 */
d0edc7bf 2767#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3b392ddb
SH
2768static netdev_features_t net_mpls_features(struct sk_buff *skb,
2769 netdev_features_t features,
2770 __be16 type)
2771{
25cd9ba0 2772 if (eth_p_mpls(type))
3b392ddb
SH
2773 features &= skb->dev->mpls_features;
2774
2775 return features;
2776}
2777#else
2778static netdev_features_t net_mpls_features(struct sk_buff *skb,
2779 netdev_features_t features,
2780 __be16 type)
2781{
2782 return features;
2783}
2784#endif
2785
c8f44aff 2786static netdev_features_t harmonize_features(struct sk_buff *skb,
c1e756bf 2787 netdev_features_t features)
f01a5236 2788{
53d6471c 2789 int tmp;
3b392ddb
SH
2790 __be16 type;
2791
2792 type = skb_network_protocol(skb, &tmp);
2793 features = net_mpls_features(skb, features, type);
53d6471c 2794
c0d680e5 2795 if (skb->ip_summed != CHECKSUM_NONE &&
3b392ddb 2796 !can_checksum_protocol(features, type)) {
996e8021 2797 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
c1e756bf 2798 } else if (illegal_highdma(skb->dev, skb)) {
f01a5236
JG
2799 features &= ~NETIF_F_SG;
2800 }
2801
2802 return features;
2803}
2804
e38f3025
TM
2805netdev_features_t passthru_features_check(struct sk_buff *skb,
2806 struct net_device *dev,
2807 netdev_features_t features)
2808{
2809 return features;
2810}
2811EXPORT_SYMBOL(passthru_features_check);
2812
8cb65d00
TM
2813static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2814 struct net_device *dev,
2815 netdev_features_t features)
2816{
2817 return vlan_features_check(skb, features);
2818}
2819
cbc53e08
AD
2820static netdev_features_t gso_features_check(const struct sk_buff *skb,
2821 struct net_device *dev,
2822 netdev_features_t features)
2823{
2824 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2825
2826 if (gso_segs > dev->gso_max_segs)
2827 return features & ~NETIF_F_GSO_MASK;
2828
802ab55a
AD
2829 /* Support for GSO partial features requires software
2830 * intervention before we can actually process the packets
2831 * so we need to strip support for any partial features now
2832 * and we can pull them back in after we have partially
2833 * segmented the frame.
2834 */
2835 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2836 features &= ~dev->gso_partial_features;
2837
2838 /* Make sure to clear the IPv4 ID mangling feature if the
2839 * IPv4 header has the potential to be fragmented.
cbc53e08
AD
2840 */
2841 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2842 struct iphdr *iph = skb->encapsulation ?
2843 inner_ip_hdr(skb) : ip_hdr(skb);
2844
2845 if (!(iph->frag_off & htons(IP_DF)))
2846 features &= ~NETIF_F_TSO_MANGLEID;
2847 }
2848
2849 return features;
2850}
2851
c1e756bf 2852netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6 2853{
5f35227e 2854 struct net_device *dev = skb->dev;
fcbeb976 2855 netdev_features_t features = dev->features;
58e998c6 2856
cbc53e08
AD
2857 if (skb_is_gso(skb))
2858 features = gso_features_check(skb, dev, features);
30b678d8 2859
5f35227e
JG
2860 /* If encapsulation offload request, verify we are testing
2861 * hardware encapsulation features instead of standard
2862 * features for the netdev
2863 */
2864 if (skb->encapsulation)
2865 features &= dev->hw_enc_features;
2866
f5a7fb88
TM
2867 if (skb_vlan_tagged(skb))
2868 features = netdev_intersect_features(features,
2869 dev->vlan_features |
2870 NETIF_F_HW_VLAN_CTAG_TX |
2871 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2872
5f35227e
JG
2873 if (dev->netdev_ops->ndo_features_check)
2874 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2875 features);
8cb65d00
TM
2876 else
2877 features &= dflt_features_check(skb, dev, features);
5f35227e 2878
c1e756bf 2879 return harmonize_features(skb, features);
58e998c6 2880}
c1e756bf 2881EXPORT_SYMBOL(netif_skb_features);
58e998c6 2882
2ea25513 2883static int xmit_one(struct sk_buff *skb, struct net_device *dev,
95f6b3dd 2884 struct netdev_queue *txq, bool more)
f6a78bfc 2885{
2ea25513
DM
2886 unsigned int len;
2887 int rc;
00829823 2888
7866a621 2889 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2ea25513 2890 dev_queue_xmit_nit(skb, dev);
fc741216 2891
2ea25513
DM
2892 len = skb->len;
2893 trace_net_dev_start_xmit(skb, dev);
95f6b3dd 2894 rc = netdev_start_xmit(skb, dev, txq, more);
2ea25513 2895 trace_net_dev_xmit(skb, rc, dev, len);
adf30907 2896
2ea25513
DM
2897 return rc;
2898}
7b9c6090 2899
8dcda22a
DM
2900struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2901 struct netdev_queue *txq, int *ret)
7f2e870f
DM
2902{
2903 struct sk_buff *skb = first;
2904 int rc = NETDEV_TX_OK;
7b9c6090 2905
7f2e870f
DM
2906 while (skb) {
2907 struct sk_buff *next = skb->next;
fc70fb64 2908
7f2e870f 2909 skb->next = NULL;
95f6b3dd 2910 rc = xmit_one(skb, dev, txq, next != NULL);
7f2e870f
DM
2911 if (unlikely(!dev_xmit_complete(rc))) {
2912 skb->next = next;
2913 goto out;
2914 }
6afff0ca 2915
7f2e870f
DM
2916 skb = next;
2917 if (netif_xmit_stopped(txq) && skb) {
2918 rc = NETDEV_TX_BUSY;
2919 break;
9ccb8975 2920 }
7f2e870f 2921 }
9ccb8975 2922
7f2e870f
DM
2923out:
2924 *ret = rc;
2925 return skb;
2926}
b40863c6 2927
1ff0dc94
ED
2928static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2929 netdev_features_t features)
f6a78bfc 2930{
df8a39de 2931 if (skb_vlan_tag_present(skb) &&
5968250c
JP
2932 !vlan_hw_offload_capable(features, skb->vlan_proto))
2933 skb = __vlan_hwaccel_push_inside(skb);
eae3f88e
DM
2934 return skb;
2935}
f6a78bfc 2936
55a93b3e 2937static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
eae3f88e
DM
2938{
2939 netdev_features_t features;
f6a78bfc 2940
eae3f88e
DM
2941 features = netif_skb_features(skb);
2942 skb = validate_xmit_vlan(skb, features);
2943 if (unlikely(!skb))
2944 goto out_null;
7b9c6090 2945
8b86a61d 2946 if (netif_needs_gso(skb, features)) {
ce93718f
DM
2947 struct sk_buff *segs;
2948
2949 segs = skb_gso_segment(skb, features);
cecda693 2950 if (IS_ERR(segs)) {
af6dabc9 2951 goto out_kfree_skb;
cecda693
JW
2952 } else if (segs) {
2953 consume_skb(skb);
2954 skb = segs;
f6a78bfc 2955 }
eae3f88e
DM
2956 } else {
2957 if (skb_needs_linearize(skb, features) &&
2958 __skb_linearize(skb))
2959 goto out_kfree_skb;
4ec93edb 2960
eae3f88e
DM
2961 /* If packet is not checksummed and device does not
2962 * support checksumming for this protocol, complete
2963 * checksumming here.
2964 */
2965 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2966 if (skb->encapsulation)
2967 skb_set_inner_transport_header(skb,
2968 skb_checksum_start_offset(skb));
2969 else
2970 skb_set_transport_header(skb,
2971 skb_checksum_start_offset(skb));
a188222b 2972 if (!(features & NETIF_F_CSUM_MASK) &&
eae3f88e
DM
2973 skb_checksum_help(skb))
2974 goto out_kfree_skb;
7b9c6090 2975 }
0c772159 2976 }
7b9c6090 2977
eae3f88e 2978 return skb;
fc70fb64 2979
f6a78bfc
HX
2980out_kfree_skb:
2981 kfree_skb(skb);
eae3f88e 2982out_null:
d21fd63e 2983 atomic_long_inc(&dev->tx_dropped);
eae3f88e
DM
2984 return NULL;
2985}
6afff0ca 2986
55a93b3e
ED
2987struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2988{
2989 struct sk_buff *next, *head = NULL, *tail;
2990
bec3cfdc 2991 for (; skb != NULL; skb = next) {
55a93b3e
ED
2992 next = skb->next;
2993 skb->next = NULL;
bec3cfdc
ED
2994
2995 /* in case skb wont be segmented, point to itself */
2996 skb->prev = skb;
2997
55a93b3e 2998 skb = validate_xmit_skb(skb, dev);
bec3cfdc
ED
2999 if (!skb)
3000 continue;
55a93b3e 3001
bec3cfdc
ED
3002 if (!head)
3003 head = skb;
3004 else
3005 tail->next = skb;
3006 /* If skb was segmented, skb->prev points to
3007 * the last segment. If not, it still contains skb.
3008 */
3009 tail = skb->prev;
55a93b3e
ED
3010 }
3011 return head;
f6a78bfc 3012}
104ba78c 3013EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
f6a78bfc 3014
1def9238
ED
3015static void qdisc_pkt_len_init(struct sk_buff *skb)
3016{
3017 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3018
3019 qdisc_skb_cb(skb)->pkt_len = skb->len;
3020
3021 /* To get more precise estimation of bytes sent on wire,
3022 * we add to pkt_len the headers size of all segments
3023 */
3024 if (shinfo->gso_size) {
757b8b1d 3025 unsigned int hdr_len;
15e5a030 3026 u16 gso_segs = shinfo->gso_segs;
1def9238 3027
757b8b1d
ED
3028 /* mac layer + network layer */
3029 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3030
3031 /* + transport layer */
1def9238
ED
3032 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3033 hdr_len += tcp_hdrlen(skb);
3034 else
3035 hdr_len += sizeof(struct udphdr);
15e5a030
JW
3036
3037 if (shinfo->gso_type & SKB_GSO_DODGY)
3038 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3039 shinfo->gso_size);
3040
3041 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
3042 }
3043}
3044
bbd8a0d3
KK
3045static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3046 struct net_device *dev,
3047 struct netdev_queue *txq)
3048{
3049 spinlock_t *root_lock = qdisc_lock(q);
520ac30f 3050 struct sk_buff *to_free = NULL;
a2da570d 3051 bool contended;
bbd8a0d3
KK
3052 int rc;
3053
a2da570d 3054 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
3055 /*
3056 * Heuristic to force contended enqueues to serialize on a
3057 * separate lock before trying to get qdisc main lock.
f9eb8aea 3058 * This permits qdisc->running owner to get the lock more
9bf2b8c2 3059 * often and dequeue packets faster.
79640a4c 3060 */
a2da570d 3061 contended = qdisc_is_running(q);
79640a4c
ED
3062 if (unlikely(contended))
3063 spin_lock(&q->busylock);
3064
bbd8a0d3
KK
3065 spin_lock(root_lock);
3066 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
520ac30f 3067 __qdisc_drop(skb, &to_free);
bbd8a0d3
KK
3068 rc = NET_XMIT_DROP;
3069 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 3070 qdisc_run_begin(q)) {
bbd8a0d3
KK
3071 /*
3072 * This is a work-conserving queue; there are no old skbs
3073 * waiting to be sent out; and the qdisc is not running -
3074 * xmit the skb directly.
3075 */
bfe0d029 3076
bfe0d029
ED
3077 qdisc_bstats_update(q, skb);
3078
55a93b3e 3079 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
79640a4c
ED
3080 if (unlikely(contended)) {
3081 spin_unlock(&q->busylock);
3082 contended = false;
3083 }
bbd8a0d3 3084 __qdisc_run(q);
79640a4c 3085 } else
bc135b23 3086 qdisc_run_end(q);
bbd8a0d3
KK
3087
3088 rc = NET_XMIT_SUCCESS;
3089 } else {
520ac30f 3090 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
79640a4c
ED
3091 if (qdisc_run_begin(q)) {
3092 if (unlikely(contended)) {
3093 spin_unlock(&q->busylock);
3094 contended = false;
3095 }
3096 __qdisc_run(q);
3097 }
bbd8a0d3
KK
3098 }
3099 spin_unlock(root_lock);
520ac30f
ED
3100 if (unlikely(to_free))
3101 kfree_skb_list(to_free);
79640a4c
ED
3102 if (unlikely(contended))
3103 spin_unlock(&q->busylock);
bbd8a0d3
KK
3104 return rc;
3105}
3106
86f8515f 3107#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
5bc1421e
NH
3108static void skb_update_prio(struct sk_buff *skb)
3109{
6977a79d 3110 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 3111
91c68ce2 3112 if (!skb->priority && skb->sk && map) {
2a56a1fe
TH
3113 unsigned int prioidx =
3114 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
91c68ce2
ED
3115
3116 if (prioidx < map->priomap_len)
3117 skb->priority = map->priomap[prioidx];
3118 }
5bc1421e
NH
3119}
3120#else
3121#define skb_update_prio(skb)
3122#endif
3123
f60e5990 3124DEFINE_PER_CPU(int, xmit_recursion);
3125EXPORT_SYMBOL(xmit_recursion);
3126
95603e22
MM
3127/**
3128 * dev_loopback_xmit - loop back @skb
0c4b51f0
EB
3129 * @net: network namespace this loopback is happening in
3130 * @sk: sk needed to be a netfilter okfn
95603e22
MM
3131 * @skb: buffer to transmit
3132 */
0c4b51f0 3133int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
95603e22
MM
3134{
3135 skb_reset_mac_header(skb);
3136 __skb_pull(skb, skb_network_offset(skb));
3137 skb->pkt_type = PACKET_LOOPBACK;
3138 skb->ip_summed = CHECKSUM_UNNECESSARY;
3139 WARN_ON(!skb_dst(skb));
3140 skb_dst_force(skb);
3141 netif_rx_ni(skb);
3142 return 0;
3143}
3144EXPORT_SYMBOL(dev_loopback_xmit);
3145
1f211a1b
DB
3146#ifdef CONFIG_NET_EGRESS
3147static struct sk_buff *
3148sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3149{
3150 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3151 struct tcf_result cl_res;
3152
3153 if (!cl)
3154 return skb;
3155
3156 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3157 * earlier by the caller.
3158 */
3159 qdisc_bstats_cpu_update(cl->q, skb);
3160
3161 switch (tc_classify(skb, cl, &cl_res, false)) {
3162 case TC_ACT_OK:
3163 case TC_ACT_RECLASSIFY:
3164 skb->tc_index = TC_H_MIN(cl_res.classid);
3165 break;
3166 case TC_ACT_SHOT:
3167 qdisc_qstats_cpu_drop(cl->q);
3168 *ret = NET_XMIT_DROP;
7e2c3aea
DB
3169 kfree_skb(skb);
3170 return NULL;
1f211a1b
DB
3171 case TC_ACT_STOLEN:
3172 case TC_ACT_QUEUED:
3173 *ret = NET_XMIT_SUCCESS;
7e2c3aea 3174 consume_skb(skb);
1f211a1b
DB
3175 return NULL;
3176 case TC_ACT_REDIRECT:
3177 /* No need to push/pop skb's mac_header here on egress! */
3178 skb_do_redirect(skb);
3179 *ret = NET_XMIT_SUCCESS;
3180 return NULL;
3181 default:
3182 break;
3183 }
3184
3185 return skb;
3186}
3187#endif /* CONFIG_NET_EGRESS */
3188
638b2a69
JP
3189static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3190{
3191#ifdef CONFIG_XPS
3192 struct xps_dev_maps *dev_maps;
3193 struct xps_map *map;
3194 int queue_index = -1;
3195
3196 rcu_read_lock();
3197 dev_maps = rcu_dereference(dev->xps_maps);
3198 if (dev_maps) {
184c449f
AD
3199 unsigned int tci = skb->sender_cpu - 1;
3200
3201 if (dev->num_tc) {
3202 tci *= dev->num_tc;
3203 tci += netdev_get_prio_tc_map(dev, skb->priority);
3204 }
3205
3206 map = rcu_dereference(dev_maps->cpu_map[tci]);
638b2a69
JP
3207 if (map) {
3208 if (map->len == 1)
3209 queue_index = map->queues[0];
3210 else
3211 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3212 map->len)];
3213 if (unlikely(queue_index >= dev->real_num_tx_queues))
3214 queue_index = -1;
3215 }
3216 }
3217 rcu_read_unlock();
3218
3219 return queue_index;
3220#else
3221 return -1;
3222#endif
3223}
3224
3225static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3226{
3227 struct sock *sk = skb->sk;
3228 int queue_index = sk_tx_queue_get(sk);
3229
3230 if (queue_index < 0 || skb->ooo_okay ||
3231 queue_index >= dev->real_num_tx_queues) {
3232 int new_index = get_xps_queue(dev, skb);
3233 if (new_index < 0)
3234 new_index = skb_tx_hash(dev, skb);
3235
3236 if (queue_index != new_index && sk &&
004a5d01 3237 sk_fullsock(sk) &&
638b2a69
JP
3238 rcu_access_pointer(sk->sk_dst_cache))
3239 sk_tx_queue_set(sk, new_index);
3240
3241 queue_index = new_index;
3242 }
3243
3244 return queue_index;
3245}
3246
3247struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3248 struct sk_buff *skb,
3249 void *accel_priv)
3250{
3251 int queue_index = 0;
3252
3253#ifdef CONFIG_XPS
52bd2d62
ED
3254 u32 sender_cpu = skb->sender_cpu - 1;
3255
3256 if (sender_cpu >= (u32)NR_CPUS)
638b2a69
JP
3257 skb->sender_cpu = raw_smp_processor_id() + 1;
3258#endif
3259
3260 if (dev->real_num_tx_queues != 1) {
3261 const struct net_device_ops *ops = dev->netdev_ops;
3262 if (ops->ndo_select_queue)
3263 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3264 __netdev_pick_tx);
3265 else
3266 queue_index = __netdev_pick_tx(dev, skb);
3267
3268 if (!accel_priv)
3269 queue_index = netdev_cap_txqueue(dev, queue_index);
3270 }
3271
3272 skb_set_queue_mapping(skb, queue_index);
3273 return netdev_get_tx_queue(dev, queue_index);
3274}
3275
d29f749e 3276/**
9d08dd3d 3277 * __dev_queue_xmit - transmit a buffer
d29f749e 3278 * @skb: buffer to transmit
9d08dd3d 3279 * @accel_priv: private data used for L2 forwarding offload
d29f749e
DJ
3280 *
3281 * Queue a buffer for transmission to a network device. The caller must
3282 * have set the device and priority and built the buffer before calling
3283 * this function. The function can be called from an interrupt.
3284 *
3285 * A negative errno code is returned on a failure. A success does not
3286 * guarantee the frame will be transmitted as it may be dropped due
3287 * to congestion or traffic shaping.
3288 *
3289 * -----------------------------------------------------------------------------------
3290 * I notice this method can also return errors from the queue disciplines,
3291 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3292 * be positive.
3293 *
3294 * Regardless of the return value, the skb is consumed, so it is currently
3295 * difficult to retry a send to this method. (You can bump the ref count
3296 * before sending to hold a reference for retry if you are careful.)
3297 *
3298 * When calling this method, interrupts MUST be enabled. This is because
3299 * the BH enable code must have IRQs enabled so that it will not deadlock.
3300 * --BLG
3301 */
0a59f3a9 3302static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
1da177e4
LT
3303{
3304 struct net_device *dev = skb->dev;
dc2b4847 3305 struct netdev_queue *txq;
1da177e4
LT
3306 struct Qdisc *q;
3307 int rc = -ENOMEM;
3308
6d1ccff6
ED
3309 skb_reset_mac_header(skb);
3310
e7fd2885
WB
3311 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3312 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3313
4ec93edb
YH
3314 /* Disable soft irqs for various locks below. Also
3315 * stops preemption for RCU.
1da177e4 3316 */
4ec93edb 3317 rcu_read_lock_bh();
1da177e4 3318
5bc1421e
NH
3319 skb_update_prio(skb);
3320
1f211a1b
DB
3321 qdisc_pkt_len_init(skb);
3322#ifdef CONFIG_NET_CLS_ACT
3323 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3324# ifdef CONFIG_NET_EGRESS
3325 if (static_key_false(&egress_needed)) {
3326 skb = sch_handle_egress(skb, &rc, dev);
3327 if (!skb)
3328 goto out;
3329 }
3330# endif
3331#endif
02875878
ED
3332 /* If device/qdisc don't need skb->dst, release it right now while
3333 * its hot in this cpu cache.
3334 */
3335 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3336 skb_dst_drop(skb);
3337 else
3338 skb_dst_force(skb);
3339
f663dd9a 3340 txq = netdev_pick_tx(dev, skb, accel_priv);
a898def2 3341 q = rcu_dereference_bh(txq->qdisc);
37437bb2 3342
cf66ba58 3343 trace_net_dev_queue(skb);
1da177e4 3344 if (q->enqueue) {
bbd8a0d3 3345 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 3346 goto out;
1da177e4
LT
3347 }
3348
3349 /* The device has no queue. Common case for software devices:
3350 loopback, all the sorts of tunnels...
3351
932ff279
HX
3352 Really, it is unlikely that netif_tx_lock protection is necessary
3353 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
3354 counters.)
3355 However, it is possible, that they rely on protection
3356 made by us here.
3357
3358 Check this and shot the lock. It is not prone from deadlocks.
3359 Either shot noqueue qdisc, it is even simpler 8)
3360 */
3361 if (dev->flags & IFF_UP) {
3362 int cpu = smp_processor_id(); /* ok because BHs are off */
3363
c773e847 3364 if (txq->xmit_lock_owner != cpu) {
a70b506e
DB
3365 if (unlikely(__this_cpu_read(xmit_recursion) >
3366 XMIT_RECURSION_LIMIT))
745e20f1
ED
3367 goto recursion_alert;
3368
1f59533f
JDB
3369 skb = validate_xmit_skb(skb, dev);
3370 if (!skb)
d21fd63e 3371 goto out;
1f59533f 3372
c773e847 3373 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 3374
73466498 3375 if (!netif_xmit_stopped(txq)) {
745e20f1 3376 __this_cpu_inc(xmit_recursion);
ce93718f 3377 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
745e20f1 3378 __this_cpu_dec(xmit_recursion);
572a9d7b 3379 if (dev_xmit_complete(rc)) {
c773e847 3380 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
3381 goto out;
3382 }
3383 }
c773e847 3384 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
3385 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3386 dev->name);
1da177e4
LT
3387 } else {
3388 /* Recursion is detected! It is possible,
745e20f1
ED
3389 * unfortunately
3390 */
3391recursion_alert:
e87cc472
JP
3392 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3393 dev->name);
1da177e4
LT
3394 }
3395 }
3396
3397 rc = -ENETDOWN;
d4828d85 3398 rcu_read_unlock_bh();
1da177e4 3399
015f0688 3400 atomic_long_inc(&dev->tx_dropped);
1f59533f 3401 kfree_skb_list(skb);
1da177e4
LT
3402 return rc;
3403out:
d4828d85 3404 rcu_read_unlock_bh();
1da177e4
LT
3405 return rc;
3406}
f663dd9a 3407
2b4aa3ce 3408int dev_queue_xmit(struct sk_buff *skb)
f663dd9a
JW
3409{
3410 return __dev_queue_xmit(skb, NULL);
3411}
2b4aa3ce 3412EXPORT_SYMBOL(dev_queue_xmit);
1da177e4 3413
f663dd9a
JW
3414int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3415{
3416 return __dev_queue_xmit(skb, accel_priv);
3417}
3418EXPORT_SYMBOL(dev_queue_xmit_accel);
3419
1da177e4
LT
3420
3421/*=======================================================================
3422 Receiver routines
3423 =======================================================================*/
3424
6b2bedc3 3425int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
3426EXPORT_SYMBOL(netdev_max_backlog);
3427
3b098e2d 3428int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
3429int netdev_budget __read_mostly = 300;
3430int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 3431
eecfd7c4
ED
3432/* Called with irq disabled */
3433static inline void ____napi_schedule(struct softnet_data *sd,
3434 struct napi_struct *napi)
3435{
3436 list_add_tail(&napi->poll_list, &sd->poll_list);
3437 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3438}
3439
bfb564e7
KK
3440#ifdef CONFIG_RPS
3441
3442/* One global table that all flow-based protocols share. */
6e3f7faf 3443struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7 3444EXPORT_SYMBOL(rps_sock_flow_table);
567e4b79
ED
3445u32 rps_cpu_mask __read_mostly;
3446EXPORT_SYMBOL(rps_cpu_mask);
bfb564e7 3447
c5905afb 3448struct static_key rps_needed __read_mostly;
3df97ba8 3449EXPORT_SYMBOL(rps_needed);
adc9300e 3450
c445477d
BH
3451static struct rps_dev_flow *
3452set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3453 struct rps_dev_flow *rflow, u16 next_cpu)
3454{
a31196b0 3455 if (next_cpu < nr_cpu_ids) {
c445477d
BH
3456#ifdef CONFIG_RFS_ACCEL
3457 struct netdev_rx_queue *rxqueue;
3458 struct rps_dev_flow_table *flow_table;
3459 struct rps_dev_flow *old_rflow;
3460 u32 flow_id;
3461 u16 rxq_index;
3462 int rc;
3463
3464 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
3465 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3466 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
3467 goto out;
3468 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3469 if (rxq_index == skb_get_rx_queue(skb))
3470 goto out;
3471
3472 rxqueue = dev->_rx + rxq_index;
3473 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3474 if (!flow_table)
3475 goto out;
61b905da 3476 flow_id = skb_get_hash(skb) & flow_table->mask;
c445477d
BH
3477 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3478 rxq_index, flow_id);
3479 if (rc < 0)
3480 goto out;
3481 old_rflow = rflow;
3482 rflow = &flow_table->flows[flow_id];
c445477d
BH
3483 rflow->filter = rc;
3484 if (old_rflow->filter == rflow->filter)
3485 old_rflow->filter = RPS_NO_FILTER;
3486 out:
3487#endif
3488 rflow->last_qtail =
09994d1b 3489 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
3490 }
3491
09994d1b 3492 rflow->cpu = next_cpu;
c445477d
BH
3493 return rflow;
3494}
3495
bfb564e7
KK
3496/*
3497 * get_rps_cpu is called from netif_receive_skb and returns the target
3498 * CPU from the RPS map of the receiving queue for a given skb.
3499 * rcu_read_lock must be held on entry.
3500 */
3501static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3502 struct rps_dev_flow **rflowp)
3503{
567e4b79
ED
3504 const struct rps_sock_flow_table *sock_flow_table;
3505 struct netdev_rx_queue *rxqueue = dev->_rx;
bfb564e7 3506 struct rps_dev_flow_table *flow_table;
567e4b79 3507 struct rps_map *map;
bfb564e7 3508 int cpu = -1;
567e4b79 3509 u32 tcpu;
61b905da 3510 u32 hash;
bfb564e7
KK
3511
3512 if (skb_rx_queue_recorded(skb)) {
3513 u16 index = skb_get_rx_queue(skb);
567e4b79 3514
62fe0b40
BH
3515 if (unlikely(index >= dev->real_num_rx_queues)) {
3516 WARN_ONCE(dev->real_num_rx_queues > 1,
3517 "%s received packet on queue %u, but number "
3518 "of RX queues is %u\n",
3519 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
3520 goto done;
3521 }
567e4b79
ED
3522 rxqueue += index;
3523 }
bfb564e7 3524
567e4b79
ED
3525 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3526
3527 flow_table = rcu_dereference(rxqueue->rps_flow_table);
6e3f7faf 3528 map = rcu_dereference(rxqueue->rps_map);
567e4b79 3529 if (!flow_table && !map)
bfb564e7
KK
3530 goto done;
3531
2d47b459 3532 skb_reset_network_header(skb);
61b905da
TH
3533 hash = skb_get_hash(skb);
3534 if (!hash)
bfb564e7
KK
3535 goto done;
3536
fec5e652
TH
3537 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3538 if (flow_table && sock_flow_table) {
fec5e652 3539 struct rps_dev_flow *rflow;
567e4b79
ED
3540 u32 next_cpu;
3541 u32 ident;
3542
3543 /* First check into global flow table if there is a match */
3544 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3545 if ((ident ^ hash) & ~rps_cpu_mask)
3546 goto try_rps;
fec5e652 3547
567e4b79
ED
3548 next_cpu = ident & rps_cpu_mask;
3549
3550 /* OK, now we know there is a match,
3551 * we can look at the local (per receive queue) flow table
3552 */
61b905da 3553 rflow = &flow_table->flows[hash & flow_table->mask];
fec5e652
TH
3554 tcpu = rflow->cpu;
3555
fec5e652
TH
3556 /*
3557 * If the desired CPU (where last recvmsg was done) is
3558 * different from current CPU (one in the rx-queue flow
3559 * table entry), switch if one of the following holds:
a31196b0 3560 * - Current CPU is unset (>= nr_cpu_ids).
fec5e652
TH
3561 * - Current CPU is offline.
3562 * - The current CPU's queue tail has advanced beyond the
3563 * last packet that was enqueued using this table entry.
3564 * This guarantees that all previous packets for the flow
3565 * have been dequeued, thus preserving in order delivery.
3566 */
3567 if (unlikely(tcpu != next_cpu) &&
a31196b0 3568 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
fec5e652 3569 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
3570 rflow->last_qtail)) >= 0)) {
3571 tcpu = next_cpu;
c445477d 3572 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 3573 }
c445477d 3574
a31196b0 3575 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
fec5e652
TH
3576 *rflowp = rflow;
3577 cpu = tcpu;
3578 goto done;
3579 }
3580 }
3581
567e4b79
ED
3582try_rps:
3583
0a9627f2 3584 if (map) {
8fc54f68 3585 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
0a9627f2
TH
3586 if (cpu_online(tcpu)) {
3587 cpu = tcpu;
3588 goto done;
3589 }
3590 }
3591
3592done:
0a9627f2
TH
3593 return cpu;
3594}
3595
c445477d
BH
3596#ifdef CONFIG_RFS_ACCEL
3597
3598/**
3599 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3600 * @dev: Device on which the filter was set
3601 * @rxq_index: RX queue index
3602 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3603 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3604 *
3605 * Drivers that implement ndo_rx_flow_steer() should periodically call
3606 * this function for each installed filter and remove the filters for
3607 * which it returns %true.
3608 */
3609bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3610 u32 flow_id, u16 filter_id)
3611{
3612 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3613 struct rps_dev_flow_table *flow_table;
3614 struct rps_dev_flow *rflow;
3615 bool expire = true;
a31196b0 3616 unsigned int cpu;
c445477d
BH
3617
3618 rcu_read_lock();
3619 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3620 if (flow_table && flow_id <= flow_table->mask) {
3621 rflow = &flow_table->flows[flow_id];
3622 cpu = ACCESS_ONCE(rflow->cpu);
a31196b0 3623 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
c445477d
BH
3624 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3625 rflow->last_qtail) <
3626 (int)(10 * flow_table->mask)))
3627 expire = false;
3628 }
3629 rcu_read_unlock();
3630 return expire;
3631}
3632EXPORT_SYMBOL(rps_may_expire_flow);
3633
3634#endif /* CONFIG_RFS_ACCEL */
3635
0a9627f2 3636/* Called from hardirq (IPI) context */
e36fa2f7 3637static void rps_trigger_softirq(void *data)
0a9627f2 3638{
e36fa2f7
ED
3639 struct softnet_data *sd = data;
3640
eecfd7c4 3641 ____napi_schedule(sd, &sd->backlog);
dee42870 3642 sd->received_rps++;
0a9627f2 3643}
e36fa2f7 3644
fec5e652 3645#endif /* CONFIG_RPS */
0a9627f2 3646
e36fa2f7
ED
3647/*
3648 * Check if this softnet_data structure is another cpu one
3649 * If yes, queue it to our IPI list and return 1
3650 * If no, return 0
3651 */
3652static int rps_ipi_queued(struct softnet_data *sd)
3653{
3654#ifdef CONFIG_RPS
903ceff7 3655 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
e36fa2f7
ED
3656
3657 if (sd != mysd) {
3658 sd->rps_ipi_next = mysd->rps_ipi_list;
3659 mysd->rps_ipi_list = sd;
3660
3661 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3662 return 1;
3663 }
3664#endif /* CONFIG_RPS */
3665 return 0;
3666}
3667
99bbc707
WB
3668#ifdef CONFIG_NET_FLOW_LIMIT
3669int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3670#endif
3671
3672static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3673{
3674#ifdef CONFIG_NET_FLOW_LIMIT
3675 struct sd_flow_limit *fl;
3676 struct softnet_data *sd;
3677 unsigned int old_flow, new_flow;
3678
3679 if (qlen < (netdev_max_backlog >> 1))
3680 return false;
3681
903ceff7 3682 sd = this_cpu_ptr(&softnet_data);
99bbc707
WB
3683
3684 rcu_read_lock();
3685 fl = rcu_dereference(sd->flow_limit);
3686 if (fl) {
3958afa1 3687 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
99bbc707
WB
3688 old_flow = fl->history[fl->history_head];
3689 fl->history[fl->history_head] = new_flow;
3690
3691 fl->history_head++;
3692 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3693
3694 if (likely(fl->buckets[old_flow]))
3695 fl->buckets[old_flow]--;
3696
3697 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3698 fl->count++;
3699 rcu_read_unlock();
3700 return true;
3701 }
3702 }
3703 rcu_read_unlock();
3704#endif
3705 return false;
3706}
3707
0a9627f2
TH
3708/*
3709 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3710 * queue (may be a remote CPU queue).
3711 */
fec5e652
TH
3712static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3713 unsigned int *qtail)
0a9627f2 3714{
e36fa2f7 3715 struct softnet_data *sd;
0a9627f2 3716 unsigned long flags;
99bbc707 3717 unsigned int qlen;
0a9627f2 3718
e36fa2f7 3719 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3720
3721 local_irq_save(flags);
0a9627f2 3722
e36fa2f7 3723 rps_lock(sd);
e9e4dd32
JA
3724 if (!netif_running(skb->dev))
3725 goto drop;
99bbc707
WB
3726 qlen = skb_queue_len(&sd->input_pkt_queue);
3727 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
e008f3f0 3728 if (qlen) {
0a9627f2 3729enqueue:
e36fa2f7 3730 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3731 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3732 rps_unlock(sd);
152102c7 3733 local_irq_restore(flags);
0a9627f2
TH
3734 return NET_RX_SUCCESS;
3735 }
3736
ebda37c2
ED
3737 /* Schedule NAPI for backlog device
3738 * We can use non atomic operation since we own the queue lock
3739 */
3740 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3741 if (!rps_ipi_queued(sd))
eecfd7c4 3742 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3743 }
3744 goto enqueue;
3745 }
3746
e9e4dd32 3747drop:
dee42870 3748 sd->dropped++;
e36fa2f7 3749 rps_unlock(sd);
0a9627f2 3750
0a9627f2
TH
3751 local_irq_restore(flags);
3752
caf586e5 3753 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3754 kfree_skb(skb);
3755 return NET_RX_DROP;
3756}
1da177e4 3757
ae78dbfa 3758static int netif_rx_internal(struct sk_buff *skb)
1da177e4 3759{
b0e28f1e 3760 int ret;
1da177e4 3761
588f0330 3762 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3763
cf66ba58 3764 trace_netif_rx(skb);
df334545 3765#ifdef CONFIG_RPS
c5905afb 3766 if (static_key_false(&rps_needed)) {
fec5e652 3767 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3768 int cpu;
3769
cece1945 3770 preempt_disable();
b0e28f1e 3771 rcu_read_lock();
fec5e652
TH
3772
3773 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3774 if (cpu < 0)
3775 cpu = smp_processor_id();
fec5e652
TH
3776
3777 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3778
b0e28f1e 3779 rcu_read_unlock();
cece1945 3780 preempt_enable();
adc9300e
ED
3781 } else
3782#endif
fec5e652
TH
3783 {
3784 unsigned int qtail;
3785 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3786 put_cpu();
3787 }
b0e28f1e 3788 return ret;
1da177e4 3789}
ae78dbfa
BH
3790
3791/**
3792 * netif_rx - post buffer to the network code
3793 * @skb: buffer to post
3794 *
3795 * This function receives a packet from a device driver and queues it for
3796 * the upper (protocol) levels to process. It always succeeds. The buffer
3797 * may be dropped during processing for congestion control or by the
3798 * protocol layers.
3799 *
3800 * return values:
3801 * NET_RX_SUCCESS (no congestion)
3802 * NET_RX_DROP (packet was dropped)
3803 *
3804 */
3805
3806int netif_rx(struct sk_buff *skb)
3807{
3808 trace_netif_rx_entry(skb);
3809
3810 return netif_rx_internal(skb);
3811}
d1b19dff 3812EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3813
3814int netif_rx_ni(struct sk_buff *skb)
3815{
3816 int err;
3817
ae78dbfa
BH
3818 trace_netif_rx_ni_entry(skb);
3819
1da177e4 3820 preempt_disable();
ae78dbfa 3821 err = netif_rx_internal(skb);
1da177e4
LT
3822 if (local_softirq_pending())
3823 do_softirq();
3824 preempt_enable();
3825
3826 return err;
3827}
1da177e4
LT
3828EXPORT_SYMBOL(netif_rx_ni);
3829
0766f788 3830static __latent_entropy void net_tx_action(struct softirq_action *h)
1da177e4 3831{
903ceff7 3832 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
1da177e4
LT
3833
3834 if (sd->completion_queue) {
3835 struct sk_buff *clist;
3836
3837 local_irq_disable();
3838 clist = sd->completion_queue;
3839 sd->completion_queue = NULL;
3840 local_irq_enable();
3841
3842 while (clist) {
3843 struct sk_buff *skb = clist;
3844 clist = clist->next;
3845
547b792c 3846 WARN_ON(atomic_read(&skb->users));
e6247027
ED
3847 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3848 trace_consume_skb(skb);
3849 else
3850 trace_kfree_skb(skb, net_tx_action);
15fad714
JDB
3851
3852 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3853 __kfree_skb(skb);
3854 else
3855 __kfree_skb_defer(skb);
1da177e4 3856 }
15fad714
JDB
3857
3858 __kfree_skb_flush();
1da177e4
LT
3859 }
3860
3861 if (sd->output_queue) {
37437bb2 3862 struct Qdisc *head;
1da177e4
LT
3863
3864 local_irq_disable();
3865 head = sd->output_queue;
3866 sd->output_queue = NULL;
a9cbd588 3867 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3868 local_irq_enable();
3869
3870 while (head) {
37437bb2
DM
3871 struct Qdisc *q = head;
3872 spinlock_t *root_lock;
3873
1da177e4
LT
3874 head = head->next_sched;
3875
5fb66229 3876 root_lock = qdisc_lock(q);
3bcb846c
ED
3877 spin_lock(root_lock);
3878 /* We need to make sure head->next_sched is read
3879 * before clearing __QDISC_STATE_SCHED
3880 */
3881 smp_mb__before_atomic();
3882 clear_bit(__QDISC_STATE_SCHED, &q->state);
3883 qdisc_run(q);
3884 spin_unlock(root_lock);
1da177e4
LT
3885 }
3886 }
3887}
3888
181402a5 3889#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
da678292
MM
3890/* This hook is defined here for ATM LANE */
3891int (*br_fdb_test_addr_hook)(struct net_device *dev,
3892 unsigned char *addr) __read_mostly;
4fb019a0 3893EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3894#endif
1da177e4 3895
1f211a1b
DB
3896static inline struct sk_buff *
3897sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3898 struct net_device *orig_dev)
f697c3e8 3899{
e7582bab 3900#ifdef CONFIG_NET_CLS_ACT
d2788d34
DB
3901 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3902 struct tcf_result cl_res;
24824a09 3903
c9e99fd0
DB
3904 /* If there's at least one ingress present somewhere (so
3905 * we get here via enabled static key), remaining devices
3906 * that are not configured with an ingress qdisc will bail
d2788d34 3907 * out here.
c9e99fd0 3908 */
d2788d34 3909 if (!cl)
4577139b 3910 return skb;
f697c3e8
HX
3911 if (*pt_prev) {
3912 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3913 *pt_prev = NULL;
1da177e4
LT
3914 }
3915
3365495c 3916 qdisc_skb_cb(skb)->pkt_len = skb->len;
c9e99fd0 3917 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
24ea591d 3918 qdisc_bstats_cpu_update(cl->q, skb);
c9e99fd0 3919
3b3ae880 3920 switch (tc_classify(skb, cl, &cl_res, false)) {
d2788d34
DB
3921 case TC_ACT_OK:
3922 case TC_ACT_RECLASSIFY:
3923 skb->tc_index = TC_H_MIN(cl_res.classid);
3924 break;
3925 case TC_ACT_SHOT:
24ea591d 3926 qdisc_qstats_cpu_drop(cl->q);
8a3a4c6e
ED
3927 kfree_skb(skb);
3928 return NULL;
d2788d34
DB
3929 case TC_ACT_STOLEN:
3930 case TC_ACT_QUEUED:
8a3a4c6e 3931 consume_skb(skb);
d2788d34 3932 return NULL;
27b29f63
AS
3933 case TC_ACT_REDIRECT:
3934 /* skb_mac_header check was done by cls/act_bpf, so
3935 * we can safely push the L2 header back before
3936 * redirecting to another netdev
3937 */
3938 __skb_push(skb, skb->mac_len);
3939 skb_do_redirect(skb);
3940 return NULL;
d2788d34
DB
3941 default:
3942 break;
f697c3e8 3943 }
e7582bab 3944#endif /* CONFIG_NET_CLS_ACT */
e687ad60
PN
3945 return skb;
3946}
1da177e4 3947
24b27fc4
MB
3948/**
3949 * netdev_is_rx_handler_busy - check if receive handler is registered
3950 * @dev: device to check
3951 *
3952 * Check if a receive handler is already registered for a given device.
3953 * Return true if there one.
3954 *
3955 * The caller must hold the rtnl_mutex.
3956 */
3957bool netdev_is_rx_handler_busy(struct net_device *dev)
3958{
3959 ASSERT_RTNL();
3960 return dev && rtnl_dereference(dev->rx_handler);
3961}
3962EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3963
ab95bfe0
JP
3964/**
3965 * netdev_rx_handler_register - register receive handler
3966 * @dev: device to register a handler for
3967 * @rx_handler: receive handler to register
93e2c32b 3968 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0 3969 *
e227867f 3970 * Register a receive handler for a device. This handler will then be
ab95bfe0
JP
3971 * called from __netif_receive_skb. A negative errno code is returned
3972 * on a failure.
3973 *
3974 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3975 *
3976 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3977 */
3978int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3979 rx_handler_func_t *rx_handler,
3980 void *rx_handler_data)
ab95bfe0
JP
3981{
3982 ASSERT_RTNL();
3983
3984 if (dev->rx_handler)
3985 return -EBUSY;
3986
00cfec37 3987 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3988 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3989 rcu_assign_pointer(dev->rx_handler, rx_handler);
3990
3991 return 0;
3992}
3993EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3994
3995/**
3996 * netdev_rx_handler_unregister - unregister receive handler
3997 * @dev: device to unregister a handler from
3998 *
166ec369 3999 * Unregister a receive handler from a device.
ab95bfe0
JP
4000 *
4001 * The caller must hold the rtnl_mutex.
4002 */
4003void netdev_rx_handler_unregister(struct net_device *dev)
4004{
4005
4006 ASSERT_RTNL();
a9b3cd7f 4007 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
4008 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4009 * section has a guarantee to see a non NULL rx_handler_data
4010 * as well.
4011 */
4012 synchronize_net();
a9b3cd7f 4013 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
4014}
4015EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4016
b4b9e355
MG
4017/*
4018 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4019 * the special handling of PFMEMALLOC skbs.
4020 */
4021static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4022{
4023 switch (skb->protocol) {
2b8837ae
JP
4024 case htons(ETH_P_ARP):
4025 case htons(ETH_P_IP):
4026 case htons(ETH_P_IPV6):
4027 case htons(ETH_P_8021Q):
4028 case htons(ETH_P_8021AD):
b4b9e355
MG
4029 return true;
4030 default:
4031 return false;
4032 }
4033}
4034
e687ad60
PN
4035static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4036 int *ret, struct net_device *orig_dev)
4037{
e7582bab 4038#ifdef CONFIG_NETFILTER_INGRESS
e687ad60 4039 if (nf_hook_ingress_active(skb)) {
2c1e2703
AC
4040 int ingress_retval;
4041
e687ad60
PN
4042 if (*pt_prev) {
4043 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4044 *pt_prev = NULL;
4045 }
4046
2c1e2703
AC
4047 rcu_read_lock();
4048 ingress_retval = nf_hook_ingress(skb);
4049 rcu_read_unlock();
4050 return ingress_retval;
e687ad60 4051 }
e7582bab 4052#endif /* CONFIG_NETFILTER_INGRESS */
e687ad60
PN
4053 return 0;
4054}
e687ad60 4055
9754e293 4056static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
4057{
4058 struct packet_type *ptype, *pt_prev;
ab95bfe0 4059 rx_handler_func_t *rx_handler;
f2ccd8fa 4060 struct net_device *orig_dev;
8a4eb573 4061 bool deliver_exact = false;
1da177e4 4062 int ret = NET_RX_DROP;
252e3346 4063 __be16 type;
1da177e4 4064
588f0330 4065 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 4066
cf66ba58 4067 trace_netif_receive_skb(skb);
9b22ea56 4068
cc9bd5ce 4069 orig_dev = skb->dev;
8f903c70 4070
c1d2bbe1 4071 skb_reset_network_header(skb);
fda55eca
ED
4072 if (!skb_transport_header_was_set(skb))
4073 skb_reset_transport_header(skb);
0b5c9db1 4074 skb_reset_mac_len(skb);
1da177e4
LT
4075
4076 pt_prev = NULL;
4077
63d8ea7f 4078another_round:
b6858177 4079 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
4080
4081 __this_cpu_inc(softnet_data.processed);
4082
8ad227ff
PM
4083 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4084 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
0d5501c1 4085 skb = skb_vlan_untag(skb);
bcc6d479 4086 if (unlikely(!skb))
2c17d27c 4087 goto out;
bcc6d479
JP
4088 }
4089
1da177e4
LT
4090#ifdef CONFIG_NET_CLS_ACT
4091 if (skb->tc_verd & TC_NCLS) {
4092 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4093 goto ncls;
4094 }
4095#endif
4096
9754e293 4097 if (pfmemalloc)
b4b9e355
MG
4098 goto skip_taps;
4099
1da177e4 4100 list_for_each_entry_rcu(ptype, &ptype_all, list) {
7866a621
SN
4101 if (pt_prev)
4102 ret = deliver_skb(skb, pt_prev, orig_dev);
4103 pt_prev = ptype;
4104 }
4105
4106 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4107 if (pt_prev)
4108 ret = deliver_skb(skb, pt_prev, orig_dev);
4109 pt_prev = ptype;
1da177e4
LT
4110 }
4111
b4b9e355 4112skip_taps:
1cf51900 4113#ifdef CONFIG_NET_INGRESS
4577139b 4114 if (static_key_false(&ingress_needed)) {
1f211a1b 4115 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4577139b 4116 if (!skb)
2c17d27c 4117 goto out;
e687ad60
PN
4118
4119 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
2c17d27c 4120 goto out;
4577139b 4121 }
1cf51900
PN
4122#endif
4123#ifdef CONFIG_NET_CLS_ACT
4577139b 4124 skb->tc_verd = 0;
1da177e4
LT
4125ncls:
4126#endif
9754e293 4127 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
4128 goto drop;
4129
df8a39de 4130 if (skb_vlan_tag_present(skb)) {
2425717b
JF
4131 if (pt_prev) {
4132 ret = deliver_skb(skb, pt_prev, orig_dev);
4133 pt_prev = NULL;
4134 }
48cc32d3 4135 if (vlan_do_receive(&skb))
2425717b
JF
4136 goto another_round;
4137 else if (unlikely(!skb))
2c17d27c 4138 goto out;
2425717b
JF
4139 }
4140
48cc32d3 4141 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
4142 if (rx_handler) {
4143 if (pt_prev) {
4144 ret = deliver_skb(skb, pt_prev, orig_dev);
4145 pt_prev = NULL;
4146 }
8a4eb573
JP
4147 switch (rx_handler(&skb)) {
4148 case RX_HANDLER_CONSUMED:
3bc1b1ad 4149 ret = NET_RX_SUCCESS;
2c17d27c 4150 goto out;
8a4eb573 4151 case RX_HANDLER_ANOTHER:
63d8ea7f 4152 goto another_round;
8a4eb573
JP
4153 case RX_HANDLER_EXACT:
4154 deliver_exact = true;
4155 case RX_HANDLER_PASS:
4156 break;
4157 default:
4158 BUG();
4159 }
ab95bfe0 4160 }
1da177e4 4161
df8a39de
JP
4162 if (unlikely(skb_vlan_tag_present(skb))) {
4163 if (skb_vlan_tag_get_id(skb))
d4b812de
ED
4164 skb->pkt_type = PACKET_OTHERHOST;
4165 /* Note: we might in the future use prio bits
4166 * and set skb->priority like in vlan_do_receive()
4167 * For the time being, just ignore Priority Code Point
4168 */
4169 skb->vlan_tci = 0;
4170 }
48cc32d3 4171
7866a621
SN
4172 type = skb->protocol;
4173
63d8ea7f 4174 /* deliver only exact match when indicated */
7866a621
SN
4175 if (likely(!deliver_exact)) {
4176 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4177 &ptype_base[ntohs(type) &
4178 PTYPE_HASH_MASK]);
4179 }
1f3c8804 4180
7866a621
SN
4181 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4182 &orig_dev->ptype_specific);
4183
4184 if (unlikely(skb->dev != orig_dev)) {
4185 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4186 &skb->dev->ptype_specific);
1da177e4
LT
4187 }
4188
4189 if (pt_prev) {
1080e512 4190 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 4191 goto drop;
1080e512
MT
4192 else
4193 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 4194 } else {
b4b9e355 4195drop:
6e7333d3
JW
4196 if (!deliver_exact)
4197 atomic_long_inc(&skb->dev->rx_dropped);
4198 else
4199 atomic_long_inc(&skb->dev->rx_nohandler);
1da177e4
LT
4200 kfree_skb(skb);
4201 /* Jamal, now you will not able to escape explaining
4202 * me how you were going to use this. :-)
4203 */
4204 ret = NET_RX_DROP;
4205 }
4206
2c17d27c 4207out:
9754e293
DM
4208 return ret;
4209}
4210
4211static int __netif_receive_skb(struct sk_buff *skb)
4212{
4213 int ret;
4214
4215 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4216 unsigned long pflags = current->flags;
4217
4218 /*
4219 * PFMEMALLOC skbs are special, they should
4220 * - be delivered to SOCK_MEMALLOC sockets only
4221 * - stay away from userspace
4222 * - have bounded memory usage
4223 *
4224 * Use PF_MEMALLOC as this saves us from propagating the allocation
4225 * context down to all allocation sites.
4226 */
4227 current->flags |= PF_MEMALLOC;
4228 ret = __netif_receive_skb_core(skb, true);
4229 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4230 } else
4231 ret = __netif_receive_skb_core(skb, false);
4232
1da177e4
LT
4233 return ret;
4234}
0a9627f2 4235
ae78dbfa 4236static int netif_receive_skb_internal(struct sk_buff *skb)
0a9627f2 4237{
2c17d27c
JA
4238 int ret;
4239
588f0330 4240 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 4241
c1f19b51
RC
4242 if (skb_defer_rx_timestamp(skb))
4243 return NET_RX_SUCCESS;
4244
2c17d27c
JA
4245 rcu_read_lock();
4246
df334545 4247#ifdef CONFIG_RPS
c5905afb 4248 if (static_key_false(&rps_needed)) {
3b098e2d 4249 struct rps_dev_flow voidflow, *rflow = &voidflow;
2c17d27c 4250 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 4251
3b098e2d
ED
4252 if (cpu >= 0) {
4253 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4254 rcu_read_unlock();
adc9300e 4255 return ret;
3b098e2d 4256 }
fec5e652 4257 }
1e94d72f 4258#endif
2c17d27c
JA
4259 ret = __netif_receive_skb(skb);
4260 rcu_read_unlock();
4261 return ret;
0a9627f2 4262}
ae78dbfa
BH
4263
4264/**
4265 * netif_receive_skb - process receive buffer from network
4266 * @skb: buffer to process
4267 *
4268 * netif_receive_skb() is the main receive data processing function.
4269 * It always succeeds. The buffer may be dropped during processing
4270 * for congestion control or by the protocol layers.
4271 *
4272 * This function may only be called from softirq context and interrupts
4273 * should be enabled.
4274 *
4275 * Return values (usually ignored):
4276 * NET_RX_SUCCESS: no congestion
4277 * NET_RX_DROP: packet was dropped
4278 */
04eb4489 4279int netif_receive_skb(struct sk_buff *skb)
ae78dbfa
BH
4280{
4281 trace_netif_receive_skb_entry(skb);
4282
4283 return netif_receive_skb_internal(skb);
4284}
04eb4489 4285EXPORT_SYMBOL(netif_receive_skb);
1da177e4 4286
41852497 4287DEFINE_PER_CPU(struct work_struct, flush_works);
145dd5f9
PA
4288
4289/* Network device is going away, flush any packets still pending */
4290static void flush_backlog(struct work_struct *work)
6e583ce5 4291{
6e583ce5 4292 struct sk_buff *skb, *tmp;
145dd5f9
PA
4293 struct softnet_data *sd;
4294
4295 local_bh_disable();
4296 sd = this_cpu_ptr(&softnet_data);
6e583ce5 4297
145dd5f9 4298 local_irq_disable();
e36fa2f7 4299 rps_lock(sd);
6e7676c1 4300 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
41852497 4301 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
e36fa2f7 4302 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 4303 kfree_skb(skb);
76cc8b13 4304 input_queue_head_incr(sd);
6e583ce5 4305 }
6e7676c1 4306 }
e36fa2f7 4307 rps_unlock(sd);
145dd5f9 4308 local_irq_enable();
6e7676c1
CG
4309
4310 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
41852497 4311 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
6e7676c1
CG
4312 __skb_unlink(skb, &sd->process_queue);
4313 kfree_skb(skb);
76cc8b13 4314 input_queue_head_incr(sd);
6e7676c1
CG
4315 }
4316 }
145dd5f9
PA
4317 local_bh_enable();
4318}
4319
41852497 4320static void flush_all_backlogs(void)
145dd5f9
PA
4321{
4322 unsigned int cpu;
4323
4324 get_online_cpus();
4325
41852497
ED
4326 for_each_online_cpu(cpu)
4327 queue_work_on(cpu, system_highpri_wq,
4328 per_cpu_ptr(&flush_works, cpu));
145dd5f9
PA
4329
4330 for_each_online_cpu(cpu)
41852497 4331 flush_work(per_cpu_ptr(&flush_works, cpu));
145dd5f9
PA
4332
4333 put_online_cpus();
6e583ce5
SH
4334}
4335
d565b0a1
HX
4336static int napi_gro_complete(struct sk_buff *skb)
4337{
22061d80 4338 struct packet_offload *ptype;
d565b0a1 4339 __be16 type = skb->protocol;
22061d80 4340 struct list_head *head = &offload_base;
d565b0a1
HX
4341 int err = -ENOENT;
4342
c3c7c254
ED
4343 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4344
fc59f9a3
HX
4345 if (NAPI_GRO_CB(skb)->count == 1) {
4346 skb_shinfo(skb)->gso_size = 0;
d565b0a1 4347 goto out;
fc59f9a3 4348 }
d565b0a1
HX
4349
4350 rcu_read_lock();
4351 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4352 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
4353 continue;
4354
299603e8 4355 err = ptype->callbacks.gro_complete(skb, 0);
d565b0a1
HX
4356 break;
4357 }
4358 rcu_read_unlock();
4359
4360 if (err) {
4361 WARN_ON(&ptype->list == head);
4362 kfree_skb(skb);
4363 return NET_RX_SUCCESS;
4364 }
4365
4366out:
ae78dbfa 4367 return netif_receive_skb_internal(skb);
d565b0a1
HX
4368}
4369
2e71a6f8
ED
4370/* napi->gro_list contains packets ordered by age.
4371 * youngest packets at the head of it.
4372 * Complete skbs in reverse order to reduce latencies.
4373 */
4374void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 4375{
2e71a6f8 4376 struct sk_buff *skb, *prev = NULL;
d565b0a1 4377
2e71a6f8
ED
4378 /* scan list and build reverse chain */
4379 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4380 skb->prev = prev;
4381 prev = skb;
4382 }
4383
4384 for (skb = prev; skb; skb = prev) {
d565b0a1 4385 skb->next = NULL;
2e71a6f8
ED
4386
4387 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4388 return;
4389
4390 prev = skb->prev;
d565b0a1 4391 napi_gro_complete(skb);
2e71a6f8 4392 napi->gro_count--;
d565b0a1
HX
4393 }
4394
4395 napi->gro_list = NULL;
4396}
86cac58b 4397EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 4398
89c5fa33
ED
4399static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4400{
4401 struct sk_buff *p;
4402 unsigned int maclen = skb->dev->hard_header_len;
0b4cec8c 4403 u32 hash = skb_get_hash_raw(skb);
89c5fa33
ED
4404
4405 for (p = napi->gro_list; p; p = p->next) {
4406 unsigned long diffs;
4407
0b4cec8c
TH
4408 NAPI_GRO_CB(p)->flush = 0;
4409
4410 if (hash != skb_get_hash_raw(p)) {
4411 NAPI_GRO_CB(p)->same_flow = 0;
4412 continue;
4413 }
4414
89c5fa33
ED
4415 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4416 diffs |= p->vlan_tci ^ skb->vlan_tci;
ce87fc6c 4417 diffs |= skb_metadata_dst_cmp(p, skb);
89c5fa33
ED
4418 if (maclen == ETH_HLEN)
4419 diffs |= compare_ether_header(skb_mac_header(p),
a50e233c 4420 skb_mac_header(skb));
89c5fa33
ED
4421 else if (!diffs)
4422 diffs = memcmp(skb_mac_header(p),
a50e233c 4423 skb_mac_header(skb),
89c5fa33
ED
4424 maclen);
4425 NAPI_GRO_CB(p)->same_flow = !diffs;
89c5fa33
ED
4426 }
4427}
4428
299603e8
JC
4429static void skb_gro_reset_offset(struct sk_buff *skb)
4430{
4431 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4432 const skb_frag_t *frag0 = &pinfo->frags[0];
4433
4434 NAPI_GRO_CB(skb)->data_offset = 0;
4435 NAPI_GRO_CB(skb)->frag0 = NULL;
4436 NAPI_GRO_CB(skb)->frag0_len = 0;
4437
4438 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4439 pinfo->nr_frags &&
4440 !PageHighMem(skb_frag_page(frag0))) {
4441 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4442 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
89c5fa33
ED
4443 }
4444}
4445
a50e233c
ED
4446static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4447{
4448 struct skb_shared_info *pinfo = skb_shinfo(skb);
4449
4450 BUG_ON(skb->end - skb->tail < grow);
4451
4452 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4453
4454 skb->data_len -= grow;
4455 skb->tail += grow;
4456
4457 pinfo->frags[0].page_offset += grow;
4458 skb_frag_size_sub(&pinfo->frags[0], grow);
4459
4460 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4461 skb_frag_unref(skb, 0);
4462 memmove(pinfo->frags, pinfo->frags + 1,
4463 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4464 }
4465}
4466
bb728820 4467static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
4468{
4469 struct sk_buff **pp = NULL;
22061d80 4470 struct packet_offload *ptype;
d565b0a1 4471 __be16 type = skb->protocol;
22061d80 4472 struct list_head *head = &offload_base;
0da2afd5 4473 int same_flow;
5b252f0c 4474 enum gro_result ret;
a50e233c 4475 int grow;
d565b0a1 4476
9c62a68d 4477 if (!(skb->dev->features & NETIF_F_GRO))
d565b0a1
HX
4478 goto normal;
4479
d61d072e 4480 if (skb->csum_bad)
f17f5c91
HX
4481 goto normal;
4482
89c5fa33
ED
4483 gro_list_prepare(napi, skb);
4484
d565b0a1
HX
4485 rcu_read_lock();
4486 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4487 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
4488 continue;
4489
86911732 4490 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 4491 skb_reset_mac_len(skb);
d565b0a1 4492 NAPI_GRO_CB(skb)->same_flow = 0;
d61d072e 4493 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5d38a079 4494 NAPI_GRO_CB(skb)->free = 0;
fac8e0f5 4495 NAPI_GRO_CB(skb)->encap_mark = 0;
fcd91dd4 4496 NAPI_GRO_CB(skb)->recursion_counter = 0;
a0ca153f 4497 NAPI_GRO_CB(skb)->is_fou = 0;
1530545e 4498 NAPI_GRO_CB(skb)->is_atomic = 1;
15e2396d 4499 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
d565b0a1 4500
662880f4
TH
4501 /* Setup for GRO checksum validation */
4502 switch (skb->ip_summed) {
4503 case CHECKSUM_COMPLETE:
4504 NAPI_GRO_CB(skb)->csum = skb->csum;
4505 NAPI_GRO_CB(skb)->csum_valid = 1;
4506 NAPI_GRO_CB(skb)->csum_cnt = 0;
4507 break;
4508 case CHECKSUM_UNNECESSARY:
4509 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4510 NAPI_GRO_CB(skb)->csum_valid = 0;
4511 break;
4512 default:
4513 NAPI_GRO_CB(skb)->csum_cnt = 0;
4514 NAPI_GRO_CB(skb)->csum_valid = 0;
4515 }
d565b0a1 4516
f191a1d1 4517 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
4518 break;
4519 }
4520 rcu_read_unlock();
4521
4522 if (&ptype->list == head)
4523 goto normal;
4524
0da2afd5 4525 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 4526 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 4527
d565b0a1
HX
4528 if (pp) {
4529 struct sk_buff *nskb = *pp;
4530
4531 *pp = nskb->next;
4532 nskb->next = NULL;
4533 napi_gro_complete(nskb);
4ae5544f 4534 napi->gro_count--;
d565b0a1
HX
4535 }
4536
0da2afd5 4537 if (same_flow)
d565b0a1
HX
4538 goto ok;
4539
600adc18 4540 if (NAPI_GRO_CB(skb)->flush)
d565b0a1 4541 goto normal;
d565b0a1 4542
600adc18
ED
4543 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4544 struct sk_buff *nskb = napi->gro_list;
4545
4546 /* locate the end of the list to select the 'oldest' flow */
4547 while (nskb->next) {
4548 pp = &nskb->next;
4549 nskb = *pp;
4550 }
4551 *pp = NULL;
4552 nskb->next = NULL;
4553 napi_gro_complete(nskb);
4554 } else {
4555 napi->gro_count++;
4556 }
d565b0a1 4557 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 4558 NAPI_GRO_CB(skb)->age = jiffies;
29e98242 4559 NAPI_GRO_CB(skb)->last = skb;
86911732 4560 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
4561 skb->next = napi->gro_list;
4562 napi->gro_list = skb;
5d0d9be8 4563 ret = GRO_HELD;
d565b0a1 4564
ad0f9904 4565pull:
a50e233c
ED
4566 grow = skb_gro_offset(skb) - skb_headlen(skb);
4567 if (grow > 0)
4568 gro_pull_from_frag0(skb, grow);
d565b0a1 4569ok:
5d0d9be8 4570 return ret;
d565b0a1
HX
4571
4572normal:
ad0f9904
HX
4573 ret = GRO_NORMAL;
4574 goto pull;
5d38a079 4575}
96e93eab 4576
bf5a755f
JC
4577struct packet_offload *gro_find_receive_by_type(__be16 type)
4578{
4579 struct list_head *offload_head = &offload_base;
4580 struct packet_offload *ptype;
4581
4582 list_for_each_entry_rcu(ptype, offload_head, list) {
4583 if (ptype->type != type || !ptype->callbacks.gro_receive)
4584 continue;
4585 return ptype;
4586 }
4587 return NULL;
4588}
e27a2f83 4589EXPORT_SYMBOL(gro_find_receive_by_type);
bf5a755f
JC
4590
4591struct packet_offload *gro_find_complete_by_type(__be16 type)
4592{
4593 struct list_head *offload_head = &offload_base;
4594 struct packet_offload *ptype;
4595
4596 list_for_each_entry_rcu(ptype, offload_head, list) {
4597 if (ptype->type != type || !ptype->callbacks.gro_complete)
4598 continue;
4599 return ptype;
4600 }
4601 return NULL;
4602}
e27a2f83 4603EXPORT_SYMBOL(gro_find_complete_by_type);
5d38a079 4604
bb728820 4605static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 4606{
5d0d9be8
HX
4607 switch (ret) {
4608 case GRO_NORMAL:
ae78dbfa 4609 if (netif_receive_skb_internal(skb))
c7c4b3b6
BH
4610 ret = GRO_DROP;
4611 break;
5d38a079 4612
5d0d9be8 4613 case GRO_DROP:
5d38a079
HX
4614 kfree_skb(skb);
4615 break;
5b252f0c 4616
daa86548 4617 case GRO_MERGED_FREE:
ce87fc6c
JG
4618 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4619 skb_dst_drop(skb);
d7e8883c 4620 kmem_cache_free(skbuff_head_cache, skb);
ce87fc6c 4621 } else {
d7e8883c 4622 __kfree_skb(skb);
ce87fc6c 4623 }
daa86548
ED
4624 break;
4625
5b252f0c
BH
4626 case GRO_HELD:
4627 case GRO_MERGED:
4628 break;
5d38a079
HX
4629 }
4630
c7c4b3b6 4631 return ret;
5d0d9be8 4632}
5d0d9be8 4633
c7c4b3b6 4634gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 4635{
93f93a44 4636 skb_mark_napi_id(skb, napi);
ae78dbfa 4637 trace_napi_gro_receive_entry(skb);
86911732 4638
a50e233c
ED
4639 skb_gro_reset_offset(skb);
4640
89c5fa33 4641 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
4642}
4643EXPORT_SYMBOL(napi_gro_receive);
4644
d0c2b0d2 4645static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 4646{
93a35f59
ED
4647 if (unlikely(skb->pfmemalloc)) {
4648 consume_skb(skb);
4649 return;
4650 }
96e93eab 4651 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
4652 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4653 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 4654 skb->vlan_tci = 0;
66c46d74 4655 skb->dev = napi->dev;
6d152e23 4656 skb->skb_iif = 0;
c3caf119
JC
4657 skb->encapsulation = 0;
4658 skb_shinfo(skb)->gso_type = 0;
e33d0ba8 4659 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
96e93eab
HX
4660
4661 napi->skb = skb;
4662}
96e93eab 4663
76620aaf 4664struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 4665{
5d38a079 4666 struct sk_buff *skb = napi->skb;
5d38a079
HX
4667
4668 if (!skb) {
fd11a83d 4669 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
e2f9dc3b
ED
4670 if (skb) {
4671 napi->skb = skb;
4672 skb_mark_napi_id(skb, napi);
4673 }
80595d59 4674 }
96e93eab
HX
4675 return skb;
4676}
76620aaf 4677EXPORT_SYMBOL(napi_get_frags);
96e93eab 4678
a50e233c
ED
4679static gro_result_t napi_frags_finish(struct napi_struct *napi,
4680 struct sk_buff *skb,
4681 gro_result_t ret)
96e93eab 4682{
5d0d9be8
HX
4683 switch (ret) {
4684 case GRO_NORMAL:
a50e233c
ED
4685 case GRO_HELD:
4686 __skb_push(skb, ETH_HLEN);
4687 skb->protocol = eth_type_trans(skb, skb->dev);
4688 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
c7c4b3b6 4689 ret = GRO_DROP;
86911732 4690 break;
5d38a079 4691
5d0d9be8 4692 case GRO_DROP:
5d0d9be8
HX
4693 case GRO_MERGED_FREE:
4694 napi_reuse_skb(napi, skb);
4695 break;
5b252f0c
BH
4696
4697 case GRO_MERGED:
4698 break;
5d0d9be8 4699 }
5d38a079 4700
c7c4b3b6 4701 return ret;
5d38a079 4702}
5d0d9be8 4703
a50e233c
ED
4704/* Upper GRO stack assumes network header starts at gro_offset=0
4705 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4706 * We copy ethernet header into skb->data to have a common layout.
4707 */
4adb9c4a 4708static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
4709{
4710 struct sk_buff *skb = napi->skb;
a50e233c
ED
4711 const struct ethhdr *eth;
4712 unsigned int hlen = sizeof(*eth);
76620aaf
HX
4713
4714 napi->skb = NULL;
4715
a50e233c
ED
4716 skb_reset_mac_header(skb);
4717 skb_gro_reset_offset(skb);
4718
4719 eth = skb_gro_header_fast(skb, 0);
4720 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4721 eth = skb_gro_header_slow(skb, hlen, 0);
4722 if (unlikely(!eth)) {
4da46ceb
AC
4723 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4724 __func__, napi->dev->name);
a50e233c
ED
4725 napi_reuse_skb(napi, skb);
4726 return NULL;
4727 }
4728 } else {
4729 gro_pull_from_frag0(skb, hlen);
4730 NAPI_GRO_CB(skb)->frag0 += hlen;
4731 NAPI_GRO_CB(skb)->frag0_len -= hlen;
76620aaf 4732 }
a50e233c
ED
4733 __skb_pull(skb, hlen);
4734
4735 /*
4736 * This works because the only protocols we care about don't require
4737 * special handling.
4738 * We'll fix it up properly in napi_frags_finish()
4739 */
4740 skb->protocol = eth->h_proto;
76620aaf 4741
76620aaf
HX
4742 return skb;
4743}
76620aaf 4744
c7c4b3b6 4745gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 4746{
76620aaf 4747 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
4748
4749 if (!skb)
c7c4b3b6 4750 return GRO_DROP;
5d0d9be8 4751
ae78dbfa
BH
4752 trace_napi_gro_frags_entry(skb);
4753
89c5fa33 4754 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 4755}
5d38a079
HX
4756EXPORT_SYMBOL(napi_gro_frags);
4757
573e8fca
TH
4758/* Compute the checksum from gro_offset and return the folded value
4759 * after adding in any pseudo checksum.
4760 */
4761__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4762{
4763 __wsum wsum;
4764 __sum16 sum;
4765
4766 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4767
4768 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4769 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4770 if (likely(!sum)) {
4771 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4772 !skb->csum_complete_sw)
4773 netdev_rx_csum_fault(skb->dev);
4774 }
4775
4776 NAPI_GRO_CB(skb)->csum = wsum;
4777 NAPI_GRO_CB(skb)->csum_valid = 1;
4778
4779 return sum;
4780}
4781EXPORT_SYMBOL(__skb_gro_checksum_complete);
4782
e326bed2 4783/*
855abcf0 4784 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
e326bed2
ED
4785 * Note: called with local irq disabled, but exits with local irq enabled.
4786 */
4787static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4788{
4789#ifdef CONFIG_RPS
4790 struct softnet_data *remsd = sd->rps_ipi_list;
4791
4792 if (remsd) {
4793 sd->rps_ipi_list = NULL;
4794
4795 local_irq_enable();
4796
4797 /* Send pending IPI's to kick RPS processing on remote cpus. */
4798 while (remsd) {
4799 struct softnet_data *next = remsd->rps_ipi_next;
4800
4801 if (cpu_online(remsd->cpu))
c46fff2a 4802 smp_call_function_single_async(remsd->cpu,
fce8ad15 4803 &remsd->csd);
e326bed2
ED
4804 remsd = next;
4805 }
4806 } else
4807#endif
4808 local_irq_enable();
4809}
4810
d75b1ade
ED
4811static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4812{
4813#ifdef CONFIG_RPS
4814 return sd->rps_ipi_list != NULL;
4815#else
4816 return false;
4817#endif
4818}
4819
bea3348e 4820static int process_backlog(struct napi_struct *napi, int quota)
1da177e4 4821{
eecfd7c4 4822 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
145dd5f9
PA
4823 bool again = true;
4824 int work = 0;
1da177e4 4825
e326bed2
ED
4826 /* Check if we have pending ipi, its better to send them now,
4827 * not waiting net_rx_action() end.
4828 */
d75b1ade 4829 if (sd_has_rps_ipi_waiting(sd)) {
e326bed2
ED
4830 local_irq_disable();
4831 net_rps_action_and_irq_enable(sd);
4832 }
d75b1ade 4833
bea3348e 4834 napi->weight = weight_p;
145dd5f9 4835 while (again) {
1da177e4 4836 struct sk_buff *skb;
6e7676c1
CG
4837
4838 while ((skb = __skb_dequeue(&sd->process_queue))) {
2c17d27c 4839 rcu_read_lock();
6e7676c1 4840 __netif_receive_skb(skb);
2c17d27c 4841 rcu_read_unlock();
76cc8b13 4842 input_queue_head_incr(sd);
145dd5f9 4843 if (++work >= quota)
76cc8b13 4844 return work;
145dd5f9 4845
6e7676c1 4846 }
1da177e4 4847
145dd5f9 4848 local_irq_disable();
e36fa2f7 4849 rps_lock(sd);
11ef7a89 4850 if (skb_queue_empty(&sd->input_pkt_queue)) {
eecfd7c4
ED
4851 /*
4852 * Inline a custom version of __napi_complete().
4853 * only current cpu owns and manipulates this napi,
11ef7a89
TH
4854 * and NAPI_STATE_SCHED is the only possible flag set
4855 * on backlog.
4856 * We can use a plain write instead of clear_bit(),
eecfd7c4
ED
4857 * and we dont need an smp_mb() memory barrier.
4858 */
eecfd7c4 4859 napi->state = 0;
145dd5f9
PA
4860 again = false;
4861 } else {
4862 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4863 &sd->process_queue);
bea3348e 4864 }
e36fa2f7 4865 rps_unlock(sd);
145dd5f9 4866 local_irq_enable();
6e7676c1 4867 }
1da177e4 4868
bea3348e
SH
4869 return work;
4870}
1da177e4 4871
bea3348e
SH
4872/**
4873 * __napi_schedule - schedule for receive
c4ea43c5 4874 * @n: entry to schedule
bea3348e 4875 *
bc9ad166
ED
4876 * The entry's receive function will be scheduled to run.
4877 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
bea3348e 4878 */
b5606c2d 4879void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4880{
4881 unsigned long flags;
1da177e4 4882
bea3348e 4883 local_irq_save(flags);
903ceff7 4884 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
bea3348e 4885 local_irq_restore(flags);
1da177e4 4886}
bea3348e
SH
4887EXPORT_SYMBOL(__napi_schedule);
4888
bc9ad166
ED
4889/**
4890 * __napi_schedule_irqoff - schedule for receive
4891 * @n: entry to schedule
4892 *
4893 * Variant of __napi_schedule() assuming hard irqs are masked
4894 */
4895void __napi_schedule_irqoff(struct napi_struct *n)
4896{
4897 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4898}
4899EXPORT_SYMBOL(__napi_schedule_irqoff);
4900
364b6055 4901bool __napi_complete(struct napi_struct *n)
d565b0a1
HX
4902{
4903 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
d565b0a1 4904
217f6974
ED
4905 /* Some drivers call us directly, instead of calling
4906 * napi_complete_done().
4907 */
4908 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
364b6055 4909 return false;
217f6974 4910
d75b1ade 4911 list_del_init(&n->poll_list);
4e857c58 4912 smp_mb__before_atomic();
d565b0a1 4913 clear_bit(NAPI_STATE_SCHED, &n->state);
364b6055 4914 return true;
d565b0a1
HX
4915}
4916EXPORT_SYMBOL(__napi_complete);
4917
364b6055 4918bool napi_complete_done(struct napi_struct *n, int work_done)
d565b0a1
HX
4919{
4920 unsigned long flags;
4921
4922 /*
217f6974
ED
4923 * 1) Don't let napi dequeue from the cpu poll list
4924 * just in case its running on a different cpu.
4925 * 2) If we are busy polling, do nothing here, we have
4926 * the guarantee we will be called later.
d565b0a1 4927 */
217f6974
ED
4928 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4929 NAPIF_STATE_IN_BUSY_POLL)))
364b6055 4930 return false;
d565b0a1 4931
3b47d303
ED
4932 if (n->gro_list) {
4933 unsigned long timeout = 0;
d75b1ade 4934
3b47d303
ED
4935 if (work_done)
4936 timeout = n->dev->gro_flush_timeout;
4937
4938 if (timeout)
4939 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4940 HRTIMER_MODE_REL_PINNED);
4941 else
4942 napi_gro_flush(n, false);
4943 }
d75b1ade
ED
4944 if (likely(list_empty(&n->poll_list))) {
4945 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4946 } else {
4947 /* If n->poll_list is not empty, we need to mask irqs */
4948 local_irq_save(flags);
4949 __napi_complete(n);
4950 local_irq_restore(flags);
4951 }
364b6055 4952 return true;
d565b0a1 4953}
3b47d303 4954EXPORT_SYMBOL(napi_complete_done);
d565b0a1 4955
af12fa6e 4956/* must be called under rcu_read_lock(), as we dont take a reference */
02d62e86 4957static struct napi_struct *napi_by_id(unsigned int napi_id)
af12fa6e
ET
4958{
4959 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4960 struct napi_struct *napi;
4961
4962 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4963 if (napi->napi_id == napi_id)
4964 return napi;
4965
4966 return NULL;
4967}
02d62e86
ED
4968
4969#if defined(CONFIG_NET_RX_BUSY_POLL)
217f6974 4970
ce6aea93 4971#define BUSY_POLL_BUDGET 8
217f6974
ED
4972
4973static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4974{
4975 int rc;
4976
4977 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4978
4979 local_bh_disable();
4980
4981 /* All we really want here is to re-enable device interrupts.
4982 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4983 */
4984 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4985 netpoll_poll_unlock(have_poll_lock);
4986 if (rc == BUSY_POLL_BUDGET)
4987 __napi_schedule(napi);
4988 local_bh_enable();
4989 if (local_softirq_pending())
4990 do_softirq();
4991}
4992
02d62e86
ED
4993bool sk_busy_loop(struct sock *sk, int nonblock)
4994{
4995 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
217f6974 4996 int (*napi_poll)(struct napi_struct *napi, int budget);
ce6aea93 4997 int (*busy_poll)(struct napi_struct *dev);
217f6974 4998 void *have_poll_lock = NULL;
02d62e86 4999 struct napi_struct *napi;
217f6974
ED
5000 int rc;
5001
5002restart:
5003 rc = false;
5004 napi_poll = NULL;
02d62e86 5005
2a028ecb 5006 rcu_read_lock();
02d62e86
ED
5007
5008 napi = napi_by_id(sk->sk_napi_id);
5009 if (!napi)
5010 goto out;
5011
ce6aea93
ED
5012 /* Note: ndo_busy_poll method is optional in linux-4.5 */
5013 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
02d62e86 5014
217f6974
ED
5015 preempt_disable();
5016 for (;;) {
ce6aea93 5017 rc = 0;
2a028ecb 5018 local_bh_disable();
ce6aea93
ED
5019 if (busy_poll) {
5020 rc = busy_poll(napi);
217f6974 5021 goto count;
ce6aea93 5022 }
217f6974
ED
5023 if (!napi_poll) {
5024 unsigned long val = READ_ONCE(napi->state);
5025
5026 /* If multiple threads are competing for this napi,
5027 * we avoid dirtying napi->state as much as we can.
5028 */
5029 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5030 NAPIF_STATE_IN_BUSY_POLL))
5031 goto count;
5032 if (cmpxchg(&napi->state, val,
5033 val | NAPIF_STATE_IN_BUSY_POLL |
5034 NAPIF_STATE_SCHED) != val)
5035 goto count;
5036 have_poll_lock = netpoll_poll_lock(napi);
5037 napi_poll = napi->poll;
5038 }
5039 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5040 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5041count:
2a028ecb 5042 if (rc > 0)
02a1d6e7
ED
5043 __NET_ADD_STATS(sock_net(sk),
5044 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
2a028ecb 5045 local_bh_enable();
02d62e86
ED
5046
5047 if (rc == LL_FLUSH_FAILED)
5048 break; /* permanent failure */
5049
217f6974
ED
5050 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5051 busy_loop_timeout(end_time))
5052 break;
02d62e86 5053
217f6974
ED
5054 if (unlikely(need_resched())) {
5055 if (napi_poll)
5056 busy_poll_stop(napi, have_poll_lock);
5057 preempt_enable();
5058 rcu_read_unlock();
5059 cond_resched();
5060 rc = !skb_queue_empty(&sk->sk_receive_queue);
5061 if (rc || busy_loop_timeout(end_time))
5062 return rc;
5063 goto restart;
5064 }
5065 cpu_relax_lowlatency();
5066 }
5067 if (napi_poll)
5068 busy_poll_stop(napi, have_poll_lock);
5069 preempt_enable();
02d62e86
ED
5070 rc = !skb_queue_empty(&sk->sk_receive_queue);
5071out:
2a028ecb 5072 rcu_read_unlock();
02d62e86
ED
5073 return rc;
5074}
5075EXPORT_SYMBOL(sk_busy_loop);
5076
5077#endif /* CONFIG_NET_RX_BUSY_POLL */
af12fa6e 5078
149d6ad8 5079static void napi_hash_add(struct napi_struct *napi)
af12fa6e 5080{
d64b5e85
ED
5081 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5082 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
52bd2d62 5083 return;
af12fa6e 5084
52bd2d62 5085 spin_lock(&napi_hash_lock);
af12fa6e 5086
52bd2d62
ED
5087 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5088 do {
5089 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5090 napi_gen_id = NR_CPUS + 1;
5091 } while (napi_by_id(napi_gen_id));
5092 napi->napi_id = napi_gen_id;
af12fa6e 5093
52bd2d62
ED
5094 hlist_add_head_rcu(&napi->napi_hash_node,
5095 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
af12fa6e 5096
52bd2d62 5097 spin_unlock(&napi_hash_lock);
af12fa6e 5098}
af12fa6e
ET
5099
5100/* Warning : caller is responsible to make sure rcu grace period
5101 * is respected before freeing memory containing @napi
5102 */
34cbe27e 5103bool napi_hash_del(struct napi_struct *napi)
af12fa6e 5104{
34cbe27e
ED
5105 bool rcu_sync_needed = false;
5106
af12fa6e
ET
5107 spin_lock(&napi_hash_lock);
5108
34cbe27e
ED
5109 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5110 rcu_sync_needed = true;
af12fa6e 5111 hlist_del_rcu(&napi->napi_hash_node);
34cbe27e 5112 }
af12fa6e 5113 spin_unlock(&napi_hash_lock);
34cbe27e 5114 return rcu_sync_needed;
af12fa6e
ET
5115}
5116EXPORT_SYMBOL_GPL(napi_hash_del);
5117
3b47d303
ED
5118static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5119{
5120 struct napi_struct *napi;
5121
5122 napi = container_of(timer, struct napi_struct, timer);
5123 if (napi->gro_list)
5124 napi_schedule(napi);
5125
5126 return HRTIMER_NORESTART;
5127}
5128
d565b0a1
HX
5129void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5130 int (*poll)(struct napi_struct *, int), int weight)
5131{
5132 INIT_LIST_HEAD(&napi->poll_list);
3b47d303
ED
5133 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5134 napi->timer.function = napi_watchdog;
4ae5544f 5135 napi->gro_count = 0;
d565b0a1 5136 napi->gro_list = NULL;
5d38a079 5137 napi->skb = NULL;
d565b0a1 5138 napi->poll = poll;
82dc3c63
ED
5139 if (weight > NAPI_POLL_WEIGHT)
5140 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5141 weight, dev->name);
d565b0a1
HX
5142 napi->weight = weight;
5143 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 5144 napi->dev = dev;
5d38a079 5145#ifdef CONFIG_NETPOLL
d565b0a1
HX
5146 napi->poll_owner = -1;
5147#endif
5148 set_bit(NAPI_STATE_SCHED, &napi->state);
93d05d4a 5149 napi_hash_add(napi);
d565b0a1
HX
5150}
5151EXPORT_SYMBOL(netif_napi_add);
5152
3b47d303
ED
5153void napi_disable(struct napi_struct *n)
5154{
5155 might_sleep();
5156 set_bit(NAPI_STATE_DISABLE, &n->state);
5157
5158 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5159 msleep(1);
2d8bff12
NH
5160 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5161 msleep(1);
3b47d303
ED
5162
5163 hrtimer_cancel(&n->timer);
5164
5165 clear_bit(NAPI_STATE_DISABLE, &n->state);
5166}
5167EXPORT_SYMBOL(napi_disable);
5168
93d05d4a 5169/* Must be called in process context */
d565b0a1
HX
5170void netif_napi_del(struct napi_struct *napi)
5171{
93d05d4a
ED
5172 might_sleep();
5173 if (napi_hash_del(napi))
5174 synchronize_net();
d7b06636 5175 list_del_init(&napi->dev_list);
76620aaf 5176 napi_free_frags(napi);
d565b0a1 5177
289dccbe 5178 kfree_skb_list(napi->gro_list);
d565b0a1 5179 napi->gro_list = NULL;
4ae5544f 5180 napi->gro_count = 0;
d565b0a1
HX
5181}
5182EXPORT_SYMBOL(netif_napi_del);
5183
726ce70e
HX
5184static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5185{
5186 void *have;
5187 int work, weight;
5188
5189 list_del_init(&n->poll_list);
5190
5191 have = netpoll_poll_lock(n);
5192
5193 weight = n->weight;
5194
5195 /* This NAPI_STATE_SCHED test is for avoiding a race
5196 * with netpoll's poll_napi(). Only the entity which
5197 * obtains the lock and sees NAPI_STATE_SCHED set will
5198 * actually make the ->poll() call. Therefore we avoid
5199 * accidentally calling ->poll() when NAPI is not scheduled.
5200 */
5201 work = 0;
5202 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5203 work = n->poll(n, weight);
1db19db7 5204 trace_napi_poll(n, work, weight);
726ce70e
HX
5205 }
5206
5207 WARN_ON_ONCE(work > weight);
5208
5209 if (likely(work < weight))
5210 goto out_unlock;
5211
5212 /* Drivers must not modify the NAPI state if they
5213 * consume the entire weight. In such cases this code
5214 * still "owns" the NAPI instance and therefore can
5215 * move the instance around on the list at-will.
5216 */
5217 if (unlikely(napi_disable_pending(n))) {
5218 napi_complete(n);
5219 goto out_unlock;
5220 }
5221
5222 if (n->gro_list) {
5223 /* flush too old packets
5224 * If HZ < 1000, flush all packets.
5225 */
5226 napi_gro_flush(n, HZ >= 1000);
5227 }
5228
001ce546
HX
5229 /* Some drivers may have called napi_schedule
5230 * prior to exhausting their budget.
5231 */
5232 if (unlikely(!list_empty(&n->poll_list))) {
5233 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5234 n->dev ? n->dev->name : "backlog");
5235 goto out_unlock;
5236 }
5237
726ce70e
HX
5238 list_add_tail(&n->poll_list, repoll);
5239
5240out_unlock:
5241 netpoll_poll_unlock(have);
5242
5243 return work;
5244}
5245
0766f788 5246static __latent_entropy void net_rx_action(struct softirq_action *h)
1da177e4 5247{
903ceff7 5248 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
24f8b238 5249 unsigned long time_limit = jiffies + 2;
51b0bded 5250 int budget = netdev_budget;
d75b1ade
ED
5251 LIST_HEAD(list);
5252 LIST_HEAD(repoll);
53fb95d3 5253
1da177e4 5254 local_irq_disable();
d75b1ade
ED
5255 list_splice_init(&sd->poll_list, &list);
5256 local_irq_enable();
1da177e4 5257
ceb8d5bf 5258 for (;;) {
bea3348e 5259 struct napi_struct *n;
1da177e4 5260
ceb8d5bf
HX
5261 if (list_empty(&list)) {
5262 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5263 return;
5264 break;
5265 }
5266
6bd373eb
HX
5267 n = list_first_entry(&list, struct napi_struct, poll_list);
5268 budget -= napi_poll(n, &repoll);
5269
d75b1ade 5270 /* If softirq window is exhausted then punt.
24f8b238
SH
5271 * Allow this to run for 2 jiffies since which will allow
5272 * an average latency of 1.5/HZ.
bea3348e 5273 */
ceb8d5bf
HX
5274 if (unlikely(budget <= 0 ||
5275 time_after_eq(jiffies, time_limit))) {
5276 sd->time_squeeze++;
5277 break;
5278 }
1da177e4 5279 }
d75b1ade 5280
795bb1c0 5281 __kfree_skb_flush();
d75b1ade
ED
5282 local_irq_disable();
5283
5284 list_splice_tail_init(&sd->poll_list, &list);
5285 list_splice_tail(&repoll, &list);
5286 list_splice(&list, &sd->poll_list);
5287 if (!list_empty(&sd->poll_list))
5288 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5289
e326bed2 5290 net_rps_action_and_irq_enable(sd);
1da177e4
LT
5291}
5292
aa9d8560 5293struct netdev_adjacent {
9ff162a8 5294 struct net_device *dev;
5d261913
VF
5295
5296 /* upper master flag, there can only be one master device per list */
9ff162a8 5297 bool master;
5d261913 5298
5d261913
VF
5299 /* counter for the number of times this device was added to us */
5300 u16 ref_nr;
5301
402dae96
VF
5302 /* private field for the users */
5303 void *private;
5304
9ff162a8
JP
5305 struct list_head list;
5306 struct rcu_head rcu;
9ff162a8
JP
5307};
5308
6ea29da1 5309static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
2f268f12 5310 struct list_head *adj_list)
9ff162a8 5311{
5d261913 5312 struct netdev_adjacent *adj;
5d261913 5313
2f268f12 5314 list_for_each_entry(adj, adj_list, list) {
5d261913
VF
5315 if (adj->dev == adj_dev)
5316 return adj;
9ff162a8
JP
5317 }
5318 return NULL;
5319}
5320
f1170fd4
DA
5321static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5322{
5323 struct net_device *dev = data;
5324
5325 return upper_dev == dev;
5326}
5327
9ff162a8
JP
5328/**
5329 * netdev_has_upper_dev - Check if device is linked to an upper device
5330 * @dev: device
5331 * @upper_dev: upper device to check
5332 *
5333 * Find out if a device is linked to specified upper device and return true
5334 * in case it is. Note that this checks only immediate upper device,
5335 * not through a complete stack of devices. The caller must hold the RTNL lock.
5336 */
5337bool netdev_has_upper_dev(struct net_device *dev,
5338 struct net_device *upper_dev)
5339{
5340 ASSERT_RTNL();
5341
f1170fd4
DA
5342 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5343 upper_dev);
9ff162a8
JP
5344}
5345EXPORT_SYMBOL(netdev_has_upper_dev);
5346
1a3f060c
DA
5347/**
5348 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5349 * @dev: device
5350 * @upper_dev: upper device to check
5351 *
5352 * Find out if a device is linked to specified upper device and return true
5353 * in case it is. Note that this checks the entire upper device chain.
5354 * The caller must hold rcu lock.
5355 */
5356
1a3f060c
DA
5357bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5358 struct net_device *upper_dev)
5359{
5360 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5361 upper_dev);
5362}
5363EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5364
9ff162a8
JP
5365/**
5366 * netdev_has_any_upper_dev - Check if device is linked to some device
5367 * @dev: device
5368 *
5369 * Find out if a device is linked to an upper device and return true in case
5370 * it is. The caller must hold the RTNL lock.
5371 */
1d143d9f 5372static bool netdev_has_any_upper_dev(struct net_device *dev)
9ff162a8
JP
5373{
5374 ASSERT_RTNL();
5375
f1170fd4 5376 return !list_empty(&dev->adj_list.upper);
9ff162a8 5377}
9ff162a8
JP
5378
5379/**
5380 * netdev_master_upper_dev_get - Get master upper device
5381 * @dev: device
5382 *
5383 * Find a master upper device and return pointer to it or NULL in case
5384 * it's not there. The caller must hold the RTNL lock.
5385 */
5386struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5387{
aa9d8560 5388 struct netdev_adjacent *upper;
9ff162a8
JP
5389
5390 ASSERT_RTNL();
5391
2f268f12 5392 if (list_empty(&dev->adj_list.upper))
9ff162a8
JP
5393 return NULL;
5394
2f268f12 5395 upper = list_first_entry(&dev->adj_list.upper,
aa9d8560 5396 struct netdev_adjacent, list);
9ff162a8
JP
5397 if (likely(upper->master))
5398 return upper->dev;
5399 return NULL;
5400}
5401EXPORT_SYMBOL(netdev_master_upper_dev_get);
5402
0f524a80
DA
5403/**
5404 * netdev_has_any_lower_dev - Check if device is linked to some device
5405 * @dev: device
5406 *
5407 * Find out if a device is linked to a lower device and return true in case
5408 * it is. The caller must hold the RTNL lock.
5409 */
5410static bool netdev_has_any_lower_dev(struct net_device *dev)
5411{
5412 ASSERT_RTNL();
5413
5414 return !list_empty(&dev->adj_list.lower);
5415}
5416
b6ccba4c
VF
5417void *netdev_adjacent_get_private(struct list_head *adj_list)
5418{
5419 struct netdev_adjacent *adj;
5420
5421 adj = list_entry(adj_list, struct netdev_adjacent, list);
5422
5423 return adj->private;
5424}
5425EXPORT_SYMBOL(netdev_adjacent_get_private);
5426
44a40855
VY
5427/**
5428 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5429 * @dev: device
5430 * @iter: list_head ** of the current position
5431 *
5432 * Gets the next device from the dev's upper list, starting from iter
5433 * position. The caller must hold RCU read lock.
5434 */
5435struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5436 struct list_head **iter)
5437{
5438 struct netdev_adjacent *upper;
5439
5440 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5441
5442 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5443
5444 if (&upper->list == &dev->adj_list.upper)
5445 return NULL;
5446
5447 *iter = &upper->list;
5448
5449 return upper->dev;
5450}
5451EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5452
1a3f060c
DA
5453static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5454 struct list_head **iter)
5455{
5456 struct netdev_adjacent *upper;
5457
5458 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5459
5460 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5461
5462 if (&upper->list == &dev->adj_list.upper)
5463 return NULL;
5464
5465 *iter = &upper->list;
5466
5467 return upper->dev;
5468}
5469
5470int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5471 int (*fn)(struct net_device *dev,
5472 void *data),
5473 void *data)
5474{
5475 struct net_device *udev;
5476 struct list_head *iter;
5477 int ret;
5478
5479 for (iter = &dev->adj_list.upper,
5480 udev = netdev_next_upper_dev_rcu(dev, &iter);
5481 udev;
5482 udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5483 /* first is the upper device itself */
5484 ret = fn(udev, data);
5485 if (ret)
5486 return ret;
5487
5488 /* then look at all of its upper devices */
5489 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5490 if (ret)
5491 return ret;
5492 }
5493
5494 return 0;
5495}
5496EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5497
31088a11
VF
5498/**
5499 * netdev_lower_get_next_private - Get the next ->private from the
5500 * lower neighbour list
5501 * @dev: device
5502 * @iter: list_head ** of the current position
5503 *
5504 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5505 * list, starting from iter position. The caller must hold either hold the
5506 * RTNL lock or its own locking that guarantees that the neighbour lower
b469139e 5507 * list will remain unchanged.
31088a11
VF
5508 */
5509void *netdev_lower_get_next_private(struct net_device *dev,
5510 struct list_head **iter)
5511{
5512 struct netdev_adjacent *lower;
5513
5514 lower = list_entry(*iter, struct netdev_adjacent, list);
5515
5516 if (&lower->list == &dev->adj_list.lower)
5517 return NULL;
5518
6859e7df 5519 *iter = lower->list.next;
31088a11
VF
5520
5521 return lower->private;
5522}
5523EXPORT_SYMBOL(netdev_lower_get_next_private);
5524
5525/**
5526 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5527 * lower neighbour list, RCU
5528 * variant
5529 * @dev: device
5530 * @iter: list_head ** of the current position
5531 *
5532 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5533 * list, starting from iter position. The caller must hold RCU read lock.
5534 */
5535void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5536 struct list_head **iter)
5537{
5538 struct netdev_adjacent *lower;
5539
5540 WARN_ON_ONCE(!rcu_read_lock_held());
5541
5542 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5543
5544 if (&lower->list == &dev->adj_list.lower)
5545 return NULL;
5546
6859e7df 5547 *iter = &lower->list;
31088a11
VF
5548
5549 return lower->private;
5550}
5551EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5552
4085ebe8
VY
5553/**
5554 * netdev_lower_get_next - Get the next device from the lower neighbour
5555 * list
5556 * @dev: device
5557 * @iter: list_head ** of the current position
5558 *
5559 * Gets the next netdev_adjacent from the dev's lower neighbour
5560 * list, starting from iter position. The caller must hold RTNL lock or
5561 * its own locking that guarantees that the neighbour lower
b469139e 5562 * list will remain unchanged.
4085ebe8
VY
5563 */
5564void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5565{
5566 struct netdev_adjacent *lower;
5567
cfdd28be 5568 lower = list_entry(*iter, struct netdev_adjacent, list);
4085ebe8
VY
5569
5570 if (&lower->list == &dev->adj_list.lower)
5571 return NULL;
5572
cfdd28be 5573 *iter = lower->list.next;
4085ebe8
VY
5574
5575 return lower->dev;
5576}
5577EXPORT_SYMBOL(netdev_lower_get_next);
5578
1a3f060c
DA
5579static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5580 struct list_head **iter)
5581{
5582 struct netdev_adjacent *lower;
5583
46b5ab1a 5584 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
1a3f060c
DA
5585
5586 if (&lower->list == &dev->adj_list.lower)
5587 return NULL;
5588
46b5ab1a 5589 *iter = &lower->list;
1a3f060c
DA
5590
5591 return lower->dev;
5592}
5593
5594int netdev_walk_all_lower_dev(struct net_device *dev,
5595 int (*fn)(struct net_device *dev,
5596 void *data),
5597 void *data)
5598{
5599 struct net_device *ldev;
5600 struct list_head *iter;
5601 int ret;
5602
5603 for (iter = &dev->adj_list.lower,
5604 ldev = netdev_next_lower_dev(dev, &iter);
5605 ldev;
5606 ldev = netdev_next_lower_dev(dev, &iter)) {
5607 /* first is the lower device itself */
5608 ret = fn(ldev, data);
5609 if (ret)
5610 return ret;
5611
5612 /* then look at all of its lower devices */
5613 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5614 if (ret)
5615 return ret;
5616 }
5617
5618 return 0;
5619}
5620EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5621
1a3f060c
DA
5622static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5623 struct list_head **iter)
5624{
5625 struct netdev_adjacent *lower;
5626
5627 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5628 if (&lower->list == &dev->adj_list.lower)
5629 return NULL;
5630
5631 *iter = &lower->list;
5632
5633 return lower->dev;
5634}
5635
5636int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5637 int (*fn)(struct net_device *dev,
5638 void *data),
5639 void *data)
5640{
5641 struct net_device *ldev;
5642 struct list_head *iter;
5643 int ret;
5644
5645 for (iter = &dev->adj_list.lower,
5646 ldev = netdev_next_lower_dev_rcu(dev, &iter);
5647 ldev;
5648 ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5649 /* first is the lower device itself */
5650 ret = fn(ldev, data);
5651 if (ret)
5652 return ret;
5653
5654 /* then look at all of its lower devices */
5655 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5656 if (ret)
5657 return ret;
5658 }
5659
5660 return 0;
5661}
5662EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5663
e001bfad 5664/**
5665 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5666 * lower neighbour list, RCU
5667 * variant
5668 * @dev: device
5669 *
5670 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5671 * list. The caller must hold RCU read lock.
5672 */
5673void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5674{
5675 struct netdev_adjacent *lower;
5676
5677 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5678 struct netdev_adjacent, list);
5679 if (lower)
5680 return lower->private;
5681 return NULL;
5682}
5683EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5684
9ff162a8
JP
5685/**
5686 * netdev_master_upper_dev_get_rcu - Get master upper device
5687 * @dev: device
5688 *
5689 * Find a master upper device and return pointer to it or NULL in case
5690 * it's not there. The caller must hold the RCU read lock.
5691 */
5692struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5693{
aa9d8560 5694 struct netdev_adjacent *upper;
9ff162a8 5695
2f268f12 5696 upper = list_first_or_null_rcu(&dev->adj_list.upper,
aa9d8560 5697 struct netdev_adjacent, list);
9ff162a8
JP
5698 if (upper && likely(upper->master))
5699 return upper->dev;
5700 return NULL;
5701}
5702EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5703
0a59f3a9 5704static int netdev_adjacent_sysfs_add(struct net_device *dev,
3ee32707
VF
5705 struct net_device *adj_dev,
5706 struct list_head *dev_list)
5707{
5708 char linkname[IFNAMSIZ+7];
5709 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5710 "upper_%s" : "lower_%s", adj_dev->name);
5711 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5712 linkname);
5713}
0a59f3a9 5714static void netdev_adjacent_sysfs_del(struct net_device *dev,
3ee32707
VF
5715 char *name,
5716 struct list_head *dev_list)
5717{
5718 char linkname[IFNAMSIZ+7];
5719 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5720 "upper_%s" : "lower_%s", name);
5721 sysfs_remove_link(&(dev->dev.kobj), linkname);
5722}
5723
7ce64c79
AF
5724static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5725 struct net_device *adj_dev,
5726 struct list_head *dev_list)
5727{
5728 return (dev_list == &dev->adj_list.upper ||
5729 dev_list == &dev->adj_list.lower) &&
5730 net_eq(dev_net(dev), dev_net(adj_dev));
5731}
3ee32707 5732
5d261913
VF
5733static int __netdev_adjacent_dev_insert(struct net_device *dev,
5734 struct net_device *adj_dev,
7863c054 5735 struct list_head *dev_list,
402dae96 5736 void *private, bool master)
5d261913
VF
5737{
5738 struct netdev_adjacent *adj;
842d67a7 5739 int ret;
5d261913 5740
6ea29da1 5741 adj = __netdev_find_adj(adj_dev, dev_list);
5d261913
VF
5742
5743 if (adj) {
790510d9 5744 adj->ref_nr += 1;
67b62f98
DA
5745 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5746 dev->name, adj_dev->name, adj->ref_nr);
5747
5d261913
VF
5748 return 0;
5749 }
5750
5751 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5752 if (!adj)
5753 return -ENOMEM;
5754
5755 adj->dev = adj_dev;
5756 adj->master = master;
790510d9 5757 adj->ref_nr = 1;
402dae96 5758 adj->private = private;
5d261913 5759 dev_hold(adj_dev);
2f268f12 5760
67b62f98
DA
5761 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5762 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5d261913 5763
7ce64c79 5764 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
3ee32707 5765 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5831d66e
VF
5766 if (ret)
5767 goto free_adj;
5768 }
5769
7863c054 5770 /* Ensure that master link is always the first item in list. */
842d67a7
VF
5771 if (master) {
5772 ret = sysfs_create_link(&(dev->dev.kobj),
5773 &(adj_dev->dev.kobj), "master");
5774 if (ret)
5831d66e 5775 goto remove_symlinks;
842d67a7 5776
7863c054 5777 list_add_rcu(&adj->list, dev_list);
842d67a7 5778 } else {
7863c054 5779 list_add_tail_rcu(&adj->list, dev_list);
842d67a7 5780 }
5d261913
VF
5781
5782 return 0;
842d67a7 5783
5831d66e 5784remove_symlinks:
7ce64c79 5785 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5786 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
842d67a7
VF
5787free_adj:
5788 kfree(adj);
974daef7 5789 dev_put(adj_dev);
842d67a7
VF
5790
5791 return ret;
5d261913
VF
5792}
5793
1d143d9f 5794static void __netdev_adjacent_dev_remove(struct net_device *dev,
5795 struct net_device *adj_dev,
93409033 5796 u16 ref_nr,
1d143d9f 5797 struct list_head *dev_list)
5d261913
VF
5798{
5799 struct netdev_adjacent *adj;
5800
67b62f98
DA
5801 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5802 dev->name, adj_dev->name, ref_nr);
5803
6ea29da1 5804 adj = __netdev_find_adj(adj_dev, dev_list);
5d261913 5805
2f268f12 5806 if (!adj) {
67b62f98 5807 pr_err("Adjacency does not exist for device %s from %s\n",
2f268f12 5808 dev->name, adj_dev->name);
67b62f98
DA
5809 WARN_ON(1);
5810 return;
2f268f12 5811 }
5d261913 5812
93409033 5813 if (adj->ref_nr > ref_nr) {
67b62f98
DA
5814 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5815 dev->name, adj_dev->name, ref_nr,
5816 adj->ref_nr - ref_nr);
93409033 5817 adj->ref_nr -= ref_nr;
5d261913
VF
5818 return;
5819 }
5820
842d67a7
VF
5821 if (adj->master)
5822 sysfs_remove_link(&(dev->dev.kobj), "master");
5823
7ce64c79 5824 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5825 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5831d66e 5826
5d261913 5827 list_del_rcu(&adj->list);
67b62f98 5828 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
2f268f12 5829 adj_dev->name, dev->name, adj_dev->name);
5d261913
VF
5830 dev_put(adj_dev);
5831 kfree_rcu(adj, rcu);
5832}
5833
1d143d9f 5834static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5835 struct net_device *upper_dev,
5836 struct list_head *up_list,
5837 struct list_head *down_list,
5838 void *private, bool master)
5d261913
VF
5839{
5840 int ret;
5841
790510d9 5842 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
93409033 5843 private, master);
5d261913
VF
5844 if (ret)
5845 return ret;
5846
790510d9 5847 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
93409033 5848 private, false);
5d261913 5849 if (ret) {
790510d9 5850 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5d261913
VF
5851 return ret;
5852 }
5853
5854 return 0;
5855}
5856
1d143d9f 5857static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5858 struct net_device *upper_dev,
93409033 5859 u16 ref_nr,
1d143d9f 5860 struct list_head *up_list,
5861 struct list_head *down_list)
5d261913 5862{
93409033
AC
5863 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5864 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5d261913
VF
5865}
5866
1d143d9f 5867static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5868 struct net_device *upper_dev,
5869 void *private, bool master)
2f268f12 5870{
f1170fd4
DA
5871 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5872 &dev->adj_list.upper,
5873 &upper_dev->adj_list.lower,
5874 private, master);
5d261913
VF
5875}
5876
1d143d9f 5877static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5878 struct net_device *upper_dev)
2f268f12 5879{
93409033 5880 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
2f268f12
VF
5881 &dev->adj_list.upper,
5882 &upper_dev->adj_list.lower);
5883}
5d261913 5884
9ff162a8 5885static int __netdev_upper_dev_link(struct net_device *dev,
402dae96 5886 struct net_device *upper_dev, bool master,
29bf24af 5887 void *upper_priv, void *upper_info)
9ff162a8 5888{
0e4ead9d 5889 struct netdev_notifier_changeupper_info changeupper_info;
5d261913 5890 int ret = 0;
9ff162a8
JP
5891
5892 ASSERT_RTNL();
5893
5894 if (dev == upper_dev)
5895 return -EBUSY;
5896
5897 /* To prevent loops, check if dev is not upper device to upper_dev. */
f1170fd4 5898 if (netdev_has_upper_dev(upper_dev, dev))
9ff162a8
JP
5899 return -EBUSY;
5900
f1170fd4 5901 if (netdev_has_upper_dev(dev, upper_dev))
9ff162a8
JP
5902 return -EEXIST;
5903
5904 if (master && netdev_master_upper_dev_get(dev))
5905 return -EBUSY;
5906
0e4ead9d
JP
5907 changeupper_info.upper_dev = upper_dev;
5908 changeupper_info.master = master;
5909 changeupper_info.linking = true;
29bf24af 5910 changeupper_info.upper_info = upper_info;
0e4ead9d 5911
573c7ba0
JP
5912 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5913 &changeupper_info.info);
5914 ret = notifier_to_errno(ret);
5915 if (ret)
5916 return ret;
5917
6dffb044 5918 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
402dae96 5919 master);
5d261913
VF
5920 if (ret)
5921 return ret;
9ff162a8 5922
b03804e7
IS
5923 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5924 &changeupper_info.info);
5925 ret = notifier_to_errno(ret);
5926 if (ret)
f1170fd4 5927 goto rollback;
b03804e7 5928
9ff162a8 5929 return 0;
5d261913 5930
f1170fd4 5931rollback:
2f268f12 5932 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5933
5934 return ret;
9ff162a8
JP
5935}
5936
5937/**
5938 * netdev_upper_dev_link - Add a link to the upper device
5939 * @dev: device
5940 * @upper_dev: new upper device
5941 *
5942 * Adds a link to device which is upper to this one. The caller must hold
5943 * the RTNL lock. On a failure a negative errno code is returned.
5944 * On success the reference counts are adjusted and the function
5945 * returns zero.
5946 */
5947int netdev_upper_dev_link(struct net_device *dev,
5948 struct net_device *upper_dev)
5949{
29bf24af 5950 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
9ff162a8
JP
5951}
5952EXPORT_SYMBOL(netdev_upper_dev_link);
5953
5954/**
5955 * netdev_master_upper_dev_link - Add a master link to the upper device
5956 * @dev: device
5957 * @upper_dev: new upper device
6dffb044 5958 * @upper_priv: upper device private
29bf24af 5959 * @upper_info: upper info to be passed down via notifier
9ff162a8
JP
5960 *
5961 * Adds a link to device which is upper to this one. In this case, only
5962 * one master upper device can be linked, although other non-master devices
5963 * might be linked as well. The caller must hold the RTNL lock.
5964 * On a failure a negative errno code is returned. On success the reference
5965 * counts are adjusted and the function returns zero.
5966 */
5967int netdev_master_upper_dev_link(struct net_device *dev,
6dffb044 5968 struct net_device *upper_dev,
29bf24af 5969 void *upper_priv, void *upper_info)
9ff162a8 5970{
29bf24af
JP
5971 return __netdev_upper_dev_link(dev, upper_dev, true,
5972 upper_priv, upper_info);
9ff162a8
JP
5973}
5974EXPORT_SYMBOL(netdev_master_upper_dev_link);
5975
5976/**
5977 * netdev_upper_dev_unlink - Removes a link to upper device
5978 * @dev: device
5979 * @upper_dev: new upper device
5980 *
5981 * Removes a link to device which is upper to this one. The caller must hold
5982 * the RTNL lock.
5983 */
5984void netdev_upper_dev_unlink(struct net_device *dev,
5985 struct net_device *upper_dev)
5986{
0e4ead9d 5987 struct netdev_notifier_changeupper_info changeupper_info;
9ff162a8
JP
5988 ASSERT_RTNL();
5989
0e4ead9d
JP
5990 changeupper_info.upper_dev = upper_dev;
5991 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5992 changeupper_info.linking = false;
5993
573c7ba0
JP
5994 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5995 &changeupper_info.info);
5996
2f268f12 5997 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913 5998
0e4ead9d
JP
5999 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6000 &changeupper_info.info);
9ff162a8
JP
6001}
6002EXPORT_SYMBOL(netdev_upper_dev_unlink);
6003
61bd3857
MS
6004/**
6005 * netdev_bonding_info_change - Dispatch event about slave change
6006 * @dev: device
4a26e453 6007 * @bonding_info: info to dispatch
61bd3857
MS
6008 *
6009 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6010 * The caller must hold the RTNL lock.
6011 */
6012void netdev_bonding_info_change(struct net_device *dev,
6013 struct netdev_bonding_info *bonding_info)
6014{
6015 struct netdev_notifier_bonding_info info;
6016
6017 memcpy(&info.bonding_info, bonding_info,
6018 sizeof(struct netdev_bonding_info));
6019 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6020 &info.info);
6021}
6022EXPORT_SYMBOL(netdev_bonding_info_change);
6023
2ce1ee17 6024static void netdev_adjacent_add_links(struct net_device *dev)
4c75431a
AF
6025{
6026 struct netdev_adjacent *iter;
6027
6028 struct net *net = dev_net(dev);
6029
6030 list_for_each_entry(iter, &dev->adj_list.upper, list) {
be4da0e3 6031 if (!net_eq(net, dev_net(iter->dev)))
4c75431a
AF
6032 continue;
6033 netdev_adjacent_sysfs_add(iter->dev, dev,
6034 &iter->dev->adj_list.lower);
6035 netdev_adjacent_sysfs_add(dev, iter->dev,
6036 &dev->adj_list.upper);
6037 }
6038
6039 list_for_each_entry(iter, &dev->adj_list.lower, list) {
be4da0e3 6040 if (!net_eq(net, dev_net(iter->dev)))
4c75431a
AF
6041 continue;
6042 netdev_adjacent_sysfs_add(iter->dev, dev,
6043 &iter->dev->adj_list.upper);
6044 netdev_adjacent_sysfs_add(dev, iter->dev,
6045 &dev->adj_list.lower);
6046 }
6047}
6048
2ce1ee17 6049static void netdev_adjacent_del_links(struct net_device *dev)
4c75431a
AF
6050{
6051 struct netdev_adjacent *iter;
6052
6053 struct net *net = dev_net(dev);
6054
6055 list_for_each_entry(iter, &dev->adj_list.upper, list) {
be4da0e3 6056 if (!net_eq(net, dev_net(iter->dev)))
4c75431a
AF
6057 continue;
6058 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6059 &iter->dev->adj_list.lower);
6060 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6061 &dev->adj_list.upper);
6062 }
6063
6064 list_for_each_entry(iter, &dev->adj_list.lower, list) {
be4da0e3 6065 if (!net_eq(net, dev_net(iter->dev)))
4c75431a
AF
6066 continue;
6067 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6068 &iter->dev->adj_list.upper);
6069 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6070 &dev->adj_list.lower);
6071 }
6072}
6073
5bb025fa 6074void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
402dae96 6075{
5bb025fa 6076 struct netdev_adjacent *iter;
402dae96 6077
4c75431a
AF
6078 struct net *net = dev_net(dev);
6079
5bb025fa 6080 list_for_each_entry(iter, &dev->adj_list.upper, list) {
be4da0e3 6081 if (!net_eq(net, dev_net(iter->dev)))
4c75431a 6082 continue;
5bb025fa
VF
6083 netdev_adjacent_sysfs_del(iter->dev, oldname,
6084 &iter->dev->adj_list.lower);
6085 netdev_adjacent_sysfs_add(iter->dev, dev,
6086 &iter->dev->adj_list.lower);
6087 }
402dae96 6088
5bb025fa 6089 list_for_each_entry(iter, &dev->adj_list.lower, list) {
be4da0e3 6090 if (!net_eq(net, dev_net(iter->dev)))
4c75431a 6091 continue;
5bb025fa
VF
6092 netdev_adjacent_sysfs_del(iter->dev, oldname,
6093 &iter->dev->adj_list.upper);
6094 netdev_adjacent_sysfs_add(iter->dev, dev,
6095 &iter->dev->adj_list.upper);
6096 }
402dae96 6097}
402dae96
VF
6098
6099void *netdev_lower_dev_get_private(struct net_device *dev,
6100 struct net_device *lower_dev)
6101{
6102 struct netdev_adjacent *lower;
6103
6104 if (!lower_dev)
6105 return NULL;
6ea29da1 6106 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
402dae96
VF
6107 if (!lower)
6108 return NULL;
6109
6110 return lower->private;
6111}
6112EXPORT_SYMBOL(netdev_lower_dev_get_private);
6113
4085ebe8 6114
952fcfd0 6115int dev_get_nest_level(struct net_device *dev)
4085ebe8
VY
6116{
6117 struct net_device *lower = NULL;
6118 struct list_head *iter;
6119 int max_nest = -1;
6120 int nest;
6121
6122 ASSERT_RTNL();
6123
6124 netdev_for_each_lower_dev(dev, lower, iter) {
952fcfd0 6125 nest = dev_get_nest_level(lower);
4085ebe8
VY
6126 if (max_nest < nest)
6127 max_nest = nest;
6128 }
6129
952fcfd0 6130 return max_nest + 1;
4085ebe8
VY
6131}
6132EXPORT_SYMBOL(dev_get_nest_level);
6133
04d48266
JP
6134/**
6135 * netdev_lower_change - Dispatch event about lower device state change
6136 * @lower_dev: device
6137 * @lower_state_info: state to dispatch
6138 *
6139 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6140 * The caller must hold the RTNL lock.
6141 */
6142void netdev_lower_state_changed(struct net_device *lower_dev,
6143 void *lower_state_info)
6144{
6145 struct netdev_notifier_changelowerstate_info changelowerstate_info;
6146
6147 ASSERT_RTNL();
6148 changelowerstate_info.lower_state_info = lower_state_info;
6149 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6150 &changelowerstate_info.info);
6151}
6152EXPORT_SYMBOL(netdev_lower_state_changed);
6153
18bfb924
JP
6154int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6155 struct neighbour *n)
6156{
6157 struct net_device *lower_dev, *stop_dev;
6158 struct list_head *iter;
6159 int err;
6160
6161 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6162 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6163 continue;
6164 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6165 if (err) {
6166 stop_dev = lower_dev;
6167 goto rollback;
6168 }
6169 }
6170 return 0;
6171
6172rollback:
6173 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6174 if (lower_dev == stop_dev)
6175 break;
6176 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6177 continue;
6178 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6179 }
6180 return err;
6181}
6182EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6183
6184void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6185 struct neighbour *n)
6186{
6187 struct net_device *lower_dev;
6188 struct list_head *iter;
6189
6190 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6191 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6192 continue;
6193 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6194 }
6195}
6196EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6197
b6c40d68
PM
6198static void dev_change_rx_flags(struct net_device *dev, int flags)
6199{
d314774c
SH
6200 const struct net_device_ops *ops = dev->netdev_ops;
6201
d2615bf4 6202 if (ops->ndo_change_rx_flags)
d314774c 6203 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
6204}
6205
991fb3f7 6206static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
1da177e4 6207{
b536db93 6208 unsigned int old_flags = dev->flags;
d04a48b0
EB
6209 kuid_t uid;
6210 kgid_t gid;
1da177e4 6211
24023451
PM
6212 ASSERT_RTNL();
6213
dad9b335
WC
6214 dev->flags |= IFF_PROMISC;
6215 dev->promiscuity += inc;
6216 if (dev->promiscuity == 0) {
6217 /*
6218 * Avoid overflow.
6219 * If inc causes overflow, untouch promisc and return error.
6220 */
6221 if (inc < 0)
6222 dev->flags &= ~IFF_PROMISC;
6223 else {
6224 dev->promiscuity -= inc;
7b6cd1ce
JP
6225 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6226 dev->name);
dad9b335
WC
6227 return -EOVERFLOW;
6228 }
6229 }
52609c0b 6230 if (dev->flags != old_flags) {
7b6cd1ce
JP
6231 pr_info("device %s %s promiscuous mode\n",
6232 dev->name,
6233 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
6234 if (audit_enabled) {
6235 current_uid_gid(&uid, &gid);
7759db82
KHK
6236 audit_log(current->audit_context, GFP_ATOMIC,
6237 AUDIT_ANOM_PROMISCUOUS,
6238 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6239 dev->name, (dev->flags & IFF_PROMISC),
6240 (old_flags & IFF_PROMISC),
e1760bd5 6241 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
6242 from_kuid(&init_user_ns, uid),
6243 from_kgid(&init_user_ns, gid),
7759db82 6244 audit_get_sessionid(current));
8192b0c4 6245 }
24023451 6246
b6c40d68 6247 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 6248 }
991fb3f7
ND
6249 if (notify)
6250 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
dad9b335 6251 return 0;
1da177e4
LT
6252}
6253
4417da66
PM
6254/**
6255 * dev_set_promiscuity - update promiscuity count on a device
6256 * @dev: device
6257 * @inc: modifier
6258 *
6259 * Add or remove promiscuity from a device. While the count in the device
6260 * remains above zero the interface remains promiscuous. Once it hits zero
6261 * the device reverts back to normal filtering operation. A negative inc
6262 * value is used to drop promiscuity on the device.
dad9b335 6263 * Return 0 if successful or a negative errno code on error.
4417da66 6264 */
dad9b335 6265int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 6266{
b536db93 6267 unsigned int old_flags = dev->flags;
dad9b335 6268 int err;
4417da66 6269
991fb3f7 6270 err = __dev_set_promiscuity(dev, inc, true);
4b5a698e 6271 if (err < 0)
dad9b335 6272 return err;
4417da66
PM
6273 if (dev->flags != old_flags)
6274 dev_set_rx_mode(dev);
dad9b335 6275 return err;
4417da66 6276}
d1b19dff 6277EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 6278
991fb3f7 6279static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
1da177e4 6280{
991fb3f7 6281 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
1da177e4 6282
24023451
PM
6283 ASSERT_RTNL();
6284
1da177e4 6285 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
6286 dev->allmulti += inc;
6287 if (dev->allmulti == 0) {
6288 /*
6289 * Avoid overflow.
6290 * If inc causes overflow, untouch allmulti and return error.
6291 */
6292 if (inc < 0)
6293 dev->flags &= ~IFF_ALLMULTI;
6294 else {
6295 dev->allmulti -= inc;
7b6cd1ce
JP
6296 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6297 dev->name);
dad9b335
WC
6298 return -EOVERFLOW;
6299 }
6300 }
24023451 6301 if (dev->flags ^ old_flags) {
b6c40d68 6302 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 6303 dev_set_rx_mode(dev);
991fb3f7
ND
6304 if (notify)
6305 __dev_notify_flags(dev, old_flags,
6306 dev->gflags ^ old_gflags);
24023451 6307 }
dad9b335 6308 return 0;
4417da66 6309}
991fb3f7
ND
6310
6311/**
6312 * dev_set_allmulti - update allmulti count on a device
6313 * @dev: device
6314 * @inc: modifier
6315 *
6316 * Add or remove reception of all multicast frames to a device. While the
6317 * count in the device remains above zero the interface remains listening
6318 * to all interfaces. Once it hits zero the device reverts back to normal
6319 * filtering operation. A negative @inc value is used to drop the counter
6320 * when releasing a resource needing all multicasts.
6321 * Return 0 if successful or a negative errno code on error.
6322 */
6323
6324int dev_set_allmulti(struct net_device *dev, int inc)
6325{
6326 return __dev_set_allmulti(dev, inc, true);
6327}
d1b19dff 6328EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
6329
6330/*
6331 * Upload unicast and multicast address lists to device and
6332 * configure RX filtering. When the device doesn't support unicast
53ccaae1 6333 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
6334 * are present.
6335 */
6336void __dev_set_rx_mode(struct net_device *dev)
6337{
d314774c
SH
6338 const struct net_device_ops *ops = dev->netdev_ops;
6339
4417da66
PM
6340 /* dev_open will call this function so the list will stay sane. */
6341 if (!(dev->flags&IFF_UP))
6342 return;
6343
6344 if (!netif_device_present(dev))
40b77c94 6345 return;
4417da66 6346
01789349 6347 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
6348 /* Unicast addresses changes may only happen under the rtnl,
6349 * therefore calling __dev_set_promiscuity here is safe.
6350 */
32e7bfc4 6351 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
991fb3f7 6352 __dev_set_promiscuity(dev, 1, false);
2d348d1f 6353 dev->uc_promisc = true;
32e7bfc4 6354 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
991fb3f7 6355 __dev_set_promiscuity(dev, -1, false);
2d348d1f 6356 dev->uc_promisc = false;
4417da66 6357 }
4417da66 6358 }
01789349
JP
6359
6360 if (ops->ndo_set_rx_mode)
6361 ops->ndo_set_rx_mode(dev);
4417da66
PM
6362}
6363
6364void dev_set_rx_mode(struct net_device *dev)
6365{
b9e40857 6366 netif_addr_lock_bh(dev);
4417da66 6367 __dev_set_rx_mode(dev);
b9e40857 6368 netif_addr_unlock_bh(dev);
1da177e4
LT
6369}
6370
f0db275a
SH
6371/**
6372 * dev_get_flags - get flags reported to userspace
6373 * @dev: device
6374 *
6375 * Get the combination of flag bits exported through APIs to userspace.
6376 */
95c96174 6377unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 6378{
95c96174 6379 unsigned int flags;
1da177e4
LT
6380
6381 flags = (dev->flags & ~(IFF_PROMISC |
6382 IFF_ALLMULTI |
b00055aa
SR
6383 IFF_RUNNING |
6384 IFF_LOWER_UP |
6385 IFF_DORMANT)) |
1da177e4
LT
6386 (dev->gflags & (IFF_PROMISC |
6387 IFF_ALLMULTI));
6388
b00055aa
SR
6389 if (netif_running(dev)) {
6390 if (netif_oper_up(dev))
6391 flags |= IFF_RUNNING;
6392 if (netif_carrier_ok(dev))
6393 flags |= IFF_LOWER_UP;
6394 if (netif_dormant(dev))
6395 flags |= IFF_DORMANT;
6396 }
1da177e4
LT
6397
6398 return flags;
6399}
d1b19dff 6400EXPORT_SYMBOL(dev_get_flags);
1da177e4 6401
bd380811 6402int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 6403{
b536db93 6404 unsigned int old_flags = dev->flags;
bd380811 6405 int ret;
1da177e4 6406
24023451
PM
6407 ASSERT_RTNL();
6408
1da177e4
LT
6409 /*
6410 * Set the flags on our device.
6411 */
6412
6413 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6414 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6415 IFF_AUTOMEDIA)) |
6416 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6417 IFF_ALLMULTI));
6418
6419 /*
6420 * Load in the correct multicast list now the flags have changed.
6421 */
6422
b6c40d68
PM
6423 if ((old_flags ^ flags) & IFF_MULTICAST)
6424 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 6425
4417da66 6426 dev_set_rx_mode(dev);
1da177e4
LT
6427
6428 /*
6429 * Have we downed the interface. We handle IFF_UP ourselves
6430 * according to user attempts to set it, rather than blindly
6431 * setting it.
6432 */
6433
6434 ret = 0;
d215d10f 6435 if ((old_flags ^ flags) & IFF_UP)
bd380811 6436 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4 6437
1da177e4 6438 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff 6439 int inc = (flags & IFF_PROMISC) ? 1 : -1;
991fb3f7 6440 unsigned int old_flags = dev->flags;
d1b19dff 6441
1da177e4 6442 dev->gflags ^= IFF_PROMISC;
991fb3f7
ND
6443
6444 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6445 if (dev->flags != old_flags)
6446 dev_set_rx_mode(dev);
1da177e4
LT
6447 }
6448
6449 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6450 is important. Some (broken) drivers set IFF_PROMISC, when
6451 IFF_ALLMULTI is requested not asking us and not reporting.
6452 */
6453 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
6454 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6455
1da177e4 6456 dev->gflags ^= IFF_ALLMULTI;
991fb3f7 6457 __dev_set_allmulti(dev, inc, false);
1da177e4
LT
6458 }
6459
bd380811
PM
6460 return ret;
6461}
6462
a528c219
ND
6463void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6464 unsigned int gchanges)
bd380811
PM
6465{
6466 unsigned int changes = dev->flags ^ old_flags;
6467
a528c219 6468 if (gchanges)
7f294054 6469 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
a528c219 6470
bd380811
PM
6471 if (changes & IFF_UP) {
6472 if (dev->flags & IFF_UP)
6473 call_netdevice_notifiers(NETDEV_UP, dev);
6474 else
6475 call_netdevice_notifiers(NETDEV_DOWN, dev);
6476 }
6477
6478 if (dev->flags & IFF_UP &&
be9efd36
JP
6479 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6480 struct netdev_notifier_change_info change_info;
6481
6482 change_info.flags_changed = changes;
6483 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6484 &change_info.info);
6485 }
bd380811
PM
6486}
6487
6488/**
6489 * dev_change_flags - change device settings
6490 * @dev: device
6491 * @flags: device state flags
6492 *
6493 * Change settings on device based state flags. The flags are
6494 * in the userspace exported format.
6495 */
b536db93 6496int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 6497{
b536db93 6498 int ret;
991fb3f7 6499 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
bd380811
PM
6500
6501 ret = __dev_change_flags(dev, flags);
6502 if (ret < 0)
6503 return ret;
6504
991fb3f7 6505 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
a528c219 6506 __dev_notify_flags(dev, old_flags, changes);
1da177e4
LT
6507 return ret;
6508}
d1b19dff 6509EXPORT_SYMBOL(dev_change_flags);
1da177e4 6510
2315dc91
VF
6511static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6512{
6513 const struct net_device_ops *ops = dev->netdev_ops;
6514
6515 if (ops->ndo_change_mtu)
6516 return ops->ndo_change_mtu(dev, new_mtu);
6517
6518 dev->mtu = new_mtu;
6519 return 0;
6520}
6521
f0db275a
SH
6522/**
6523 * dev_set_mtu - Change maximum transfer unit
6524 * @dev: device
6525 * @new_mtu: new transfer unit
6526 *
6527 * Change the maximum transfer size of the network device.
6528 */
1da177e4
LT
6529int dev_set_mtu(struct net_device *dev, int new_mtu)
6530{
2315dc91 6531 int err, orig_mtu;
1da177e4
LT
6532
6533 if (new_mtu == dev->mtu)
6534 return 0;
6535
61e84623
JW
6536 /* MTU must be positive, and in range */
6537 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6538 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6539 dev->name, new_mtu, dev->min_mtu);
1da177e4 6540 return -EINVAL;
61e84623
JW
6541 }
6542
6543 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6544 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
a0e65de7 6545 dev->name, new_mtu, dev->max_mtu);
61e84623
JW
6546 return -EINVAL;
6547 }
1da177e4
LT
6548
6549 if (!netif_device_present(dev))
6550 return -ENODEV;
6551
1d486bfb
VF
6552 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6553 err = notifier_to_errno(err);
6554 if (err)
6555 return err;
d314774c 6556
2315dc91
VF
6557 orig_mtu = dev->mtu;
6558 err = __dev_set_mtu(dev, new_mtu);
d314774c 6559
2315dc91
VF
6560 if (!err) {
6561 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6562 err = notifier_to_errno(err);
6563 if (err) {
6564 /* setting mtu back and notifying everyone again,
6565 * so that they have a chance to revert changes.
6566 */
6567 __dev_set_mtu(dev, orig_mtu);
6568 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6569 }
6570 }
1da177e4
LT
6571 return err;
6572}
d1b19dff 6573EXPORT_SYMBOL(dev_set_mtu);
1da177e4 6574
cbda10fa
VD
6575/**
6576 * dev_set_group - Change group this device belongs to
6577 * @dev: device
6578 * @new_group: group this device should belong to
6579 */
6580void dev_set_group(struct net_device *dev, int new_group)
6581{
6582 dev->group = new_group;
6583}
6584EXPORT_SYMBOL(dev_set_group);
6585
f0db275a
SH
6586/**
6587 * dev_set_mac_address - Change Media Access Control Address
6588 * @dev: device
6589 * @sa: new address
6590 *
6591 * Change the hardware (MAC) address of the device
6592 */
1da177e4
LT
6593int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6594{
d314774c 6595 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
6596 int err;
6597
d314774c 6598 if (!ops->ndo_set_mac_address)
1da177e4
LT
6599 return -EOPNOTSUPP;
6600 if (sa->sa_family != dev->type)
6601 return -EINVAL;
6602 if (!netif_device_present(dev))
6603 return -ENODEV;
d314774c 6604 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
6605 if (err)
6606 return err;
fbdeca2d 6607 dev->addr_assign_type = NET_ADDR_SET;
f6521516 6608 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 6609 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 6610 return 0;
1da177e4 6611}
d1b19dff 6612EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 6613
4bf84c35
JP
6614/**
6615 * dev_change_carrier - Change device carrier
6616 * @dev: device
691b3b7e 6617 * @new_carrier: new value
4bf84c35
JP
6618 *
6619 * Change device carrier
6620 */
6621int dev_change_carrier(struct net_device *dev, bool new_carrier)
6622{
6623 const struct net_device_ops *ops = dev->netdev_ops;
6624
6625 if (!ops->ndo_change_carrier)
6626 return -EOPNOTSUPP;
6627 if (!netif_device_present(dev))
6628 return -ENODEV;
6629 return ops->ndo_change_carrier(dev, new_carrier);
6630}
6631EXPORT_SYMBOL(dev_change_carrier);
6632
66b52b0d
JP
6633/**
6634 * dev_get_phys_port_id - Get device physical port ID
6635 * @dev: device
6636 * @ppid: port ID
6637 *
6638 * Get device physical port ID
6639 */
6640int dev_get_phys_port_id(struct net_device *dev,
02637fce 6641 struct netdev_phys_item_id *ppid)
66b52b0d
JP
6642{
6643 const struct net_device_ops *ops = dev->netdev_ops;
6644
6645 if (!ops->ndo_get_phys_port_id)
6646 return -EOPNOTSUPP;
6647 return ops->ndo_get_phys_port_id(dev, ppid);
6648}
6649EXPORT_SYMBOL(dev_get_phys_port_id);
6650
db24a904
DA
6651/**
6652 * dev_get_phys_port_name - Get device physical port name
6653 * @dev: device
6654 * @name: port name
ed49e650 6655 * @len: limit of bytes to copy to name
db24a904
DA
6656 *
6657 * Get device physical port name
6658 */
6659int dev_get_phys_port_name(struct net_device *dev,
6660 char *name, size_t len)
6661{
6662 const struct net_device_ops *ops = dev->netdev_ops;
6663
6664 if (!ops->ndo_get_phys_port_name)
6665 return -EOPNOTSUPP;
6666 return ops->ndo_get_phys_port_name(dev, name, len);
6667}
6668EXPORT_SYMBOL(dev_get_phys_port_name);
6669
d746d707
AK
6670/**
6671 * dev_change_proto_down - update protocol port state information
6672 * @dev: device
6673 * @proto_down: new value
6674 *
6675 * This info can be used by switch drivers to set the phys state of the
6676 * port.
6677 */
6678int dev_change_proto_down(struct net_device *dev, bool proto_down)
6679{
6680 const struct net_device_ops *ops = dev->netdev_ops;
6681
6682 if (!ops->ndo_change_proto_down)
6683 return -EOPNOTSUPP;
6684 if (!netif_device_present(dev))
6685 return -ENODEV;
6686 return ops->ndo_change_proto_down(dev, proto_down);
6687}
6688EXPORT_SYMBOL(dev_change_proto_down);
6689
a7862b45
BB
6690/**
6691 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6692 * @dev: device
6693 * @fd: new program fd or negative value to clear
6694 *
6695 * Set or clear a bpf program for a device
6696 */
6697int dev_change_xdp_fd(struct net_device *dev, int fd)
6698{
6699 const struct net_device_ops *ops = dev->netdev_ops;
6700 struct bpf_prog *prog = NULL;
6701 struct netdev_xdp xdp = {};
6702 int err;
6703
6704 if (!ops->ndo_xdp)
6705 return -EOPNOTSUPP;
6706 if (fd >= 0) {
6707 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6708 if (IS_ERR(prog))
6709 return PTR_ERR(prog);
6710 }
6711
6712 xdp.command = XDP_SETUP_PROG;
6713 xdp.prog = prog;
6714 err = ops->ndo_xdp(dev, &xdp);
6715 if (err < 0 && prog)
6716 bpf_prog_put(prog);
6717
6718 return err;
6719}
6720EXPORT_SYMBOL(dev_change_xdp_fd);
6721
1da177e4
LT
6722/**
6723 * dev_new_index - allocate an ifindex
c4ea43c5 6724 * @net: the applicable net namespace
1da177e4
LT
6725 *
6726 * Returns a suitable unique value for a new device interface
6727 * number. The caller must hold the rtnl semaphore or the
6728 * dev_base_lock to be sure it remains unique.
6729 */
881d966b 6730static int dev_new_index(struct net *net)
1da177e4 6731{
aa79e66e 6732 int ifindex = net->ifindex;
1da177e4
LT
6733 for (;;) {
6734 if (++ifindex <= 0)
6735 ifindex = 1;
881d966b 6736 if (!__dev_get_by_index(net, ifindex))
aa79e66e 6737 return net->ifindex = ifindex;
1da177e4
LT
6738 }
6739}
6740
1da177e4 6741/* Delayed registration/unregisteration */
3b5b34fd 6742static LIST_HEAD(net_todo_list);
200b916f 6743DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
1da177e4 6744
6f05f629 6745static void net_set_todo(struct net_device *dev)
1da177e4 6746{
1da177e4 6747 list_add_tail(&dev->todo_list, &net_todo_list);
50624c93 6748 dev_net(dev)->dev_unreg_count++;
1da177e4
LT
6749}
6750
9b5e383c 6751static void rollback_registered_many(struct list_head *head)
93ee31f1 6752{
e93737b0 6753 struct net_device *dev, *tmp;
5cde2829 6754 LIST_HEAD(close_head);
9b5e383c 6755
93ee31f1
DL
6756 BUG_ON(dev_boot_phase);
6757 ASSERT_RTNL();
6758
e93737b0 6759 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 6760 /* Some devices call without registering
e93737b0
KK
6761 * for initialization unwind. Remove those
6762 * devices and proceed with the remaining.
9b5e383c
ED
6763 */
6764 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
6765 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6766 dev->name, dev);
93ee31f1 6767
9b5e383c 6768 WARN_ON(1);
e93737b0
KK
6769 list_del(&dev->unreg_list);
6770 continue;
9b5e383c 6771 }
449f4544 6772 dev->dismantle = true;
9b5e383c 6773 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 6774 }
93ee31f1 6775
44345724 6776 /* If device is running, close it first. */
5cde2829
EB
6777 list_for_each_entry(dev, head, unreg_list)
6778 list_add_tail(&dev->close_list, &close_head);
99c4a26a 6779 dev_close_many(&close_head, true);
93ee31f1 6780
44345724 6781 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
6782 /* And unlink it from device chain. */
6783 unlist_netdevice(dev);
93ee31f1 6784
9b5e383c
ED
6785 dev->reg_state = NETREG_UNREGISTERING;
6786 }
41852497 6787 flush_all_backlogs();
93ee31f1
DL
6788
6789 synchronize_net();
6790
9b5e383c 6791 list_for_each_entry(dev, head, unreg_list) {
395eea6c
MB
6792 struct sk_buff *skb = NULL;
6793
9b5e383c
ED
6794 /* Shutdown queueing discipline. */
6795 dev_shutdown(dev);
93ee31f1
DL
6796
6797
9b5e383c
ED
6798 /* Notify protocols, that we are about to destroy
6799 this device. They should clean all the things.
6800 */
6801 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 6802
395eea6c
MB
6803 if (!dev->rtnl_link_ops ||
6804 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6805 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6806 GFP_KERNEL);
6807
9b5e383c
ED
6808 /*
6809 * Flush the unicast and multicast chains
6810 */
a748ee24 6811 dev_uc_flush(dev);
22bedad3 6812 dev_mc_flush(dev);
93ee31f1 6813
9b5e383c
ED
6814 if (dev->netdev_ops->ndo_uninit)
6815 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 6816
395eea6c
MB
6817 if (skb)
6818 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
56bfa7ee 6819
9ff162a8
JP
6820 /* Notifier chain MUST detach us all upper devices. */
6821 WARN_ON(netdev_has_any_upper_dev(dev));
0f524a80 6822 WARN_ON(netdev_has_any_lower_dev(dev));
93ee31f1 6823
9b5e383c
ED
6824 /* Remove entries from kobject tree */
6825 netdev_unregister_kobject(dev);
024e9679
AD
6826#ifdef CONFIG_XPS
6827 /* Remove XPS queueing entries */
6828 netif_reset_xps_queues_gt(dev, 0);
6829#endif
9b5e383c 6830 }
93ee31f1 6831
850a545b 6832 synchronize_net();
395264d5 6833
a5ee1551 6834 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
6835 dev_put(dev);
6836}
6837
6838static void rollback_registered(struct net_device *dev)
6839{
6840 LIST_HEAD(single);
6841
6842 list_add(&dev->unreg_list, &single);
6843 rollback_registered_many(&single);
ceaaec98 6844 list_del(&single);
93ee31f1
DL
6845}
6846
fd867d51
JW
6847static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6848 struct net_device *upper, netdev_features_t features)
6849{
6850 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6851 netdev_features_t feature;
5ba3f7d6 6852 int feature_bit;
fd867d51 6853
5ba3f7d6
JW
6854 for_each_netdev_feature(&upper_disables, feature_bit) {
6855 feature = __NETIF_F_BIT(feature_bit);
fd867d51
JW
6856 if (!(upper->wanted_features & feature)
6857 && (features & feature)) {
6858 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6859 &feature, upper->name);
6860 features &= ~feature;
6861 }
6862 }
6863
6864 return features;
6865}
6866
6867static void netdev_sync_lower_features(struct net_device *upper,
6868 struct net_device *lower, netdev_features_t features)
6869{
6870 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6871 netdev_features_t feature;
5ba3f7d6 6872 int feature_bit;
fd867d51 6873
5ba3f7d6
JW
6874 for_each_netdev_feature(&upper_disables, feature_bit) {
6875 feature = __NETIF_F_BIT(feature_bit);
fd867d51
JW
6876 if (!(features & feature) && (lower->features & feature)) {
6877 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6878 &feature, lower->name);
6879 lower->wanted_features &= ~feature;
6880 netdev_update_features(lower);
6881
6882 if (unlikely(lower->features & feature))
6883 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6884 &feature, lower->name);
6885 }
6886 }
6887}
6888
c8f44aff
MM
6889static netdev_features_t netdev_fix_features(struct net_device *dev,
6890 netdev_features_t features)
b63365a2 6891{
57422dc5
MM
6892 /* Fix illegal checksum combinations */
6893 if ((features & NETIF_F_HW_CSUM) &&
6894 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 6895 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
6896 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6897 }
6898
b63365a2 6899 /* TSO requires that SG is present as well. */
ea2d3688 6900 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 6901 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 6902 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
6903 }
6904
ec5f0615
PS
6905 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6906 !(features & NETIF_F_IP_CSUM)) {
6907 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6908 features &= ~NETIF_F_TSO;
6909 features &= ~NETIF_F_TSO_ECN;
6910 }
6911
6912 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6913 !(features & NETIF_F_IPV6_CSUM)) {
6914 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6915 features &= ~NETIF_F_TSO6;
6916 }
6917
b1dc497b
AD
6918 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6919 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6920 features &= ~NETIF_F_TSO_MANGLEID;
6921
31d8b9e0
BH
6922 /* TSO ECN requires that TSO is present as well. */
6923 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6924 features &= ~NETIF_F_TSO_ECN;
6925
212b573f
MM
6926 /* Software GSO depends on SG. */
6927 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 6928 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
6929 features &= ~NETIF_F_GSO;
6930 }
6931
acd1130e 6932 /* UFO needs SG and checksumming */
b63365a2 6933 if (features & NETIF_F_UFO) {
79032644 6934 /* maybe split UFO into V4 and V6? */
c8cd0989
TH
6935 if (!(features & NETIF_F_HW_CSUM) &&
6936 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6937 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6f404e44 6938 netdev_dbg(dev,
acd1130e 6939 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
6940 features &= ~NETIF_F_UFO;
6941 }
6942
6943 if (!(features & NETIF_F_SG)) {
6f404e44 6944 netdev_dbg(dev,
acd1130e 6945 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
6946 features &= ~NETIF_F_UFO;
6947 }
6948 }
6949
802ab55a
AD
6950 /* GSO partial features require GSO partial be set */
6951 if ((features & dev->gso_partial_features) &&
6952 !(features & NETIF_F_GSO_PARTIAL)) {
6953 netdev_dbg(dev,
6954 "Dropping partially supported GSO features since no GSO partial.\n");
6955 features &= ~dev->gso_partial_features;
6956 }
6957
d0290214
JP
6958#ifdef CONFIG_NET_RX_BUSY_POLL
6959 if (dev->netdev_ops->ndo_busy_poll)
6960 features |= NETIF_F_BUSY_POLL;
6961 else
6962#endif
6963 features &= ~NETIF_F_BUSY_POLL;
6964
b63365a2
HX
6965 return features;
6966}
b63365a2 6967
6cb6a27c 6968int __netdev_update_features(struct net_device *dev)
5455c699 6969{
fd867d51 6970 struct net_device *upper, *lower;
c8f44aff 6971 netdev_features_t features;
fd867d51 6972 struct list_head *iter;
e7868a85 6973 int err = -1;
5455c699 6974
87267485
MM
6975 ASSERT_RTNL();
6976
5455c699
MM
6977 features = netdev_get_wanted_features(dev);
6978
6979 if (dev->netdev_ops->ndo_fix_features)
6980 features = dev->netdev_ops->ndo_fix_features(dev, features);
6981
6982 /* driver might be less strict about feature dependencies */
6983 features = netdev_fix_features(dev, features);
6984
fd867d51
JW
6985 /* some features can't be enabled if they're off an an upper device */
6986 netdev_for_each_upper_dev_rcu(dev, upper, iter)
6987 features = netdev_sync_upper_features(dev, upper, features);
6988
5455c699 6989 if (dev->features == features)
e7868a85 6990 goto sync_lower;
5455c699 6991
c8f44aff
MM
6992 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6993 &dev->features, &features);
5455c699
MM
6994
6995 if (dev->netdev_ops->ndo_set_features)
6996 err = dev->netdev_ops->ndo_set_features(dev, features);
5f8dc33e
NA
6997 else
6998 err = 0;
5455c699 6999
6cb6a27c 7000 if (unlikely(err < 0)) {
5455c699 7001 netdev_err(dev,
c8f44aff
MM
7002 "set_features() failed (%d); wanted %pNF, left %pNF\n",
7003 err, &features, &dev->features);
17b85d29
NA
7004 /* return non-0 since some features might have changed and
7005 * it's better to fire a spurious notification than miss it
7006 */
7007 return -1;
6cb6a27c
MM
7008 }
7009
e7868a85 7010sync_lower:
fd867d51
JW
7011 /* some features must be disabled on lower devices when disabled
7012 * on an upper device (think: bonding master or bridge)
7013 */
7014 netdev_for_each_lower_dev(dev, lower, iter)
7015 netdev_sync_lower_features(dev, lower, features);
7016
6cb6a27c
MM
7017 if (!err)
7018 dev->features = features;
7019
e7868a85 7020 return err < 0 ? 0 : 1;
6cb6a27c
MM
7021}
7022
afe12cc8
MM
7023/**
7024 * netdev_update_features - recalculate device features
7025 * @dev: the device to check
7026 *
7027 * Recalculate dev->features set and send notifications if it
7028 * has changed. Should be called after driver or hardware dependent
7029 * conditions might have changed that influence the features.
7030 */
6cb6a27c
MM
7031void netdev_update_features(struct net_device *dev)
7032{
7033 if (__netdev_update_features(dev))
7034 netdev_features_change(dev);
5455c699
MM
7035}
7036EXPORT_SYMBOL(netdev_update_features);
7037
afe12cc8
MM
7038/**
7039 * netdev_change_features - recalculate device features
7040 * @dev: the device to check
7041 *
7042 * Recalculate dev->features set and send notifications even
7043 * if they have not changed. Should be called instead of
7044 * netdev_update_features() if also dev->vlan_features might
7045 * have changed to allow the changes to be propagated to stacked
7046 * VLAN devices.
7047 */
7048void netdev_change_features(struct net_device *dev)
7049{
7050 __netdev_update_features(dev);
7051 netdev_features_change(dev);
7052}
7053EXPORT_SYMBOL(netdev_change_features);
7054
fc4a7489
PM
7055/**
7056 * netif_stacked_transfer_operstate - transfer operstate
7057 * @rootdev: the root or lower level device to transfer state from
7058 * @dev: the device to transfer operstate to
7059 *
7060 * Transfer operational state from root to device. This is normally
7061 * called when a stacking relationship exists between the root
7062 * device and the device(a leaf device).
7063 */
7064void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7065 struct net_device *dev)
7066{
7067 if (rootdev->operstate == IF_OPER_DORMANT)
7068 netif_dormant_on(dev);
7069 else
7070 netif_dormant_off(dev);
7071
7072 if (netif_carrier_ok(rootdev)) {
7073 if (!netif_carrier_ok(dev))
7074 netif_carrier_on(dev);
7075 } else {
7076 if (netif_carrier_ok(dev))
7077 netif_carrier_off(dev);
7078 }
7079}
7080EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7081
a953be53 7082#ifdef CONFIG_SYSFS
1b4bf461
ED
7083static int netif_alloc_rx_queues(struct net_device *dev)
7084{
1b4bf461 7085 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 7086 struct netdev_rx_queue *rx;
10595902 7087 size_t sz = count * sizeof(*rx);
1b4bf461 7088
bd25fa7b 7089 BUG_ON(count < 1);
1b4bf461 7090
10595902
PG
7091 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7092 if (!rx) {
7093 rx = vzalloc(sz);
7094 if (!rx)
7095 return -ENOMEM;
7096 }
bd25fa7b
TH
7097 dev->_rx = rx;
7098
bd25fa7b 7099 for (i = 0; i < count; i++)
fe822240 7100 rx[i].dev = dev;
1b4bf461
ED
7101 return 0;
7102}
bf264145 7103#endif
1b4bf461 7104
aa942104
CG
7105static void netdev_init_one_queue(struct net_device *dev,
7106 struct netdev_queue *queue, void *_unused)
7107{
7108 /* Initialize queue lock */
7109 spin_lock_init(&queue->_xmit_lock);
7110 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7111 queue->xmit_lock_owner = -1;
b236da69 7112 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 7113 queue->dev = dev;
114cf580
TH
7114#ifdef CONFIG_BQL
7115 dql_init(&queue->dql, HZ);
7116#endif
aa942104
CG
7117}
7118
60877a32
ED
7119static void netif_free_tx_queues(struct net_device *dev)
7120{
4cb28970 7121 kvfree(dev->_tx);
60877a32
ED
7122}
7123
e6484930
TH
7124static int netif_alloc_netdev_queues(struct net_device *dev)
7125{
7126 unsigned int count = dev->num_tx_queues;
7127 struct netdev_queue *tx;
60877a32 7128 size_t sz = count * sizeof(*tx);
e6484930 7129
d339727c
ED
7130 if (count < 1 || count > 0xffff)
7131 return -EINVAL;
62b5942a 7132
60877a32
ED
7133 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7134 if (!tx) {
7135 tx = vzalloc(sz);
7136 if (!tx)
7137 return -ENOMEM;
7138 }
e6484930 7139 dev->_tx = tx;
1d24eb48 7140
e6484930
TH
7141 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7142 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
7143
7144 return 0;
e6484930
TH
7145}
7146
a2029240
DV
7147void netif_tx_stop_all_queues(struct net_device *dev)
7148{
7149 unsigned int i;
7150
7151 for (i = 0; i < dev->num_tx_queues; i++) {
7152 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7153 netif_tx_stop_queue(txq);
7154 }
7155}
7156EXPORT_SYMBOL(netif_tx_stop_all_queues);
7157
1da177e4
LT
7158/**
7159 * register_netdevice - register a network device
7160 * @dev: device to register
7161 *
7162 * Take a completed network device structure and add it to the kernel
7163 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7164 * chain. 0 is returned on success. A negative errno code is returned
7165 * on a failure to set up the device, or if the name is a duplicate.
7166 *
7167 * Callers must hold the rtnl semaphore. You may want
7168 * register_netdev() instead of this.
7169 *
7170 * BUGS:
7171 * The locking appears insufficient to guarantee two parallel registers
7172 * will not get the same name.
7173 */
7174
7175int register_netdevice(struct net_device *dev)
7176{
1da177e4 7177 int ret;
d314774c 7178 struct net *net = dev_net(dev);
1da177e4
LT
7179
7180 BUG_ON(dev_boot_phase);
7181 ASSERT_RTNL();
7182
b17a7c17
SH
7183 might_sleep();
7184
1da177e4
LT
7185 /* When net_device's are persistent, this will be fatal. */
7186 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 7187 BUG_ON(!net);
1da177e4 7188
f1f28aa3 7189 spin_lock_init(&dev->addr_list_lock);
cf508b12 7190 netdev_set_addr_lockdep_class(dev);
1da177e4 7191
828de4f6 7192 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
7193 if (ret < 0)
7194 goto out;
7195
1da177e4 7196 /* Init, if this function is available */
d314774c
SH
7197 if (dev->netdev_ops->ndo_init) {
7198 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
7199 if (ret) {
7200 if (ret > 0)
7201 ret = -EIO;
90833aa4 7202 goto out;
1da177e4
LT
7203 }
7204 }
4ec93edb 7205
f646968f
PM
7206 if (((dev->hw_features | dev->features) &
7207 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
7208 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7209 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7210 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7211 ret = -EINVAL;
7212 goto err_uninit;
7213 }
7214
9c7dafbf
PE
7215 ret = -EBUSY;
7216 if (!dev->ifindex)
7217 dev->ifindex = dev_new_index(net);
7218 else if (__dev_get_by_index(net, dev->ifindex))
7219 goto err_uninit;
7220
5455c699
MM
7221 /* Transfer changeable features to wanted_features and enable
7222 * software offloads (GSO and GRO).
7223 */
7224 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
7225 dev->features |= NETIF_F_SOFT_FEATURES;
7226 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 7227
cbc53e08 7228 if (!(dev->flags & IFF_LOOPBACK))
34324dc2 7229 dev->hw_features |= NETIF_F_NOCACHE_COPY;
cbc53e08 7230
7f348a60
AD
7231 /* If IPv4 TCP segmentation offload is supported we should also
7232 * allow the device to enable segmenting the frame with the option
7233 * of ignoring a static IP ID value. This doesn't enable the
7234 * feature itself but allows the user to enable it later.
7235 */
cbc53e08
AD
7236 if (dev->hw_features & NETIF_F_TSO)
7237 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7f348a60
AD
7238 if (dev->vlan_features & NETIF_F_TSO)
7239 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7240 if (dev->mpls_features & NETIF_F_TSO)
7241 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7242 if (dev->hw_enc_features & NETIF_F_TSO)
7243 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
c6e1a0d1 7244
1180e7d6 7245 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 7246 */
1180e7d6 7247 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 7248
ee579677
PS
7249 /* Make NETIF_F_SG inheritable to tunnel devices.
7250 */
802ab55a 7251 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
ee579677 7252
0d89d203
SH
7253 /* Make NETIF_F_SG inheritable to MPLS.
7254 */
7255 dev->mpls_features |= NETIF_F_SG;
7256
7ffbe3fd
JB
7257 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7258 ret = notifier_to_errno(ret);
7259 if (ret)
7260 goto err_uninit;
7261
8b41d188 7262 ret = netdev_register_kobject(dev);
b17a7c17 7263 if (ret)
7ce1b0ed 7264 goto err_uninit;
b17a7c17
SH
7265 dev->reg_state = NETREG_REGISTERED;
7266
6cb6a27c 7267 __netdev_update_features(dev);
8e9b59b2 7268
1da177e4
LT
7269 /*
7270 * Default initial state at registry is that the
7271 * device is present.
7272 */
7273
7274 set_bit(__LINK_STATE_PRESENT, &dev->state);
7275
8f4cccbb
BH
7276 linkwatch_init_dev(dev);
7277
1da177e4 7278 dev_init_scheduler(dev);
1da177e4 7279 dev_hold(dev);
ce286d32 7280 list_netdevice(dev);
7bf23575 7281 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 7282
948b337e
JP
7283 /* If the device has permanent device address, driver should
7284 * set dev_addr and also addr_assign_type should be set to
7285 * NET_ADDR_PERM (default value).
7286 */
7287 if (dev->addr_assign_type == NET_ADDR_PERM)
7288 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7289
1da177e4 7290 /* Notify protocols, that a new device appeared. */
056925ab 7291 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 7292 ret = notifier_to_errno(ret);
93ee31f1
DL
7293 if (ret) {
7294 rollback_registered(dev);
7295 dev->reg_state = NETREG_UNREGISTERED;
7296 }
d90a909e
EB
7297 /*
7298 * Prevent userspace races by waiting until the network
7299 * device is fully setup before sending notifications.
7300 */
a2835763
PM
7301 if (!dev->rtnl_link_ops ||
7302 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7f294054 7303 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
1da177e4
LT
7304
7305out:
7306 return ret;
7ce1b0ed
HX
7307
7308err_uninit:
d314774c
SH
7309 if (dev->netdev_ops->ndo_uninit)
7310 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 7311 goto out;
1da177e4 7312}
d1b19dff 7313EXPORT_SYMBOL(register_netdevice);
1da177e4 7314
937f1ba5
BH
7315/**
7316 * init_dummy_netdev - init a dummy network device for NAPI
7317 * @dev: device to init
7318 *
7319 * This takes a network device structure and initialize the minimum
7320 * amount of fields so it can be used to schedule NAPI polls without
7321 * registering a full blown interface. This is to be used by drivers
7322 * that need to tie several hardware interfaces to a single NAPI
7323 * poll scheduler due to HW limitations.
7324 */
7325int init_dummy_netdev(struct net_device *dev)
7326{
7327 /* Clear everything. Note we don't initialize spinlocks
7328 * are they aren't supposed to be taken by any of the
7329 * NAPI code and this dummy netdev is supposed to be
7330 * only ever used for NAPI polls
7331 */
7332 memset(dev, 0, sizeof(struct net_device));
7333
7334 /* make sure we BUG if trying to hit standard
7335 * register/unregister code path
7336 */
7337 dev->reg_state = NETREG_DUMMY;
7338
937f1ba5
BH
7339 /* NAPI wants this */
7340 INIT_LIST_HEAD(&dev->napi_list);
7341
7342 /* a dummy interface is started by default */
7343 set_bit(__LINK_STATE_PRESENT, &dev->state);
7344 set_bit(__LINK_STATE_START, &dev->state);
7345
29b4433d
ED
7346 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7347 * because users of this 'device' dont need to change
7348 * its refcount.
7349 */
7350
937f1ba5
BH
7351 return 0;
7352}
7353EXPORT_SYMBOL_GPL(init_dummy_netdev);
7354
7355
1da177e4
LT
7356/**
7357 * register_netdev - register a network device
7358 * @dev: device to register
7359 *
7360 * Take a completed network device structure and add it to the kernel
7361 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7362 * chain. 0 is returned on success. A negative errno code is returned
7363 * on a failure to set up the device, or if the name is a duplicate.
7364 *
38b4da38 7365 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
7366 * and expands the device name if you passed a format string to
7367 * alloc_netdev.
7368 */
7369int register_netdev(struct net_device *dev)
7370{
7371 int err;
7372
7373 rtnl_lock();
1da177e4 7374 err = register_netdevice(dev);
1da177e4
LT
7375 rtnl_unlock();
7376 return err;
7377}
7378EXPORT_SYMBOL(register_netdev);
7379
29b4433d
ED
7380int netdev_refcnt_read(const struct net_device *dev)
7381{
7382 int i, refcnt = 0;
7383
7384 for_each_possible_cpu(i)
7385 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7386 return refcnt;
7387}
7388EXPORT_SYMBOL(netdev_refcnt_read);
7389
2c53040f 7390/**
1da177e4 7391 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 7392 * @dev: target net_device
1da177e4
LT
7393 *
7394 * This is called when unregistering network devices.
7395 *
7396 * Any protocol or device that holds a reference should register
7397 * for netdevice notification, and cleanup and put back the
7398 * reference if they receive an UNREGISTER event.
7399 * We can get stuck here if buggy protocols don't correctly
4ec93edb 7400 * call dev_put.
1da177e4
LT
7401 */
7402static void netdev_wait_allrefs(struct net_device *dev)
7403{
7404 unsigned long rebroadcast_time, warning_time;
29b4433d 7405 int refcnt;
1da177e4 7406
e014debe
ED
7407 linkwatch_forget_dev(dev);
7408
1da177e4 7409 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
7410 refcnt = netdev_refcnt_read(dev);
7411
7412 while (refcnt != 0) {
1da177e4 7413 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 7414 rtnl_lock();
1da177e4
LT
7415
7416 /* Rebroadcast unregister notification */
056925ab 7417 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 7418
748e2d93 7419 __rtnl_unlock();
0115e8e3 7420 rcu_barrier();
748e2d93
ED
7421 rtnl_lock();
7422
0115e8e3 7423 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
7424 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7425 &dev->state)) {
7426 /* We must not have linkwatch events
7427 * pending on unregister. If this
7428 * happens, we simply run the queue
7429 * unscheduled, resulting in a noop
7430 * for this device.
7431 */
7432 linkwatch_run_queue();
7433 }
7434
6756ae4b 7435 __rtnl_unlock();
1da177e4
LT
7436
7437 rebroadcast_time = jiffies;
7438 }
7439
7440 msleep(250);
7441
29b4433d
ED
7442 refcnt = netdev_refcnt_read(dev);
7443
1da177e4 7444 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
7445 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7446 dev->name, refcnt);
1da177e4
LT
7447 warning_time = jiffies;
7448 }
7449 }
7450}
7451
7452/* The sequence is:
7453 *
7454 * rtnl_lock();
7455 * ...
7456 * register_netdevice(x1);
7457 * register_netdevice(x2);
7458 * ...
7459 * unregister_netdevice(y1);
7460 * unregister_netdevice(y2);
7461 * ...
7462 * rtnl_unlock();
7463 * free_netdev(y1);
7464 * free_netdev(y2);
7465 *
58ec3b4d 7466 * We are invoked by rtnl_unlock().
1da177e4 7467 * This allows us to deal with problems:
b17a7c17 7468 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
7469 * without deadlocking with linkwatch via keventd.
7470 * 2) Since we run with the RTNL semaphore not held, we can sleep
7471 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
7472 *
7473 * We must not return until all unregister events added during
7474 * the interval the lock was held have been completed.
1da177e4 7475 */
1da177e4
LT
7476void netdev_run_todo(void)
7477{
626ab0e6 7478 struct list_head list;
1da177e4 7479
1da177e4 7480 /* Snapshot list, allow later requests */
626ab0e6 7481 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
7482
7483 __rtnl_unlock();
626ab0e6 7484
0115e8e3
ED
7485
7486 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
7487 if (!list_empty(&list))
7488 rcu_barrier();
7489
1da177e4
LT
7490 while (!list_empty(&list)) {
7491 struct net_device *dev
e5e26d75 7492 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
7493 list_del(&dev->todo_list);
7494
748e2d93 7495 rtnl_lock();
0115e8e3 7496 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 7497 __rtnl_unlock();
0115e8e3 7498
b17a7c17 7499 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 7500 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
7501 dev->name, dev->reg_state);
7502 dump_stack();
7503 continue;
7504 }
1da177e4 7505
b17a7c17 7506 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 7507
b17a7c17 7508 netdev_wait_allrefs(dev);
1da177e4 7509
b17a7c17 7510 /* paranoia */
29b4433d 7511 BUG_ON(netdev_refcnt_read(dev));
7866a621
SN
7512 BUG_ON(!list_empty(&dev->ptype_all));
7513 BUG_ON(!list_empty(&dev->ptype_specific));
33d480ce
ED
7514 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7515 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 7516 WARN_ON(dev->dn_ptr);
1da177e4 7517
b17a7c17
SH
7518 if (dev->destructor)
7519 dev->destructor(dev);
9093bbb2 7520
50624c93
EB
7521 /* Report a network device has been unregistered */
7522 rtnl_lock();
7523 dev_net(dev)->dev_unreg_count--;
7524 __rtnl_unlock();
7525 wake_up(&netdev_unregistering_wq);
7526
9093bbb2
SH
7527 /* Free network device */
7528 kobject_put(&dev->dev.kobj);
1da177e4 7529 }
1da177e4
LT
7530}
7531
9256645a
JW
7532/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7533 * all the same fields in the same order as net_device_stats, with only
7534 * the type differing, but rtnl_link_stats64 may have additional fields
7535 * at the end for newer counters.
3cfde79c 7536 */
77a1abf5
ED
7537void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7538 const struct net_device_stats *netdev_stats)
3cfde79c
BH
7539{
7540#if BITS_PER_LONG == 64
9256645a 7541 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
77a1abf5 7542 memcpy(stats64, netdev_stats, sizeof(*stats64));
9256645a
JW
7543 /* zero out counters that only exist in rtnl_link_stats64 */
7544 memset((char *)stats64 + sizeof(*netdev_stats), 0,
7545 sizeof(*stats64) - sizeof(*netdev_stats));
3cfde79c 7546#else
9256645a 7547 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
3cfde79c
BH
7548 const unsigned long *src = (const unsigned long *)netdev_stats;
7549 u64 *dst = (u64 *)stats64;
7550
9256645a 7551 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
3cfde79c
BH
7552 for (i = 0; i < n; i++)
7553 dst[i] = src[i];
9256645a
JW
7554 /* zero out counters that only exist in rtnl_link_stats64 */
7555 memset((char *)stats64 + n * sizeof(u64), 0,
7556 sizeof(*stats64) - n * sizeof(u64));
3cfde79c
BH
7557#endif
7558}
77a1abf5 7559EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 7560
eeda3fd6
SH
7561/**
7562 * dev_get_stats - get network device statistics
7563 * @dev: device to get statistics from
28172739 7564 * @storage: place to store stats
eeda3fd6 7565 *
d7753516
BH
7566 * Get network statistics from device. Return @storage.
7567 * The device driver may provide its own method by setting
7568 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7569 * otherwise the internal statistics structure is used.
eeda3fd6 7570 */
d7753516
BH
7571struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7572 struct rtnl_link_stats64 *storage)
7004bf25 7573{
eeda3fd6
SH
7574 const struct net_device_ops *ops = dev->netdev_ops;
7575
28172739
ED
7576 if (ops->ndo_get_stats64) {
7577 memset(storage, 0, sizeof(*storage));
caf586e5
ED
7578 ops->ndo_get_stats64(dev, storage);
7579 } else if (ops->ndo_get_stats) {
3cfde79c 7580 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
7581 } else {
7582 netdev_stats_to_stats64(storage, &dev->stats);
28172739 7583 }
caf586e5 7584 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
015f0688 7585 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6e7333d3 7586 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
28172739 7587 return storage;
c45d286e 7588}
eeda3fd6 7589EXPORT_SYMBOL(dev_get_stats);
c45d286e 7590
24824a09 7591struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 7592{
24824a09 7593 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 7594
24824a09
ED
7595#ifdef CONFIG_NET_CLS_ACT
7596 if (queue)
7597 return queue;
7598 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7599 if (!queue)
7600 return NULL;
7601 netdev_init_one_queue(dev, queue, NULL);
2ce1ee17 7602 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
24824a09
ED
7603 queue->qdisc_sleeping = &noop_qdisc;
7604 rcu_assign_pointer(dev->ingress_queue, queue);
7605#endif
7606 return queue;
bb949fbd
DM
7607}
7608
2c60db03
ED
7609static const struct ethtool_ops default_ethtool_ops;
7610
d07d7507
SG
7611void netdev_set_default_ethtool_ops(struct net_device *dev,
7612 const struct ethtool_ops *ops)
7613{
7614 if (dev->ethtool_ops == &default_ethtool_ops)
7615 dev->ethtool_ops = ops;
7616}
7617EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7618
74d332c1
ED
7619void netdev_freemem(struct net_device *dev)
7620{
7621 char *addr = (char *)dev - dev->padded;
7622
4cb28970 7623 kvfree(addr);
74d332c1
ED
7624}
7625
1da177e4 7626/**
36909ea4 7627 * alloc_netdev_mqs - allocate network device
c835a677
TG
7628 * @sizeof_priv: size of private data to allocate space for
7629 * @name: device name format string
7630 * @name_assign_type: origin of device name
7631 * @setup: callback to initialize device
7632 * @txqs: the number of TX subqueues to allocate
7633 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
7634 *
7635 * Allocates a struct net_device with private data area for driver use
90e51adf 7636 * and performs basic initialization. Also allocates subqueue structs
36909ea4 7637 * for each queue on the device.
1da177e4 7638 */
36909ea4 7639struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
c835a677 7640 unsigned char name_assign_type,
36909ea4
TH
7641 void (*setup)(struct net_device *),
7642 unsigned int txqs, unsigned int rxqs)
1da177e4 7643{
1da177e4 7644 struct net_device *dev;
7943986c 7645 size_t alloc_size;
1ce8e7b5 7646 struct net_device *p;
1da177e4 7647
b6fe17d6
SH
7648 BUG_ON(strlen(name) >= sizeof(dev->name));
7649
36909ea4 7650 if (txqs < 1) {
7b6cd1ce 7651 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
7652 return NULL;
7653 }
7654
a953be53 7655#ifdef CONFIG_SYSFS
36909ea4 7656 if (rxqs < 1) {
7b6cd1ce 7657 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
7658 return NULL;
7659 }
7660#endif
7661
fd2ea0a7 7662 alloc_size = sizeof(struct net_device);
d1643d24
AD
7663 if (sizeof_priv) {
7664 /* ensure 32-byte alignment of private area */
1ce8e7b5 7665 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
7666 alloc_size += sizeof_priv;
7667 }
7668 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 7669 alloc_size += NETDEV_ALIGN - 1;
1da177e4 7670
74d332c1
ED
7671 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7672 if (!p)
7673 p = vzalloc(alloc_size);
62b5942a 7674 if (!p)
1da177e4 7675 return NULL;
1da177e4 7676
1ce8e7b5 7677 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 7678 dev->padded = (char *)dev - (char *)p;
ab9c73cc 7679
29b4433d
ED
7680 dev->pcpu_refcnt = alloc_percpu(int);
7681 if (!dev->pcpu_refcnt)
74d332c1 7682 goto free_dev;
ab9c73cc 7683
ab9c73cc 7684 if (dev_addr_init(dev))
29b4433d 7685 goto free_pcpu;
ab9c73cc 7686
22bedad3 7687 dev_mc_init(dev);
a748ee24 7688 dev_uc_init(dev);
ccffad25 7689
c346dca1 7690 dev_net_set(dev, &init_net);
1da177e4 7691
8d3bdbd5 7692 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 7693 dev->gso_max_segs = GSO_MAX_SEGS;
8d3bdbd5 7694
8d3bdbd5
DM
7695 INIT_LIST_HEAD(&dev->napi_list);
7696 INIT_LIST_HEAD(&dev->unreg_list);
5cde2829 7697 INIT_LIST_HEAD(&dev->close_list);
8d3bdbd5 7698 INIT_LIST_HEAD(&dev->link_watch_list);
2f268f12
VF
7699 INIT_LIST_HEAD(&dev->adj_list.upper);
7700 INIT_LIST_HEAD(&dev->adj_list.lower);
7866a621
SN
7701 INIT_LIST_HEAD(&dev->ptype_all);
7702 INIT_LIST_HEAD(&dev->ptype_specific);
59cc1f61
JK
7703#ifdef CONFIG_NET_SCHED
7704 hash_init(dev->qdisc_hash);
7705#endif
02875878 7706 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8d3bdbd5
DM
7707 setup(dev);
7708
a813104d 7709 if (!dev->tx_queue_len) {
f84bb1ea 7710 dev->priv_flags |= IFF_NO_QUEUE;
11597084 7711 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
a813104d 7712 }
906470c1 7713
36909ea4
TH
7714 dev->num_tx_queues = txqs;
7715 dev->real_num_tx_queues = txqs;
ed9af2e8 7716 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 7717 goto free_all;
e8a0464c 7718
a953be53 7719#ifdef CONFIG_SYSFS
36909ea4
TH
7720 dev->num_rx_queues = rxqs;
7721 dev->real_num_rx_queues = rxqs;
fe822240 7722 if (netif_alloc_rx_queues(dev))
8d3bdbd5 7723 goto free_all;
df334545 7724#endif
0a9627f2 7725
1da177e4 7726 strcpy(dev->name, name);
c835a677 7727 dev->name_assign_type = name_assign_type;
cbda10fa 7728 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
7729 if (!dev->ethtool_ops)
7730 dev->ethtool_ops = &default_ethtool_ops;
e687ad60
PN
7731
7732 nf_hook_ingress_init(dev);
7733
1da177e4 7734 return dev;
ab9c73cc 7735
8d3bdbd5
DM
7736free_all:
7737 free_netdev(dev);
7738 return NULL;
7739
29b4433d
ED
7740free_pcpu:
7741 free_percpu(dev->pcpu_refcnt);
74d332c1
ED
7742free_dev:
7743 netdev_freemem(dev);
ab9c73cc 7744 return NULL;
1da177e4 7745}
36909ea4 7746EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
7747
7748/**
7749 * free_netdev - free network device
7750 * @dev: device
7751 *
4ec93edb
YH
7752 * This function does the last stage of destroying an allocated device
7753 * interface. The reference to the device object is released.
1da177e4 7754 * If this is the last reference then it will be freed.
93d05d4a 7755 * Must be called in process context.
1da177e4
LT
7756 */
7757void free_netdev(struct net_device *dev)
7758{
d565b0a1
HX
7759 struct napi_struct *p, *n;
7760
93d05d4a 7761 might_sleep();
60877a32 7762 netif_free_tx_queues(dev);
a953be53 7763#ifdef CONFIG_SYSFS
10595902 7764 kvfree(dev->_rx);
fe822240 7765#endif
e8a0464c 7766
33d480ce 7767 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 7768
f001fde5
JP
7769 /* Flush device addresses */
7770 dev_addr_flush(dev);
7771
d565b0a1
HX
7772 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7773 netif_napi_del(p);
7774
29b4433d
ED
7775 free_percpu(dev->pcpu_refcnt);
7776 dev->pcpu_refcnt = NULL;
7777
3041a069 7778 /* Compatibility with error handling in drivers */
1da177e4 7779 if (dev->reg_state == NETREG_UNINITIALIZED) {
74d332c1 7780 netdev_freemem(dev);
1da177e4
LT
7781 return;
7782 }
7783
7784 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7785 dev->reg_state = NETREG_RELEASED;
7786
43cb76d9
GKH
7787 /* will free via device release */
7788 put_device(&dev->dev);
1da177e4 7789}
d1b19dff 7790EXPORT_SYMBOL(free_netdev);
4ec93edb 7791
f0db275a
SH
7792/**
7793 * synchronize_net - Synchronize with packet receive processing
7794 *
7795 * Wait for packets currently being received to be done.
7796 * Does not block later packets from starting.
7797 */
4ec93edb 7798void synchronize_net(void)
1da177e4
LT
7799{
7800 might_sleep();
be3fc413
ED
7801 if (rtnl_is_locked())
7802 synchronize_rcu_expedited();
7803 else
7804 synchronize_rcu();
1da177e4 7805}
d1b19dff 7806EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
7807
7808/**
44a0873d 7809 * unregister_netdevice_queue - remove device from the kernel
1da177e4 7810 * @dev: device
44a0873d 7811 * @head: list
6ebfbc06 7812 *
1da177e4 7813 * This function shuts down a device interface and removes it
d59b54b1 7814 * from the kernel tables.
44a0873d 7815 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
7816 *
7817 * Callers must hold the rtnl semaphore. You may want
7818 * unregister_netdev() instead of this.
7819 */
7820
44a0873d 7821void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 7822{
a6620712
HX
7823 ASSERT_RTNL();
7824
44a0873d 7825 if (head) {
9fdce099 7826 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
7827 } else {
7828 rollback_registered(dev);
7829 /* Finish processing unregister after unlock */
7830 net_set_todo(dev);
7831 }
1da177e4 7832}
44a0873d 7833EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 7834
9b5e383c
ED
7835/**
7836 * unregister_netdevice_many - unregister many devices
7837 * @head: list of devices
87757a91
ED
7838 *
7839 * Note: As most callers use a stack allocated list_head,
7840 * we force a list_del() to make sure stack wont be corrupted later.
9b5e383c
ED
7841 */
7842void unregister_netdevice_many(struct list_head *head)
7843{
7844 struct net_device *dev;
7845
7846 if (!list_empty(head)) {
7847 rollback_registered_many(head);
7848 list_for_each_entry(dev, head, unreg_list)
7849 net_set_todo(dev);
87757a91 7850 list_del(head);
9b5e383c
ED
7851 }
7852}
63c8099d 7853EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 7854
1da177e4
LT
7855/**
7856 * unregister_netdev - remove device from the kernel
7857 * @dev: device
7858 *
7859 * This function shuts down a device interface and removes it
d59b54b1 7860 * from the kernel tables.
1da177e4
LT
7861 *
7862 * This is just a wrapper for unregister_netdevice that takes
7863 * the rtnl semaphore. In general you want to use this and not
7864 * unregister_netdevice.
7865 */
7866void unregister_netdev(struct net_device *dev)
7867{
7868 rtnl_lock();
7869 unregister_netdevice(dev);
7870 rtnl_unlock();
7871}
1da177e4
LT
7872EXPORT_SYMBOL(unregister_netdev);
7873
ce286d32
EB
7874/**
7875 * dev_change_net_namespace - move device to different nethost namespace
7876 * @dev: device
7877 * @net: network namespace
7878 * @pat: If not NULL name pattern to try if the current device name
7879 * is already taken in the destination network namespace.
7880 *
7881 * This function shuts down a device interface and moves it
7882 * to a new network namespace. On success 0 is returned, on
7883 * a failure a netagive errno code is returned.
7884 *
7885 * Callers must hold the rtnl semaphore.
7886 */
7887
7888int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7889{
ce286d32
EB
7890 int err;
7891
7892 ASSERT_RTNL();
7893
7894 /* Don't allow namespace local devices to be moved. */
7895 err = -EINVAL;
7896 if (dev->features & NETIF_F_NETNS_LOCAL)
7897 goto out;
7898
7899 /* Ensure the device has been registrered */
ce286d32
EB
7900 if (dev->reg_state != NETREG_REGISTERED)
7901 goto out;
7902
7903 /* Get out if there is nothing todo */
7904 err = 0;
878628fb 7905 if (net_eq(dev_net(dev), net))
ce286d32
EB
7906 goto out;
7907
7908 /* Pick the destination device name, and ensure
7909 * we can use it in the destination network namespace.
7910 */
7911 err = -EEXIST;
d9031024 7912 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
7913 /* We get here if we can't use the current device name */
7914 if (!pat)
7915 goto out;
828de4f6 7916 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
7917 goto out;
7918 }
7919
7920 /*
7921 * And now a mini version of register_netdevice unregister_netdevice.
7922 */
7923
7924 /* If device is running close it first. */
9b772652 7925 dev_close(dev);
ce286d32
EB
7926
7927 /* And unlink it from device chain */
7928 err = -ENODEV;
7929 unlist_netdevice(dev);
7930
7931 synchronize_net();
7932
7933 /* Shutdown queueing discipline. */
7934 dev_shutdown(dev);
7935
7936 /* Notify protocols, that we are about to destroy
7937 this device. They should clean all the things.
3b27e105
DL
7938
7939 Note that dev->reg_state stays at NETREG_REGISTERED.
7940 This is wanted because this way 8021q and macvlan know
7941 the device is just moving and can keep their slaves up.
ce286d32
EB
7942 */
7943 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
7944 rcu_barrier();
7945 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7f294054 7946 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
ce286d32
EB
7947
7948 /*
7949 * Flush the unicast and multicast chains
7950 */
a748ee24 7951 dev_uc_flush(dev);
22bedad3 7952 dev_mc_flush(dev);
ce286d32 7953
4e66ae2e
SH
7954 /* Send a netdev-removed uevent to the old namespace */
7955 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
4c75431a 7956 netdev_adjacent_del_links(dev);
4e66ae2e 7957
ce286d32 7958 /* Actually switch the network namespace */
c346dca1 7959 dev_net_set(dev, net);
ce286d32 7960
ce286d32 7961 /* If there is an ifindex conflict assign a new one */
7a66bbc9 7962 if (__dev_get_by_index(net, dev->ifindex))
ce286d32 7963 dev->ifindex = dev_new_index(net);
ce286d32 7964
4e66ae2e
SH
7965 /* Send a netdev-add uevent to the new namespace */
7966 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
4c75431a 7967 netdev_adjacent_add_links(dev);
4e66ae2e 7968
8b41d188 7969 /* Fixup kobjects */
a1b3f594 7970 err = device_rename(&dev->dev, dev->name);
8b41d188 7971 WARN_ON(err);
ce286d32
EB
7972
7973 /* Add the device back in the hashes */
7974 list_netdevice(dev);
7975
7976 /* Notify protocols, that a new device appeared. */
7977 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7978
d90a909e
EB
7979 /*
7980 * Prevent userspace races by waiting until the network
7981 * device is fully setup before sending notifications.
7982 */
7f294054 7983 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
d90a909e 7984
ce286d32
EB
7985 synchronize_net();
7986 err = 0;
7987out:
7988 return err;
7989}
463d0183 7990EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 7991
1da177e4
LT
7992static int dev_cpu_callback(struct notifier_block *nfb,
7993 unsigned long action,
7994 void *ocpu)
7995{
7996 struct sk_buff **list_skb;
1da177e4
LT
7997 struct sk_buff *skb;
7998 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7999 struct softnet_data *sd, *oldsd;
8000
8bb78442 8001 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
8002 return NOTIFY_OK;
8003
8004 local_irq_disable();
8005 cpu = smp_processor_id();
8006 sd = &per_cpu(softnet_data, cpu);
8007 oldsd = &per_cpu(softnet_data, oldcpu);
8008
8009 /* Find end of our completion_queue. */
8010 list_skb = &sd->completion_queue;
8011 while (*list_skb)
8012 list_skb = &(*list_skb)->next;
8013 /* Append completion queue from offline CPU. */
8014 *list_skb = oldsd->completion_queue;
8015 oldsd->completion_queue = NULL;
8016
1da177e4 8017 /* Append output queue from offline CPU. */
a9cbd588
CG
8018 if (oldsd->output_queue) {
8019 *sd->output_queue_tailp = oldsd->output_queue;
8020 sd->output_queue_tailp = oldsd->output_queue_tailp;
8021 oldsd->output_queue = NULL;
8022 oldsd->output_queue_tailp = &oldsd->output_queue;
8023 }
ac64da0b
ED
8024 /* Append NAPI poll list from offline CPU, with one exception :
8025 * process_backlog() must be called by cpu owning percpu backlog.
8026 * We properly handle process_queue & input_pkt_queue later.
8027 */
8028 while (!list_empty(&oldsd->poll_list)) {
8029 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8030 struct napi_struct,
8031 poll_list);
8032
8033 list_del_init(&napi->poll_list);
8034 if (napi->poll == process_backlog)
8035 napi->state = 0;
8036 else
8037 ____napi_schedule(sd, napi);
264524d5 8038 }
1da177e4
LT
8039
8040 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8041 local_irq_enable();
8042
8043 /* Process offline CPU's input_pkt_queue */
76cc8b13 8044 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
91e83133 8045 netif_rx_ni(skb);
76cc8b13 8046 input_queue_head_incr(oldsd);
fec5e652 8047 }
ac64da0b 8048 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
91e83133 8049 netif_rx_ni(skb);
76cc8b13
TH
8050 input_queue_head_incr(oldsd);
8051 }
1da177e4
LT
8052
8053 return NOTIFY_OK;
8054}
1da177e4
LT
8055
8056
7f353bf2 8057/**
b63365a2
HX
8058 * netdev_increment_features - increment feature set by one
8059 * @all: current feature set
8060 * @one: new feature set
8061 * @mask: mask feature set
7f353bf2
HX
8062 *
8063 * Computes a new feature set after adding a device with feature set
b63365a2
HX
8064 * @one to the master device with current feature set @all. Will not
8065 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 8066 */
c8f44aff
MM
8067netdev_features_t netdev_increment_features(netdev_features_t all,
8068 netdev_features_t one, netdev_features_t mask)
b63365a2 8069{
c8cd0989 8070 if (mask & NETIF_F_HW_CSUM)
a188222b 8071 mask |= NETIF_F_CSUM_MASK;
1742f183 8072 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 8073
a188222b 8074 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
1742f183 8075 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 8076
1742f183 8077 /* If one device supports hw checksumming, set for all. */
c8cd0989
TH
8078 if (all & NETIF_F_HW_CSUM)
8079 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7f353bf2
HX
8080
8081 return all;
8082}
b63365a2 8083EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 8084
430f03cd 8085static struct hlist_head * __net_init netdev_create_hash(void)
30d97d35
PE
8086{
8087 int i;
8088 struct hlist_head *hash;
8089
8090 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8091 if (hash != NULL)
8092 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8093 INIT_HLIST_HEAD(&hash[i]);
8094
8095 return hash;
8096}
8097
881d966b 8098/* Initialize per network namespace state */
4665079c 8099static int __net_init netdev_init(struct net *net)
881d966b 8100{
734b6541
RM
8101 if (net != &init_net)
8102 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 8103
30d97d35
PE
8104 net->dev_name_head = netdev_create_hash();
8105 if (net->dev_name_head == NULL)
8106 goto err_name;
881d966b 8107
30d97d35
PE
8108 net->dev_index_head = netdev_create_hash();
8109 if (net->dev_index_head == NULL)
8110 goto err_idx;
881d966b
EB
8111
8112 return 0;
30d97d35
PE
8113
8114err_idx:
8115 kfree(net->dev_name_head);
8116err_name:
8117 return -ENOMEM;
881d966b
EB
8118}
8119
f0db275a
SH
8120/**
8121 * netdev_drivername - network driver for the device
8122 * @dev: network device
f0db275a
SH
8123 *
8124 * Determine network driver for device.
8125 */
3019de12 8126const char *netdev_drivername(const struct net_device *dev)
6579e57b 8127{
cf04a4c7
SH
8128 const struct device_driver *driver;
8129 const struct device *parent;
3019de12 8130 const char *empty = "";
6579e57b
AV
8131
8132 parent = dev->dev.parent;
6579e57b 8133 if (!parent)
3019de12 8134 return empty;
6579e57b
AV
8135
8136 driver = parent->driver;
8137 if (driver && driver->name)
3019de12
DM
8138 return driver->name;
8139 return empty;
6579e57b
AV
8140}
8141
6ea754eb
JP
8142static void __netdev_printk(const char *level, const struct net_device *dev,
8143 struct va_format *vaf)
256df2f3 8144{
b004ff49 8145 if (dev && dev->dev.parent) {
6ea754eb
JP
8146 dev_printk_emit(level[1] - '0',
8147 dev->dev.parent,
8148 "%s %s %s%s: %pV",
8149 dev_driver_string(dev->dev.parent),
8150 dev_name(dev->dev.parent),
8151 netdev_name(dev), netdev_reg_state(dev),
8152 vaf);
b004ff49 8153 } else if (dev) {
6ea754eb
JP
8154 printk("%s%s%s: %pV",
8155 level, netdev_name(dev), netdev_reg_state(dev), vaf);
b004ff49 8156 } else {
6ea754eb 8157 printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 8158 }
256df2f3
JP
8159}
8160
6ea754eb
JP
8161void netdev_printk(const char *level, const struct net_device *dev,
8162 const char *format, ...)
256df2f3
JP
8163{
8164 struct va_format vaf;
8165 va_list args;
256df2f3
JP
8166
8167 va_start(args, format);
8168
8169 vaf.fmt = format;
8170 vaf.va = &args;
8171
6ea754eb 8172 __netdev_printk(level, dev, &vaf);
b004ff49 8173
256df2f3 8174 va_end(args);
256df2f3
JP
8175}
8176EXPORT_SYMBOL(netdev_printk);
8177
8178#define define_netdev_printk_level(func, level) \
6ea754eb 8179void func(const struct net_device *dev, const char *fmt, ...) \
256df2f3 8180{ \
256df2f3
JP
8181 struct va_format vaf; \
8182 va_list args; \
8183 \
8184 va_start(args, fmt); \
8185 \
8186 vaf.fmt = fmt; \
8187 vaf.va = &args; \
8188 \
6ea754eb 8189 __netdev_printk(level, dev, &vaf); \
b004ff49 8190 \
256df2f3 8191 va_end(args); \
256df2f3
JP
8192} \
8193EXPORT_SYMBOL(func);
8194
8195define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8196define_netdev_printk_level(netdev_alert, KERN_ALERT);
8197define_netdev_printk_level(netdev_crit, KERN_CRIT);
8198define_netdev_printk_level(netdev_err, KERN_ERR);
8199define_netdev_printk_level(netdev_warn, KERN_WARNING);
8200define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8201define_netdev_printk_level(netdev_info, KERN_INFO);
8202
4665079c 8203static void __net_exit netdev_exit(struct net *net)
881d966b
EB
8204{
8205 kfree(net->dev_name_head);
8206 kfree(net->dev_index_head);
8207}
8208
022cbae6 8209static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
8210 .init = netdev_init,
8211 .exit = netdev_exit,
8212};
8213
4665079c 8214static void __net_exit default_device_exit(struct net *net)
ce286d32 8215{
e008b5fc 8216 struct net_device *dev, *aux;
ce286d32 8217 /*
e008b5fc 8218 * Push all migratable network devices back to the
ce286d32
EB
8219 * initial network namespace
8220 */
8221 rtnl_lock();
e008b5fc 8222 for_each_netdev_safe(net, dev, aux) {
ce286d32 8223 int err;
aca51397 8224 char fb_name[IFNAMSIZ];
ce286d32
EB
8225
8226 /* Ignore unmoveable devices (i.e. loopback) */
8227 if (dev->features & NETIF_F_NETNS_LOCAL)
8228 continue;
8229
e008b5fc
EB
8230 /* Leave virtual devices for the generic cleanup */
8231 if (dev->rtnl_link_ops)
8232 continue;
d0c082ce 8233
25985edc 8234 /* Push remaining network devices to init_net */
aca51397
PE
8235 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8236 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 8237 if (err) {
7b6cd1ce
JP
8238 pr_emerg("%s: failed to move %s to init_net: %d\n",
8239 __func__, dev->name, err);
aca51397 8240 BUG();
ce286d32
EB
8241 }
8242 }
8243 rtnl_unlock();
8244}
8245
50624c93
EB
8246static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8247{
8248 /* Return with the rtnl_lock held when there are no network
8249 * devices unregistering in any network namespace in net_list.
8250 */
8251 struct net *net;
8252 bool unregistering;
ff960a73 8253 DEFINE_WAIT_FUNC(wait, woken_wake_function);
50624c93 8254
ff960a73 8255 add_wait_queue(&netdev_unregistering_wq, &wait);
50624c93 8256 for (;;) {
50624c93
EB
8257 unregistering = false;
8258 rtnl_lock();
8259 list_for_each_entry(net, net_list, exit_list) {
8260 if (net->dev_unreg_count > 0) {
8261 unregistering = true;
8262 break;
8263 }
8264 }
8265 if (!unregistering)
8266 break;
8267 __rtnl_unlock();
ff960a73
PZ
8268
8269 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
50624c93 8270 }
ff960a73 8271 remove_wait_queue(&netdev_unregistering_wq, &wait);
50624c93
EB
8272}
8273
04dc7f6b
EB
8274static void __net_exit default_device_exit_batch(struct list_head *net_list)
8275{
8276 /* At exit all network devices most be removed from a network
b595076a 8277 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
8278 * Do this across as many network namespaces as possible to
8279 * improve batching efficiency.
8280 */
8281 struct net_device *dev;
8282 struct net *net;
8283 LIST_HEAD(dev_kill_list);
8284
50624c93
EB
8285 /* To prevent network device cleanup code from dereferencing
8286 * loopback devices or network devices that have been freed
8287 * wait here for all pending unregistrations to complete,
8288 * before unregistring the loopback device and allowing the
8289 * network namespace be freed.
8290 *
8291 * The netdev todo list containing all network devices
8292 * unregistrations that happen in default_device_exit_batch
8293 * will run in the rtnl_unlock() at the end of
8294 * default_device_exit_batch.
8295 */
8296 rtnl_lock_unregistering(net_list);
04dc7f6b
EB
8297 list_for_each_entry(net, net_list, exit_list) {
8298 for_each_netdev_reverse(net, dev) {
b0ab2fab 8299 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
04dc7f6b
EB
8300 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8301 else
8302 unregister_netdevice_queue(dev, &dev_kill_list);
8303 }
8304 }
8305 unregister_netdevice_many(&dev_kill_list);
8306 rtnl_unlock();
8307}
8308
022cbae6 8309static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 8310 .exit = default_device_exit,
04dc7f6b 8311 .exit_batch = default_device_exit_batch,
ce286d32
EB
8312};
8313
1da177e4
LT
8314/*
8315 * Initialize the DEV module. At boot time this walks the device list and
8316 * unhooks any devices that fail to initialise (normally hardware not
8317 * present) and leaves us with a valid list of present and active devices.
8318 *
8319 */
8320
8321/*
8322 * This is called single threaded during boot, so no need
8323 * to take the rtnl semaphore.
8324 */
8325static int __init net_dev_init(void)
8326{
8327 int i, rc = -ENOMEM;
8328
8329 BUG_ON(!dev_boot_phase);
8330
1da177e4
LT
8331 if (dev_proc_init())
8332 goto out;
8333
8b41d188 8334 if (netdev_kobject_init())
1da177e4
LT
8335 goto out;
8336
8337 INIT_LIST_HEAD(&ptype_all);
82d8a867 8338 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
8339 INIT_LIST_HEAD(&ptype_base[i]);
8340
62532da9
VY
8341 INIT_LIST_HEAD(&offload_base);
8342
881d966b
EB
8343 if (register_pernet_subsys(&netdev_net_ops))
8344 goto out;
1da177e4
LT
8345
8346 /*
8347 * Initialise the packet receive queues.
8348 */
8349
6f912042 8350 for_each_possible_cpu(i) {
41852497 8351 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
e36fa2f7 8352 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 8353
41852497
ED
8354 INIT_WORK(flush, flush_backlog);
8355
e36fa2f7 8356 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 8357 skb_queue_head_init(&sd->process_queue);
e36fa2f7 8358 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588 8359 sd->output_queue_tailp = &sd->output_queue;
df334545 8360#ifdef CONFIG_RPS
e36fa2f7
ED
8361 sd->csd.func = rps_trigger_softirq;
8362 sd->csd.info = sd;
e36fa2f7 8363 sd->cpu = i;
1e94d72f 8364#endif
0a9627f2 8365
e36fa2f7
ED
8366 sd->backlog.poll = process_backlog;
8367 sd->backlog.weight = weight_p;
1da177e4
LT
8368 }
8369
1da177e4
LT
8370 dev_boot_phase = 0;
8371
505d4f73
EB
8372 /* The loopback device is special if any other network devices
8373 * is present in a network namespace the loopback device must
8374 * be present. Since we now dynamically allocate and free the
8375 * loopback device ensure this invariant is maintained by
8376 * keeping the loopback device as the first device on the
8377 * list of network devices. Ensuring the loopback devices
8378 * is the first device that appears and the last network device
8379 * that disappears.
8380 */
8381 if (register_pernet_device(&loopback_net_ops))
8382 goto out;
8383
8384 if (register_pernet_device(&default_device_ops))
8385 goto out;
8386
962cf36c
CM
8387 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8388 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
8389
8390 hotcpu_notifier(dev_cpu_callback, 0);
f38a9eb1 8391 dst_subsys_init();
1da177e4
LT
8392 rc = 0;
8393out:
8394 return rc;
8395}
8396
8397subsys_initcall(net_dev_init);