2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <linux/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/crash_dump.h>
144 #include "net-sysfs.h"
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
152 static DEFINE_SPINLOCK(ptype_lock
);
153 static DEFINE_SPINLOCK(offload_lock
);
154 struct list_head ptype_base
[PTYPE_HASH_SIZE
] __read_mostly
;
155 struct list_head ptype_all __read_mostly
; /* Taps */
156 static struct list_head offload_base __read_mostly
;
158 static int netif_rx_internal(struct sk_buff
*skb
);
159 static int call_netdevice_notifiers_info(unsigned long val
,
160 struct net_device
*dev
,
161 struct netdev_notifier_info
*info
);
164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
169 * Writers must hold the rtnl semaphore while they loop through the
170 * dev_base_head list, and hold dev_base_lock for writing when they do the
171 * actual updates. This allows pure readers to access the list even
172 * while a writer is preparing to update it.
174 * To put it another way, dev_base_lock is held for writing only to
175 * protect against pure readers; the rtnl semaphore provides the
176 * protection against other writers.
178 * See, for example usages, register_netdevice() and
179 * unregister_netdevice(), which must be called with the rtnl
182 DEFINE_RWLOCK(dev_base_lock
);
183 EXPORT_SYMBOL(dev_base_lock
);
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock
);
188 static unsigned int napi_gen_id
= NR_CPUS
;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash
, 8);
191 static seqcount_t devnet_rename_seq
;
193 static inline void dev_base_seq_inc(struct net
*net
)
195 while (++net
->dev_base_seq
== 0);
198 static inline struct hlist_head
*dev_name_hash(struct net
*net
, const char *name
)
200 unsigned int hash
= full_name_hash(net
, name
, strnlen(name
, IFNAMSIZ
));
202 return &net
->dev_name_head
[hash_32(hash
, NETDEV_HASHBITS
)];
205 static inline struct hlist_head
*dev_index_hash(struct net
*net
, int ifindex
)
207 return &net
->dev_index_head
[ifindex
& (NETDEV_HASHENTRIES
- 1)];
210 static inline void rps_lock(struct softnet_data
*sd
)
213 spin_lock(&sd
->input_pkt_queue
.lock
);
217 static inline void rps_unlock(struct softnet_data
*sd
)
220 spin_unlock(&sd
->input_pkt_queue
.lock
);
224 /* Device list insertion */
225 static void list_netdevice(struct net_device
*dev
)
227 struct net
*net
= dev_net(dev
);
231 write_lock_bh(&dev_base_lock
);
232 list_add_tail_rcu(&dev
->dev_list
, &net
->dev_base_head
);
233 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
234 hlist_add_head_rcu(&dev
->index_hlist
,
235 dev_index_hash(net
, dev
->ifindex
));
236 write_unlock_bh(&dev_base_lock
);
238 dev_base_seq_inc(net
);
241 /* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
244 static void unlist_netdevice(struct net_device
*dev
)
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock
);
250 list_del_rcu(&dev
->dev_list
);
251 hlist_del_rcu(&dev
->name_hlist
);
252 hlist_del_rcu(&dev
->index_hlist
);
253 write_unlock_bh(&dev_base_lock
);
255 dev_base_seq_inc(dev_net(dev
));
262 static RAW_NOTIFIER_HEAD(netdev_chain
);
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data
, softnet_data
);
270 EXPORT_PER_CPU_SYMBOL(softnet_data
);
272 #ifdef CONFIG_LOCKDEP
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
277 static const unsigned short netdev_lock_type
[] =
278 {ARPHRD_NETROM
, ARPHRD_ETHER
, ARPHRD_EETHER
, ARPHRD_AX25
,
279 ARPHRD_PRONET
, ARPHRD_CHAOS
, ARPHRD_IEEE802
, ARPHRD_ARCNET
,
280 ARPHRD_APPLETLK
, ARPHRD_DLCI
, ARPHRD_ATM
, ARPHRD_METRICOM
,
281 ARPHRD_IEEE1394
, ARPHRD_EUI64
, ARPHRD_INFINIBAND
, ARPHRD_SLIP
,
282 ARPHRD_CSLIP
, ARPHRD_SLIP6
, ARPHRD_CSLIP6
, ARPHRD_RSRVD
,
283 ARPHRD_ADAPT
, ARPHRD_ROSE
, ARPHRD_X25
, ARPHRD_HWX25
,
284 ARPHRD_PPP
, ARPHRD_CISCO
, ARPHRD_LAPB
, ARPHRD_DDCMP
,
285 ARPHRD_RAWHDLC
, ARPHRD_TUNNEL
, ARPHRD_TUNNEL6
, ARPHRD_FRAD
,
286 ARPHRD_SKIP
, ARPHRD_LOOPBACK
, ARPHRD_LOCALTLK
, ARPHRD_FDDI
,
287 ARPHRD_BIF
, ARPHRD_SIT
, ARPHRD_IPDDP
, ARPHRD_IPGRE
,
288 ARPHRD_PIMREG
, ARPHRD_HIPPI
, ARPHRD_ASH
, ARPHRD_ECONET
,
289 ARPHRD_IRDA
, ARPHRD_FCPP
, ARPHRD_FCAL
, ARPHRD_FCPL
,
290 ARPHRD_FCFABRIC
, ARPHRD_IEEE80211
, ARPHRD_IEEE80211_PRISM
,
291 ARPHRD_IEEE80211_RADIOTAP
, ARPHRD_PHONET
, ARPHRD_PHONET_PIPE
,
292 ARPHRD_IEEE802154
, ARPHRD_VOID
, ARPHRD_NONE
};
294 static const char *const netdev_lock_name
[] =
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
311 static struct lock_class_key netdev_xmit_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
312 static struct lock_class_key netdev_addr_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type
)
318 for (i
= 0; i
< ARRAY_SIZE(netdev_lock_type
); i
++)
319 if (netdev_lock_type
[i
] == dev_type
)
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type
) - 1;
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
326 unsigned short dev_type
)
330 i
= netdev_lock_pos(dev_type
);
331 lockdep_set_class_and_name(lock
, &netdev_xmit_lock_key
[i
],
332 netdev_lock_name
[i
]);
335 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
339 i
= netdev_lock_pos(dev
->type
);
340 lockdep_set_class_and_name(&dev
->addr_list_lock
,
341 &netdev_addr_lock_key
[i
],
342 netdev_lock_name
[i
]);
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
346 unsigned short dev_type
)
349 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
354 /*******************************************************************************
356 Protocol management and registration routines
358 *******************************************************************************/
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
376 static inline struct list_head
*ptype_head(const struct packet_type
*pt
)
378 if (pt
->type
== htons(ETH_P_ALL
))
379 return pt
->dev
? &pt
->dev
->ptype_all
: &ptype_all
;
381 return pt
->dev
? &pt
->dev
->ptype_specific
:
382 &ptype_base
[ntohs(pt
->type
) & PTYPE_HASH_MASK
];
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
393 * This call does not sleep therefore it can not
394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
398 void dev_add_pack(struct packet_type
*pt
)
400 struct list_head
*head
= ptype_head(pt
);
402 spin_lock(&ptype_lock
);
403 list_add_rcu(&pt
->list
, head
);
404 spin_unlock(&ptype_lock
);
406 EXPORT_SYMBOL(dev_add_pack
);
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
421 void __dev_remove_pack(struct packet_type
*pt
)
423 struct list_head
*head
= ptype_head(pt
);
424 struct packet_type
*pt1
;
426 spin_lock(&ptype_lock
);
428 list_for_each_entry(pt1
, head
, list
) {
430 list_del_rcu(&pt
->list
);
435 pr_warn("dev_remove_pack: %p not found\n", pt
);
437 spin_unlock(&ptype_lock
);
439 EXPORT_SYMBOL(__dev_remove_pack
);
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
450 * This call sleeps to guarantee that no CPU is looking at the packet
453 void dev_remove_pack(struct packet_type
*pt
)
455 __dev_remove_pack(pt
);
459 EXPORT_SYMBOL(dev_remove_pack
);
463 * dev_add_offload - register offload handlers
464 * @po: protocol offload declaration
466 * Add protocol offload handlers to the networking stack. The passed
467 * &proto_offload is linked into kernel lists and may not be freed until
468 * it has been removed from the kernel lists.
470 * This call does not sleep therefore it can not
471 * guarantee all CPU's that are in middle of receiving packets
472 * will see the new offload handlers (until the next received packet).
474 void dev_add_offload(struct packet_offload
*po
)
476 struct packet_offload
*elem
;
478 spin_lock(&offload_lock
);
479 list_for_each_entry(elem
, &offload_base
, list
) {
480 if (po
->priority
< elem
->priority
)
483 list_add_rcu(&po
->list
, elem
->list
.prev
);
484 spin_unlock(&offload_lock
);
486 EXPORT_SYMBOL(dev_add_offload
);
489 * __dev_remove_offload - remove offload handler
490 * @po: packet offload declaration
492 * Remove a protocol offload handler that was previously added to the
493 * kernel offload handlers by dev_add_offload(). The passed &offload_type
494 * is removed from the kernel lists and can be freed or reused once this
497 * The packet type might still be in use by receivers
498 * and must not be freed until after all the CPU's have gone
499 * through a quiescent state.
501 static void __dev_remove_offload(struct packet_offload
*po
)
503 struct list_head
*head
= &offload_base
;
504 struct packet_offload
*po1
;
506 spin_lock(&offload_lock
);
508 list_for_each_entry(po1
, head
, list
) {
510 list_del_rcu(&po
->list
);
515 pr_warn("dev_remove_offload: %p not found\n", po
);
517 spin_unlock(&offload_lock
);
521 * dev_remove_offload - remove packet offload handler
522 * @po: packet offload declaration
524 * Remove a packet offload handler that was previously added to the kernel
525 * offload handlers by dev_add_offload(). The passed &offload_type is
526 * removed from the kernel lists and can be freed or reused once this
529 * This call sleeps to guarantee that no CPU is looking at the packet
532 void dev_remove_offload(struct packet_offload
*po
)
534 __dev_remove_offload(po
);
538 EXPORT_SYMBOL(dev_remove_offload
);
540 /******************************************************************************
542 Device Boot-time Settings Routines
544 *******************************************************************************/
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup
[NETDEV_BOOT_SETUP_MAX
];
550 * netdev_boot_setup_add - add new setup entry
551 * @name: name of the device
552 * @map: configured settings for the device
554 * Adds new setup entry to the dev_boot_setup list. The function
555 * returns 0 on error and 1 on success. This is a generic routine to
558 static int netdev_boot_setup_add(char *name
, struct ifmap
*map
)
560 struct netdev_boot_setup
*s
;
564 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
565 if (s
[i
].name
[0] == '\0' || s
[i
].name
[0] == ' ') {
566 memset(s
[i
].name
, 0, sizeof(s
[i
].name
));
567 strlcpy(s
[i
].name
, name
, IFNAMSIZ
);
568 memcpy(&s
[i
].map
, map
, sizeof(s
[i
].map
));
573 return i
>= NETDEV_BOOT_SETUP_MAX
? 0 : 1;
577 * netdev_boot_setup_check - check boot time settings
578 * @dev: the netdevice
580 * Check boot time settings for the device.
581 * The found settings are set for the device to be used
582 * later in the device probing.
583 * Returns 0 if no settings found, 1 if they are.
585 int netdev_boot_setup_check(struct net_device
*dev
)
587 struct netdev_boot_setup
*s
= dev_boot_setup
;
590 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
591 if (s
[i
].name
[0] != '\0' && s
[i
].name
[0] != ' ' &&
592 !strcmp(dev
->name
, s
[i
].name
)) {
593 dev
->irq
= s
[i
].map
.irq
;
594 dev
->base_addr
= s
[i
].map
.base_addr
;
595 dev
->mem_start
= s
[i
].map
.mem_start
;
596 dev
->mem_end
= s
[i
].map
.mem_end
;
602 EXPORT_SYMBOL(netdev_boot_setup_check
);
606 * netdev_boot_base - get address from boot time settings
607 * @prefix: prefix for network device
608 * @unit: id for network device
610 * Check boot time settings for the base address of device.
611 * The found settings are set for the device to be used
612 * later in the device probing.
613 * Returns 0 if no settings found.
615 unsigned long netdev_boot_base(const char *prefix
, int unit
)
617 const struct netdev_boot_setup
*s
= dev_boot_setup
;
621 sprintf(name
, "%s%d", prefix
, unit
);
624 * If device already registered then return base of 1
625 * to indicate not to probe for this interface
627 if (__dev_get_by_name(&init_net
, name
))
630 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++)
631 if (!strcmp(name
, s
[i
].name
))
632 return s
[i
].map
.base_addr
;
637 * Saves at boot time configured settings for any netdevice.
639 int __init
netdev_boot_setup(char *str
)
644 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
649 memset(&map
, 0, sizeof(map
));
653 map
.base_addr
= ints
[2];
655 map
.mem_start
= ints
[3];
657 map
.mem_end
= ints
[4];
659 /* Add new entry to the list */
660 return netdev_boot_setup_add(str
, &map
);
663 __setup("netdev=", netdev_boot_setup
);
665 /*******************************************************************************
667 Device Interface Subroutines
669 *******************************************************************************/
672 * dev_get_iflink - get 'iflink' value of a interface
673 * @dev: targeted interface
675 * Indicates the ifindex the interface is linked to.
676 * Physical interfaces have the same 'ifindex' and 'iflink' values.
679 int dev_get_iflink(const struct net_device
*dev
)
681 if (dev
->netdev_ops
&& dev
->netdev_ops
->ndo_get_iflink
)
682 return dev
->netdev_ops
->ndo_get_iflink(dev
);
686 EXPORT_SYMBOL(dev_get_iflink
);
689 * dev_fill_metadata_dst - Retrieve tunnel egress information.
690 * @dev: targeted interface
693 * For better visibility of tunnel traffic OVS needs to retrieve
694 * egress tunnel information for a packet. Following API allows
695 * user to get this info.
697 int dev_fill_metadata_dst(struct net_device
*dev
, struct sk_buff
*skb
)
699 struct ip_tunnel_info
*info
;
701 if (!dev
->netdev_ops
|| !dev
->netdev_ops
->ndo_fill_metadata_dst
)
704 info
= skb_tunnel_info_unclone(skb
);
707 if (unlikely(!(info
->mode
& IP_TUNNEL_INFO_TX
)))
710 return dev
->netdev_ops
->ndo_fill_metadata_dst(dev
, skb
);
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst
);
715 * __dev_get_by_name - find a device by its name
716 * @net: the applicable net namespace
717 * @name: name to find
719 * Find an interface by name. Must be called under RTNL semaphore
720 * or @dev_base_lock. If the name is found a pointer to the device
721 * is returned. If the name is not found then %NULL is returned. The
722 * reference counters are not incremented so the caller must be
723 * careful with locks.
726 struct net_device
*__dev_get_by_name(struct net
*net
, const char *name
)
728 struct net_device
*dev
;
729 struct hlist_head
*head
= dev_name_hash(net
, name
);
731 hlist_for_each_entry(dev
, head
, name_hlist
)
732 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
737 EXPORT_SYMBOL(__dev_get_by_name
);
740 * dev_get_by_name_rcu - find a device by its name
741 * @net: the applicable net namespace
742 * @name: name to find
744 * Find an interface by name.
745 * If the name is found a pointer to the device is returned.
746 * If the name is not found then %NULL is returned.
747 * The reference counters are not incremented so the caller must be
748 * careful with locks. The caller must hold RCU lock.
751 struct net_device
*dev_get_by_name_rcu(struct net
*net
, const char *name
)
753 struct net_device
*dev
;
754 struct hlist_head
*head
= dev_name_hash(net
, name
);
756 hlist_for_each_entry_rcu(dev
, head
, name_hlist
)
757 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
762 EXPORT_SYMBOL(dev_get_by_name_rcu
);
765 * dev_get_by_name - find a device by its name
766 * @net: the applicable net namespace
767 * @name: name to find
769 * Find an interface by name. This can be called from any
770 * context and does its own locking. The returned handle has
771 * the usage count incremented and the caller must use dev_put() to
772 * release it when it is no longer needed. %NULL is returned if no
773 * matching device is found.
776 struct net_device
*dev_get_by_name(struct net
*net
, const char *name
)
778 struct net_device
*dev
;
781 dev
= dev_get_by_name_rcu(net
, name
);
787 EXPORT_SYMBOL(dev_get_by_name
);
790 * __dev_get_by_index - find a device by its ifindex
791 * @net: the applicable net namespace
792 * @ifindex: index of device
794 * Search for an interface by index. Returns %NULL if the device
795 * is not found or a pointer to the device. The device has not
796 * had its reference counter increased so the caller must be careful
797 * about locking. The caller must hold either the RTNL semaphore
801 struct net_device
*__dev_get_by_index(struct net
*net
, int ifindex
)
803 struct net_device
*dev
;
804 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
806 hlist_for_each_entry(dev
, head
, index_hlist
)
807 if (dev
->ifindex
== ifindex
)
812 EXPORT_SYMBOL(__dev_get_by_index
);
815 * dev_get_by_index_rcu - find a device by its ifindex
816 * @net: the applicable net namespace
817 * @ifindex: index of device
819 * Search for an interface by index. Returns %NULL if the device
820 * is not found or a pointer to the device. The device has not
821 * had its reference counter increased so the caller must be careful
822 * about locking. The caller must hold RCU lock.
825 struct net_device
*dev_get_by_index_rcu(struct net
*net
, int ifindex
)
827 struct net_device
*dev
;
828 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
830 hlist_for_each_entry_rcu(dev
, head
, index_hlist
)
831 if (dev
->ifindex
== ifindex
)
836 EXPORT_SYMBOL(dev_get_by_index_rcu
);
840 * dev_get_by_index - find a device by its ifindex
841 * @net: the applicable net namespace
842 * @ifindex: index of device
844 * Search for an interface by index. Returns NULL if the device
845 * is not found or a pointer to the device. The device returned has
846 * had a reference added and the pointer is safe until the user calls
847 * dev_put to indicate they have finished with it.
850 struct net_device
*dev_get_by_index(struct net
*net
, int ifindex
)
852 struct net_device
*dev
;
855 dev
= dev_get_by_index_rcu(net
, ifindex
);
861 EXPORT_SYMBOL(dev_get_by_index
);
864 * netdev_get_name - get a netdevice name, knowing its ifindex.
865 * @net: network namespace
866 * @name: a pointer to the buffer where the name will be stored.
867 * @ifindex: the ifindex of the interface to get the name from.
869 * The use of raw_seqcount_begin() and cond_resched() before
870 * retrying is required as we want to give the writers a chance
871 * to complete when CONFIG_PREEMPT is not set.
873 int netdev_get_name(struct net
*net
, char *name
, int ifindex
)
875 struct net_device
*dev
;
879 seq
= raw_seqcount_begin(&devnet_rename_seq
);
881 dev
= dev_get_by_index_rcu(net
, ifindex
);
887 strcpy(name
, dev
->name
);
889 if (read_seqcount_retry(&devnet_rename_seq
, seq
)) {
898 * dev_getbyhwaddr_rcu - find a device by its hardware address
899 * @net: the applicable net namespace
900 * @type: media type of device
901 * @ha: hardware address
903 * Search for an interface by MAC address. Returns NULL if the device
904 * is not found or a pointer to the device.
905 * The caller must hold RCU or RTNL.
906 * The returned device has not had its ref count increased
907 * and the caller must therefore be careful about locking
911 struct net_device
*dev_getbyhwaddr_rcu(struct net
*net
, unsigned short type
,
914 struct net_device
*dev
;
916 for_each_netdev_rcu(net
, dev
)
917 if (dev
->type
== type
&&
918 !memcmp(dev
->dev_addr
, ha
, dev
->addr_len
))
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu
);
925 struct net_device
*__dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
927 struct net_device
*dev
;
930 for_each_netdev(net
, dev
)
931 if (dev
->type
== type
)
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype
);
938 struct net_device
*dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
940 struct net_device
*dev
, *ret
= NULL
;
943 for_each_netdev_rcu(net
, dev
)
944 if (dev
->type
== type
) {
952 EXPORT_SYMBOL(dev_getfirstbyhwtype
);
955 * __dev_get_by_flags - find any device with given flags
956 * @net: the applicable net namespace
957 * @if_flags: IFF_* values
958 * @mask: bitmask of bits in if_flags to check
960 * Search for any interface with the given flags. Returns NULL if a device
961 * is not found or a pointer to the device. Must be called inside
962 * rtnl_lock(), and result refcount is unchanged.
965 struct net_device
*__dev_get_by_flags(struct net
*net
, unsigned short if_flags
,
968 struct net_device
*dev
, *ret
;
973 for_each_netdev(net
, dev
) {
974 if (((dev
->flags
^ if_flags
) & mask
) == 0) {
981 EXPORT_SYMBOL(__dev_get_by_flags
);
984 * dev_valid_name - check if name is okay for network device
987 * Network device names need to be valid file names to
988 * to allow sysfs to work. We also disallow any kind of
991 bool dev_valid_name(const char *name
)
995 if (strlen(name
) >= IFNAMSIZ
)
997 if (!strcmp(name
, ".") || !strcmp(name
, ".."))
1001 if (*name
== '/' || *name
== ':' || isspace(*name
))
1007 EXPORT_SYMBOL(dev_valid_name
);
1010 * __dev_alloc_name - allocate a name for a device
1011 * @net: network namespace to allocate the device name in
1012 * @name: name format string
1013 * @buf: scratch buffer and result name string
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1024 static int __dev_alloc_name(struct net
*net
, const char *name
, char *buf
)
1028 const int max_netdevices
= 8*PAGE_SIZE
;
1029 unsigned long *inuse
;
1030 struct net_device
*d
;
1032 p
= strnchr(name
, IFNAMSIZ
-1, '%');
1035 * Verify the string as this thing may have come from
1036 * the user. There must be either one "%d" and no other "%"
1039 if (p
[1] != 'd' || strchr(p
+ 2, '%'))
1042 /* Use one page as a bit array of possible slots */
1043 inuse
= (unsigned long *) get_zeroed_page(GFP_ATOMIC
);
1047 for_each_netdev(net
, d
) {
1048 if (!sscanf(d
->name
, name
, &i
))
1050 if (i
< 0 || i
>= max_netdevices
)
1053 /* avoid cases where sscanf is not exact inverse of printf */
1054 snprintf(buf
, IFNAMSIZ
, name
, i
);
1055 if (!strncmp(buf
, d
->name
, IFNAMSIZ
))
1059 i
= find_first_zero_bit(inuse
, max_netdevices
);
1060 free_page((unsigned long) inuse
);
1064 snprintf(buf
, IFNAMSIZ
, name
, i
);
1065 if (!__dev_get_by_name(net
, buf
))
1068 /* It is possible to run out of possible slots
1069 * when the name is long and there isn't enough space left
1070 * for the digits, or if all bits are used.
1076 * dev_alloc_name - allocate a name for a device
1078 * @name: name format string
1080 * Passed a format string - eg "lt%d" it will try and find a suitable
1081 * id. It scans list of devices to build up a free map, then chooses
1082 * the first empty slot. The caller must hold the dev_base or rtnl lock
1083 * while allocating the name and adding the device in order to avoid
1085 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 * Returns the number of the unit assigned or a negative errno code.
1089 int dev_alloc_name(struct net_device
*dev
, const char *name
)
1095 BUG_ON(!dev_net(dev
));
1097 ret
= __dev_alloc_name(net
, name
, buf
);
1099 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1102 EXPORT_SYMBOL(dev_alloc_name
);
1104 static int dev_alloc_name_ns(struct net
*net
,
1105 struct net_device
*dev
,
1111 ret
= __dev_alloc_name(net
, name
, buf
);
1113 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1117 static int dev_get_valid_name(struct net
*net
,
1118 struct net_device
*dev
,
1123 if (!dev_valid_name(name
))
1126 if (strchr(name
, '%'))
1127 return dev_alloc_name_ns(net
, dev
, name
);
1128 else if (__dev_get_by_name(net
, name
))
1130 else if (dev
->name
!= name
)
1131 strlcpy(dev
->name
, name
, IFNAMSIZ
);
1137 * dev_change_name - change name of a device
1139 * @newname: name (or format string) must be at least IFNAMSIZ
1141 * Change name of a device, can pass format strings "eth%d".
1144 int dev_change_name(struct net_device
*dev
, const char *newname
)
1146 unsigned char old_assign_type
;
1147 char oldname
[IFNAMSIZ
];
1153 BUG_ON(!dev_net(dev
));
1156 if (dev
->flags
& IFF_UP
)
1159 write_seqcount_begin(&devnet_rename_seq
);
1161 if (strncmp(newname
, dev
->name
, IFNAMSIZ
) == 0) {
1162 write_seqcount_end(&devnet_rename_seq
);
1166 memcpy(oldname
, dev
->name
, IFNAMSIZ
);
1168 err
= dev_get_valid_name(net
, dev
, newname
);
1170 write_seqcount_end(&devnet_rename_seq
);
1174 if (oldname
[0] && !strchr(oldname
, '%'))
1175 netdev_info(dev
, "renamed from %s\n", oldname
);
1177 old_assign_type
= dev
->name_assign_type
;
1178 dev
->name_assign_type
= NET_NAME_RENAMED
;
1181 ret
= device_rename(&dev
->dev
, dev
->name
);
1183 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1184 dev
->name_assign_type
= old_assign_type
;
1185 write_seqcount_end(&devnet_rename_seq
);
1189 write_seqcount_end(&devnet_rename_seq
);
1191 netdev_adjacent_rename_links(dev
, oldname
);
1193 write_lock_bh(&dev_base_lock
);
1194 hlist_del_rcu(&dev
->name_hlist
);
1195 write_unlock_bh(&dev_base_lock
);
1199 write_lock_bh(&dev_base_lock
);
1200 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
1201 write_unlock_bh(&dev_base_lock
);
1203 ret
= call_netdevice_notifiers(NETDEV_CHANGENAME
, dev
);
1204 ret
= notifier_to_errno(ret
);
1207 /* err >= 0 after dev_alloc_name() or stores the first errno */
1210 write_seqcount_begin(&devnet_rename_seq
);
1211 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1212 memcpy(oldname
, newname
, IFNAMSIZ
);
1213 dev
->name_assign_type
= old_assign_type
;
1214 old_assign_type
= NET_NAME_RENAMED
;
1217 pr_err("%s: name change rollback failed: %d\n",
1226 * dev_set_alias - change ifalias of a device
1228 * @alias: name up to IFALIASZ
1229 * @len: limit of bytes to copy from info
1231 * Set ifalias for a device,
1233 int dev_set_alias(struct net_device
*dev
, const char *alias
, size_t len
)
1239 if (len
>= IFALIASZ
)
1243 kfree(dev
->ifalias
);
1244 dev
->ifalias
= NULL
;
1248 new_ifalias
= krealloc(dev
->ifalias
, len
+ 1, GFP_KERNEL
);
1251 dev
->ifalias
= new_ifalias
;
1253 strlcpy(dev
->ifalias
, alias
, len
+1);
1259 * netdev_features_change - device changes features
1260 * @dev: device to cause notification
1262 * Called to indicate a device has changed features.
1264 void netdev_features_change(struct net_device
*dev
)
1266 call_netdevice_notifiers(NETDEV_FEAT_CHANGE
, dev
);
1268 EXPORT_SYMBOL(netdev_features_change
);
1271 * netdev_state_change - device changes state
1272 * @dev: device to cause notification
1274 * Called to indicate a device has changed state. This function calls
1275 * the notifier chains for netdev_chain and sends a NEWLINK message
1276 * to the routing socket.
1278 void netdev_state_change(struct net_device
*dev
)
1280 if (dev
->flags
& IFF_UP
) {
1281 struct netdev_notifier_change_info change_info
;
1283 change_info
.flags_changed
= 0;
1284 call_netdevice_notifiers_info(NETDEV_CHANGE
, dev
,
1286 rtmsg_ifinfo(RTM_NEWLINK
, dev
, 0, GFP_KERNEL
);
1289 EXPORT_SYMBOL(netdev_state_change
);
1292 * netdev_notify_peers - notify network peers about existence of @dev
1293 * @dev: network device
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1301 void netdev_notify_peers(struct net_device
*dev
)
1304 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS
, dev
);
1307 EXPORT_SYMBOL(netdev_notify_peers
);
1309 static int __dev_open(struct net_device
*dev
)
1311 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1316 if (!netif_device_present(dev
))
1319 /* Block netpoll from trying to do any rx path servicing.
1320 * If we don't do this there is a chance ndo_poll_controller
1321 * or ndo_poll may be running while we open the device
1323 netpoll_poll_disable(dev
);
1325 ret
= call_netdevice_notifiers(NETDEV_PRE_UP
, dev
);
1326 ret
= notifier_to_errno(ret
);
1330 set_bit(__LINK_STATE_START
, &dev
->state
);
1332 if (ops
->ndo_validate_addr
)
1333 ret
= ops
->ndo_validate_addr(dev
);
1335 if (!ret
&& ops
->ndo_open
)
1336 ret
= ops
->ndo_open(dev
);
1338 netpoll_poll_enable(dev
);
1341 clear_bit(__LINK_STATE_START
, &dev
->state
);
1343 dev
->flags
|= IFF_UP
;
1344 dev_set_rx_mode(dev
);
1346 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
1353 * dev_open - prepare an interface for use.
1354 * @dev: device to open
1356 * Takes a device from down to up state. The device's private open
1357 * function is invoked and then the multicast lists are loaded. Finally
1358 * the device is moved into the up state and a %NETDEV_UP message is
1359 * sent to the netdev notifier chain.
1361 * Calling this function on an active interface is a nop. On a failure
1362 * a negative errno code is returned.
1364 int dev_open(struct net_device
*dev
)
1368 if (dev
->flags
& IFF_UP
)
1371 ret
= __dev_open(dev
);
1375 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1376 call_netdevice_notifiers(NETDEV_UP
, dev
);
1380 EXPORT_SYMBOL(dev_open
);
1382 static int __dev_close_many(struct list_head
*head
)
1384 struct net_device
*dev
;
1389 list_for_each_entry(dev
, head
, close_list
) {
1390 /* Temporarily disable netpoll until the interface is down */
1391 netpoll_poll_disable(dev
);
1393 call_netdevice_notifiers(NETDEV_GOING_DOWN
, dev
);
1395 clear_bit(__LINK_STATE_START
, &dev
->state
);
1397 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398 * can be even on different cpu. So just clear netif_running().
1400 * dev->stop() will invoke napi_disable() on all of it's
1401 * napi_struct instances on this device.
1403 smp_mb__after_atomic(); /* Commit netif_running(). */
1406 dev_deactivate_many(head
);
1408 list_for_each_entry(dev
, head
, close_list
) {
1409 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1412 * Call the device specific close. This cannot fail.
1413 * Only if device is UP
1415 * We allow it to be called even after a DETACH hot-plug
1421 dev
->flags
&= ~IFF_UP
;
1422 netpoll_poll_enable(dev
);
1428 static int __dev_close(struct net_device
*dev
)
1433 list_add(&dev
->close_list
, &single
);
1434 retval
= __dev_close_many(&single
);
1440 int dev_close_many(struct list_head
*head
, bool unlink
)
1442 struct net_device
*dev
, *tmp
;
1444 /* Remove the devices that don't need to be closed */
1445 list_for_each_entry_safe(dev
, tmp
, head
, close_list
)
1446 if (!(dev
->flags
& IFF_UP
))
1447 list_del_init(&dev
->close_list
);
1449 __dev_close_many(head
);
1451 list_for_each_entry_safe(dev
, tmp
, head
, close_list
) {
1452 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1453 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
1455 list_del_init(&dev
->close_list
);
1460 EXPORT_SYMBOL(dev_close_many
);
1463 * dev_close - shutdown an interface.
1464 * @dev: device to shutdown
1466 * This function moves an active device into down state. A
1467 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1471 int dev_close(struct net_device
*dev
)
1473 if (dev
->flags
& IFF_UP
) {
1476 list_add(&dev
->close_list
, &single
);
1477 dev_close_many(&single
, true);
1482 EXPORT_SYMBOL(dev_close
);
1486 * dev_disable_lro - disable Large Receive Offload on a device
1489 * Disable Large Receive Offload (LRO) on a net device. Must be
1490 * called under RTNL. This is needed if received packets may be
1491 * forwarded to another interface.
1493 void dev_disable_lro(struct net_device
*dev
)
1495 struct net_device
*lower_dev
;
1496 struct list_head
*iter
;
1498 dev
->wanted_features
&= ~NETIF_F_LRO
;
1499 netdev_update_features(dev
);
1501 if (unlikely(dev
->features
& NETIF_F_LRO
))
1502 netdev_WARN(dev
, "failed to disable LRO!\n");
1504 netdev_for_each_lower_dev(dev
, lower_dev
, iter
)
1505 dev_disable_lro(lower_dev
);
1507 EXPORT_SYMBOL(dev_disable_lro
);
1509 static int call_netdevice_notifier(struct notifier_block
*nb
, unsigned long val
,
1510 struct net_device
*dev
)
1512 struct netdev_notifier_info info
;
1514 netdev_notifier_info_init(&info
, dev
);
1515 return nb
->notifier_call(nb
, val
, &info
);
1518 static int dev_boot_phase
= 1;
1521 * register_netdevice_notifier - register a network notifier block
1524 * Register a notifier to be called when network device events occur.
1525 * The notifier passed is linked into the kernel structures and must
1526 * not be reused until it has been unregistered. A negative errno code
1527 * is returned on a failure.
1529 * When registered all registration and up events are replayed
1530 * to the new notifier to allow device to have a race free
1531 * view of the network device list.
1534 int register_netdevice_notifier(struct notifier_block
*nb
)
1536 struct net_device
*dev
;
1537 struct net_device
*last
;
1542 err
= raw_notifier_chain_register(&netdev_chain
, nb
);
1548 for_each_netdev(net
, dev
) {
1549 err
= call_netdevice_notifier(nb
, NETDEV_REGISTER
, dev
);
1550 err
= notifier_to_errno(err
);
1554 if (!(dev
->flags
& IFF_UP
))
1557 call_netdevice_notifier(nb
, NETDEV_UP
, dev
);
1568 for_each_netdev(net
, dev
) {
1572 if (dev
->flags
& IFF_UP
) {
1573 call_netdevice_notifier(nb
, NETDEV_GOING_DOWN
,
1575 call_netdevice_notifier(nb
, NETDEV_DOWN
, dev
);
1577 call_netdevice_notifier(nb
, NETDEV_UNREGISTER
, dev
);
1582 raw_notifier_chain_unregister(&netdev_chain
, nb
);
1585 EXPORT_SYMBOL(register_netdevice_notifier
);
1588 * unregister_netdevice_notifier - unregister a network notifier block
1591 * Unregister a notifier previously registered by
1592 * register_netdevice_notifier(). The notifier is unlinked into the
1593 * kernel structures and may then be reused. A negative errno code
1594 * is returned on a failure.
1596 * After unregistering unregister and down device events are synthesized
1597 * for all devices on the device list to the removed notifier to remove
1598 * the need for special case cleanup code.
1601 int unregister_netdevice_notifier(struct notifier_block
*nb
)
1603 struct net_device
*dev
;
1608 err
= raw_notifier_chain_unregister(&netdev_chain
, nb
);
1613 for_each_netdev(net
, dev
) {
1614 if (dev
->flags
& IFF_UP
) {
1615 call_netdevice_notifier(nb
, NETDEV_GOING_DOWN
,
1617 call_netdevice_notifier(nb
, NETDEV_DOWN
, dev
);
1619 call_netdevice_notifier(nb
, NETDEV_UNREGISTER
, dev
);
1626 EXPORT_SYMBOL(unregister_netdevice_notifier
);
1629 * call_netdevice_notifiers_info - call all network notifier blocks
1630 * @val: value passed unmodified to notifier function
1631 * @dev: net_device pointer passed unmodified to notifier function
1632 * @info: notifier information data
1634 * Call all network notifier blocks. Parameters and return value
1635 * are as for raw_notifier_call_chain().
1638 static int call_netdevice_notifiers_info(unsigned long val
,
1639 struct net_device
*dev
,
1640 struct netdev_notifier_info
*info
)
1643 netdev_notifier_info_init(info
, dev
);
1644 return raw_notifier_call_chain(&netdev_chain
, val
, info
);
1648 * call_netdevice_notifiers - call all network notifier blocks
1649 * @val: value passed unmodified to notifier function
1650 * @dev: net_device pointer passed unmodified to notifier function
1652 * Call all network notifier blocks. Parameters and return value
1653 * are as for raw_notifier_call_chain().
1656 int call_netdevice_notifiers(unsigned long val
, struct net_device
*dev
)
1658 struct netdev_notifier_info info
;
1660 return call_netdevice_notifiers_info(val
, dev
, &info
);
1662 EXPORT_SYMBOL(call_netdevice_notifiers
);
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly
;
1667 void net_inc_ingress_queue(void)
1669 static_key_slow_inc(&ingress_needed
);
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue
);
1673 void net_dec_ingress_queue(void)
1675 static_key_slow_dec(&ingress_needed
);
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue
);
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly
;
1683 void net_inc_egress_queue(void)
1685 static_key_slow_inc(&egress_needed
);
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue
);
1689 void net_dec_egress_queue(void)
1691 static_key_slow_dec(&egress_needed
);
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue
);
1696 static struct static_key netstamp_needed __read_mostly
;
1697 #ifdef HAVE_JUMP_LABEL
1698 static atomic_t netstamp_needed_deferred
;
1699 static atomic_t netstamp_wanted
;
1700 static void netstamp_clear(struct work_struct
*work
)
1702 int deferred
= atomic_xchg(&netstamp_needed_deferred
, 0);
1705 wanted
= atomic_add_return(deferred
, &netstamp_wanted
);
1707 static_key_enable(&netstamp_needed
);
1709 static_key_disable(&netstamp_needed
);
1711 static DECLARE_WORK(netstamp_work
, netstamp_clear
);
1714 void net_enable_timestamp(void)
1716 #ifdef HAVE_JUMP_LABEL
1720 wanted
= atomic_read(&netstamp_wanted
);
1723 if (atomic_cmpxchg(&netstamp_wanted
, wanted
, wanted
+ 1) == wanted
)
1726 atomic_inc(&netstamp_needed_deferred
);
1727 schedule_work(&netstamp_work
);
1729 static_key_slow_inc(&netstamp_needed
);
1732 EXPORT_SYMBOL(net_enable_timestamp
);
1734 void net_disable_timestamp(void)
1736 #ifdef HAVE_JUMP_LABEL
1740 wanted
= atomic_read(&netstamp_wanted
);
1743 if (atomic_cmpxchg(&netstamp_wanted
, wanted
, wanted
- 1) == wanted
)
1746 atomic_dec(&netstamp_needed_deferred
);
1747 schedule_work(&netstamp_work
);
1749 static_key_slow_dec(&netstamp_needed
);
1752 EXPORT_SYMBOL(net_disable_timestamp
);
1754 static inline void net_timestamp_set(struct sk_buff
*skb
)
1757 if (static_key_false(&netstamp_needed
))
1758 __net_timestamp(skb
);
1761 #define net_timestamp_check(COND, SKB) \
1762 if (static_key_false(&netstamp_needed)) { \
1763 if ((COND) && !(SKB)->tstamp) \
1764 __net_timestamp(SKB); \
1767 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1771 if (!(dev
->flags
& IFF_UP
))
1774 len
= dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
;
1775 if (skb
->len
<= len
)
1778 /* if TSO is enabled, we don't care about the length as the packet
1779 * could be forwarded without being segmented before
1781 if (skb_is_gso(skb
))
1786 EXPORT_SYMBOL_GPL(is_skb_forwardable
);
1788 int __dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1790 int ret
= ____dev_forward_skb(dev
, skb
);
1793 skb
->protocol
= eth_type_trans(skb
, dev
);
1794 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
1799 EXPORT_SYMBOL_GPL(__dev_forward_skb
);
1802 * dev_forward_skb - loopback an skb to another netif
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1808 * NET_RX_SUCCESS (no congestion)
1809 * NET_RX_DROP (packet was dropped, but freed)
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1819 int dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1821 return __dev_forward_skb(dev
, skb
) ?: netif_rx_internal(skb
);
1823 EXPORT_SYMBOL_GPL(dev_forward_skb
);
1825 static inline int deliver_skb(struct sk_buff
*skb
,
1826 struct packet_type
*pt_prev
,
1827 struct net_device
*orig_dev
)
1829 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
1831 atomic_inc(&skb
->users
);
1832 return pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
1835 static inline void deliver_ptype_list_skb(struct sk_buff
*skb
,
1836 struct packet_type
**pt
,
1837 struct net_device
*orig_dev
,
1839 struct list_head
*ptype_list
)
1841 struct packet_type
*ptype
, *pt_prev
= *pt
;
1843 list_for_each_entry_rcu(ptype
, ptype_list
, list
) {
1844 if (ptype
->type
!= type
)
1847 deliver_skb(skb
, pt_prev
, orig_dev
);
1853 static inline bool skb_loop_sk(struct packet_type
*ptype
, struct sk_buff
*skb
)
1855 if (!ptype
->af_packet_priv
|| !skb
->sk
)
1858 if (ptype
->id_match
)
1859 return ptype
->id_match(ptype
, skb
->sk
);
1860 else if ((struct sock
*)ptype
->af_packet_priv
== skb
->sk
)
1867 * Support routine. Sends outgoing frames to any network
1868 * taps currently in use.
1871 void dev_queue_xmit_nit(struct sk_buff
*skb
, struct net_device
*dev
)
1873 struct packet_type
*ptype
;
1874 struct sk_buff
*skb2
= NULL
;
1875 struct packet_type
*pt_prev
= NULL
;
1876 struct list_head
*ptype_list
= &ptype_all
;
1880 list_for_each_entry_rcu(ptype
, ptype_list
, list
) {
1881 /* Never send packets back to the socket
1882 * they originated from - MvS (miquels@drinkel.ow.org)
1884 if (skb_loop_sk(ptype
, skb
))
1888 deliver_skb(skb2
, pt_prev
, skb
->dev
);
1893 /* need to clone skb, done only once */
1894 skb2
= skb_clone(skb
, GFP_ATOMIC
);
1898 net_timestamp_set(skb2
);
1900 /* skb->nh should be correctly
1901 * set by sender, so that the second statement is
1902 * just protection against buggy protocols.
1904 skb_reset_mac_header(skb2
);
1906 if (skb_network_header(skb2
) < skb2
->data
||
1907 skb_network_header(skb2
) > skb_tail_pointer(skb2
)) {
1908 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909 ntohs(skb2
->protocol
),
1911 skb_reset_network_header(skb2
);
1914 skb2
->transport_header
= skb2
->network_header
;
1915 skb2
->pkt_type
= PACKET_OUTGOING
;
1919 if (ptype_list
== &ptype_all
) {
1920 ptype_list
= &dev
->ptype_all
;
1925 pt_prev
->func(skb2
, skb
->dev
, pt_prev
, skb
->dev
);
1928 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit
);
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1943 static void netif_setup_tc(struct net_device
*dev
, unsigned int txq
)
1946 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
1948 /* If TC0 is invalidated disable TC mapping */
1949 if (tc
->offset
+ tc
->count
> txq
) {
1950 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1955 /* Invalidated prio to tc mappings set to TC0 */
1956 for (i
= 1; i
< TC_BITMASK
+ 1; i
++) {
1957 int q
= netdev_get_prio_tc_map(dev
, i
);
1959 tc
= &dev
->tc_to_txq
[q
];
1960 if (tc
->offset
+ tc
->count
> txq
) {
1961 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1963 netdev_set_prio_tc_map(dev
, i
, 0);
1968 int netdev_txq_to_tc(struct net_device
*dev
, unsigned int txq
)
1971 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
1974 for (i
= 0; i
< TC_MAX_QUEUE
; i
++, tc
++) {
1975 if ((txq
- tc
->offset
) < tc
->count
)
1986 static DEFINE_MUTEX(xps_map_mutex
);
1987 #define xmap_dereference(P) \
1988 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1990 static bool remove_xps_queue(struct xps_dev_maps
*dev_maps
,
1993 struct xps_map
*map
= NULL
;
1997 map
= xmap_dereference(dev_maps
->cpu_map
[tci
]);
2001 for (pos
= map
->len
; pos
--;) {
2002 if (map
->queues
[pos
] != index
)
2006 map
->queues
[pos
] = map
->queues
[--map
->len
];
2010 RCU_INIT_POINTER(dev_maps
->cpu_map
[tci
], NULL
);
2011 kfree_rcu(map
, rcu
);
2018 static bool remove_xps_queue_cpu(struct net_device
*dev
,
2019 struct xps_dev_maps
*dev_maps
,
2020 int cpu
, u16 offset
, u16 count
)
2022 int num_tc
= dev
->num_tc
? : 1;
2023 bool active
= false;
2026 for (tci
= cpu
* num_tc
; num_tc
--; tci
++) {
2029 for (i
= count
, j
= offset
; i
--; j
++) {
2030 if (!remove_xps_queue(dev_maps
, cpu
, j
))
2040 static void netif_reset_xps_queues(struct net_device
*dev
, u16 offset
,
2043 struct xps_dev_maps
*dev_maps
;
2045 bool active
= false;
2047 mutex_lock(&xps_map_mutex
);
2048 dev_maps
= xmap_dereference(dev
->xps_maps
);
2053 for_each_possible_cpu(cpu
)
2054 active
|= remove_xps_queue_cpu(dev
, dev_maps
, cpu
,
2058 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
2059 kfree_rcu(dev_maps
, rcu
);
2062 for (i
= offset
+ (count
- 1); count
--; i
--)
2063 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, i
),
2067 mutex_unlock(&xps_map_mutex
);
2070 static void netif_reset_xps_queues_gt(struct net_device
*dev
, u16 index
)
2072 netif_reset_xps_queues(dev
, index
, dev
->num_tx_queues
- index
);
2075 static struct xps_map
*expand_xps_map(struct xps_map
*map
,
2078 struct xps_map
*new_map
;
2079 int alloc_len
= XPS_MIN_MAP_ALLOC
;
2082 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
2083 if (map
->queues
[pos
] != index
)
2088 /* Need to add queue to this CPU's existing map */
2090 if (pos
< map
->alloc_len
)
2093 alloc_len
= map
->alloc_len
* 2;
2096 /* Need to allocate new map to store queue on this CPU's map */
2097 new_map
= kzalloc_node(XPS_MAP_SIZE(alloc_len
), GFP_KERNEL
,
2102 for (i
= 0; i
< pos
; i
++)
2103 new_map
->queues
[i
] = map
->queues
[i
];
2104 new_map
->alloc_len
= alloc_len
;
2110 int netif_set_xps_queue(struct net_device
*dev
, const struct cpumask
*mask
,
2113 struct xps_dev_maps
*dev_maps
, *new_dev_maps
= NULL
;
2114 int i
, cpu
, tci
, numa_node_id
= -2;
2115 int maps_sz
, num_tc
= 1, tc
= 0;
2116 struct xps_map
*map
, *new_map
;
2117 bool active
= false;
2120 num_tc
= dev
->num_tc
;
2121 tc
= netdev_txq_to_tc(dev
, index
);
2126 maps_sz
= XPS_DEV_MAPS_SIZE(num_tc
);
2127 if (maps_sz
< L1_CACHE_BYTES
)
2128 maps_sz
= L1_CACHE_BYTES
;
2130 mutex_lock(&xps_map_mutex
);
2132 dev_maps
= xmap_dereference(dev
->xps_maps
);
2134 /* allocate memory for queue storage */
2135 for_each_cpu_and(cpu
, cpu_online_mask
, mask
) {
2137 new_dev_maps
= kzalloc(maps_sz
, GFP_KERNEL
);
2138 if (!new_dev_maps
) {
2139 mutex_unlock(&xps_map_mutex
);
2143 tci
= cpu
* num_tc
+ tc
;
2144 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[tci
]) :
2147 map
= expand_xps_map(map
, cpu
, index
);
2151 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[tci
], map
);
2155 goto out_no_new_maps
;
2157 for_each_possible_cpu(cpu
) {
2158 /* copy maps belonging to foreign traffic classes */
2159 for (i
= tc
, tci
= cpu
* num_tc
; dev_maps
&& i
--; tci
++) {
2160 /* fill in the new device map from the old device map */
2161 map
= xmap_dereference(dev_maps
->cpu_map
[tci
]);
2162 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[tci
], map
);
2165 /* We need to explicitly update tci as prevous loop
2166 * could break out early if dev_maps is NULL.
2168 tci
= cpu
* num_tc
+ tc
;
2170 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
)) {
2171 /* add queue to CPU maps */
2174 map
= xmap_dereference(new_dev_maps
->cpu_map
[tci
]);
2175 while ((pos
< map
->len
) && (map
->queues
[pos
] != index
))
2178 if (pos
== map
->len
)
2179 map
->queues
[map
->len
++] = index
;
2181 if (numa_node_id
== -2)
2182 numa_node_id
= cpu_to_node(cpu
);
2183 else if (numa_node_id
!= cpu_to_node(cpu
))
2186 } else if (dev_maps
) {
2187 /* fill in the new device map from the old device map */
2188 map
= xmap_dereference(dev_maps
->cpu_map
[tci
]);
2189 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[tci
], map
);
2192 /* copy maps belonging to foreign traffic classes */
2193 for (i
= num_tc
- tc
, tci
++; dev_maps
&& --i
; tci
++) {
2194 /* fill in the new device map from the old device map */
2195 map
= xmap_dereference(dev_maps
->cpu_map
[tci
]);
2196 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[tci
], map
);
2200 rcu_assign_pointer(dev
->xps_maps
, new_dev_maps
);
2202 /* Cleanup old maps */
2204 goto out_no_old_maps
;
2206 for_each_possible_cpu(cpu
) {
2207 for (i
= num_tc
, tci
= cpu
* num_tc
; i
--; tci
++) {
2208 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[tci
]);
2209 map
= xmap_dereference(dev_maps
->cpu_map
[tci
]);
2210 if (map
&& map
!= new_map
)
2211 kfree_rcu(map
, rcu
);
2215 kfree_rcu(dev_maps
, rcu
);
2218 dev_maps
= new_dev_maps
;
2222 /* update Tx queue numa node */
2223 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, index
),
2224 (numa_node_id
>= 0) ? numa_node_id
:
2230 /* removes queue from unused CPUs */
2231 for_each_possible_cpu(cpu
) {
2232 for (i
= tc
, tci
= cpu
* num_tc
; i
--; tci
++)
2233 active
|= remove_xps_queue(dev_maps
, tci
, index
);
2234 if (!cpumask_test_cpu(cpu
, mask
) || !cpu_online(cpu
))
2235 active
|= remove_xps_queue(dev_maps
, tci
, index
);
2236 for (i
= num_tc
- tc
, tci
++; --i
; tci
++)
2237 active
|= remove_xps_queue(dev_maps
, tci
, index
);
2240 /* free map if not active */
2242 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
2243 kfree_rcu(dev_maps
, rcu
);
2247 mutex_unlock(&xps_map_mutex
);
2251 /* remove any maps that we added */
2252 for_each_possible_cpu(cpu
) {
2253 for (i
= num_tc
, tci
= cpu
* num_tc
; i
--; tci
++) {
2254 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[tci
]);
2256 xmap_dereference(dev_maps
->cpu_map
[tci
]) :
2258 if (new_map
&& new_map
!= map
)
2263 mutex_unlock(&xps_map_mutex
);
2265 kfree(new_dev_maps
);
2268 EXPORT_SYMBOL(netif_set_xps_queue
);
2271 void netdev_reset_tc(struct net_device
*dev
)
2274 netif_reset_xps_queues_gt(dev
, 0);
2277 memset(dev
->tc_to_txq
, 0, sizeof(dev
->tc_to_txq
));
2278 memset(dev
->prio_tc_map
, 0, sizeof(dev
->prio_tc_map
));
2280 EXPORT_SYMBOL(netdev_reset_tc
);
2282 int netdev_set_tc_queue(struct net_device
*dev
, u8 tc
, u16 count
, u16 offset
)
2284 if (tc
>= dev
->num_tc
)
2288 netif_reset_xps_queues(dev
, offset
, count
);
2290 dev
->tc_to_txq
[tc
].count
= count
;
2291 dev
->tc_to_txq
[tc
].offset
= offset
;
2294 EXPORT_SYMBOL(netdev_set_tc_queue
);
2296 int netdev_set_num_tc(struct net_device
*dev
, u8 num_tc
)
2298 if (num_tc
> TC_MAX_QUEUE
)
2302 netif_reset_xps_queues_gt(dev
, 0);
2304 dev
->num_tc
= num_tc
;
2307 EXPORT_SYMBOL(netdev_set_num_tc
);
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2313 int netif_set_real_num_tx_queues(struct net_device
*dev
, unsigned int txq
)
2317 if (txq
< 1 || txq
> dev
->num_tx_queues
)
2320 if (dev
->reg_state
== NETREG_REGISTERED
||
2321 dev
->reg_state
== NETREG_UNREGISTERING
) {
2324 rc
= netdev_queue_update_kobjects(dev
, dev
->real_num_tx_queues
,
2330 netif_setup_tc(dev
, txq
);
2332 if (txq
< dev
->real_num_tx_queues
) {
2333 qdisc_reset_all_tx_gt(dev
, txq
);
2335 netif_reset_xps_queues_gt(dev
, txq
);
2340 dev
->real_num_tx_queues
= txq
;
2343 EXPORT_SYMBOL(netif_set_real_num_tx_queues
);
2347 * netif_set_real_num_rx_queues - set actual number of RX queues used
2348 * @dev: Network device
2349 * @rxq: Actual number of RX queues
2351 * This must be called either with the rtnl_lock held or before
2352 * registration of the net device. Returns 0 on success, or a
2353 * negative error code. If called before registration, it always
2356 int netif_set_real_num_rx_queues(struct net_device
*dev
, unsigned int rxq
)
2360 if (rxq
< 1 || rxq
> dev
->num_rx_queues
)
2363 if (dev
->reg_state
== NETREG_REGISTERED
) {
2366 rc
= net_rx_queue_update_kobjects(dev
, dev
->real_num_rx_queues
,
2372 dev
->real_num_rx_queues
= rxq
;
2375 EXPORT_SYMBOL(netif_set_real_num_rx_queues
);
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2384 int netif_get_num_default_rss_queues(void)
2386 return is_kdump_kernel() ?
2387 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES
, num_online_cpus());
2389 EXPORT_SYMBOL(netif_get_num_default_rss_queues
);
2391 static void __netif_reschedule(struct Qdisc
*q
)
2393 struct softnet_data
*sd
;
2394 unsigned long flags
;
2396 local_irq_save(flags
);
2397 sd
= this_cpu_ptr(&softnet_data
);
2398 q
->next_sched
= NULL
;
2399 *sd
->output_queue_tailp
= q
;
2400 sd
->output_queue_tailp
= &q
->next_sched
;
2401 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2402 local_irq_restore(flags
);
2405 void __netif_schedule(struct Qdisc
*q
)
2407 if (!test_and_set_bit(__QDISC_STATE_SCHED
, &q
->state
))
2408 __netif_reschedule(q
);
2410 EXPORT_SYMBOL(__netif_schedule
);
2412 struct dev_kfree_skb_cb
{
2413 enum skb_free_reason reason
;
2416 static struct dev_kfree_skb_cb
*get_kfree_skb_cb(const struct sk_buff
*skb
)
2418 return (struct dev_kfree_skb_cb
*)skb
->cb
;
2421 void netif_schedule_queue(struct netdev_queue
*txq
)
2424 if (!(txq
->state
& QUEUE_STATE_ANY_XOFF
)) {
2425 struct Qdisc
*q
= rcu_dereference(txq
->qdisc
);
2427 __netif_schedule(q
);
2431 EXPORT_SYMBOL(netif_schedule_queue
);
2434 * netif_wake_subqueue - allow sending packets on subqueue
2435 * @dev: network device
2436 * @queue_index: sub queue index
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2440 void netif_wake_subqueue(struct net_device
*dev
, u16 queue_index
)
2442 struct netdev_queue
*txq
= netdev_get_tx_queue(dev
, queue_index
);
2444 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF
, &txq
->state
)) {
2448 q
= rcu_dereference(txq
->qdisc
);
2449 __netif_schedule(q
);
2453 EXPORT_SYMBOL(netif_wake_subqueue
);
2455 void netif_tx_wake_queue(struct netdev_queue
*dev_queue
)
2457 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF
, &dev_queue
->state
)) {
2461 q
= rcu_dereference(dev_queue
->qdisc
);
2462 __netif_schedule(q
);
2466 EXPORT_SYMBOL(netif_tx_wake_queue
);
2468 void __dev_kfree_skb_irq(struct sk_buff
*skb
, enum skb_free_reason reason
)
2470 unsigned long flags
;
2472 if (likely(atomic_read(&skb
->users
) == 1)) {
2474 atomic_set(&skb
->users
, 0);
2475 } else if (likely(!atomic_dec_and_test(&skb
->users
))) {
2478 get_kfree_skb_cb(skb
)->reason
= reason
;
2479 local_irq_save(flags
);
2480 skb
->next
= __this_cpu_read(softnet_data
.completion_queue
);
2481 __this_cpu_write(softnet_data
.completion_queue
, skb
);
2482 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2483 local_irq_restore(flags
);
2485 EXPORT_SYMBOL(__dev_kfree_skb_irq
);
2487 void __dev_kfree_skb_any(struct sk_buff
*skb
, enum skb_free_reason reason
)
2489 if (in_irq() || irqs_disabled())
2490 __dev_kfree_skb_irq(skb
, reason
);
2494 EXPORT_SYMBOL(__dev_kfree_skb_any
);
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2501 * Mark device as removed from system and therefore no longer available.
2503 void netif_device_detach(struct net_device
*dev
)
2505 if (test_and_clear_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2506 netif_running(dev
)) {
2507 netif_tx_stop_all_queues(dev
);
2510 EXPORT_SYMBOL(netif_device_detach
);
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2516 * Mark device as attached from system and restart if needed.
2518 void netif_device_attach(struct net_device
*dev
)
2520 if (!test_and_set_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2521 netif_running(dev
)) {
2522 netif_tx_wake_all_queues(dev
);
2523 __netdev_watchdog_up(dev
);
2526 EXPORT_SYMBOL(netif_device_attach
);
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2532 u16
__skb_tx_hash(const struct net_device
*dev
, struct sk_buff
*skb
,
2533 unsigned int num_tx_queues
)
2537 u16 qcount
= num_tx_queues
;
2539 if (skb_rx_queue_recorded(skb
)) {
2540 hash
= skb_get_rx_queue(skb
);
2541 while (unlikely(hash
>= num_tx_queues
))
2542 hash
-= num_tx_queues
;
2547 u8 tc
= netdev_get_prio_tc_map(dev
, skb
->priority
);
2548 qoffset
= dev
->tc_to_txq
[tc
].offset
;
2549 qcount
= dev
->tc_to_txq
[tc
].count
;
2552 return (u16
) reciprocal_scale(skb_get_hash(skb
), qcount
) + qoffset
;
2554 EXPORT_SYMBOL(__skb_tx_hash
);
2556 static void skb_warn_bad_offload(const struct sk_buff
*skb
)
2558 static const netdev_features_t null_features
;
2559 struct net_device
*dev
= skb
->dev
;
2560 const char *name
= "";
2562 if (!net_ratelimit())
2566 if (dev
->dev
.parent
)
2567 name
= dev_driver_string(dev
->dev
.parent
);
2569 name
= netdev_name(dev
);
2571 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572 "gso_type=%d ip_summed=%d\n",
2573 name
, dev
? &dev
->features
: &null_features
,
2574 skb
->sk
? &skb
->sk
->sk_route_caps
: &null_features
,
2575 skb
->len
, skb
->data_len
, skb_shinfo(skb
)->gso_size
,
2576 skb_shinfo(skb
)->gso_type
, skb
->ip_summed
);
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2583 int skb_checksum_help(struct sk_buff
*skb
)
2586 int ret
= 0, offset
;
2588 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
2589 goto out_set_summed
;
2591 if (unlikely(skb_shinfo(skb
)->gso_size
)) {
2592 skb_warn_bad_offload(skb
);
2596 /* Before computing a checksum, we should make sure no frag could
2597 * be modified by an external entity : checksum could be wrong.
2599 if (skb_has_shared_frag(skb
)) {
2600 ret
= __skb_linearize(skb
);
2605 offset
= skb_checksum_start_offset(skb
);
2606 BUG_ON(offset
>= skb_headlen(skb
));
2607 csum
= skb_checksum(skb
, offset
, skb
->len
- offset
, 0);
2609 offset
+= skb
->csum_offset
;
2610 BUG_ON(offset
+ sizeof(__sum16
) > skb_headlen(skb
));
2612 if (skb_cloned(skb
) &&
2613 !skb_clone_writable(skb
, offset
+ sizeof(__sum16
))) {
2614 ret
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
);
2619 *(__sum16
*)(skb
->data
+ offset
) = csum_fold(csum
) ?: CSUM_MANGLED_0
;
2621 skb
->ip_summed
= CHECKSUM_NONE
;
2625 EXPORT_SYMBOL(skb_checksum_help
);
2627 __be16
skb_network_protocol(struct sk_buff
*skb
, int *depth
)
2629 __be16 type
= skb
->protocol
;
2631 /* Tunnel gso handlers can set protocol to ethernet. */
2632 if (type
== htons(ETH_P_TEB
)) {
2635 if (unlikely(!pskb_may_pull(skb
, sizeof(struct ethhdr
))))
2638 eth
= (struct ethhdr
*)skb_mac_header(skb
);
2639 type
= eth
->h_proto
;
2642 return __vlan_get_protocol(skb
, type
, depth
);
2646 * skb_mac_gso_segment - mac layer segmentation handler.
2647 * @skb: buffer to segment
2648 * @features: features for the output path (see dev->features)
2650 struct sk_buff
*skb_mac_gso_segment(struct sk_buff
*skb
,
2651 netdev_features_t features
)
2653 struct sk_buff
*segs
= ERR_PTR(-EPROTONOSUPPORT
);
2654 struct packet_offload
*ptype
;
2655 int vlan_depth
= skb
->mac_len
;
2656 __be16 type
= skb_network_protocol(skb
, &vlan_depth
);
2658 if (unlikely(!type
))
2659 return ERR_PTR(-EINVAL
);
2661 __skb_pull(skb
, vlan_depth
);
2664 list_for_each_entry_rcu(ptype
, &offload_base
, list
) {
2665 if (ptype
->type
== type
&& ptype
->callbacks
.gso_segment
) {
2666 segs
= ptype
->callbacks
.gso_segment(skb
, features
);
2672 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2676 EXPORT_SYMBOL(skb_mac_gso_segment
);
2679 /* openvswitch calls this on rx path, so we need a different check.
2681 static inline bool skb_needs_check(struct sk_buff
*skb
, bool tx_path
)
2684 return skb
->ip_summed
!= CHECKSUM_PARTIAL
;
2686 return skb
->ip_summed
== CHECKSUM_NONE
;
2690 * __skb_gso_segment - Perform segmentation on skb.
2691 * @skb: buffer to segment
2692 * @features: features for the output path (see dev->features)
2693 * @tx_path: whether it is called in TX path
2695 * This function segments the given skb and returns a list of segments.
2697 * It may return NULL if the skb requires no segmentation. This is
2698 * only possible when GSO is used for verifying header integrity.
2700 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2702 struct sk_buff
*__skb_gso_segment(struct sk_buff
*skb
,
2703 netdev_features_t features
, bool tx_path
)
2705 struct sk_buff
*segs
;
2707 if (unlikely(skb_needs_check(skb
, tx_path
))) {
2710 /* We're going to init ->check field in TCP or UDP header */
2711 err
= skb_cow_head(skb
, 0);
2713 return ERR_PTR(err
);
2716 /* Only report GSO partial support if it will enable us to
2717 * support segmentation on this frame without needing additional
2720 if (features
& NETIF_F_GSO_PARTIAL
) {
2721 netdev_features_t partial_features
= NETIF_F_GSO_ROBUST
;
2722 struct net_device
*dev
= skb
->dev
;
2724 partial_features
|= dev
->features
& dev
->gso_partial_features
;
2725 if (!skb_gso_ok(skb
, features
| partial_features
))
2726 features
&= ~NETIF_F_GSO_PARTIAL
;
2729 BUILD_BUG_ON(SKB_SGO_CB_OFFSET
+
2730 sizeof(*SKB_GSO_CB(skb
)) > sizeof(skb
->cb
));
2732 SKB_GSO_CB(skb
)->mac_offset
= skb_headroom(skb
);
2733 SKB_GSO_CB(skb
)->encap_level
= 0;
2735 skb_reset_mac_header(skb
);
2736 skb_reset_mac_len(skb
);
2738 segs
= skb_mac_gso_segment(skb
, features
);
2740 if (unlikely(skb_needs_check(skb
, tx_path
)))
2741 skb_warn_bad_offload(skb
);
2745 EXPORT_SYMBOL(__skb_gso_segment
);
2747 /* Take action when hardware reception checksum errors are detected. */
2749 void netdev_rx_csum_fault(struct net_device
*dev
)
2751 if (net_ratelimit()) {
2752 pr_err("%s: hw csum failure\n", dev
? dev
->name
: "<unknown>");
2756 EXPORT_SYMBOL(netdev_rx_csum_fault
);
2759 /* Actually, we should eliminate this check as soon as we know, that:
2760 * 1. IOMMU is present and allows to map all the memory.
2761 * 2. No high memory really exists on this machine.
2764 static int illegal_highdma(struct net_device
*dev
, struct sk_buff
*skb
)
2766 #ifdef CONFIG_HIGHMEM
2768 if (!(dev
->features
& NETIF_F_HIGHDMA
)) {
2769 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2770 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2771 if (PageHighMem(skb_frag_page(frag
)))
2776 if (PCI_DMA_BUS_IS_PHYS
) {
2777 struct device
*pdev
= dev
->dev
.parent
;
2781 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2782 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2783 dma_addr_t addr
= page_to_phys(skb_frag_page(frag
));
2784 if (!pdev
->dma_mask
|| addr
+ PAGE_SIZE
- 1 > *pdev
->dma_mask
)
2792 /* If MPLS offload request, verify we are testing hardware MPLS features
2793 * instead of standard features for the netdev.
2795 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2796 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
2797 netdev_features_t features
,
2800 if (eth_p_mpls(type
))
2801 features
&= skb
->dev
->mpls_features
;
2806 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
2807 netdev_features_t features
,
2814 static netdev_features_t
harmonize_features(struct sk_buff
*skb
,
2815 netdev_features_t features
)
2820 type
= skb_network_protocol(skb
, &tmp
);
2821 features
= net_mpls_features(skb
, features
, type
);
2823 if (skb
->ip_summed
!= CHECKSUM_NONE
&&
2824 !can_checksum_protocol(features
, type
)) {
2825 features
&= ~(NETIF_F_CSUM_MASK
| NETIF_F_GSO_MASK
);
2827 if (illegal_highdma(skb
->dev
, skb
))
2828 features
&= ~NETIF_F_SG
;
2833 netdev_features_t
passthru_features_check(struct sk_buff
*skb
,
2834 struct net_device
*dev
,
2835 netdev_features_t features
)
2839 EXPORT_SYMBOL(passthru_features_check
);
2841 static netdev_features_t
dflt_features_check(const struct sk_buff
*skb
,
2842 struct net_device
*dev
,
2843 netdev_features_t features
)
2845 return vlan_features_check(skb
, features
);
2848 static netdev_features_t
gso_features_check(const struct sk_buff
*skb
,
2849 struct net_device
*dev
,
2850 netdev_features_t features
)
2852 u16 gso_segs
= skb_shinfo(skb
)->gso_segs
;
2854 if (gso_segs
> dev
->gso_max_segs
)
2855 return features
& ~NETIF_F_GSO_MASK
;
2857 /* Support for GSO partial features requires software
2858 * intervention before we can actually process the packets
2859 * so we need to strip support for any partial features now
2860 * and we can pull them back in after we have partially
2861 * segmented the frame.
2863 if (!(skb_shinfo(skb
)->gso_type
& SKB_GSO_PARTIAL
))
2864 features
&= ~dev
->gso_partial_features
;
2866 /* Make sure to clear the IPv4 ID mangling feature if the
2867 * IPv4 header has the potential to be fragmented.
2869 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV4
) {
2870 struct iphdr
*iph
= skb
->encapsulation
?
2871 inner_ip_hdr(skb
) : ip_hdr(skb
);
2873 if (!(iph
->frag_off
& htons(IP_DF
)))
2874 features
&= ~NETIF_F_TSO_MANGLEID
;
2880 netdev_features_t
netif_skb_features(struct sk_buff
*skb
)
2882 struct net_device
*dev
= skb
->dev
;
2883 netdev_features_t features
= dev
->features
;
2885 if (skb_is_gso(skb
))
2886 features
= gso_features_check(skb
, dev
, features
);
2888 /* If encapsulation offload request, verify we are testing
2889 * hardware encapsulation features instead of standard
2890 * features for the netdev
2892 if (skb
->encapsulation
)
2893 features
&= dev
->hw_enc_features
;
2895 if (skb_vlan_tagged(skb
))
2896 features
= netdev_intersect_features(features
,
2897 dev
->vlan_features
|
2898 NETIF_F_HW_VLAN_CTAG_TX
|
2899 NETIF_F_HW_VLAN_STAG_TX
);
2901 if (dev
->netdev_ops
->ndo_features_check
)
2902 features
&= dev
->netdev_ops
->ndo_features_check(skb
, dev
,
2905 features
&= dflt_features_check(skb
, dev
, features
);
2907 return harmonize_features(skb
, features
);
2909 EXPORT_SYMBOL(netif_skb_features
);
2911 static int xmit_one(struct sk_buff
*skb
, struct net_device
*dev
,
2912 struct netdev_queue
*txq
, bool more
)
2917 if (!list_empty(&ptype_all
) || !list_empty(&dev
->ptype_all
))
2918 dev_queue_xmit_nit(skb
, dev
);
2921 trace_net_dev_start_xmit(skb
, dev
);
2922 rc
= netdev_start_xmit(skb
, dev
, txq
, more
);
2923 trace_net_dev_xmit(skb
, rc
, dev
, len
);
2928 struct sk_buff
*dev_hard_start_xmit(struct sk_buff
*first
, struct net_device
*dev
,
2929 struct netdev_queue
*txq
, int *ret
)
2931 struct sk_buff
*skb
= first
;
2932 int rc
= NETDEV_TX_OK
;
2935 struct sk_buff
*next
= skb
->next
;
2938 rc
= xmit_one(skb
, dev
, txq
, next
!= NULL
);
2939 if (unlikely(!dev_xmit_complete(rc
))) {
2945 if (netif_xmit_stopped(txq
) && skb
) {
2946 rc
= NETDEV_TX_BUSY
;
2956 static struct sk_buff
*validate_xmit_vlan(struct sk_buff
*skb
,
2957 netdev_features_t features
)
2959 if (skb_vlan_tag_present(skb
) &&
2960 !vlan_hw_offload_capable(features
, skb
->vlan_proto
))
2961 skb
= __vlan_hwaccel_push_inside(skb
);
2965 static struct sk_buff
*validate_xmit_skb(struct sk_buff
*skb
, struct net_device
*dev
)
2967 netdev_features_t features
;
2969 features
= netif_skb_features(skb
);
2970 skb
= validate_xmit_vlan(skb
, features
);
2974 if (netif_needs_gso(skb
, features
)) {
2975 struct sk_buff
*segs
;
2977 segs
= skb_gso_segment(skb
, features
);
2985 if (skb_needs_linearize(skb
, features
) &&
2986 __skb_linearize(skb
))
2989 /* If packet is not checksummed and device does not
2990 * support checksumming for this protocol, complete
2991 * checksumming here.
2993 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2994 if (skb
->encapsulation
)
2995 skb_set_inner_transport_header(skb
,
2996 skb_checksum_start_offset(skb
));
2998 skb_set_transport_header(skb
,
2999 skb_checksum_start_offset(skb
));
3000 if (!(features
& NETIF_F_CSUM_MASK
) &&
3001 skb_checksum_help(skb
))
3011 atomic_long_inc(&dev
->tx_dropped
);
3015 struct sk_buff
*validate_xmit_skb_list(struct sk_buff
*skb
, struct net_device
*dev
)
3017 struct sk_buff
*next
, *head
= NULL
, *tail
;
3019 for (; skb
!= NULL
; skb
= next
) {
3023 /* in case skb wont be segmented, point to itself */
3026 skb
= validate_xmit_skb(skb
, dev
);
3034 /* If skb was segmented, skb->prev points to
3035 * the last segment. If not, it still contains skb.
3041 EXPORT_SYMBOL_GPL(validate_xmit_skb_list
);
3043 static void qdisc_pkt_len_init(struct sk_buff
*skb
)
3045 const struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
3047 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
3049 /* To get more precise estimation of bytes sent on wire,
3050 * we add to pkt_len the headers size of all segments
3052 if (shinfo
->gso_size
) {
3053 unsigned int hdr_len
;
3054 u16 gso_segs
= shinfo
->gso_segs
;
3056 /* mac layer + network layer */
3057 hdr_len
= skb_transport_header(skb
) - skb_mac_header(skb
);
3059 /* + transport layer */
3060 if (likely(shinfo
->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
)))
3061 hdr_len
+= tcp_hdrlen(skb
);
3063 hdr_len
+= sizeof(struct udphdr
);
3065 if (shinfo
->gso_type
& SKB_GSO_DODGY
)
3066 gso_segs
= DIV_ROUND_UP(skb
->len
- hdr_len
,
3069 qdisc_skb_cb(skb
)->pkt_len
+= (gso_segs
- 1) * hdr_len
;
3073 static inline int __dev_xmit_skb(struct sk_buff
*skb
, struct Qdisc
*q
,
3074 struct net_device
*dev
,
3075 struct netdev_queue
*txq
)
3077 spinlock_t
*root_lock
= qdisc_lock(q
);
3078 struct sk_buff
*to_free
= NULL
;
3082 qdisc_calculate_pkt_len(skb
, q
);
3084 * Heuristic to force contended enqueues to serialize on a
3085 * separate lock before trying to get qdisc main lock.
3086 * This permits qdisc->running owner to get the lock more
3087 * often and dequeue packets faster.
3089 contended
= qdisc_is_running(q
);
3090 if (unlikely(contended
))
3091 spin_lock(&q
->busylock
);
3093 spin_lock(root_lock
);
3094 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
))) {
3095 __qdisc_drop(skb
, &to_free
);
3097 } else if ((q
->flags
& TCQ_F_CAN_BYPASS
) && !qdisc_qlen(q
) &&
3098 qdisc_run_begin(q
)) {
3100 * This is a work-conserving queue; there are no old skbs
3101 * waiting to be sent out; and the qdisc is not running -
3102 * xmit the skb directly.
3105 qdisc_bstats_update(q
, skb
);
3107 if (sch_direct_xmit(skb
, q
, dev
, txq
, root_lock
, true)) {
3108 if (unlikely(contended
)) {
3109 spin_unlock(&q
->busylock
);
3116 rc
= NET_XMIT_SUCCESS
;
3118 rc
= q
->enqueue(skb
, q
, &to_free
) & NET_XMIT_MASK
;
3119 if (qdisc_run_begin(q
)) {
3120 if (unlikely(contended
)) {
3121 spin_unlock(&q
->busylock
);
3127 spin_unlock(root_lock
);
3128 if (unlikely(to_free
))
3129 kfree_skb_list(to_free
);
3130 if (unlikely(contended
))
3131 spin_unlock(&q
->busylock
);
3135 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3136 static void skb_update_prio(struct sk_buff
*skb
)
3138 struct netprio_map
*map
= rcu_dereference_bh(skb
->dev
->priomap
);
3140 if (!skb
->priority
&& skb
->sk
&& map
) {
3141 unsigned int prioidx
=
3142 sock_cgroup_prioidx(&skb
->sk
->sk_cgrp_data
);
3144 if (prioidx
< map
->priomap_len
)
3145 skb
->priority
= map
->priomap
[prioidx
];
3149 #define skb_update_prio(skb)
3152 DEFINE_PER_CPU(int, xmit_recursion
);
3153 EXPORT_SYMBOL(xmit_recursion
);
3156 * dev_loopback_xmit - loop back @skb
3157 * @net: network namespace this loopback is happening in
3158 * @sk: sk needed to be a netfilter okfn
3159 * @skb: buffer to transmit
3161 int dev_loopback_xmit(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3163 skb_reset_mac_header(skb
);
3164 __skb_pull(skb
, skb_network_offset(skb
));
3165 skb
->pkt_type
= PACKET_LOOPBACK
;
3166 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
3167 WARN_ON(!skb_dst(skb
));
3172 EXPORT_SYMBOL(dev_loopback_xmit
);
3174 #ifdef CONFIG_NET_EGRESS
3175 static struct sk_buff
*
3176 sch_handle_egress(struct sk_buff
*skb
, int *ret
, struct net_device
*dev
)
3178 struct tcf_proto
*cl
= rcu_dereference_bh(dev
->egress_cl_list
);
3179 struct tcf_result cl_res
;
3184 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3185 * earlier by the caller.
3187 qdisc_bstats_cpu_update(cl
->q
, skb
);
3189 switch (tc_classify(skb
, cl
, &cl_res
, false)) {
3191 case TC_ACT_RECLASSIFY
:
3192 skb
->tc_index
= TC_H_MIN(cl_res
.classid
);
3195 qdisc_qstats_cpu_drop(cl
->q
);
3196 *ret
= NET_XMIT_DROP
;
3201 *ret
= NET_XMIT_SUCCESS
;
3204 case TC_ACT_REDIRECT
:
3205 /* No need to push/pop skb's mac_header here on egress! */
3206 skb_do_redirect(skb
);
3207 *ret
= NET_XMIT_SUCCESS
;
3215 #endif /* CONFIG_NET_EGRESS */
3217 static inline int get_xps_queue(struct net_device
*dev
, struct sk_buff
*skb
)
3220 struct xps_dev_maps
*dev_maps
;
3221 struct xps_map
*map
;
3222 int queue_index
= -1;
3225 dev_maps
= rcu_dereference(dev
->xps_maps
);
3227 unsigned int tci
= skb
->sender_cpu
- 1;
3231 tci
+= netdev_get_prio_tc_map(dev
, skb
->priority
);
3234 map
= rcu_dereference(dev_maps
->cpu_map
[tci
]);
3237 queue_index
= map
->queues
[0];
3239 queue_index
= map
->queues
[reciprocal_scale(skb_get_hash(skb
),
3241 if (unlikely(queue_index
>= dev
->real_num_tx_queues
))
3253 static u16
__netdev_pick_tx(struct net_device
*dev
, struct sk_buff
*skb
)
3255 struct sock
*sk
= skb
->sk
;
3256 int queue_index
= sk_tx_queue_get(sk
);
3258 if (queue_index
< 0 || skb
->ooo_okay
||
3259 queue_index
>= dev
->real_num_tx_queues
) {
3260 int new_index
= get_xps_queue(dev
, skb
);
3262 new_index
= skb_tx_hash(dev
, skb
);
3264 if (queue_index
!= new_index
&& sk
&&
3266 rcu_access_pointer(sk
->sk_dst_cache
))
3267 sk_tx_queue_set(sk
, new_index
);
3269 queue_index
= new_index
;
3275 struct netdev_queue
*netdev_pick_tx(struct net_device
*dev
,
3276 struct sk_buff
*skb
,
3279 int queue_index
= 0;
3282 u32 sender_cpu
= skb
->sender_cpu
- 1;
3284 if (sender_cpu
>= (u32
)NR_CPUS
)
3285 skb
->sender_cpu
= raw_smp_processor_id() + 1;
3288 if (dev
->real_num_tx_queues
!= 1) {
3289 const struct net_device_ops
*ops
= dev
->netdev_ops
;
3290 if (ops
->ndo_select_queue
)
3291 queue_index
= ops
->ndo_select_queue(dev
, skb
, accel_priv
,
3294 queue_index
= __netdev_pick_tx(dev
, skb
);
3297 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
3300 skb_set_queue_mapping(skb
, queue_index
);
3301 return netdev_get_tx_queue(dev
, queue_index
);
3305 * __dev_queue_xmit - transmit a buffer
3306 * @skb: buffer to transmit
3307 * @accel_priv: private data used for L2 forwarding offload
3309 * Queue a buffer for transmission to a network device. The caller must
3310 * have set the device and priority and built the buffer before calling
3311 * this function. The function can be called from an interrupt.
3313 * A negative errno code is returned on a failure. A success does not
3314 * guarantee the frame will be transmitted as it may be dropped due
3315 * to congestion or traffic shaping.
3317 * -----------------------------------------------------------------------------------
3318 * I notice this method can also return errors from the queue disciplines,
3319 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3322 * Regardless of the return value, the skb is consumed, so it is currently
3323 * difficult to retry a send to this method. (You can bump the ref count
3324 * before sending to hold a reference for retry if you are careful.)
3326 * When calling this method, interrupts MUST be enabled. This is because
3327 * the BH enable code must have IRQs enabled so that it will not deadlock.
3330 static int __dev_queue_xmit(struct sk_buff
*skb
, void *accel_priv
)
3332 struct net_device
*dev
= skb
->dev
;
3333 struct netdev_queue
*txq
;
3337 skb_reset_mac_header(skb
);
3339 if (unlikely(skb_shinfo(skb
)->tx_flags
& SKBTX_SCHED_TSTAMP
))
3340 __skb_tstamp_tx(skb
, NULL
, skb
->sk
, SCM_TSTAMP_SCHED
);
3342 /* Disable soft irqs for various locks below. Also
3343 * stops preemption for RCU.
3347 skb_update_prio(skb
);
3349 qdisc_pkt_len_init(skb
);
3350 #ifdef CONFIG_NET_CLS_ACT
3351 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_EGRESS
);
3352 # ifdef CONFIG_NET_EGRESS
3353 if (static_key_false(&egress_needed
)) {
3354 skb
= sch_handle_egress(skb
, &rc
, dev
);
3360 /* If device/qdisc don't need skb->dst, release it right now while
3361 * its hot in this cpu cache.
3363 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
3368 txq
= netdev_pick_tx(dev
, skb
, accel_priv
);
3369 q
= rcu_dereference_bh(txq
->qdisc
);
3371 trace_net_dev_queue(skb
);
3373 rc
= __dev_xmit_skb(skb
, q
, dev
, txq
);
3377 /* The device has no queue. Common case for software devices:
3378 loopback, all the sorts of tunnels...
3380 Really, it is unlikely that netif_tx_lock protection is necessary
3381 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3383 However, it is possible, that they rely on protection
3386 Check this and shot the lock. It is not prone from deadlocks.
3387 Either shot noqueue qdisc, it is even simpler 8)
3389 if (dev
->flags
& IFF_UP
) {
3390 int cpu
= smp_processor_id(); /* ok because BHs are off */
3392 if (txq
->xmit_lock_owner
!= cpu
) {
3393 if (unlikely(__this_cpu_read(xmit_recursion
) >
3394 XMIT_RECURSION_LIMIT
))
3395 goto recursion_alert
;
3397 skb
= validate_xmit_skb(skb
, dev
);
3401 HARD_TX_LOCK(dev
, txq
, cpu
);
3403 if (!netif_xmit_stopped(txq
)) {
3404 __this_cpu_inc(xmit_recursion
);
3405 skb
= dev_hard_start_xmit(skb
, dev
, txq
, &rc
);
3406 __this_cpu_dec(xmit_recursion
);
3407 if (dev_xmit_complete(rc
)) {
3408 HARD_TX_UNLOCK(dev
, txq
);
3412 HARD_TX_UNLOCK(dev
, txq
);
3413 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3416 /* Recursion is detected! It is possible,
3420 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3426 rcu_read_unlock_bh();
3428 atomic_long_inc(&dev
->tx_dropped
);
3429 kfree_skb_list(skb
);
3432 rcu_read_unlock_bh();
3436 int dev_queue_xmit(struct sk_buff
*skb
)
3438 return __dev_queue_xmit(skb
, NULL
);
3440 EXPORT_SYMBOL(dev_queue_xmit
);
3442 int dev_queue_xmit_accel(struct sk_buff
*skb
, void *accel_priv
)
3444 return __dev_queue_xmit(skb
, accel_priv
);
3446 EXPORT_SYMBOL(dev_queue_xmit_accel
);
3449 /*=======================================================================
3451 =======================================================================*/
3453 int netdev_max_backlog __read_mostly
= 1000;
3454 EXPORT_SYMBOL(netdev_max_backlog
);
3456 int netdev_tstamp_prequeue __read_mostly
= 1;
3457 int netdev_budget __read_mostly
= 300;
3458 int weight_p __read_mostly
= 64; /* old backlog weight */
3460 /* Called with irq disabled */
3461 static inline void ____napi_schedule(struct softnet_data
*sd
,
3462 struct napi_struct
*napi
)
3464 list_add_tail(&napi
->poll_list
, &sd
->poll_list
);
3465 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3470 /* One global table that all flow-based protocols share. */
3471 struct rps_sock_flow_table __rcu
*rps_sock_flow_table __read_mostly
;
3472 EXPORT_SYMBOL(rps_sock_flow_table
);
3473 u32 rps_cpu_mask __read_mostly
;
3474 EXPORT_SYMBOL(rps_cpu_mask
);
3476 struct static_key rps_needed __read_mostly
;
3477 EXPORT_SYMBOL(rps_needed
);
3478 struct static_key rfs_needed __read_mostly
;
3479 EXPORT_SYMBOL(rfs_needed
);
3481 static struct rps_dev_flow
*
3482 set_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
3483 struct rps_dev_flow
*rflow
, u16 next_cpu
)
3485 if (next_cpu
< nr_cpu_ids
) {
3486 #ifdef CONFIG_RFS_ACCEL
3487 struct netdev_rx_queue
*rxqueue
;
3488 struct rps_dev_flow_table
*flow_table
;
3489 struct rps_dev_flow
*old_rflow
;
3494 /* Should we steer this flow to a different hardware queue? */
3495 if (!skb_rx_queue_recorded(skb
) || !dev
->rx_cpu_rmap
||
3496 !(dev
->features
& NETIF_F_NTUPLE
))
3498 rxq_index
= cpu_rmap_lookup_index(dev
->rx_cpu_rmap
, next_cpu
);
3499 if (rxq_index
== skb_get_rx_queue(skb
))
3502 rxqueue
= dev
->_rx
+ rxq_index
;
3503 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3506 flow_id
= skb_get_hash(skb
) & flow_table
->mask
;
3507 rc
= dev
->netdev_ops
->ndo_rx_flow_steer(dev
, skb
,
3508 rxq_index
, flow_id
);
3512 rflow
= &flow_table
->flows
[flow_id
];
3514 if (old_rflow
->filter
== rflow
->filter
)
3515 old_rflow
->filter
= RPS_NO_FILTER
;
3519 per_cpu(softnet_data
, next_cpu
).input_queue_head
;
3522 rflow
->cpu
= next_cpu
;
3527 * get_rps_cpu is called from netif_receive_skb and returns the target
3528 * CPU from the RPS map of the receiving queue for a given skb.
3529 * rcu_read_lock must be held on entry.
3531 static int get_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
3532 struct rps_dev_flow
**rflowp
)
3534 const struct rps_sock_flow_table
*sock_flow_table
;
3535 struct netdev_rx_queue
*rxqueue
= dev
->_rx
;
3536 struct rps_dev_flow_table
*flow_table
;
3537 struct rps_map
*map
;
3542 if (skb_rx_queue_recorded(skb
)) {
3543 u16 index
= skb_get_rx_queue(skb
);
3545 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
3546 WARN_ONCE(dev
->real_num_rx_queues
> 1,
3547 "%s received packet on queue %u, but number "
3548 "of RX queues is %u\n",
3549 dev
->name
, index
, dev
->real_num_rx_queues
);
3555 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3557 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3558 map
= rcu_dereference(rxqueue
->rps_map
);
3559 if (!flow_table
&& !map
)
3562 skb_reset_network_header(skb
);
3563 hash
= skb_get_hash(skb
);
3567 sock_flow_table
= rcu_dereference(rps_sock_flow_table
);
3568 if (flow_table
&& sock_flow_table
) {
3569 struct rps_dev_flow
*rflow
;
3573 /* First check into global flow table if there is a match */
3574 ident
= sock_flow_table
->ents
[hash
& sock_flow_table
->mask
];
3575 if ((ident
^ hash
) & ~rps_cpu_mask
)
3578 next_cpu
= ident
& rps_cpu_mask
;
3580 /* OK, now we know there is a match,
3581 * we can look at the local (per receive queue) flow table
3583 rflow
= &flow_table
->flows
[hash
& flow_table
->mask
];
3587 * If the desired CPU (where last recvmsg was done) is
3588 * different from current CPU (one in the rx-queue flow
3589 * table entry), switch if one of the following holds:
3590 * - Current CPU is unset (>= nr_cpu_ids).
3591 * - Current CPU is offline.
3592 * - The current CPU's queue tail has advanced beyond the
3593 * last packet that was enqueued using this table entry.
3594 * This guarantees that all previous packets for the flow
3595 * have been dequeued, thus preserving in order delivery.
3597 if (unlikely(tcpu
!= next_cpu
) &&
3598 (tcpu
>= nr_cpu_ids
|| !cpu_online(tcpu
) ||
3599 ((int)(per_cpu(softnet_data
, tcpu
).input_queue_head
-
3600 rflow
->last_qtail
)) >= 0)) {
3602 rflow
= set_rps_cpu(dev
, skb
, rflow
, next_cpu
);
3605 if (tcpu
< nr_cpu_ids
&& cpu_online(tcpu
)) {
3615 tcpu
= map
->cpus
[reciprocal_scale(hash
, map
->len
)];
3616 if (cpu_online(tcpu
)) {
3626 #ifdef CONFIG_RFS_ACCEL
3629 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3630 * @dev: Device on which the filter was set
3631 * @rxq_index: RX queue index
3632 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3633 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3635 * Drivers that implement ndo_rx_flow_steer() should periodically call
3636 * this function for each installed filter and remove the filters for
3637 * which it returns %true.
3639 bool rps_may_expire_flow(struct net_device
*dev
, u16 rxq_index
,
3640 u32 flow_id
, u16 filter_id
)
3642 struct netdev_rx_queue
*rxqueue
= dev
->_rx
+ rxq_index
;
3643 struct rps_dev_flow_table
*flow_table
;
3644 struct rps_dev_flow
*rflow
;
3649 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3650 if (flow_table
&& flow_id
<= flow_table
->mask
) {
3651 rflow
= &flow_table
->flows
[flow_id
];
3652 cpu
= ACCESS_ONCE(rflow
->cpu
);
3653 if (rflow
->filter
== filter_id
&& cpu
< nr_cpu_ids
&&
3654 ((int)(per_cpu(softnet_data
, cpu
).input_queue_head
-
3655 rflow
->last_qtail
) <
3656 (int)(10 * flow_table
->mask
)))
3662 EXPORT_SYMBOL(rps_may_expire_flow
);
3664 #endif /* CONFIG_RFS_ACCEL */
3666 /* Called from hardirq (IPI) context */
3667 static void rps_trigger_softirq(void *data
)
3669 struct softnet_data
*sd
= data
;
3671 ____napi_schedule(sd
, &sd
->backlog
);
3675 #endif /* CONFIG_RPS */
3678 * Check if this softnet_data structure is another cpu one
3679 * If yes, queue it to our IPI list and return 1
3682 static int rps_ipi_queued(struct softnet_data
*sd
)
3685 struct softnet_data
*mysd
= this_cpu_ptr(&softnet_data
);
3688 sd
->rps_ipi_next
= mysd
->rps_ipi_list
;
3689 mysd
->rps_ipi_list
= sd
;
3691 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3694 #endif /* CONFIG_RPS */
3698 #ifdef CONFIG_NET_FLOW_LIMIT
3699 int netdev_flow_limit_table_len __read_mostly
= (1 << 12);
3702 static bool skb_flow_limit(struct sk_buff
*skb
, unsigned int qlen
)
3704 #ifdef CONFIG_NET_FLOW_LIMIT
3705 struct sd_flow_limit
*fl
;
3706 struct softnet_data
*sd
;
3707 unsigned int old_flow
, new_flow
;
3709 if (qlen
< (netdev_max_backlog
>> 1))
3712 sd
= this_cpu_ptr(&softnet_data
);
3715 fl
= rcu_dereference(sd
->flow_limit
);
3717 new_flow
= skb_get_hash(skb
) & (fl
->num_buckets
- 1);
3718 old_flow
= fl
->history
[fl
->history_head
];
3719 fl
->history
[fl
->history_head
] = new_flow
;
3722 fl
->history_head
&= FLOW_LIMIT_HISTORY
- 1;
3724 if (likely(fl
->buckets
[old_flow
]))
3725 fl
->buckets
[old_flow
]--;
3727 if (++fl
->buckets
[new_flow
] > (FLOW_LIMIT_HISTORY
>> 1)) {
3739 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3740 * queue (may be a remote CPU queue).
3742 static int enqueue_to_backlog(struct sk_buff
*skb
, int cpu
,
3743 unsigned int *qtail
)
3745 struct softnet_data
*sd
;
3746 unsigned long flags
;
3749 sd
= &per_cpu(softnet_data
, cpu
);
3751 local_irq_save(flags
);
3754 if (!netif_running(skb
->dev
))
3756 qlen
= skb_queue_len(&sd
->input_pkt_queue
);
3757 if (qlen
<= netdev_max_backlog
&& !skb_flow_limit(skb
, qlen
)) {
3760 __skb_queue_tail(&sd
->input_pkt_queue
, skb
);
3761 input_queue_tail_incr_save(sd
, qtail
);
3763 local_irq_restore(flags
);
3764 return NET_RX_SUCCESS
;
3767 /* Schedule NAPI for backlog device
3768 * We can use non atomic operation since we own the queue lock
3770 if (!__test_and_set_bit(NAPI_STATE_SCHED
, &sd
->backlog
.state
)) {
3771 if (!rps_ipi_queued(sd
))
3772 ____napi_schedule(sd
, &sd
->backlog
);
3781 local_irq_restore(flags
);
3783 atomic_long_inc(&skb
->dev
->rx_dropped
);
3788 static int netif_rx_internal(struct sk_buff
*skb
)
3792 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3794 trace_netif_rx(skb
);
3796 if (static_key_false(&rps_needed
)) {
3797 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3803 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3805 cpu
= smp_processor_id();
3807 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3815 ret
= enqueue_to_backlog(skb
, get_cpu(), &qtail
);
3822 * netif_rx - post buffer to the network code
3823 * @skb: buffer to post
3825 * This function receives a packet from a device driver and queues it for
3826 * the upper (protocol) levels to process. It always succeeds. The buffer
3827 * may be dropped during processing for congestion control or by the
3831 * NET_RX_SUCCESS (no congestion)
3832 * NET_RX_DROP (packet was dropped)
3836 int netif_rx(struct sk_buff
*skb
)
3838 trace_netif_rx_entry(skb
);
3840 return netif_rx_internal(skb
);
3842 EXPORT_SYMBOL(netif_rx
);
3844 int netif_rx_ni(struct sk_buff
*skb
)
3848 trace_netif_rx_ni_entry(skb
);
3851 err
= netif_rx_internal(skb
);
3852 if (local_softirq_pending())
3858 EXPORT_SYMBOL(netif_rx_ni
);
3860 static __latent_entropy
void net_tx_action(struct softirq_action
*h
)
3862 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
3864 if (sd
->completion_queue
) {
3865 struct sk_buff
*clist
;
3867 local_irq_disable();
3868 clist
= sd
->completion_queue
;
3869 sd
->completion_queue
= NULL
;
3873 struct sk_buff
*skb
= clist
;
3874 clist
= clist
->next
;
3876 WARN_ON(atomic_read(&skb
->users
));
3877 if (likely(get_kfree_skb_cb(skb
)->reason
== SKB_REASON_CONSUMED
))
3878 trace_consume_skb(skb
);
3880 trace_kfree_skb(skb
, net_tx_action
);
3882 if (skb
->fclone
!= SKB_FCLONE_UNAVAILABLE
)
3885 __kfree_skb_defer(skb
);
3888 __kfree_skb_flush();
3891 if (sd
->output_queue
) {
3894 local_irq_disable();
3895 head
= sd
->output_queue
;
3896 sd
->output_queue
= NULL
;
3897 sd
->output_queue_tailp
= &sd
->output_queue
;
3901 struct Qdisc
*q
= head
;
3902 spinlock_t
*root_lock
;
3904 head
= head
->next_sched
;
3906 root_lock
= qdisc_lock(q
);
3907 spin_lock(root_lock
);
3908 /* We need to make sure head->next_sched is read
3909 * before clearing __QDISC_STATE_SCHED
3911 smp_mb__before_atomic();
3912 clear_bit(__QDISC_STATE_SCHED
, &q
->state
);
3914 spin_unlock(root_lock
);
3919 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3920 /* This hook is defined here for ATM LANE */
3921 int (*br_fdb_test_addr_hook
)(struct net_device
*dev
,
3922 unsigned char *addr
) __read_mostly
;
3923 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook
);
3926 static inline struct sk_buff
*
3927 sch_handle_ingress(struct sk_buff
*skb
, struct packet_type
**pt_prev
, int *ret
,
3928 struct net_device
*orig_dev
)
3930 #ifdef CONFIG_NET_CLS_ACT
3931 struct tcf_proto
*cl
= rcu_dereference_bh(skb
->dev
->ingress_cl_list
);
3932 struct tcf_result cl_res
;
3934 /* If there's at least one ingress present somewhere (so
3935 * we get here via enabled static key), remaining devices
3936 * that are not configured with an ingress qdisc will bail
3942 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
3946 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
3947 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_INGRESS
);
3948 qdisc_bstats_cpu_update(cl
->q
, skb
);
3950 switch (tc_classify(skb
, cl
, &cl_res
, false)) {
3952 case TC_ACT_RECLASSIFY
:
3953 skb
->tc_index
= TC_H_MIN(cl_res
.classid
);
3956 qdisc_qstats_cpu_drop(cl
->q
);
3963 case TC_ACT_REDIRECT
:
3964 /* skb_mac_header check was done by cls/act_bpf, so
3965 * we can safely push the L2 header back before
3966 * redirecting to another netdev
3968 __skb_push(skb
, skb
->mac_len
);
3969 skb_do_redirect(skb
);
3974 #endif /* CONFIG_NET_CLS_ACT */
3979 * netdev_is_rx_handler_busy - check if receive handler is registered
3980 * @dev: device to check
3982 * Check if a receive handler is already registered for a given device.
3983 * Return true if there one.
3985 * The caller must hold the rtnl_mutex.
3987 bool netdev_is_rx_handler_busy(struct net_device
*dev
)
3990 return dev
&& rtnl_dereference(dev
->rx_handler
);
3992 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy
);
3995 * netdev_rx_handler_register - register receive handler
3996 * @dev: device to register a handler for
3997 * @rx_handler: receive handler to register
3998 * @rx_handler_data: data pointer that is used by rx handler
4000 * Register a receive handler for a device. This handler will then be
4001 * called from __netif_receive_skb. A negative errno code is returned
4004 * The caller must hold the rtnl_mutex.
4006 * For a general description of rx_handler, see enum rx_handler_result.
4008 int netdev_rx_handler_register(struct net_device
*dev
,
4009 rx_handler_func_t
*rx_handler
,
4010 void *rx_handler_data
)
4014 if (dev
->rx_handler
)
4017 /* Note: rx_handler_data must be set before rx_handler */
4018 rcu_assign_pointer(dev
->rx_handler_data
, rx_handler_data
);
4019 rcu_assign_pointer(dev
->rx_handler
, rx_handler
);
4023 EXPORT_SYMBOL_GPL(netdev_rx_handler_register
);
4026 * netdev_rx_handler_unregister - unregister receive handler
4027 * @dev: device to unregister a handler from
4029 * Unregister a receive handler from a device.
4031 * The caller must hold the rtnl_mutex.
4033 void netdev_rx_handler_unregister(struct net_device
*dev
)
4037 RCU_INIT_POINTER(dev
->rx_handler
, NULL
);
4038 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4039 * section has a guarantee to see a non NULL rx_handler_data
4043 RCU_INIT_POINTER(dev
->rx_handler_data
, NULL
);
4045 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister
);
4048 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4049 * the special handling of PFMEMALLOC skbs.
4051 static bool skb_pfmemalloc_protocol(struct sk_buff
*skb
)
4053 switch (skb
->protocol
) {
4054 case htons(ETH_P_ARP
):
4055 case htons(ETH_P_IP
):
4056 case htons(ETH_P_IPV6
):
4057 case htons(ETH_P_8021Q
):
4058 case htons(ETH_P_8021AD
):
4065 static inline int nf_ingress(struct sk_buff
*skb
, struct packet_type
**pt_prev
,
4066 int *ret
, struct net_device
*orig_dev
)
4068 #ifdef CONFIG_NETFILTER_INGRESS
4069 if (nf_hook_ingress_active(skb
)) {
4073 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
4078 ingress_retval
= nf_hook_ingress(skb
);
4080 return ingress_retval
;
4082 #endif /* CONFIG_NETFILTER_INGRESS */
4086 static int __netif_receive_skb_core(struct sk_buff
*skb
, bool pfmemalloc
)
4088 struct packet_type
*ptype
, *pt_prev
;
4089 rx_handler_func_t
*rx_handler
;
4090 struct net_device
*orig_dev
;
4091 bool deliver_exact
= false;
4092 int ret
= NET_RX_DROP
;
4095 net_timestamp_check(!netdev_tstamp_prequeue
, skb
);
4097 trace_netif_receive_skb(skb
);
4099 orig_dev
= skb
->dev
;
4101 skb_reset_network_header(skb
);
4102 if (!skb_transport_header_was_set(skb
))
4103 skb_reset_transport_header(skb
);
4104 skb_reset_mac_len(skb
);
4109 skb
->skb_iif
= skb
->dev
->ifindex
;
4111 __this_cpu_inc(softnet_data
.processed
);
4113 if (skb
->protocol
== cpu_to_be16(ETH_P_8021Q
) ||
4114 skb
->protocol
== cpu_to_be16(ETH_P_8021AD
)) {
4115 skb
= skb_vlan_untag(skb
);
4120 #ifdef CONFIG_NET_CLS_ACT
4121 if (skb
->tc_verd
& TC_NCLS
) {
4122 skb
->tc_verd
= CLR_TC_NCLS(skb
->tc_verd
);
4130 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
4132 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
4136 list_for_each_entry_rcu(ptype
, &skb
->dev
->ptype_all
, list
) {
4138 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
4143 #ifdef CONFIG_NET_INGRESS
4144 if (static_key_false(&ingress_needed
)) {
4145 skb
= sch_handle_ingress(skb
, &pt_prev
, &ret
, orig_dev
);
4149 if (nf_ingress(skb
, &pt_prev
, &ret
, orig_dev
) < 0)
4153 #ifdef CONFIG_NET_CLS_ACT
4157 if (pfmemalloc
&& !skb_pfmemalloc_protocol(skb
))
4160 if (skb_vlan_tag_present(skb
)) {
4162 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
4165 if (vlan_do_receive(&skb
))
4167 else if (unlikely(!skb
))
4171 rx_handler
= rcu_dereference(skb
->dev
->rx_handler
);
4174 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
4177 switch (rx_handler(&skb
)) {
4178 case RX_HANDLER_CONSUMED
:
4179 ret
= NET_RX_SUCCESS
;
4181 case RX_HANDLER_ANOTHER
:
4183 case RX_HANDLER_EXACT
:
4184 deliver_exact
= true;
4185 case RX_HANDLER_PASS
:
4192 if (unlikely(skb_vlan_tag_present(skb
))) {
4193 if (skb_vlan_tag_get_id(skb
))
4194 skb
->pkt_type
= PACKET_OTHERHOST
;
4195 /* Note: we might in the future use prio bits
4196 * and set skb->priority like in vlan_do_receive()
4197 * For the time being, just ignore Priority Code Point
4202 type
= skb
->protocol
;
4204 /* deliver only exact match when indicated */
4205 if (likely(!deliver_exact
)) {
4206 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
4207 &ptype_base
[ntohs(type
) &
4211 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
4212 &orig_dev
->ptype_specific
);
4214 if (unlikely(skb
->dev
!= orig_dev
)) {
4215 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
4216 &skb
->dev
->ptype_specific
);
4220 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
4223 ret
= pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
4227 atomic_long_inc(&skb
->dev
->rx_dropped
);
4229 atomic_long_inc(&skb
->dev
->rx_nohandler
);
4231 /* Jamal, now you will not able to escape explaining
4232 * me how you were going to use this. :-)
4241 static int __netif_receive_skb(struct sk_buff
*skb
)
4245 if (sk_memalloc_socks() && skb_pfmemalloc(skb
)) {
4246 unsigned long pflags
= current
->flags
;
4249 * PFMEMALLOC skbs are special, they should
4250 * - be delivered to SOCK_MEMALLOC sockets only
4251 * - stay away from userspace
4252 * - have bounded memory usage
4254 * Use PF_MEMALLOC as this saves us from propagating the allocation
4255 * context down to all allocation sites.
4257 current
->flags
|= PF_MEMALLOC
;
4258 ret
= __netif_receive_skb_core(skb
, true);
4259 tsk_restore_flags(current
, pflags
, PF_MEMALLOC
);
4261 ret
= __netif_receive_skb_core(skb
, false);
4266 static int netif_receive_skb_internal(struct sk_buff
*skb
)
4270 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
4272 if (skb_defer_rx_timestamp(skb
))
4273 return NET_RX_SUCCESS
;
4278 if (static_key_false(&rps_needed
)) {
4279 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
4280 int cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
4283 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
4289 ret
= __netif_receive_skb(skb
);
4295 * netif_receive_skb - process receive buffer from network
4296 * @skb: buffer to process
4298 * netif_receive_skb() is the main receive data processing function.
4299 * It always succeeds. The buffer may be dropped during processing
4300 * for congestion control or by the protocol layers.
4302 * This function may only be called from softirq context and interrupts
4303 * should be enabled.
4305 * Return values (usually ignored):
4306 * NET_RX_SUCCESS: no congestion
4307 * NET_RX_DROP: packet was dropped
4309 int netif_receive_skb(struct sk_buff
*skb
)
4311 trace_netif_receive_skb_entry(skb
);
4313 return netif_receive_skb_internal(skb
);
4315 EXPORT_SYMBOL(netif_receive_skb
);
4317 DEFINE_PER_CPU(struct work_struct
, flush_works
);
4319 /* Network device is going away, flush any packets still pending */
4320 static void flush_backlog(struct work_struct
*work
)
4322 struct sk_buff
*skb
, *tmp
;
4323 struct softnet_data
*sd
;
4326 sd
= this_cpu_ptr(&softnet_data
);
4328 local_irq_disable();
4330 skb_queue_walk_safe(&sd
->input_pkt_queue
, skb
, tmp
) {
4331 if (skb
->dev
->reg_state
== NETREG_UNREGISTERING
) {
4332 __skb_unlink(skb
, &sd
->input_pkt_queue
);
4334 input_queue_head_incr(sd
);
4340 skb_queue_walk_safe(&sd
->process_queue
, skb
, tmp
) {
4341 if (skb
->dev
->reg_state
== NETREG_UNREGISTERING
) {
4342 __skb_unlink(skb
, &sd
->process_queue
);
4344 input_queue_head_incr(sd
);
4350 static void flush_all_backlogs(void)
4356 for_each_online_cpu(cpu
)
4357 queue_work_on(cpu
, system_highpri_wq
,
4358 per_cpu_ptr(&flush_works
, cpu
));
4360 for_each_online_cpu(cpu
)
4361 flush_work(per_cpu_ptr(&flush_works
, cpu
));
4366 static int napi_gro_complete(struct sk_buff
*skb
)
4368 struct packet_offload
*ptype
;
4369 __be16 type
= skb
->protocol
;
4370 struct list_head
*head
= &offload_base
;
4373 BUILD_BUG_ON(sizeof(struct napi_gro_cb
) > sizeof(skb
->cb
));
4375 if (NAPI_GRO_CB(skb
)->count
== 1) {
4376 skb_shinfo(skb
)->gso_size
= 0;
4381 list_for_each_entry_rcu(ptype
, head
, list
) {
4382 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
4385 err
= ptype
->callbacks
.gro_complete(skb
, 0);
4391 WARN_ON(&ptype
->list
== head
);
4393 return NET_RX_SUCCESS
;
4397 return netif_receive_skb_internal(skb
);
4400 /* napi->gro_list contains packets ordered by age.
4401 * youngest packets at the head of it.
4402 * Complete skbs in reverse order to reduce latencies.
4404 void napi_gro_flush(struct napi_struct
*napi
, bool flush_old
)
4406 struct sk_buff
*skb
, *prev
= NULL
;
4408 /* scan list and build reverse chain */
4409 for (skb
= napi
->gro_list
; skb
!= NULL
; skb
= skb
->next
) {
4414 for (skb
= prev
; skb
; skb
= prev
) {
4417 if (flush_old
&& NAPI_GRO_CB(skb
)->age
== jiffies
)
4421 napi_gro_complete(skb
);
4425 napi
->gro_list
= NULL
;
4427 EXPORT_SYMBOL(napi_gro_flush
);
4429 static void gro_list_prepare(struct napi_struct
*napi
, struct sk_buff
*skb
)
4432 unsigned int maclen
= skb
->dev
->hard_header_len
;
4433 u32 hash
= skb_get_hash_raw(skb
);
4435 for (p
= napi
->gro_list
; p
; p
= p
->next
) {
4436 unsigned long diffs
;
4438 NAPI_GRO_CB(p
)->flush
= 0;
4440 if (hash
!= skb_get_hash_raw(p
)) {
4441 NAPI_GRO_CB(p
)->same_flow
= 0;
4445 diffs
= (unsigned long)p
->dev
^ (unsigned long)skb
->dev
;
4446 diffs
|= p
->vlan_tci
^ skb
->vlan_tci
;
4447 diffs
|= skb_metadata_dst_cmp(p
, skb
);
4448 if (maclen
== ETH_HLEN
)
4449 diffs
|= compare_ether_header(skb_mac_header(p
),
4450 skb_mac_header(skb
));
4452 diffs
= memcmp(skb_mac_header(p
),
4453 skb_mac_header(skb
),
4455 NAPI_GRO_CB(p
)->same_flow
= !diffs
;
4459 static void skb_gro_reset_offset(struct sk_buff
*skb
)
4461 const struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
4462 const skb_frag_t
*frag0
= &pinfo
->frags
[0];
4464 NAPI_GRO_CB(skb
)->data_offset
= 0;
4465 NAPI_GRO_CB(skb
)->frag0
= NULL
;
4466 NAPI_GRO_CB(skb
)->frag0_len
= 0;
4468 if (skb_mac_header(skb
) == skb_tail_pointer(skb
) &&
4470 !PageHighMem(skb_frag_page(frag0
))) {
4471 NAPI_GRO_CB(skb
)->frag0
= skb_frag_address(frag0
);
4472 NAPI_GRO_CB(skb
)->frag0_len
= min_t(unsigned int,
4473 skb_frag_size(frag0
),
4474 skb
->end
- skb
->tail
);
4478 static void gro_pull_from_frag0(struct sk_buff
*skb
, int grow
)
4480 struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
4482 BUG_ON(skb
->end
- skb
->tail
< grow
);
4484 memcpy(skb_tail_pointer(skb
), NAPI_GRO_CB(skb
)->frag0
, grow
);
4486 skb
->data_len
-= grow
;
4489 pinfo
->frags
[0].page_offset
+= grow
;
4490 skb_frag_size_sub(&pinfo
->frags
[0], grow
);
4492 if (unlikely(!skb_frag_size(&pinfo
->frags
[0]))) {
4493 skb_frag_unref(skb
, 0);
4494 memmove(pinfo
->frags
, pinfo
->frags
+ 1,
4495 --pinfo
->nr_frags
* sizeof(pinfo
->frags
[0]));
4499 static enum gro_result
dev_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
4501 struct sk_buff
**pp
= NULL
;
4502 struct packet_offload
*ptype
;
4503 __be16 type
= skb
->protocol
;
4504 struct list_head
*head
= &offload_base
;
4506 enum gro_result ret
;
4509 if (!(skb
->dev
->features
& NETIF_F_GRO
))
4515 gro_list_prepare(napi
, skb
);
4518 list_for_each_entry_rcu(ptype
, head
, list
) {
4519 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
4522 skb_set_network_header(skb
, skb_gro_offset(skb
));
4523 skb_reset_mac_len(skb
);
4524 NAPI_GRO_CB(skb
)->same_flow
= 0;
4525 NAPI_GRO_CB(skb
)->flush
= skb_is_gso(skb
) || skb_has_frag_list(skb
);
4526 NAPI_GRO_CB(skb
)->free
= 0;
4527 NAPI_GRO_CB(skb
)->encap_mark
= 0;
4528 NAPI_GRO_CB(skb
)->recursion_counter
= 0;
4529 NAPI_GRO_CB(skb
)->is_fou
= 0;
4530 NAPI_GRO_CB(skb
)->is_atomic
= 1;
4531 NAPI_GRO_CB(skb
)->gro_remcsum_start
= 0;
4533 /* Setup for GRO checksum validation */
4534 switch (skb
->ip_summed
) {
4535 case CHECKSUM_COMPLETE
:
4536 NAPI_GRO_CB(skb
)->csum
= skb
->csum
;
4537 NAPI_GRO_CB(skb
)->csum_valid
= 1;
4538 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
4540 case CHECKSUM_UNNECESSARY
:
4541 NAPI_GRO_CB(skb
)->csum_cnt
= skb
->csum_level
+ 1;
4542 NAPI_GRO_CB(skb
)->csum_valid
= 0;
4545 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
4546 NAPI_GRO_CB(skb
)->csum_valid
= 0;
4549 pp
= ptype
->callbacks
.gro_receive(&napi
->gro_list
, skb
);
4554 if (&ptype
->list
== head
)
4557 same_flow
= NAPI_GRO_CB(skb
)->same_flow
;
4558 ret
= NAPI_GRO_CB(skb
)->free
? GRO_MERGED_FREE
: GRO_MERGED
;
4561 struct sk_buff
*nskb
= *pp
;
4565 napi_gro_complete(nskb
);
4572 if (NAPI_GRO_CB(skb
)->flush
)
4575 if (unlikely(napi
->gro_count
>= MAX_GRO_SKBS
)) {
4576 struct sk_buff
*nskb
= napi
->gro_list
;
4578 /* locate the end of the list to select the 'oldest' flow */
4579 while (nskb
->next
) {
4585 napi_gro_complete(nskb
);
4589 NAPI_GRO_CB(skb
)->count
= 1;
4590 NAPI_GRO_CB(skb
)->age
= jiffies
;
4591 NAPI_GRO_CB(skb
)->last
= skb
;
4592 skb_shinfo(skb
)->gso_size
= skb_gro_len(skb
);
4593 skb
->next
= napi
->gro_list
;
4594 napi
->gro_list
= skb
;
4598 grow
= skb_gro_offset(skb
) - skb_headlen(skb
);
4600 gro_pull_from_frag0(skb
, grow
);
4609 struct packet_offload
*gro_find_receive_by_type(__be16 type
)
4611 struct list_head
*offload_head
= &offload_base
;
4612 struct packet_offload
*ptype
;
4614 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
4615 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
4621 EXPORT_SYMBOL(gro_find_receive_by_type
);
4623 struct packet_offload
*gro_find_complete_by_type(__be16 type
)
4625 struct list_head
*offload_head
= &offload_base
;
4626 struct packet_offload
*ptype
;
4628 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
4629 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
4635 EXPORT_SYMBOL(gro_find_complete_by_type
);
4637 static gro_result_t
napi_skb_finish(gro_result_t ret
, struct sk_buff
*skb
)
4641 if (netif_receive_skb_internal(skb
))
4649 case GRO_MERGED_FREE
:
4650 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
) {
4652 kmem_cache_free(skbuff_head_cache
, skb
);
4666 gro_result_t
napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
4668 skb_mark_napi_id(skb
, napi
);
4669 trace_napi_gro_receive_entry(skb
);
4671 skb_gro_reset_offset(skb
);
4673 return napi_skb_finish(dev_gro_receive(napi
, skb
), skb
);
4675 EXPORT_SYMBOL(napi_gro_receive
);
4677 static void napi_reuse_skb(struct napi_struct
*napi
, struct sk_buff
*skb
)
4679 if (unlikely(skb
->pfmemalloc
)) {
4683 __skb_pull(skb
, skb_headlen(skb
));
4684 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4685 skb_reserve(skb
, NET_SKB_PAD
+ NET_IP_ALIGN
- skb_headroom(skb
));
4687 skb
->dev
= napi
->dev
;
4689 skb
->encapsulation
= 0;
4690 skb_shinfo(skb
)->gso_type
= 0;
4691 skb
->truesize
= SKB_TRUESIZE(skb_end_offset(skb
));
4696 struct sk_buff
*napi_get_frags(struct napi_struct
*napi
)
4698 struct sk_buff
*skb
= napi
->skb
;
4701 skb
= napi_alloc_skb(napi
, GRO_MAX_HEAD
);
4704 skb_mark_napi_id(skb
, napi
);
4709 EXPORT_SYMBOL(napi_get_frags
);
4711 static gro_result_t
napi_frags_finish(struct napi_struct
*napi
,
4712 struct sk_buff
*skb
,
4718 __skb_push(skb
, ETH_HLEN
);
4719 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
4720 if (ret
== GRO_NORMAL
&& netif_receive_skb_internal(skb
))
4725 case GRO_MERGED_FREE
:
4726 napi_reuse_skb(napi
, skb
);
4736 /* Upper GRO stack assumes network header starts at gro_offset=0
4737 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4738 * We copy ethernet header into skb->data to have a common layout.
4740 static struct sk_buff
*napi_frags_skb(struct napi_struct
*napi
)
4742 struct sk_buff
*skb
= napi
->skb
;
4743 const struct ethhdr
*eth
;
4744 unsigned int hlen
= sizeof(*eth
);
4748 skb_reset_mac_header(skb
);
4749 skb_gro_reset_offset(skb
);
4751 eth
= skb_gro_header_fast(skb
, 0);
4752 if (unlikely(skb_gro_header_hard(skb
, hlen
))) {
4753 eth
= skb_gro_header_slow(skb
, hlen
, 0);
4754 if (unlikely(!eth
)) {
4755 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4756 __func__
, napi
->dev
->name
);
4757 napi_reuse_skb(napi
, skb
);
4761 gro_pull_from_frag0(skb
, hlen
);
4762 NAPI_GRO_CB(skb
)->frag0
+= hlen
;
4763 NAPI_GRO_CB(skb
)->frag0_len
-= hlen
;
4765 __skb_pull(skb
, hlen
);
4768 * This works because the only protocols we care about don't require
4770 * We'll fix it up properly in napi_frags_finish()
4772 skb
->protocol
= eth
->h_proto
;
4777 gro_result_t
napi_gro_frags(struct napi_struct
*napi
)
4779 struct sk_buff
*skb
= napi_frags_skb(napi
);
4784 trace_napi_gro_frags_entry(skb
);
4786 return napi_frags_finish(napi
, skb
, dev_gro_receive(napi
, skb
));
4788 EXPORT_SYMBOL(napi_gro_frags
);
4790 /* Compute the checksum from gro_offset and return the folded value
4791 * after adding in any pseudo checksum.
4793 __sum16
__skb_gro_checksum_complete(struct sk_buff
*skb
)
4798 wsum
= skb_checksum(skb
, skb_gro_offset(skb
), skb_gro_len(skb
), 0);
4800 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4801 sum
= csum_fold(csum_add(NAPI_GRO_CB(skb
)->csum
, wsum
));
4803 if (unlikely(skb
->ip_summed
== CHECKSUM_COMPLETE
) &&
4804 !skb
->csum_complete_sw
)
4805 netdev_rx_csum_fault(skb
->dev
);
4808 NAPI_GRO_CB(skb
)->csum
= wsum
;
4809 NAPI_GRO_CB(skb
)->csum_valid
= 1;
4813 EXPORT_SYMBOL(__skb_gro_checksum_complete
);
4816 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4817 * Note: called with local irq disabled, but exits with local irq enabled.
4819 static void net_rps_action_and_irq_enable(struct softnet_data
*sd
)
4822 struct softnet_data
*remsd
= sd
->rps_ipi_list
;
4825 sd
->rps_ipi_list
= NULL
;
4829 /* Send pending IPI's to kick RPS processing on remote cpus. */
4831 struct softnet_data
*next
= remsd
->rps_ipi_next
;
4833 if (cpu_online(remsd
->cpu
))
4834 smp_call_function_single_async(remsd
->cpu
,
4843 static bool sd_has_rps_ipi_waiting(struct softnet_data
*sd
)
4846 return sd
->rps_ipi_list
!= NULL
;
4852 static int process_backlog(struct napi_struct
*napi
, int quota
)
4854 struct softnet_data
*sd
= container_of(napi
, struct softnet_data
, backlog
);
4858 /* Check if we have pending ipi, its better to send them now,
4859 * not waiting net_rx_action() end.
4861 if (sd_has_rps_ipi_waiting(sd
)) {
4862 local_irq_disable();
4863 net_rps_action_and_irq_enable(sd
);
4866 napi
->weight
= weight_p
;
4868 struct sk_buff
*skb
;
4870 while ((skb
= __skb_dequeue(&sd
->process_queue
))) {
4872 __netif_receive_skb(skb
);
4874 input_queue_head_incr(sd
);
4875 if (++work
>= quota
)
4880 local_irq_disable();
4882 if (skb_queue_empty(&sd
->input_pkt_queue
)) {
4884 * Inline a custom version of __napi_complete().
4885 * only current cpu owns and manipulates this napi,
4886 * and NAPI_STATE_SCHED is the only possible flag set
4888 * We can use a plain write instead of clear_bit(),
4889 * and we dont need an smp_mb() memory barrier.
4894 skb_queue_splice_tail_init(&sd
->input_pkt_queue
,
4895 &sd
->process_queue
);
4905 * __napi_schedule - schedule for receive
4906 * @n: entry to schedule
4908 * The entry's receive function will be scheduled to run.
4909 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4911 void __napi_schedule(struct napi_struct
*n
)
4913 unsigned long flags
;
4915 local_irq_save(flags
);
4916 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
4917 local_irq_restore(flags
);
4919 EXPORT_SYMBOL(__napi_schedule
);
4922 * napi_schedule_prep - check if napi can be scheduled
4925 * Test if NAPI routine is already running, and if not mark
4926 * it as running. This is used as a condition variable
4927 * insure only one NAPI poll instance runs. We also make
4928 * sure there is no pending NAPI disable.
4930 bool napi_schedule_prep(struct napi_struct
*n
)
4932 unsigned long val
, new;
4935 val
= READ_ONCE(n
->state
);
4936 if (unlikely(val
& NAPIF_STATE_DISABLE
))
4938 new = val
| NAPIF_STATE_SCHED
;
4940 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4941 * This was suggested by Alexander Duyck, as compiler
4942 * emits better code than :
4943 * if (val & NAPIF_STATE_SCHED)
4944 * new |= NAPIF_STATE_MISSED;
4946 new |= (val
& NAPIF_STATE_SCHED
) / NAPIF_STATE_SCHED
*
4948 } while (cmpxchg(&n
->state
, val
, new) != val
);
4950 return !(val
& NAPIF_STATE_SCHED
);
4952 EXPORT_SYMBOL(napi_schedule_prep
);
4955 * __napi_schedule_irqoff - schedule for receive
4956 * @n: entry to schedule
4958 * Variant of __napi_schedule() assuming hard irqs are masked
4960 void __napi_schedule_irqoff(struct napi_struct
*n
)
4962 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
4964 EXPORT_SYMBOL(__napi_schedule_irqoff
);
4966 bool __napi_complete(struct napi_struct
*n
)
4968 BUG_ON(!test_bit(NAPI_STATE_SCHED
, &n
->state
));
4970 /* Some drivers call us directly, instead of calling
4971 * napi_complete_done().
4973 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL
, &n
->state
)))
4976 list_del_init(&n
->poll_list
);
4977 smp_mb__before_atomic();
4978 clear_bit(NAPI_STATE_SCHED
, &n
->state
);
4981 EXPORT_SYMBOL(__napi_complete
);
4983 bool napi_complete_done(struct napi_struct
*n
, int work_done
)
4985 unsigned long flags
, val
, new;
4988 * 1) Don't let napi dequeue from the cpu poll list
4989 * just in case its running on a different cpu.
4990 * 2) If we are busy polling, do nothing here, we have
4991 * the guarantee we will be called later.
4993 if (unlikely(n
->state
& (NAPIF_STATE_NPSVC
|
4994 NAPIF_STATE_IN_BUSY_POLL
)))
4998 unsigned long timeout
= 0;
5001 timeout
= n
->dev
->gro_flush_timeout
;
5004 hrtimer_start(&n
->timer
, ns_to_ktime(timeout
),
5005 HRTIMER_MODE_REL_PINNED
);
5007 napi_gro_flush(n
, false);
5009 if (unlikely(!list_empty(&n
->poll_list
))) {
5010 /* If n->poll_list is not empty, we need to mask irqs */
5011 local_irq_save(flags
);
5012 list_del_init(&n
->poll_list
);
5013 local_irq_restore(flags
);
5017 val
= READ_ONCE(n
->state
);
5019 WARN_ON_ONCE(!(val
& NAPIF_STATE_SCHED
));
5021 new = val
& ~(NAPIF_STATE_MISSED
| NAPIF_STATE_SCHED
);
5023 /* If STATE_MISSED was set, leave STATE_SCHED set,
5024 * because we will call napi->poll() one more time.
5025 * This C code was suggested by Alexander Duyck to help gcc.
5027 new |= (val
& NAPIF_STATE_MISSED
) / NAPIF_STATE_MISSED
*
5029 } while (cmpxchg(&n
->state
, val
, new) != val
);
5031 if (unlikely(val
& NAPIF_STATE_MISSED
)) {
5038 EXPORT_SYMBOL(napi_complete_done
);
5040 /* must be called under rcu_read_lock(), as we dont take a reference */
5041 static struct napi_struct
*napi_by_id(unsigned int napi_id
)
5043 unsigned int hash
= napi_id
% HASH_SIZE(napi_hash
);
5044 struct napi_struct
*napi
;
5046 hlist_for_each_entry_rcu(napi
, &napi_hash
[hash
], napi_hash_node
)
5047 if (napi
->napi_id
== napi_id
)
5053 #if defined(CONFIG_NET_RX_BUSY_POLL)
5055 #define BUSY_POLL_BUDGET 8
5057 static void busy_poll_stop(struct napi_struct
*napi
, void *have_poll_lock
)
5061 /* Busy polling means there is a high chance device driver hard irq
5062 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5063 * set in napi_schedule_prep().
5064 * Since we are about to call napi->poll() once more, we can safely
5065 * clear NAPI_STATE_MISSED.
5067 * Note: x86 could use a single "lock and ..." instruction
5068 * to perform these two clear_bit()
5070 clear_bit(NAPI_STATE_MISSED
, &napi
->state
);
5071 clear_bit(NAPI_STATE_IN_BUSY_POLL
, &napi
->state
);
5075 /* All we really want here is to re-enable device interrupts.
5076 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5078 rc
= napi
->poll(napi
, BUSY_POLL_BUDGET
);
5079 netpoll_poll_unlock(have_poll_lock
);
5080 if (rc
== BUSY_POLL_BUDGET
)
5081 __napi_schedule(napi
);
5083 if (local_softirq_pending())
5087 bool sk_busy_loop(struct sock
*sk
, int nonblock
)
5089 unsigned long end_time
= !nonblock
? sk_busy_loop_end_time(sk
) : 0;
5090 int (*napi_poll
)(struct napi_struct
*napi
, int budget
);
5091 int (*busy_poll
)(struct napi_struct
*dev
);
5092 void *have_poll_lock
= NULL
;
5093 struct napi_struct
*napi
;
5102 napi
= napi_by_id(sk
->sk_napi_id
);
5106 /* Note: ndo_busy_poll method is optional in linux-4.5 */
5107 busy_poll
= napi
->dev
->netdev_ops
->ndo_busy_poll
;
5114 rc
= busy_poll(napi
);
5118 unsigned long val
= READ_ONCE(napi
->state
);
5120 /* If multiple threads are competing for this napi,
5121 * we avoid dirtying napi->state as much as we can.
5123 if (val
& (NAPIF_STATE_DISABLE
| NAPIF_STATE_SCHED
|
5124 NAPIF_STATE_IN_BUSY_POLL
))
5126 if (cmpxchg(&napi
->state
, val
,
5127 val
| NAPIF_STATE_IN_BUSY_POLL
|
5128 NAPIF_STATE_SCHED
) != val
)
5130 have_poll_lock
= netpoll_poll_lock(napi
);
5131 napi_poll
= napi
->poll
;
5133 rc
= napi_poll(napi
, BUSY_POLL_BUDGET
);
5134 trace_napi_poll(napi
, rc
, BUSY_POLL_BUDGET
);
5137 __NET_ADD_STATS(sock_net(sk
),
5138 LINUX_MIB_BUSYPOLLRXPACKETS
, rc
);
5141 if (rc
== LL_FLUSH_FAILED
)
5142 break; /* permanent failure */
5144 if (nonblock
|| !skb_queue_empty(&sk
->sk_receive_queue
) ||
5145 busy_loop_timeout(end_time
))
5148 if (unlikely(need_resched())) {
5150 busy_poll_stop(napi
, have_poll_lock
);
5154 rc
= !skb_queue_empty(&sk
->sk_receive_queue
);
5155 if (rc
|| busy_loop_timeout(end_time
))
5162 busy_poll_stop(napi
, have_poll_lock
);
5164 rc
= !skb_queue_empty(&sk
->sk_receive_queue
);
5169 EXPORT_SYMBOL(sk_busy_loop
);
5171 #endif /* CONFIG_NET_RX_BUSY_POLL */
5173 static void napi_hash_add(struct napi_struct
*napi
)
5175 if (test_bit(NAPI_STATE_NO_BUSY_POLL
, &napi
->state
) ||
5176 test_and_set_bit(NAPI_STATE_HASHED
, &napi
->state
))
5179 spin_lock(&napi_hash_lock
);
5181 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5183 if (unlikely(++napi_gen_id
< NR_CPUS
+ 1))
5184 napi_gen_id
= NR_CPUS
+ 1;
5185 } while (napi_by_id(napi_gen_id
));
5186 napi
->napi_id
= napi_gen_id
;
5188 hlist_add_head_rcu(&napi
->napi_hash_node
,
5189 &napi_hash
[napi
->napi_id
% HASH_SIZE(napi_hash
)]);
5191 spin_unlock(&napi_hash_lock
);
5194 /* Warning : caller is responsible to make sure rcu grace period
5195 * is respected before freeing memory containing @napi
5197 bool napi_hash_del(struct napi_struct
*napi
)
5199 bool rcu_sync_needed
= false;
5201 spin_lock(&napi_hash_lock
);
5203 if (test_and_clear_bit(NAPI_STATE_HASHED
, &napi
->state
)) {
5204 rcu_sync_needed
= true;
5205 hlist_del_rcu(&napi
->napi_hash_node
);
5207 spin_unlock(&napi_hash_lock
);
5208 return rcu_sync_needed
;
5210 EXPORT_SYMBOL_GPL(napi_hash_del
);
5212 static enum hrtimer_restart
napi_watchdog(struct hrtimer
*timer
)
5214 struct napi_struct
*napi
;
5216 napi
= container_of(timer
, struct napi_struct
, timer
);
5218 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5219 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5221 if (napi
->gro_list
&& !napi_disable_pending(napi
) &&
5222 !test_and_set_bit(NAPI_STATE_SCHED
, &napi
->state
))
5223 __napi_schedule_irqoff(napi
);
5225 return HRTIMER_NORESTART
;
5228 void netif_napi_add(struct net_device
*dev
, struct napi_struct
*napi
,
5229 int (*poll
)(struct napi_struct
*, int), int weight
)
5231 INIT_LIST_HEAD(&napi
->poll_list
);
5232 hrtimer_init(&napi
->timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL_PINNED
);
5233 napi
->timer
.function
= napi_watchdog
;
5234 napi
->gro_count
= 0;
5235 napi
->gro_list
= NULL
;
5238 if (weight
> NAPI_POLL_WEIGHT
)
5239 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5241 napi
->weight
= weight
;
5242 list_add(&napi
->dev_list
, &dev
->napi_list
);
5244 #ifdef CONFIG_NETPOLL
5245 napi
->poll_owner
= -1;
5247 set_bit(NAPI_STATE_SCHED
, &napi
->state
);
5248 napi_hash_add(napi
);
5250 EXPORT_SYMBOL(netif_napi_add
);
5252 void napi_disable(struct napi_struct
*n
)
5255 set_bit(NAPI_STATE_DISABLE
, &n
->state
);
5257 while (test_and_set_bit(NAPI_STATE_SCHED
, &n
->state
))
5259 while (test_and_set_bit(NAPI_STATE_NPSVC
, &n
->state
))
5262 hrtimer_cancel(&n
->timer
);
5264 clear_bit(NAPI_STATE_DISABLE
, &n
->state
);
5266 EXPORT_SYMBOL(napi_disable
);
5268 /* Must be called in process context */
5269 void netif_napi_del(struct napi_struct
*napi
)
5272 if (napi_hash_del(napi
))
5274 list_del_init(&napi
->dev_list
);
5275 napi_free_frags(napi
);
5277 kfree_skb_list(napi
->gro_list
);
5278 napi
->gro_list
= NULL
;
5279 napi
->gro_count
= 0;
5281 EXPORT_SYMBOL(netif_napi_del
);
5283 static int napi_poll(struct napi_struct
*n
, struct list_head
*repoll
)
5288 list_del_init(&n
->poll_list
);
5290 have
= netpoll_poll_lock(n
);
5294 /* This NAPI_STATE_SCHED test is for avoiding a race
5295 * with netpoll's poll_napi(). Only the entity which
5296 * obtains the lock and sees NAPI_STATE_SCHED set will
5297 * actually make the ->poll() call. Therefore we avoid
5298 * accidentally calling ->poll() when NAPI is not scheduled.
5301 if (test_bit(NAPI_STATE_SCHED
, &n
->state
)) {
5302 work
= n
->poll(n
, weight
);
5303 trace_napi_poll(n
, work
, weight
);
5306 WARN_ON_ONCE(work
> weight
);
5308 if (likely(work
< weight
))
5311 /* Drivers must not modify the NAPI state if they
5312 * consume the entire weight. In such cases this code
5313 * still "owns" the NAPI instance and therefore can
5314 * move the instance around on the list at-will.
5316 if (unlikely(napi_disable_pending(n
))) {
5322 /* flush too old packets
5323 * If HZ < 1000, flush all packets.
5325 napi_gro_flush(n
, HZ
>= 1000);
5328 /* Some drivers may have called napi_schedule
5329 * prior to exhausting their budget.
5331 if (unlikely(!list_empty(&n
->poll_list
))) {
5332 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5333 n
->dev
? n
->dev
->name
: "backlog");
5337 list_add_tail(&n
->poll_list
, repoll
);
5340 netpoll_poll_unlock(have
);
5345 static __latent_entropy
void net_rx_action(struct softirq_action
*h
)
5347 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
5348 unsigned long time_limit
= jiffies
+ 2;
5349 int budget
= netdev_budget
;
5353 local_irq_disable();
5354 list_splice_init(&sd
->poll_list
, &list
);
5358 struct napi_struct
*n
;
5360 if (list_empty(&list
)) {
5361 if (!sd_has_rps_ipi_waiting(sd
) && list_empty(&repoll
))
5366 n
= list_first_entry(&list
, struct napi_struct
, poll_list
);
5367 budget
-= napi_poll(n
, &repoll
);
5369 /* If softirq window is exhausted then punt.
5370 * Allow this to run for 2 jiffies since which will allow
5371 * an average latency of 1.5/HZ.
5373 if (unlikely(budget
<= 0 ||
5374 time_after_eq(jiffies
, time_limit
))) {
5380 local_irq_disable();
5382 list_splice_tail_init(&sd
->poll_list
, &list
);
5383 list_splice_tail(&repoll
, &list
);
5384 list_splice(&list
, &sd
->poll_list
);
5385 if (!list_empty(&sd
->poll_list
))
5386 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
5388 net_rps_action_and_irq_enable(sd
);
5390 __kfree_skb_flush();
5393 struct netdev_adjacent
{
5394 struct net_device
*dev
;
5396 /* upper master flag, there can only be one master device per list */
5399 /* counter for the number of times this device was added to us */
5402 /* private field for the users */
5405 struct list_head list
;
5406 struct rcu_head rcu
;
5409 static struct netdev_adjacent
*__netdev_find_adj(struct net_device
*adj_dev
,
5410 struct list_head
*adj_list
)
5412 struct netdev_adjacent
*adj
;
5414 list_for_each_entry(adj
, adj_list
, list
) {
5415 if (adj
->dev
== adj_dev
)
5421 static int __netdev_has_upper_dev(struct net_device
*upper_dev
, void *data
)
5423 struct net_device
*dev
= data
;
5425 return upper_dev
== dev
;
5429 * netdev_has_upper_dev - Check if device is linked to an upper device
5431 * @upper_dev: upper device to check
5433 * Find out if a device is linked to specified upper device and return true
5434 * in case it is. Note that this checks only immediate upper device,
5435 * not through a complete stack of devices. The caller must hold the RTNL lock.
5437 bool netdev_has_upper_dev(struct net_device
*dev
,
5438 struct net_device
*upper_dev
)
5442 return netdev_walk_all_upper_dev_rcu(dev
, __netdev_has_upper_dev
,
5445 EXPORT_SYMBOL(netdev_has_upper_dev
);
5448 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5450 * @upper_dev: upper device to check
5452 * Find out if a device is linked to specified upper device and return true
5453 * in case it is. Note that this checks the entire upper device chain.
5454 * The caller must hold rcu lock.
5457 bool netdev_has_upper_dev_all_rcu(struct net_device
*dev
,
5458 struct net_device
*upper_dev
)
5460 return !!netdev_walk_all_upper_dev_rcu(dev
, __netdev_has_upper_dev
,
5463 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu
);
5466 * netdev_has_any_upper_dev - Check if device is linked to some device
5469 * Find out if a device is linked to an upper device and return true in case
5470 * it is. The caller must hold the RTNL lock.
5472 static bool netdev_has_any_upper_dev(struct net_device
*dev
)
5476 return !list_empty(&dev
->adj_list
.upper
);
5480 * netdev_master_upper_dev_get - Get master upper device
5483 * Find a master upper device and return pointer to it or NULL in case
5484 * it's not there. The caller must hold the RTNL lock.
5486 struct net_device
*netdev_master_upper_dev_get(struct net_device
*dev
)
5488 struct netdev_adjacent
*upper
;
5492 if (list_empty(&dev
->adj_list
.upper
))
5495 upper
= list_first_entry(&dev
->adj_list
.upper
,
5496 struct netdev_adjacent
, list
);
5497 if (likely(upper
->master
))
5501 EXPORT_SYMBOL(netdev_master_upper_dev_get
);
5504 * netdev_has_any_lower_dev - Check if device is linked to some device
5507 * Find out if a device is linked to a lower device and return true in case
5508 * it is. The caller must hold the RTNL lock.
5510 static bool netdev_has_any_lower_dev(struct net_device
*dev
)
5514 return !list_empty(&dev
->adj_list
.lower
);
5517 void *netdev_adjacent_get_private(struct list_head
*adj_list
)
5519 struct netdev_adjacent
*adj
;
5521 adj
= list_entry(adj_list
, struct netdev_adjacent
, list
);
5523 return adj
->private;
5525 EXPORT_SYMBOL(netdev_adjacent_get_private
);
5528 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5530 * @iter: list_head ** of the current position
5532 * Gets the next device from the dev's upper list, starting from iter
5533 * position. The caller must hold RCU read lock.
5535 struct net_device
*netdev_upper_get_next_dev_rcu(struct net_device
*dev
,
5536 struct list_head
**iter
)
5538 struct netdev_adjacent
*upper
;
5540 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5542 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
5544 if (&upper
->list
== &dev
->adj_list
.upper
)
5547 *iter
= &upper
->list
;
5551 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu
);
5553 static struct net_device
*netdev_next_upper_dev_rcu(struct net_device
*dev
,
5554 struct list_head
**iter
)
5556 struct netdev_adjacent
*upper
;
5558 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5560 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
5562 if (&upper
->list
== &dev
->adj_list
.upper
)
5565 *iter
= &upper
->list
;
5570 int netdev_walk_all_upper_dev_rcu(struct net_device
*dev
,
5571 int (*fn
)(struct net_device
*dev
,
5575 struct net_device
*udev
;
5576 struct list_head
*iter
;
5579 for (iter
= &dev
->adj_list
.upper
,
5580 udev
= netdev_next_upper_dev_rcu(dev
, &iter
);
5582 udev
= netdev_next_upper_dev_rcu(dev
, &iter
)) {
5583 /* first is the upper device itself */
5584 ret
= fn(udev
, data
);
5588 /* then look at all of its upper devices */
5589 ret
= netdev_walk_all_upper_dev_rcu(udev
, fn
, data
);
5596 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu
);
5599 * netdev_lower_get_next_private - Get the next ->private from the
5600 * lower neighbour list
5602 * @iter: list_head ** of the current position
5604 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5605 * list, starting from iter position. The caller must hold either hold the
5606 * RTNL lock or its own locking that guarantees that the neighbour lower
5607 * list will remain unchanged.
5609 void *netdev_lower_get_next_private(struct net_device
*dev
,
5610 struct list_head
**iter
)
5612 struct netdev_adjacent
*lower
;
5614 lower
= list_entry(*iter
, struct netdev_adjacent
, list
);
5616 if (&lower
->list
== &dev
->adj_list
.lower
)
5619 *iter
= lower
->list
.next
;
5621 return lower
->private;
5623 EXPORT_SYMBOL(netdev_lower_get_next_private
);
5626 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5627 * lower neighbour list, RCU
5630 * @iter: list_head ** of the current position
5632 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5633 * list, starting from iter position. The caller must hold RCU read lock.
5635 void *netdev_lower_get_next_private_rcu(struct net_device
*dev
,
5636 struct list_head
**iter
)
5638 struct netdev_adjacent
*lower
;
5640 WARN_ON_ONCE(!rcu_read_lock_held());
5642 lower
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
5644 if (&lower
->list
== &dev
->adj_list
.lower
)
5647 *iter
= &lower
->list
;
5649 return lower
->private;
5651 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu
);
5654 * netdev_lower_get_next - Get the next device from the lower neighbour
5657 * @iter: list_head ** of the current position
5659 * Gets the next netdev_adjacent from the dev's lower neighbour
5660 * list, starting from iter position. The caller must hold RTNL lock or
5661 * its own locking that guarantees that the neighbour lower
5662 * list will remain unchanged.
5664 void *netdev_lower_get_next(struct net_device
*dev
, struct list_head
**iter
)
5666 struct netdev_adjacent
*lower
;
5668 lower
= list_entry(*iter
, struct netdev_adjacent
, list
);
5670 if (&lower
->list
== &dev
->adj_list
.lower
)
5673 *iter
= lower
->list
.next
;
5677 EXPORT_SYMBOL(netdev_lower_get_next
);
5679 static struct net_device
*netdev_next_lower_dev(struct net_device
*dev
,
5680 struct list_head
**iter
)
5682 struct netdev_adjacent
*lower
;
5684 lower
= list_entry((*iter
)->next
, struct netdev_adjacent
, list
);
5686 if (&lower
->list
== &dev
->adj_list
.lower
)
5689 *iter
= &lower
->list
;
5694 int netdev_walk_all_lower_dev(struct net_device
*dev
,
5695 int (*fn
)(struct net_device
*dev
,
5699 struct net_device
*ldev
;
5700 struct list_head
*iter
;
5703 for (iter
= &dev
->adj_list
.lower
,
5704 ldev
= netdev_next_lower_dev(dev
, &iter
);
5706 ldev
= netdev_next_lower_dev(dev
, &iter
)) {
5707 /* first is the lower device itself */
5708 ret
= fn(ldev
, data
);
5712 /* then look at all of its lower devices */
5713 ret
= netdev_walk_all_lower_dev(ldev
, fn
, data
);
5720 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev
);
5722 static struct net_device
*netdev_next_lower_dev_rcu(struct net_device
*dev
,
5723 struct list_head
**iter
)
5725 struct netdev_adjacent
*lower
;
5727 lower
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
5728 if (&lower
->list
== &dev
->adj_list
.lower
)
5731 *iter
= &lower
->list
;
5736 int netdev_walk_all_lower_dev_rcu(struct net_device
*dev
,
5737 int (*fn
)(struct net_device
*dev
,
5741 struct net_device
*ldev
;
5742 struct list_head
*iter
;
5745 for (iter
= &dev
->adj_list
.lower
,
5746 ldev
= netdev_next_lower_dev_rcu(dev
, &iter
);
5748 ldev
= netdev_next_lower_dev_rcu(dev
, &iter
)) {
5749 /* first is the lower device itself */
5750 ret
= fn(ldev
, data
);
5754 /* then look at all of its lower devices */
5755 ret
= netdev_walk_all_lower_dev_rcu(ldev
, fn
, data
);
5762 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu
);
5765 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5766 * lower neighbour list, RCU
5770 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5771 * list. The caller must hold RCU read lock.
5773 void *netdev_lower_get_first_private_rcu(struct net_device
*dev
)
5775 struct netdev_adjacent
*lower
;
5777 lower
= list_first_or_null_rcu(&dev
->adj_list
.lower
,
5778 struct netdev_adjacent
, list
);
5780 return lower
->private;
5783 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu
);
5786 * netdev_master_upper_dev_get_rcu - Get master upper device
5789 * Find a master upper device and return pointer to it or NULL in case
5790 * it's not there. The caller must hold the RCU read lock.
5792 struct net_device
*netdev_master_upper_dev_get_rcu(struct net_device
*dev
)
5794 struct netdev_adjacent
*upper
;
5796 upper
= list_first_or_null_rcu(&dev
->adj_list
.upper
,
5797 struct netdev_adjacent
, list
);
5798 if (upper
&& likely(upper
->master
))
5802 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu
);
5804 static int netdev_adjacent_sysfs_add(struct net_device
*dev
,
5805 struct net_device
*adj_dev
,
5806 struct list_head
*dev_list
)
5808 char linkname
[IFNAMSIZ
+7];
5809 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
5810 "upper_%s" : "lower_%s", adj_dev
->name
);
5811 return sysfs_create_link(&(dev
->dev
.kobj
), &(adj_dev
->dev
.kobj
),
5814 static void netdev_adjacent_sysfs_del(struct net_device
*dev
,
5816 struct list_head
*dev_list
)
5818 char linkname
[IFNAMSIZ
+7];
5819 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
5820 "upper_%s" : "lower_%s", name
);
5821 sysfs_remove_link(&(dev
->dev
.kobj
), linkname
);
5824 static inline bool netdev_adjacent_is_neigh_list(struct net_device
*dev
,
5825 struct net_device
*adj_dev
,
5826 struct list_head
*dev_list
)
5828 return (dev_list
== &dev
->adj_list
.upper
||
5829 dev_list
== &dev
->adj_list
.lower
) &&
5830 net_eq(dev_net(dev
), dev_net(adj_dev
));
5833 static int __netdev_adjacent_dev_insert(struct net_device
*dev
,
5834 struct net_device
*adj_dev
,
5835 struct list_head
*dev_list
,
5836 void *private, bool master
)
5838 struct netdev_adjacent
*adj
;
5841 adj
= __netdev_find_adj(adj_dev
, dev_list
);
5845 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5846 dev
->name
, adj_dev
->name
, adj
->ref_nr
);
5851 adj
= kmalloc(sizeof(*adj
), GFP_KERNEL
);
5856 adj
->master
= master
;
5858 adj
->private = private;
5861 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5862 dev
->name
, adj_dev
->name
, adj
->ref_nr
, adj_dev
->name
);
5864 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
)) {
5865 ret
= netdev_adjacent_sysfs_add(dev
, adj_dev
, dev_list
);
5870 /* Ensure that master link is always the first item in list. */
5872 ret
= sysfs_create_link(&(dev
->dev
.kobj
),
5873 &(adj_dev
->dev
.kobj
), "master");
5875 goto remove_symlinks
;
5877 list_add_rcu(&adj
->list
, dev_list
);
5879 list_add_tail_rcu(&adj
->list
, dev_list
);
5885 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
5886 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
5894 static void __netdev_adjacent_dev_remove(struct net_device
*dev
,
5895 struct net_device
*adj_dev
,
5897 struct list_head
*dev_list
)
5899 struct netdev_adjacent
*adj
;
5901 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5902 dev
->name
, adj_dev
->name
, ref_nr
);
5904 adj
= __netdev_find_adj(adj_dev
, dev_list
);
5907 pr_err("Adjacency does not exist for device %s from %s\n",
5908 dev
->name
, adj_dev
->name
);
5913 if (adj
->ref_nr
> ref_nr
) {
5914 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5915 dev
->name
, adj_dev
->name
, ref_nr
,
5916 adj
->ref_nr
- ref_nr
);
5917 adj
->ref_nr
-= ref_nr
;
5922 sysfs_remove_link(&(dev
->dev
.kobj
), "master");
5924 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
5925 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
5927 list_del_rcu(&adj
->list
);
5928 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5929 adj_dev
->name
, dev
->name
, adj_dev
->name
);
5931 kfree_rcu(adj
, rcu
);
5934 static int __netdev_adjacent_dev_link_lists(struct net_device
*dev
,
5935 struct net_device
*upper_dev
,
5936 struct list_head
*up_list
,
5937 struct list_head
*down_list
,
5938 void *private, bool master
)
5942 ret
= __netdev_adjacent_dev_insert(dev
, upper_dev
, up_list
,
5947 ret
= __netdev_adjacent_dev_insert(upper_dev
, dev
, down_list
,
5950 __netdev_adjacent_dev_remove(dev
, upper_dev
, 1, up_list
);
5957 static void __netdev_adjacent_dev_unlink_lists(struct net_device
*dev
,
5958 struct net_device
*upper_dev
,
5960 struct list_head
*up_list
,
5961 struct list_head
*down_list
)
5963 __netdev_adjacent_dev_remove(dev
, upper_dev
, ref_nr
, up_list
);
5964 __netdev_adjacent_dev_remove(upper_dev
, dev
, ref_nr
, down_list
);
5967 static int __netdev_adjacent_dev_link_neighbour(struct net_device
*dev
,
5968 struct net_device
*upper_dev
,
5969 void *private, bool master
)
5971 return __netdev_adjacent_dev_link_lists(dev
, upper_dev
,
5972 &dev
->adj_list
.upper
,
5973 &upper_dev
->adj_list
.lower
,
5977 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device
*dev
,
5978 struct net_device
*upper_dev
)
5980 __netdev_adjacent_dev_unlink_lists(dev
, upper_dev
, 1,
5981 &dev
->adj_list
.upper
,
5982 &upper_dev
->adj_list
.lower
);
5985 static int __netdev_upper_dev_link(struct net_device
*dev
,
5986 struct net_device
*upper_dev
, bool master
,
5987 void *upper_priv
, void *upper_info
)
5989 struct netdev_notifier_changeupper_info changeupper_info
;
5994 if (dev
== upper_dev
)
5997 /* To prevent loops, check if dev is not upper device to upper_dev. */
5998 if (netdev_has_upper_dev(upper_dev
, dev
))
6001 if (netdev_has_upper_dev(dev
, upper_dev
))
6004 if (master
&& netdev_master_upper_dev_get(dev
))
6007 changeupper_info
.upper_dev
= upper_dev
;
6008 changeupper_info
.master
= master
;
6009 changeupper_info
.linking
= true;
6010 changeupper_info
.upper_info
= upper_info
;
6012 ret
= call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER
, dev
,
6013 &changeupper_info
.info
);
6014 ret
= notifier_to_errno(ret
);
6018 ret
= __netdev_adjacent_dev_link_neighbour(dev
, upper_dev
, upper_priv
,
6023 ret
= call_netdevice_notifiers_info(NETDEV_CHANGEUPPER
, dev
,
6024 &changeupper_info
.info
);
6025 ret
= notifier_to_errno(ret
);
6032 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
6038 * netdev_upper_dev_link - Add a link to the upper device
6040 * @upper_dev: new upper device
6042 * Adds a link to device which is upper to this one. The caller must hold
6043 * the RTNL lock. On a failure a negative errno code is returned.
6044 * On success the reference counts are adjusted and the function
6047 int netdev_upper_dev_link(struct net_device
*dev
,
6048 struct net_device
*upper_dev
)
6050 return __netdev_upper_dev_link(dev
, upper_dev
, false, NULL
, NULL
);
6052 EXPORT_SYMBOL(netdev_upper_dev_link
);
6055 * netdev_master_upper_dev_link - Add a master link to the upper device
6057 * @upper_dev: new upper device
6058 * @upper_priv: upper device private
6059 * @upper_info: upper info to be passed down via notifier
6061 * Adds a link to device which is upper to this one. In this case, only
6062 * one master upper device can be linked, although other non-master devices
6063 * might be linked as well. The caller must hold the RTNL lock.
6064 * On a failure a negative errno code is returned. On success the reference
6065 * counts are adjusted and the function returns zero.
6067 int netdev_master_upper_dev_link(struct net_device
*dev
,
6068 struct net_device
*upper_dev
,
6069 void *upper_priv
, void *upper_info
)
6071 return __netdev_upper_dev_link(dev
, upper_dev
, true,
6072 upper_priv
, upper_info
);
6074 EXPORT_SYMBOL(netdev_master_upper_dev_link
);
6077 * netdev_upper_dev_unlink - Removes a link to upper device
6079 * @upper_dev: new upper device
6081 * Removes a link to device which is upper to this one. The caller must hold
6084 void netdev_upper_dev_unlink(struct net_device
*dev
,
6085 struct net_device
*upper_dev
)
6087 struct netdev_notifier_changeupper_info changeupper_info
;
6090 changeupper_info
.upper_dev
= upper_dev
;
6091 changeupper_info
.master
= netdev_master_upper_dev_get(dev
) == upper_dev
;
6092 changeupper_info
.linking
= false;
6094 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER
, dev
,
6095 &changeupper_info
.info
);
6097 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
6099 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER
, dev
,
6100 &changeupper_info
.info
);
6102 EXPORT_SYMBOL(netdev_upper_dev_unlink
);
6105 * netdev_bonding_info_change - Dispatch event about slave change
6107 * @bonding_info: info to dispatch
6109 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6110 * The caller must hold the RTNL lock.
6112 void netdev_bonding_info_change(struct net_device
*dev
,
6113 struct netdev_bonding_info
*bonding_info
)
6115 struct netdev_notifier_bonding_info info
;
6117 memcpy(&info
.bonding_info
, bonding_info
,
6118 sizeof(struct netdev_bonding_info
));
6119 call_netdevice_notifiers_info(NETDEV_BONDING_INFO
, dev
,
6122 EXPORT_SYMBOL(netdev_bonding_info_change
);
6124 static void netdev_adjacent_add_links(struct net_device
*dev
)
6126 struct netdev_adjacent
*iter
;
6128 struct net
*net
= dev_net(dev
);
6130 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
6131 if (!net_eq(net
, dev_net(iter
->dev
)))
6133 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
6134 &iter
->dev
->adj_list
.lower
);
6135 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
6136 &dev
->adj_list
.upper
);
6139 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
6140 if (!net_eq(net
, dev_net(iter
->dev
)))
6142 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
6143 &iter
->dev
->adj_list
.upper
);
6144 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
6145 &dev
->adj_list
.lower
);
6149 static void netdev_adjacent_del_links(struct net_device
*dev
)
6151 struct netdev_adjacent
*iter
;
6153 struct net
*net
= dev_net(dev
);
6155 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
6156 if (!net_eq(net
, dev_net(iter
->dev
)))
6158 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
6159 &iter
->dev
->adj_list
.lower
);
6160 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
6161 &dev
->adj_list
.upper
);
6164 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
6165 if (!net_eq(net
, dev_net(iter
->dev
)))
6167 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
6168 &iter
->dev
->adj_list
.upper
);
6169 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
6170 &dev
->adj_list
.lower
);
6174 void netdev_adjacent_rename_links(struct net_device
*dev
, char *oldname
)
6176 struct netdev_adjacent
*iter
;
6178 struct net
*net
= dev_net(dev
);
6180 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
6181 if (!net_eq(net
, dev_net(iter
->dev
)))
6183 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
6184 &iter
->dev
->adj_list
.lower
);
6185 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
6186 &iter
->dev
->adj_list
.lower
);
6189 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
6190 if (!net_eq(net
, dev_net(iter
->dev
)))
6192 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
6193 &iter
->dev
->adj_list
.upper
);
6194 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
6195 &iter
->dev
->adj_list
.upper
);
6199 void *netdev_lower_dev_get_private(struct net_device
*dev
,
6200 struct net_device
*lower_dev
)
6202 struct netdev_adjacent
*lower
;
6206 lower
= __netdev_find_adj(lower_dev
, &dev
->adj_list
.lower
);
6210 return lower
->private;
6212 EXPORT_SYMBOL(netdev_lower_dev_get_private
);
6215 int dev_get_nest_level(struct net_device
*dev
)
6217 struct net_device
*lower
= NULL
;
6218 struct list_head
*iter
;
6224 netdev_for_each_lower_dev(dev
, lower
, iter
) {
6225 nest
= dev_get_nest_level(lower
);
6226 if (max_nest
< nest
)
6230 return max_nest
+ 1;
6232 EXPORT_SYMBOL(dev_get_nest_level
);
6235 * netdev_lower_change - Dispatch event about lower device state change
6236 * @lower_dev: device
6237 * @lower_state_info: state to dispatch
6239 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6240 * The caller must hold the RTNL lock.
6242 void netdev_lower_state_changed(struct net_device
*lower_dev
,
6243 void *lower_state_info
)
6245 struct netdev_notifier_changelowerstate_info changelowerstate_info
;
6248 changelowerstate_info
.lower_state_info
= lower_state_info
;
6249 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE
, lower_dev
,
6250 &changelowerstate_info
.info
);
6252 EXPORT_SYMBOL(netdev_lower_state_changed
);
6254 int netdev_default_l2upper_neigh_construct(struct net_device
*dev
,
6255 struct neighbour
*n
)
6257 struct net_device
*lower_dev
, *stop_dev
;
6258 struct list_head
*iter
;
6261 netdev_for_each_lower_dev(dev
, lower_dev
, iter
) {
6262 if (!lower_dev
->netdev_ops
->ndo_neigh_construct
)
6264 err
= lower_dev
->netdev_ops
->ndo_neigh_construct(lower_dev
, n
);
6266 stop_dev
= lower_dev
;
6273 netdev_for_each_lower_dev(dev
, lower_dev
, iter
) {
6274 if (lower_dev
== stop_dev
)
6276 if (!lower_dev
->netdev_ops
->ndo_neigh_destroy
)
6278 lower_dev
->netdev_ops
->ndo_neigh_destroy(lower_dev
, n
);
6282 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct
);
6284 void netdev_default_l2upper_neigh_destroy(struct net_device
*dev
,
6285 struct neighbour
*n
)
6287 struct net_device
*lower_dev
;
6288 struct list_head
*iter
;
6290 netdev_for_each_lower_dev(dev
, lower_dev
, iter
) {
6291 if (!lower_dev
->netdev_ops
->ndo_neigh_destroy
)
6293 lower_dev
->netdev_ops
->ndo_neigh_destroy(lower_dev
, n
);
6296 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy
);
6298 static void dev_change_rx_flags(struct net_device
*dev
, int flags
)
6300 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6302 if (ops
->ndo_change_rx_flags
)
6303 ops
->ndo_change_rx_flags(dev
, flags
);
6306 static int __dev_set_promiscuity(struct net_device
*dev
, int inc
, bool notify
)
6308 unsigned int old_flags
= dev
->flags
;
6314 dev
->flags
|= IFF_PROMISC
;
6315 dev
->promiscuity
+= inc
;
6316 if (dev
->promiscuity
== 0) {
6319 * If inc causes overflow, untouch promisc and return error.
6322 dev
->flags
&= ~IFF_PROMISC
;
6324 dev
->promiscuity
-= inc
;
6325 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6330 if (dev
->flags
!= old_flags
) {
6331 pr_info("device %s %s promiscuous mode\n",
6333 dev
->flags
& IFF_PROMISC
? "entered" : "left");
6334 if (audit_enabled
) {
6335 current_uid_gid(&uid
, &gid
);
6336 audit_log(current
->audit_context
, GFP_ATOMIC
,
6337 AUDIT_ANOM_PROMISCUOUS
,
6338 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6339 dev
->name
, (dev
->flags
& IFF_PROMISC
),
6340 (old_flags
& IFF_PROMISC
),
6341 from_kuid(&init_user_ns
, audit_get_loginuid(current
)),
6342 from_kuid(&init_user_ns
, uid
),
6343 from_kgid(&init_user_ns
, gid
),
6344 audit_get_sessionid(current
));
6347 dev_change_rx_flags(dev
, IFF_PROMISC
);
6350 __dev_notify_flags(dev
, old_flags
, IFF_PROMISC
);
6355 * dev_set_promiscuity - update promiscuity count on a device
6359 * Add or remove promiscuity from a device. While the count in the device
6360 * remains above zero the interface remains promiscuous. Once it hits zero
6361 * the device reverts back to normal filtering operation. A negative inc
6362 * value is used to drop promiscuity on the device.
6363 * Return 0 if successful or a negative errno code on error.
6365 int dev_set_promiscuity(struct net_device
*dev
, int inc
)
6367 unsigned int old_flags
= dev
->flags
;
6370 err
= __dev_set_promiscuity(dev
, inc
, true);
6373 if (dev
->flags
!= old_flags
)
6374 dev_set_rx_mode(dev
);
6377 EXPORT_SYMBOL(dev_set_promiscuity
);
6379 static int __dev_set_allmulti(struct net_device
*dev
, int inc
, bool notify
)
6381 unsigned int old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
6385 dev
->flags
|= IFF_ALLMULTI
;
6386 dev
->allmulti
+= inc
;
6387 if (dev
->allmulti
== 0) {
6390 * If inc causes overflow, untouch allmulti and return error.
6393 dev
->flags
&= ~IFF_ALLMULTI
;
6395 dev
->allmulti
-= inc
;
6396 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6401 if (dev
->flags
^ old_flags
) {
6402 dev_change_rx_flags(dev
, IFF_ALLMULTI
);
6403 dev_set_rx_mode(dev
);
6405 __dev_notify_flags(dev
, old_flags
,
6406 dev
->gflags
^ old_gflags
);
6412 * dev_set_allmulti - update allmulti count on a device
6416 * Add or remove reception of all multicast frames to a device. While the
6417 * count in the device remains above zero the interface remains listening
6418 * to all interfaces. Once it hits zero the device reverts back to normal
6419 * filtering operation. A negative @inc value is used to drop the counter
6420 * when releasing a resource needing all multicasts.
6421 * Return 0 if successful or a negative errno code on error.
6424 int dev_set_allmulti(struct net_device
*dev
, int inc
)
6426 return __dev_set_allmulti(dev
, inc
, true);
6428 EXPORT_SYMBOL(dev_set_allmulti
);
6431 * Upload unicast and multicast address lists to device and
6432 * configure RX filtering. When the device doesn't support unicast
6433 * filtering it is put in promiscuous mode while unicast addresses
6436 void __dev_set_rx_mode(struct net_device
*dev
)
6438 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6440 /* dev_open will call this function so the list will stay sane. */
6441 if (!(dev
->flags
&IFF_UP
))
6444 if (!netif_device_present(dev
))
6447 if (!(dev
->priv_flags
& IFF_UNICAST_FLT
)) {
6448 /* Unicast addresses changes may only happen under the rtnl,
6449 * therefore calling __dev_set_promiscuity here is safe.
6451 if (!netdev_uc_empty(dev
) && !dev
->uc_promisc
) {
6452 __dev_set_promiscuity(dev
, 1, false);
6453 dev
->uc_promisc
= true;
6454 } else if (netdev_uc_empty(dev
) && dev
->uc_promisc
) {
6455 __dev_set_promiscuity(dev
, -1, false);
6456 dev
->uc_promisc
= false;
6460 if (ops
->ndo_set_rx_mode
)
6461 ops
->ndo_set_rx_mode(dev
);
6464 void dev_set_rx_mode(struct net_device
*dev
)
6466 netif_addr_lock_bh(dev
);
6467 __dev_set_rx_mode(dev
);
6468 netif_addr_unlock_bh(dev
);
6472 * dev_get_flags - get flags reported to userspace
6475 * Get the combination of flag bits exported through APIs to userspace.
6477 unsigned int dev_get_flags(const struct net_device
*dev
)
6481 flags
= (dev
->flags
& ~(IFF_PROMISC
|
6486 (dev
->gflags
& (IFF_PROMISC
|
6489 if (netif_running(dev
)) {
6490 if (netif_oper_up(dev
))
6491 flags
|= IFF_RUNNING
;
6492 if (netif_carrier_ok(dev
))
6493 flags
|= IFF_LOWER_UP
;
6494 if (netif_dormant(dev
))
6495 flags
|= IFF_DORMANT
;
6500 EXPORT_SYMBOL(dev_get_flags
);
6502 int __dev_change_flags(struct net_device
*dev
, unsigned int flags
)
6504 unsigned int old_flags
= dev
->flags
;
6510 * Set the flags on our device.
6513 dev
->flags
= (flags
& (IFF_DEBUG
| IFF_NOTRAILERS
| IFF_NOARP
|
6514 IFF_DYNAMIC
| IFF_MULTICAST
| IFF_PORTSEL
|
6516 (dev
->flags
& (IFF_UP
| IFF_VOLATILE
| IFF_PROMISC
|
6520 * Load in the correct multicast list now the flags have changed.
6523 if ((old_flags
^ flags
) & IFF_MULTICAST
)
6524 dev_change_rx_flags(dev
, IFF_MULTICAST
);
6526 dev_set_rx_mode(dev
);
6529 * Have we downed the interface. We handle IFF_UP ourselves
6530 * according to user attempts to set it, rather than blindly
6535 if ((old_flags
^ flags
) & IFF_UP
)
6536 ret
= ((old_flags
& IFF_UP
) ? __dev_close
: __dev_open
)(dev
);
6538 if ((flags
^ dev
->gflags
) & IFF_PROMISC
) {
6539 int inc
= (flags
& IFF_PROMISC
) ? 1 : -1;
6540 unsigned int old_flags
= dev
->flags
;
6542 dev
->gflags
^= IFF_PROMISC
;
6544 if (__dev_set_promiscuity(dev
, inc
, false) >= 0)
6545 if (dev
->flags
!= old_flags
)
6546 dev_set_rx_mode(dev
);
6549 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6550 is important. Some (broken) drivers set IFF_PROMISC, when
6551 IFF_ALLMULTI is requested not asking us and not reporting.
6553 if ((flags
^ dev
->gflags
) & IFF_ALLMULTI
) {
6554 int inc
= (flags
& IFF_ALLMULTI
) ? 1 : -1;
6556 dev
->gflags
^= IFF_ALLMULTI
;
6557 __dev_set_allmulti(dev
, inc
, false);
6563 void __dev_notify_flags(struct net_device
*dev
, unsigned int old_flags
,
6564 unsigned int gchanges
)
6566 unsigned int changes
= dev
->flags
^ old_flags
;
6569 rtmsg_ifinfo(RTM_NEWLINK
, dev
, gchanges
, GFP_ATOMIC
);
6571 if (changes
& IFF_UP
) {
6572 if (dev
->flags
& IFF_UP
)
6573 call_netdevice_notifiers(NETDEV_UP
, dev
);
6575 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
6578 if (dev
->flags
& IFF_UP
&&
6579 (changes
& ~(IFF_UP
| IFF_PROMISC
| IFF_ALLMULTI
| IFF_VOLATILE
))) {
6580 struct netdev_notifier_change_info change_info
;
6582 change_info
.flags_changed
= changes
;
6583 call_netdevice_notifiers_info(NETDEV_CHANGE
, dev
,
6589 * dev_change_flags - change device settings
6591 * @flags: device state flags
6593 * Change settings on device based state flags. The flags are
6594 * in the userspace exported format.
6596 int dev_change_flags(struct net_device
*dev
, unsigned int flags
)
6599 unsigned int changes
, old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
6601 ret
= __dev_change_flags(dev
, flags
);
6605 changes
= (old_flags
^ dev
->flags
) | (old_gflags
^ dev
->gflags
);
6606 __dev_notify_flags(dev
, old_flags
, changes
);
6609 EXPORT_SYMBOL(dev_change_flags
);
6611 int __dev_set_mtu(struct net_device
*dev
, int new_mtu
)
6613 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6615 if (ops
->ndo_change_mtu
)
6616 return ops
->ndo_change_mtu(dev
, new_mtu
);
6621 EXPORT_SYMBOL(__dev_set_mtu
);
6624 * dev_set_mtu - Change maximum transfer unit
6626 * @new_mtu: new transfer unit
6628 * Change the maximum transfer size of the network device.
6630 int dev_set_mtu(struct net_device
*dev
, int new_mtu
)
6634 if (new_mtu
== dev
->mtu
)
6637 /* MTU must be positive, and in range */
6638 if (new_mtu
< 0 || new_mtu
< dev
->min_mtu
) {
6639 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6640 dev
->name
, new_mtu
, dev
->min_mtu
);
6644 if (dev
->max_mtu
> 0 && new_mtu
> dev
->max_mtu
) {
6645 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6646 dev
->name
, new_mtu
, dev
->max_mtu
);
6650 if (!netif_device_present(dev
))
6653 err
= call_netdevice_notifiers(NETDEV_PRECHANGEMTU
, dev
);
6654 err
= notifier_to_errno(err
);
6658 orig_mtu
= dev
->mtu
;
6659 err
= __dev_set_mtu(dev
, new_mtu
);
6662 err
= call_netdevice_notifiers(NETDEV_CHANGEMTU
, dev
);
6663 err
= notifier_to_errno(err
);
6665 /* setting mtu back and notifying everyone again,
6666 * so that they have a chance to revert changes.
6668 __dev_set_mtu(dev
, orig_mtu
);
6669 call_netdevice_notifiers(NETDEV_CHANGEMTU
, dev
);
6674 EXPORT_SYMBOL(dev_set_mtu
);
6677 * dev_set_group - Change group this device belongs to
6679 * @new_group: group this device should belong to
6681 void dev_set_group(struct net_device
*dev
, int new_group
)
6683 dev
->group
= new_group
;
6685 EXPORT_SYMBOL(dev_set_group
);
6688 * dev_set_mac_address - Change Media Access Control Address
6692 * Change the hardware (MAC) address of the device
6694 int dev_set_mac_address(struct net_device
*dev
, struct sockaddr
*sa
)
6696 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6699 if (!ops
->ndo_set_mac_address
)
6701 if (sa
->sa_family
!= dev
->type
)
6703 if (!netif_device_present(dev
))
6705 err
= ops
->ndo_set_mac_address(dev
, sa
);
6708 dev
->addr_assign_type
= NET_ADDR_SET
;
6709 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
6710 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
6713 EXPORT_SYMBOL(dev_set_mac_address
);
6716 * dev_change_carrier - Change device carrier
6718 * @new_carrier: new value
6720 * Change device carrier
6722 int dev_change_carrier(struct net_device
*dev
, bool new_carrier
)
6724 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6726 if (!ops
->ndo_change_carrier
)
6728 if (!netif_device_present(dev
))
6730 return ops
->ndo_change_carrier(dev
, new_carrier
);
6732 EXPORT_SYMBOL(dev_change_carrier
);
6735 * dev_get_phys_port_id - Get device physical port ID
6739 * Get device physical port ID
6741 int dev_get_phys_port_id(struct net_device
*dev
,
6742 struct netdev_phys_item_id
*ppid
)
6744 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6746 if (!ops
->ndo_get_phys_port_id
)
6748 return ops
->ndo_get_phys_port_id(dev
, ppid
);
6750 EXPORT_SYMBOL(dev_get_phys_port_id
);
6753 * dev_get_phys_port_name - Get device physical port name
6756 * @len: limit of bytes to copy to name
6758 * Get device physical port name
6760 int dev_get_phys_port_name(struct net_device
*dev
,
6761 char *name
, size_t len
)
6763 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6765 if (!ops
->ndo_get_phys_port_name
)
6767 return ops
->ndo_get_phys_port_name(dev
, name
, len
);
6769 EXPORT_SYMBOL(dev_get_phys_port_name
);
6772 * dev_change_proto_down - update protocol port state information
6774 * @proto_down: new value
6776 * This info can be used by switch drivers to set the phys state of the
6779 int dev_change_proto_down(struct net_device
*dev
, bool proto_down
)
6781 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6783 if (!ops
->ndo_change_proto_down
)
6785 if (!netif_device_present(dev
))
6787 return ops
->ndo_change_proto_down(dev
, proto_down
);
6789 EXPORT_SYMBOL(dev_change_proto_down
);
6792 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6794 * @fd: new program fd or negative value to clear
6795 * @flags: xdp-related flags
6797 * Set or clear a bpf program for a device
6799 int dev_change_xdp_fd(struct net_device
*dev
, int fd
, u32 flags
)
6801 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6802 struct bpf_prog
*prog
= NULL
;
6803 struct netdev_xdp xdp
;
6811 if (flags
& XDP_FLAGS_UPDATE_IF_NOEXIST
) {
6812 memset(&xdp
, 0, sizeof(xdp
));
6813 xdp
.command
= XDP_QUERY_PROG
;
6815 err
= ops
->ndo_xdp(dev
, &xdp
);
6818 if (xdp
.prog_attached
)
6822 prog
= bpf_prog_get_type(fd
, BPF_PROG_TYPE_XDP
);
6824 return PTR_ERR(prog
);
6827 memset(&xdp
, 0, sizeof(xdp
));
6828 xdp
.command
= XDP_SETUP_PROG
;
6831 err
= ops
->ndo_xdp(dev
, &xdp
);
6832 if (err
< 0 && prog
)
6837 EXPORT_SYMBOL(dev_change_xdp_fd
);
6840 * dev_new_index - allocate an ifindex
6841 * @net: the applicable net namespace
6843 * Returns a suitable unique value for a new device interface
6844 * number. The caller must hold the rtnl semaphore or the
6845 * dev_base_lock to be sure it remains unique.
6847 static int dev_new_index(struct net
*net
)
6849 int ifindex
= net
->ifindex
;
6853 if (!__dev_get_by_index(net
, ifindex
))
6854 return net
->ifindex
= ifindex
;
6858 /* Delayed registration/unregisteration */
6859 static LIST_HEAD(net_todo_list
);
6860 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq
);
6862 static void net_set_todo(struct net_device
*dev
)
6864 list_add_tail(&dev
->todo_list
, &net_todo_list
);
6865 dev_net(dev
)->dev_unreg_count
++;
6868 static void rollback_registered_many(struct list_head
*head
)
6870 struct net_device
*dev
, *tmp
;
6871 LIST_HEAD(close_head
);
6873 BUG_ON(dev_boot_phase
);
6876 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
) {
6877 /* Some devices call without registering
6878 * for initialization unwind. Remove those
6879 * devices and proceed with the remaining.
6881 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
6882 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6886 list_del(&dev
->unreg_list
);
6889 dev
->dismantle
= true;
6890 BUG_ON(dev
->reg_state
!= NETREG_REGISTERED
);
6893 /* If device is running, close it first. */
6894 list_for_each_entry(dev
, head
, unreg_list
)
6895 list_add_tail(&dev
->close_list
, &close_head
);
6896 dev_close_many(&close_head
, true);
6898 list_for_each_entry(dev
, head
, unreg_list
) {
6899 /* And unlink it from device chain. */
6900 unlist_netdevice(dev
);
6902 dev
->reg_state
= NETREG_UNREGISTERING
;
6904 flush_all_backlogs();
6908 list_for_each_entry(dev
, head
, unreg_list
) {
6909 struct sk_buff
*skb
= NULL
;
6911 /* Shutdown queueing discipline. */
6915 /* Notify protocols, that we are about to destroy
6916 this device. They should clean all the things.
6918 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6920 if (!dev
->rtnl_link_ops
||
6921 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
6922 skb
= rtmsg_ifinfo_build_skb(RTM_DELLINK
, dev
, ~0U,
6926 * Flush the unicast and multicast chains
6931 if (dev
->netdev_ops
->ndo_uninit
)
6932 dev
->netdev_ops
->ndo_uninit(dev
);
6935 rtmsg_ifinfo_send(skb
, dev
, GFP_KERNEL
);
6937 /* Notifier chain MUST detach us all upper devices. */
6938 WARN_ON(netdev_has_any_upper_dev(dev
));
6939 WARN_ON(netdev_has_any_lower_dev(dev
));
6941 /* Remove entries from kobject tree */
6942 netdev_unregister_kobject(dev
);
6944 /* Remove XPS queueing entries */
6945 netif_reset_xps_queues_gt(dev
, 0);
6951 list_for_each_entry(dev
, head
, unreg_list
)
6955 static void rollback_registered(struct net_device
*dev
)
6959 list_add(&dev
->unreg_list
, &single
);
6960 rollback_registered_many(&single
);
6964 static netdev_features_t
netdev_sync_upper_features(struct net_device
*lower
,
6965 struct net_device
*upper
, netdev_features_t features
)
6967 netdev_features_t upper_disables
= NETIF_F_UPPER_DISABLES
;
6968 netdev_features_t feature
;
6971 for_each_netdev_feature(&upper_disables
, feature_bit
) {
6972 feature
= __NETIF_F_BIT(feature_bit
);
6973 if (!(upper
->wanted_features
& feature
)
6974 && (features
& feature
)) {
6975 netdev_dbg(lower
, "Dropping feature %pNF, upper dev %s has it off.\n",
6976 &feature
, upper
->name
);
6977 features
&= ~feature
;
6984 static void netdev_sync_lower_features(struct net_device
*upper
,
6985 struct net_device
*lower
, netdev_features_t features
)
6987 netdev_features_t upper_disables
= NETIF_F_UPPER_DISABLES
;
6988 netdev_features_t feature
;
6991 for_each_netdev_feature(&upper_disables
, feature_bit
) {
6992 feature
= __NETIF_F_BIT(feature_bit
);
6993 if (!(features
& feature
) && (lower
->features
& feature
)) {
6994 netdev_dbg(upper
, "Disabling feature %pNF on lower dev %s.\n",
6995 &feature
, lower
->name
);
6996 lower
->wanted_features
&= ~feature
;
6997 netdev_update_features(lower
);
6999 if (unlikely(lower
->features
& feature
))
7000 netdev_WARN(upper
, "failed to disable %pNF on %s!\n",
7001 &feature
, lower
->name
);
7006 static netdev_features_t
netdev_fix_features(struct net_device
*dev
,
7007 netdev_features_t features
)
7009 /* Fix illegal checksum combinations */
7010 if ((features
& NETIF_F_HW_CSUM
) &&
7011 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
7012 netdev_warn(dev
, "mixed HW and IP checksum settings.\n");
7013 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
);
7016 /* TSO requires that SG is present as well. */
7017 if ((features
& NETIF_F_ALL_TSO
) && !(features
& NETIF_F_SG
)) {
7018 netdev_dbg(dev
, "Dropping TSO features since no SG feature.\n");
7019 features
&= ~NETIF_F_ALL_TSO
;
7022 if ((features
& NETIF_F_TSO
) && !(features
& NETIF_F_HW_CSUM
) &&
7023 !(features
& NETIF_F_IP_CSUM
)) {
7024 netdev_dbg(dev
, "Dropping TSO features since no CSUM feature.\n");
7025 features
&= ~NETIF_F_TSO
;
7026 features
&= ~NETIF_F_TSO_ECN
;
7029 if ((features
& NETIF_F_TSO6
) && !(features
& NETIF_F_HW_CSUM
) &&
7030 !(features
& NETIF_F_IPV6_CSUM
)) {
7031 netdev_dbg(dev
, "Dropping TSO6 features since no CSUM feature.\n");
7032 features
&= ~NETIF_F_TSO6
;
7035 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7036 if ((features
& NETIF_F_TSO_MANGLEID
) && !(features
& NETIF_F_TSO
))
7037 features
&= ~NETIF_F_TSO_MANGLEID
;
7039 /* TSO ECN requires that TSO is present as well. */
7040 if ((features
& NETIF_F_ALL_TSO
) == NETIF_F_TSO_ECN
)
7041 features
&= ~NETIF_F_TSO_ECN
;
7043 /* Software GSO depends on SG. */
7044 if ((features
& NETIF_F_GSO
) && !(features
& NETIF_F_SG
)) {
7045 netdev_dbg(dev
, "Dropping NETIF_F_GSO since no SG feature.\n");
7046 features
&= ~NETIF_F_GSO
;
7049 /* UFO needs SG and checksumming */
7050 if (features
& NETIF_F_UFO
) {
7051 /* maybe split UFO into V4 and V6? */
7052 if (!(features
& NETIF_F_HW_CSUM
) &&
7053 ((features
& (NETIF_F_IP_CSUM
| NETIF_F_IPV6_CSUM
)) !=
7054 (NETIF_F_IP_CSUM
| NETIF_F_IPV6_CSUM
))) {
7056 "Dropping NETIF_F_UFO since no checksum offload features.\n");
7057 features
&= ~NETIF_F_UFO
;
7060 if (!(features
& NETIF_F_SG
)) {
7062 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7063 features
&= ~NETIF_F_UFO
;
7067 /* GSO partial features require GSO partial be set */
7068 if ((features
& dev
->gso_partial_features
) &&
7069 !(features
& NETIF_F_GSO_PARTIAL
)) {
7071 "Dropping partially supported GSO features since no GSO partial.\n");
7072 features
&= ~dev
->gso_partial_features
;
7075 #ifdef CONFIG_NET_RX_BUSY_POLL
7076 if (dev
->netdev_ops
->ndo_busy_poll
)
7077 features
|= NETIF_F_BUSY_POLL
;
7080 features
&= ~NETIF_F_BUSY_POLL
;
7085 int __netdev_update_features(struct net_device
*dev
)
7087 struct net_device
*upper
, *lower
;
7088 netdev_features_t features
;
7089 struct list_head
*iter
;
7094 features
= netdev_get_wanted_features(dev
);
7096 if (dev
->netdev_ops
->ndo_fix_features
)
7097 features
= dev
->netdev_ops
->ndo_fix_features(dev
, features
);
7099 /* driver might be less strict about feature dependencies */
7100 features
= netdev_fix_features(dev
, features
);
7102 /* some features can't be enabled if they're off an an upper device */
7103 netdev_for_each_upper_dev_rcu(dev
, upper
, iter
)
7104 features
= netdev_sync_upper_features(dev
, upper
, features
);
7106 if (dev
->features
== features
)
7109 netdev_dbg(dev
, "Features changed: %pNF -> %pNF\n",
7110 &dev
->features
, &features
);
7112 if (dev
->netdev_ops
->ndo_set_features
)
7113 err
= dev
->netdev_ops
->ndo_set_features(dev
, features
);
7117 if (unlikely(err
< 0)) {
7119 "set_features() failed (%d); wanted %pNF, left %pNF\n",
7120 err
, &features
, &dev
->features
);
7121 /* return non-0 since some features might have changed and
7122 * it's better to fire a spurious notification than miss it
7128 /* some features must be disabled on lower devices when disabled
7129 * on an upper device (think: bonding master or bridge)
7131 netdev_for_each_lower_dev(dev
, lower
, iter
)
7132 netdev_sync_lower_features(dev
, lower
, features
);
7135 dev
->features
= features
;
7137 return err
< 0 ? 0 : 1;
7141 * netdev_update_features - recalculate device features
7142 * @dev: the device to check
7144 * Recalculate dev->features set and send notifications if it
7145 * has changed. Should be called after driver or hardware dependent
7146 * conditions might have changed that influence the features.
7148 void netdev_update_features(struct net_device
*dev
)
7150 if (__netdev_update_features(dev
))
7151 netdev_features_change(dev
);
7153 EXPORT_SYMBOL(netdev_update_features
);
7156 * netdev_change_features - recalculate device features
7157 * @dev: the device to check
7159 * Recalculate dev->features set and send notifications even
7160 * if they have not changed. Should be called instead of
7161 * netdev_update_features() if also dev->vlan_features might
7162 * have changed to allow the changes to be propagated to stacked
7165 void netdev_change_features(struct net_device
*dev
)
7167 __netdev_update_features(dev
);
7168 netdev_features_change(dev
);
7170 EXPORT_SYMBOL(netdev_change_features
);
7173 * netif_stacked_transfer_operstate - transfer operstate
7174 * @rootdev: the root or lower level device to transfer state from
7175 * @dev: the device to transfer operstate to
7177 * Transfer operational state from root to device. This is normally
7178 * called when a stacking relationship exists between the root
7179 * device and the device(a leaf device).
7181 void netif_stacked_transfer_operstate(const struct net_device
*rootdev
,
7182 struct net_device
*dev
)
7184 if (rootdev
->operstate
== IF_OPER_DORMANT
)
7185 netif_dormant_on(dev
);
7187 netif_dormant_off(dev
);
7189 if (netif_carrier_ok(rootdev
)) {
7190 if (!netif_carrier_ok(dev
))
7191 netif_carrier_on(dev
);
7193 if (netif_carrier_ok(dev
))
7194 netif_carrier_off(dev
);
7197 EXPORT_SYMBOL(netif_stacked_transfer_operstate
);
7200 static int netif_alloc_rx_queues(struct net_device
*dev
)
7202 unsigned int i
, count
= dev
->num_rx_queues
;
7203 struct netdev_rx_queue
*rx
;
7204 size_t sz
= count
* sizeof(*rx
);
7208 rx
= kzalloc(sz
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
7216 for (i
= 0; i
< count
; i
++)
7222 static void netdev_init_one_queue(struct net_device
*dev
,
7223 struct netdev_queue
*queue
, void *_unused
)
7225 /* Initialize queue lock */
7226 spin_lock_init(&queue
->_xmit_lock
);
7227 netdev_set_xmit_lockdep_class(&queue
->_xmit_lock
, dev
->type
);
7228 queue
->xmit_lock_owner
= -1;
7229 netdev_queue_numa_node_write(queue
, NUMA_NO_NODE
);
7232 dql_init(&queue
->dql
, HZ
);
7236 static void netif_free_tx_queues(struct net_device
*dev
)
7241 static int netif_alloc_netdev_queues(struct net_device
*dev
)
7243 unsigned int count
= dev
->num_tx_queues
;
7244 struct netdev_queue
*tx
;
7245 size_t sz
= count
* sizeof(*tx
);
7247 if (count
< 1 || count
> 0xffff)
7250 tx
= kzalloc(sz
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
7258 netdev_for_each_tx_queue(dev
, netdev_init_one_queue
, NULL
);
7259 spin_lock_init(&dev
->tx_global_lock
);
7264 void netif_tx_stop_all_queues(struct net_device
*dev
)
7268 for (i
= 0; i
< dev
->num_tx_queues
; i
++) {
7269 struct netdev_queue
*txq
= netdev_get_tx_queue(dev
, i
);
7270 netif_tx_stop_queue(txq
);
7273 EXPORT_SYMBOL(netif_tx_stop_all_queues
);
7276 * register_netdevice - register a network device
7277 * @dev: device to register
7279 * Take a completed network device structure and add it to the kernel
7280 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7281 * chain. 0 is returned on success. A negative errno code is returned
7282 * on a failure to set up the device, or if the name is a duplicate.
7284 * Callers must hold the rtnl semaphore. You may want
7285 * register_netdev() instead of this.
7288 * The locking appears insufficient to guarantee two parallel registers
7289 * will not get the same name.
7292 int register_netdevice(struct net_device
*dev
)
7295 struct net
*net
= dev_net(dev
);
7297 BUG_ON(dev_boot_phase
);
7302 /* When net_device's are persistent, this will be fatal. */
7303 BUG_ON(dev
->reg_state
!= NETREG_UNINITIALIZED
);
7306 spin_lock_init(&dev
->addr_list_lock
);
7307 netdev_set_addr_lockdep_class(dev
);
7309 ret
= dev_get_valid_name(net
, dev
, dev
->name
);
7313 /* Init, if this function is available */
7314 if (dev
->netdev_ops
->ndo_init
) {
7315 ret
= dev
->netdev_ops
->ndo_init(dev
);
7323 if (((dev
->hw_features
| dev
->features
) &
7324 NETIF_F_HW_VLAN_CTAG_FILTER
) &&
7325 (!dev
->netdev_ops
->ndo_vlan_rx_add_vid
||
7326 !dev
->netdev_ops
->ndo_vlan_rx_kill_vid
)) {
7327 netdev_WARN(dev
, "Buggy VLAN acceleration in driver!\n");
7334 dev
->ifindex
= dev_new_index(net
);
7335 else if (__dev_get_by_index(net
, dev
->ifindex
))
7338 /* Transfer changeable features to wanted_features and enable
7339 * software offloads (GSO and GRO).
7341 dev
->hw_features
|= NETIF_F_SOFT_FEATURES
;
7342 dev
->features
|= NETIF_F_SOFT_FEATURES
;
7343 dev
->wanted_features
= dev
->features
& dev
->hw_features
;
7345 if (!(dev
->flags
& IFF_LOOPBACK
))
7346 dev
->hw_features
|= NETIF_F_NOCACHE_COPY
;
7348 /* If IPv4 TCP segmentation offload is supported we should also
7349 * allow the device to enable segmenting the frame with the option
7350 * of ignoring a static IP ID value. This doesn't enable the
7351 * feature itself but allows the user to enable it later.
7353 if (dev
->hw_features
& NETIF_F_TSO
)
7354 dev
->hw_features
|= NETIF_F_TSO_MANGLEID
;
7355 if (dev
->vlan_features
& NETIF_F_TSO
)
7356 dev
->vlan_features
|= NETIF_F_TSO_MANGLEID
;
7357 if (dev
->mpls_features
& NETIF_F_TSO
)
7358 dev
->mpls_features
|= NETIF_F_TSO_MANGLEID
;
7359 if (dev
->hw_enc_features
& NETIF_F_TSO
)
7360 dev
->hw_enc_features
|= NETIF_F_TSO_MANGLEID
;
7362 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7364 dev
->vlan_features
|= NETIF_F_HIGHDMA
;
7366 /* Make NETIF_F_SG inheritable to tunnel devices.
7368 dev
->hw_enc_features
|= NETIF_F_SG
| NETIF_F_GSO_PARTIAL
;
7370 /* Make NETIF_F_SG inheritable to MPLS.
7372 dev
->mpls_features
|= NETIF_F_SG
;
7374 ret
= call_netdevice_notifiers(NETDEV_POST_INIT
, dev
);
7375 ret
= notifier_to_errno(ret
);
7379 ret
= netdev_register_kobject(dev
);
7382 dev
->reg_state
= NETREG_REGISTERED
;
7384 __netdev_update_features(dev
);
7387 * Default initial state at registry is that the
7388 * device is present.
7391 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
7393 linkwatch_init_dev(dev
);
7395 dev_init_scheduler(dev
);
7397 list_netdevice(dev
);
7398 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
7400 /* If the device has permanent device address, driver should
7401 * set dev_addr and also addr_assign_type should be set to
7402 * NET_ADDR_PERM (default value).
7404 if (dev
->addr_assign_type
== NET_ADDR_PERM
)
7405 memcpy(dev
->perm_addr
, dev
->dev_addr
, dev
->addr_len
);
7407 /* Notify protocols, that a new device appeared. */
7408 ret
= call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
7409 ret
= notifier_to_errno(ret
);
7411 rollback_registered(dev
);
7412 dev
->reg_state
= NETREG_UNREGISTERED
;
7415 * Prevent userspace races by waiting until the network
7416 * device is fully setup before sending notifications.
7418 if (!dev
->rtnl_link_ops
||
7419 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
7420 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
7426 if (dev
->netdev_ops
->ndo_uninit
)
7427 dev
->netdev_ops
->ndo_uninit(dev
);
7430 EXPORT_SYMBOL(register_netdevice
);
7433 * init_dummy_netdev - init a dummy network device for NAPI
7434 * @dev: device to init
7436 * This takes a network device structure and initialize the minimum
7437 * amount of fields so it can be used to schedule NAPI polls without
7438 * registering a full blown interface. This is to be used by drivers
7439 * that need to tie several hardware interfaces to a single NAPI
7440 * poll scheduler due to HW limitations.
7442 int init_dummy_netdev(struct net_device
*dev
)
7444 /* Clear everything. Note we don't initialize spinlocks
7445 * are they aren't supposed to be taken by any of the
7446 * NAPI code and this dummy netdev is supposed to be
7447 * only ever used for NAPI polls
7449 memset(dev
, 0, sizeof(struct net_device
));
7451 /* make sure we BUG if trying to hit standard
7452 * register/unregister code path
7454 dev
->reg_state
= NETREG_DUMMY
;
7456 /* NAPI wants this */
7457 INIT_LIST_HEAD(&dev
->napi_list
);
7459 /* a dummy interface is started by default */
7460 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
7461 set_bit(__LINK_STATE_START
, &dev
->state
);
7463 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7464 * because users of this 'device' dont need to change
7470 EXPORT_SYMBOL_GPL(init_dummy_netdev
);
7474 * register_netdev - register a network device
7475 * @dev: device to register
7477 * Take a completed network device structure and add it to the kernel
7478 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7479 * chain. 0 is returned on success. A negative errno code is returned
7480 * on a failure to set up the device, or if the name is a duplicate.
7482 * This is a wrapper around register_netdevice that takes the rtnl semaphore
7483 * and expands the device name if you passed a format string to
7486 int register_netdev(struct net_device
*dev
)
7491 err
= register_netdevice(dev
);
7495 EXPORT_SYMBOL(register_netdev
);
7497 int netdev_refcnt_read(const struct net_device
*dev
)
7501 for_each_possible_cpu(i
)
7502 refcnt
+= *per_cpu_ptr(dev
->pcpu_refcnt
, i
);
7505 EXPORT_SYMBOL(netdev_refcnt_read
);
7508 * netdev_wait_allrefs - wait until all references are gone.
7509 * @dev: target net_device
7511 * This is called when unregistering network devices.
7513 * Any protocol or device that holds a reference should register
7514 * for netdevice notification, and cleanup and put back the
7515 * reference if they receive an UNREGISTER event.
7516 * We can get stuck here if buggy protocols don't correctly
7519 static void netdev_wait_allrefs(struct net_device
*dev
)
7521 unsigned long rebroadcast_time
, warning_time
;
7524 linkwatch_forget_dev(dev
);
7526 rebroadcast_time
= warning_time
= jiffies
;
7527 refcnt
= netdev_refcnt_read(dev
);
7529 while (refcnt
!= 0) {
7530 if (time_after(jiffies
, rebroadcast_time
+ 1 * HZ
)) {
7533 /* Rebroadcast unregister notification */
7534 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
7540 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
7541 if (test_bit(__LINK_STATE_LINKWATCH_PENDING
,
7543 /* We must not have linkwatch events
7544 * pending on unregister. If this
7545 * happens, we simply run the queue
7546 * unscheduled, resulting in a noop
7549 linkwatch_run_queue();
7554 rebroadcast_time
= jiffies
;
7559 refcnt
= netdev_refcnt_read(dev
);
7561 if (time_after(jiffies
, warning_time
+ 10 * HZ
)) {
7562 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7564 warning_time
= jiffies
;
7573 * register_netdevice(x1);
7574 * register_netdevice(x2);
7576 * unregister_netdevice(y1);
7577 * unregister_netdevice(y2);
7583 * We are invoked by rtnl_unlock().
7584 * This allows us to deal with problems:
7585 * 1) We can delete sysfs objects which invoke hotplug
7586 * without deadlocking with linkwatch via keventd.
7587 * 2) Since we run with the RTNL semaphore not held, we can sleep
7588 * safely in order to wait for the netdev refcnt to drop to zero.
7590 * We must not return until all unregister events added during
7591 * the interval the lock was held have been completed.
7593 void netdev_run_todo(void)
7595 struct list_head list
;
7597 /* Snapshot list, allow later requests */
7598 list_replace_init(&net_todo_list
, &list
);
7603 /* Wait for rcu callbacks to finish before next phase */
7604 if (!list_empty(&list
))
7607 while (!list_empty(&list
)) {
7608 struct net_device
*dev
7609 = list_first_entry(&list
, struct net_device
, todo_list
);
7610 list_del(&dev
->todo_list
);
7613 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
7616 if (unlikely(dev
->reg_state
!= NETREG_UNREGISTERING
)) {
7617 pr_err("network todo '%s' but state %d\n",
7618 dev
->name
, dev
->reg_state
);
7623 dev
->reg_state
= NETREG_UNREGISTERED
;
7625 netdev_wait_allrefs(dev
);
7628 BUG_ON(netdev_refcnt_read(dev
));
7629 BUG_ON(!list_empty(&dev
->ptype_all
));
7630 BUG_ON(!list_empty(&dev
->ptype_specific
));
7631 WARN_ON(rcu_access_pointer(dev
->ip_ptr
));
7632 WARN_ON(rcu_access_pointer(dev
->ip6_ptr
));
7633 WARN_ON(dev
->dn_ptr
);
7635 if (dev
->destructor
)
7636 dev
->destructor(dev
);
7638 /* Report a network device has been unregistered */
7640 dev_net(dev
)->dev_unreg_count
--;
7642 wake_up(&netdev_unregistering_wq
);
7644 /* Free network device */
7645 kobject_put(&dev
->dev
.kobj
);
7649 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7650 * all the same fields in the same order as net_device_stats, with only
7651 * the type differing, but rtnl_link_stats64 may have additional fields
7652 * at the end for newer counters.
7654 void netdev_stats_to_stats64(struct rtnl_link_stats64
*stats64
,
7655 const struct net_device_stats
*netdev_stats
)
7657 #if BITS_PER_LONG == 64
7658 BUILD_BUG_ON(sizeof(*stats64
) < sizeof(*netdev_stats
));
7659 memcpy(stats64
, netdev_stats
, sizeof(*stats64
));
7660 /* zero out counters that only exist in rtnl_link_stats64 */
7661 memset((char *)stats64
+ sizeof(*netdev_stats
), 0,
7662 sizeof(*stats64
) - sizeof(*netdev_stats
));
7664 size_t i
, n
= sizeof(*netdev_stats
) / sizeof(unsigned long);
7665 const unsigned long *src
= (const unsigned long *)netdev_stats
;
7666 u64
*dst
= (u64
*)stats64
;
7668 BUILD_BUG_ON(n
> sizeof(*stats64
) / sizeof(u64
));
7669 for (i
= 0; i
< n
; i
++)
7671 /* zero out counters that only exist in rtnl_link_stats64 */
7672 memset((char *)stats64
+ n
* sizeof(u64
), 0,
7673 sizeof(*stats64
) - n
* sizeof(u64
));
7676 EXPORT_SYMBOL(netdev_stats_to_stats64
);
7679 * dev_get_stats - get network device statistics
7680 * @dev: device to get statistics from
7681 * @storage: place to store stats
7683 * Get network statistics from device. Return @storage.
7684 * The device driver may provide its own method by setting
7685 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7686 * otherwise the internal statistics structure is used.
7688 struct rtnl_link_stats64
*dev_get_stats(struct net_device
*dev
,
7689 struct rtnl_link_stats64
*storage
)
7691 const struct net_device_ops
*ops
= dev
->netdev_ops
;
7693 if (ops
->ndo_get_stats64
) {
7694 memset(storage
, 0, sizeof(*storage
));
7695 ops
->ndo_get_stats64(dev
, storage
);
7696 } else if (ops
->ndo_get_stats
) {
7697 netdev_stats_to_stats64(storage
, ops
->ndo_get_stats(dev
));
7699 netdev_stats_to_stats64(storage
, &dev
->stats
);
7701 storage
->rx_dropped
+= atomic_long_read(&dev
->rx_dropped
);
7702 storage
->tx_dropped
+= atomic_long_read(&dev
->tx_dropped
);
7703 storage
->rx_nohandler
+= atomic_long_read(&dev
->rx_nohandler
);
7706 EXPORT_SYMBOL(dev_get_stats
);
7708 struct netdev_queue
*dev_ingress_queue_create(struct net_device
*dev
)
7710 struct netdev_queue
*queue
= dev_ingress_queue(dev
);
7712 #ifdef CONFIG_NET_CLS_ACT
7715 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
7718 netdev_init_one_queue(dev
, queue
, NULL
);
7719 RCU_INIT_POINTER(queue
->qdisc
, &noop_qdisc
);
7720 queue
->qdisc_sleeping
= &noop_qdisc
;
7721 rcu_assign_pointer(dev
->ingress_queue
, queue
);
7726 static const struct ethtool_ops default_ethtool_ops
;
7728 void netdev_set_default_ethtool_ops(struct net_device
*dev
,
7729 const struct ethtool_ops
*ops
)
7731 if (dev
->ethtool_ops
== &default_ethtool_ops
)
7732 dev
->ethtool_ops
= ops
;
7734 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops
);
7736 void netdev_freemem(struct net_device
*dev
)
7738 char *addr
= (char *)dev
- dev
->padded
;
7744 * alloc_netdev_mqs - allocate network device
7745 * @sizeof_priv: size of private data to allocate space for
7746 * @name: device name format string
7747 * @name_assign_type: origin of device name
7748 * @setup: callback to initialize device
7749 * @txqs: the number of TX subqueues to allocate
7750 * @rxqs: the number of RX subqueues to allocate
7752 * Allocates a struct net_device with private data area for driver use
7753 * and performs basic initialization. Also allocates subqueue structs
7754 * for each queue on the device.
7756 struct net_device
*alloc_netdev_mqs(int sizeof_priv
, const char *name
,
7757 unsigned char name_assign_type
,
7758 void (*setup
)(struct net_device
*),
7759 unsigned int txqs
, unsigned int rxqs
)
7761 struct net_device
*dev
;
7763 struct net_device
*p
;
7765 BUG_ON(strlen(name
) >= sizeof(dev
->name
));
7768 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7774 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7779 alloc_size
= sizeof(struct net_device
);
7781 /* ensure 32-byte alignment of private area */
7782 alloc_size
= ALIGN(alloc_size
, NETDEV_ALIGN
);
7783 alloc_size
+= sizeof_priv
;
7785 /* ensure 32-byte alignment of whole construct */
7786 alloc_size
+= NETDEV_ALIGN
- 1;
7788 p
= kzalloc(alloc_size
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
7790 p
= vzalloc(alloc_size
);
7794 dev
= PTR_ALIGN(p
, NETDEV_ALIGN
);
7795 dev
->padded
= (char *)dev
- (char *)p
;
7797 dev
->pcpu_refcnt
= alloc_percpu(int);
7798 if (!dev
->pcpu_refcnt
)
7801 if (dev_addr_init(dev
))
7807 dev_net_set(dev
, &init_net
);
7809 dev
->gso_max_size
= GSO_MAX_SIZE
;
7810 dev
->gso_max_segs
= GSO_MAX_SEGS
;
7812 INIT_LIST_HEAD(&dev
->napi_list
);
7813 INIT_LIST_HEAD(&dev
->unreg_list
);
7814 INIT_LIST_HEAD(&dev
->close_list
);
7815 INIT_LIST_HEAD(&dev
->link_watch_list
);
7816 INIT_LIST_HEAD(&dev
->adj_list
.upper
);
7817 INIT_LIST_HEAD(&dev
->adj_list
.lower
);
7818 INIT_LIST_HEAD(&dev
->ptype_all
);
7819 INIT_LIST_HEAD(&dev
->ptype_specific
);
7820 #ifdef CONFIG_NET_SCHED
7821 hash_init(dev
->qdisc_hash
);
7823 dev
->priv_flags
= IFF_XMIT_DST_RELEASE
| IFF_XMIT_DST_RELEASE_PERM
;
7826 if (!dev
->tx_queue_len
) {
7827 dev
->priv_flags
|= IFF_NO_QUEUE
;
7828 dev
->tx_queue_len
= DEFAULT_TX_QUEUE_LEN
;
7831 dev
->num_tx_queues
= txqs
;
7832 dev
->real_num_tx_queues
= txqs
;
7833 if (netif_alloc_netdev_queues(dev
))
7837 dev
->num_rx_queues
= rxqs
;
7838 dev
->real_num_rx_queues
= rxqs
;
7839 if (netif_alloc_rx_queues(dev
))
7843 strcpy(dev
->name
, name
);
7844 dev
->name_assign_type
= name_assign_type
;
7845 dev
->group
= INIT_NETDEV_GROUP
;
7846 if (!dev
->ethtool_ops
)
7847 dev
->ethtool_ops
= &default_ethtool_ops
;
7849 nf_hook_ingress_init(dev
);
7858 free_percpu(dev
->pcpu_refcnt
);
7860 netdev_freemem(dev
);
7863 EXPORT_SYMBOL(alloc_netdev_mqs
);
7866 * free_netdev - free network device
7869 * This function does the last stage of destroying an allocated device
7870 * interface. The reference to the device object is released.
7871 * If this is the last reference then it will be freed.
7872 * Must be called in process context.
7874 void free_netdev(struct net_device
*dev
)
7876 struct napi_struct
*p
, *n
;
7879 netif_free_tx_queues(dev
);
7884 kfree(rcu_dereference_protected(dev
->ingress_queue
, 1));
7886 /* Flush device addresses */
7887 dev_addr_flush(dev
);
7889 list_for_each_entry_safe(p
, n
, &dev
->napi_list
, dev_list
)
7892 free_percpu(dev
->pcpu_refcnt
);
7893 dev
->pcpu_refcnt
= NULL
;
7895 /* Compatibility with error handling in drivers */
7896 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
7897 netdev_freemem(dev
);
7901 BUG_ON(dev
->reg_state
!= NETREG_UNREGISTERED
);
7902 dev
->reg_state
= NETREG_RELEASED
;
7904 /* will free via device release */
7905 put_device(&dev
->dev
);
7907 EXPORT_SYMBOL(free_netdev
);
7910 * synchronize_net - Synchronize with packet receive processing
7912 * Wait for packets currently being received to be done.
7913 * Does not block later packets from starting.
7915 void synchronize_net(void)
7918 if (rtnl_is_locked())
7919 synchronize_rcu_expedited();
7923 EXPORT_SYMBOL(synchronize_net
);
7926 * unregister_netdevice_queue - remove device from the kernel
7930 * This function shuts down a device interface and removes it
7931 * from the kernel tables.
7932 * If head not NULL, device is queued to be unregistered later.
7934 * Callers must hold the rtnl semaphore. You may want
7935 * unregister_netdev() instead of this.
7938 void unregister_netdevice_queue(struct net_device
*dev
, struct list_head
*head
)
7943 list_move_tail(&dev
->unreg_list
, head
);
7945 rollback_registered(dev
);
7946 /* Finish processing unregister after unlock */
7950 EXPORT_SYMBOL(unregister_netdevice_queue
);
7953 * unregister_netdevice_many - unregister many devices
7954 * @head: list of devices
7956 * Note: As most callers use a stack allocated list_head,
7957 * we force a list_del() to make sure stack wont be corrupted later.
7959 void unregister_netdevice_many(struct list_head
*head
)
7961 struct net_device
*dev
;
7963 if (!list_empty(head
)) {
7964 rollback_registered_many(head
);
7965 list_for_each_entry(dev
, head
, unreg_list
)
7970 EXPORT_SYMBOL(unregister_netdevice_many
);
7973 * unregister_netdev - remove device from the kernel
7976 * This function shuts down a device interface and removes it
7977 * from the kernel tables.
7979 * This is just a wrapper for unregister_netdevice that takes
7980 * the rtnl semaphore. In general you want to use this and not
7981 * unregister_netdevice.
7983 void unregister_netdev(struct net_device
*dev
)
7986 unregister_netdevice(dev
);
7989 EXPORT_SYMBOL(unregister_netdev
);
7992 * dev_change_net_namespace - move device to different nethost namespace
7994 * @net: network namespace
7995 * @pat: If not NULL name pattern to try if the current device name
7996 * is already taken in the destination network namespace.
7998 * This function shuts down a device interface and moves it
7999 * to a new network namespace. On success 0 is returned, on
8000 * a failure a netagive errno code is returned.
8002 * Callers must hold the rtnl semaphore.
8005 int dev_change_net_namespace(struct net_device
*dev
, struct net
*net
, const char *pat
)
8011 /* Don't allow namespace local devices to be moved. */
8013 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
8016 /* Ensure the device has been registrered */
8017 if (dev
->reg_state
!= NETREG_REGISTERED
)
8020 /* Get out if there is nothing todo */
8022 if (net_eq(dev_net(dev
), net
))
8025 /* Pick the destination device name, and ensure
8026 * we can use it in the destination network namespace.
8029 if (__dev_get_by_name(net
, dev
->name
)) {
8030 /* We get here if we can't use the current device name */
8033 if (dev_get_valid_name(net
, dev
, pat
) < 0)
8038 * And now a mini version of register_netdevice unregister_netdevice.
8041 /* If device is running close it first. */
8044 /* And unlink it from device chain */
8046 unlist_netdevice(dev
);
8050 /* Shutdown queueing discipline. */
8053 /* Notify protocols, that we are about to destroy
8054 this device. They should clean all the things.
8056 Note that dev->reg_state stays at NETREG_REGISTERED.
8057 This is wanted because this way 8021q and macvlan know
8058 the device is just moving and can keep their slaves up.
8060 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
8062 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
8063 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U, GFP_KERNEL
);
8066 * Flush the unicast and multicast chains
8071 /* Send a netdev-removed uevent to the old namespace */
8072 kobject_uevent(&dev
->dev
.kobj
, KOBJ_REMOVE
);
8073 netdev_adjacent_del_links(dev
);
8075 /* Actually switch the network namespace */
8076 dev_net_set(dev
, net
);
8078 /* If there is an ifindex conflict assign a new one */
8079 if (__dev_get_by_index(net
, dev
->ifindex
))
8080 dev
->ifindex
= dev_new_index(net
);
8082 /* Send a netdev-add uevent to the new namespace */
8083 kobject_uevent(&dev
->dev
.kobj
, KOBJ_ADD
);
8084 netdev_adjacent_add_links(dev
);
8086 /* Fixup kobjects */
8087 err
= device_rename(&dev
->dev
, dev
->name
);
8090 /* Add the device back in the hashes */
8091 list_netdevice(dev
);
8093 /* Notify protocols, that a new device appeared. */
8094 call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
8097 * Prevent userspace races by waiting until the network
8098 * device is fully setup before sending notifications.
8100 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
8107 EXPORT_SYMBOL_GPL(dev_change_net_namespace
);
8109 static int dev_cpu_dead(unsigned int oldcpu
)
8111 struct sk_buff
**list_skb
;
8112 struct sk_buff
*skb
;
8114 struct softnet_data
*sd
, *oldsd
;
8116 local_irq_disable();
8117 cpu
= smp_processor_id();
8118 sd
= &per_cpu(softnet_data
, cpu
);
8119 oldsd
= &per_cpu(softnet_data
, oldcpu
);
8121 /* Find end of our completion_queue. */
8122 list_skb
= &sd
->completion_queue
;
8124 list_skb
= &(*list_skb
)->next
;
8125 /* Append completion queue from offline CPU. */
8126 *list_skb
= oldsd
->completion_queue
;
8127 oldsd
->completion_queue
= NULL
;
8129 /* Append output queue from offline CPU. */
8130 if (oldsd
->output_queue
) {
8131 *sd
->output_queue_tailp
= oldsd
->output_queue
;
8132 sd
->output_queue_tailp
= oldsd
->output_queue_tailp
;
8133 oldsd
->output_queue
= NULL
;
8134 oldsd
->output_queue_tailp
= &oldsd
->output_queue
;
8136 /* Append NAPI poll list from offline CPU, with one exception :
8137 * process_backlog() must be called by cpu owning percpu backlog.
8138 * We properly handle process_queue & input_pkt_queue later.
8140 while (!list_empty(&oldsd
->poll_list
)) {
8141 struct napi_struct
*napi
= list_first_entry(&oldsd
->poll_list
,
8145 list_del_init(&napi
->poll_list
);
8146 if (napi
->poll
== process_backlog
)
8149 ____napi_schedule(sd
, napi
);
8152 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
8155 /* Process offline CPU's input_pkt_queue */
8156 while ((skb
= __skb_dequeue(&oldsd
->process_queue
))) {
8158 input_queue_head_incr(oldsd
);
8160 while ((skb
= skb_dequeue(&oldsd
->input_pkt_queue
))) {
8162 input_queue_head_incr(oldsd
);
8169 * netdev_increment_features - increment feature set by one
8170 * @all: current feature set
8171 * @one: new feature set
8172 * @mask: mask feature set
8174 * Computes a new feature set after adding a device with feature set
8175 * @one to the master device with current feature set @all. Will not
8176 * enable anything that is off in @mask. Returns the new feature set.
8178 netdev_features_t
netdev_increment_features(netdev_features_t all
,
8179 netdev_features_t one
, netdev_features_t mask
)
8181 if (mask
& NETIF_F_HW_CSUM
)
8182 mask
|= NETIF_F_CSUM_MASK
;
8183 mask
|= NETIF_F_VLAN_CHALLENGED
;
8185 all
|= one
& (NETIF_F_ONE_FOR_ALL
| NETIF_F_CSUM_MASK
) & mask
;
8186 all
&= one
| ~NETIF_F_ALL_FOR_ALL
;
8188 /* If one device supports hw checksumming, set for all. */
8189 if (all
& NETIF_F_HW_CSUM
)
8190 all
&= ~(NETIF_F_CSUM_MASK
& ~NETIF_F_HW_CSUM
);
8194 EXPORT_SYMBOL(netdev_increment_features
);
8196 static struct hlist_head
* __net_init
netdev_create_hash(void)
8199 struct hlist_head
*hash
;
8201 hash
= kmalloc(sizeof(*hash
) * NETDEV_HASHENTRIES
, GFP_KERNEL
);
8203 for (i
= 0; i
< NETDEV_HASHENTRIES
; i
++)
8204 INIT_HLIST_HEAD(&hash
[i
]);
8209 /* Initialize per network namespace state */
8210 static int __net_init
netdev_init(struct net
*net
)
8212 if (net
!= &init_net
)
8213 INIT_LIST_HEAD(&net
->dev_base_head
);
8215 net
->dev_name_head
= netdev_create_hash();
8216 if (net
->dev_name_head
== NULL
)
8219 net
->dev_index_head
= netdev_create_hash();
8220 if (net
->dev_index_head
== NULL
)
8226 kfree(net
->dev_name_head
);
8232 * netdev_drivername - network driver for the device
8233 * @dev: network device
8235 * Determine network driver for device.
8237 const char *netdev_drivername(const struct net_device
*dev
)
8239 const struct device_driver
*driver
;
8240 const struct device
*parent
;
8241 const char *empty
= "";
8243 parent
= dev
->dev
.parent
;
8247 driver
= parent
->driver
;
8248 if (driver
&& driver
->name
)
8249 return driver
->name
;
8253 static void __netdev_printk(const char *level
, const struct net_device
*dev
,
8254 struct va_format
*vaf
)
8256 if (dev
&& dev
->dev
.parent
) {
8257 dev_printk_emit(level
[1] - '0',
8260 dev_driver_string(dev
->dev
.parent
),
8261 dev_name(dev
->dev
.parent
),
8262 netdev_name(dev
), netdev_reg_state(dev
),
8265 printk("%s%s%s: %pV",
8266 level
, netdev_name(dev
), netdev_reg_state(dev
), vaf
);
8268 printk("%s(NULL net_device): %pV", level
, vaf
);
8272 void netdev_printk(const char *level
, const struct net_device
*dev
,
8273 const char *format
, ...)
8275 struct va_format vaf
;
8278 va_start(args
, format
);
8283 __netdev_printk(level
, dev
, &vaf
);
8287 EXPORT_SYMBOL(netdev_printk
);
8289 #define define_netdev_printk_level(func, level) \
8290 void func(const struct net_device *dev, const char *fmt, ...) \
8292 struct va_format vaf; \
8295 va_start(args, fmt); \
8300 __netdev_printk(level, dev, &vaf); \
8304 EXPORT_SYMBOL(func);
8306 define_netdev_printk_level(netdev_emerg
, KERN_EMERG
);
8307 define_netdev_printk_level(netdev_alert
, KERN_ALERT
);
8308 define_netdev_printk_level(netdev_crit
, KERN_CRIT
);
8309 define_netdev_printk_level(netdev_err
, KERN_ERR
);
8310 define_netdev_printk_level(netdev_warn
, KERN_WARNING
);
8311 define_netdev_printk_level(netdev_notice
, KERN_NOTICE
);
8312 define_netdev_printk_level(netdev_info
, KERN_INFO
);
8314 static void __net_exit
netdev_exit(struct net
*net
)
8316 kfree(net
->dev_name_head
);
8317 kfree(net
->dev_index_head
);
8320 static struct pernet_operations __net_initdata netdev_net_ops
= {
8321 .init
= netdev_init
,
8322 .exit
= netdev_exit
,
8325 static void __net_exit
default_device_exit(struct net
*net
)
8327 struct net_device
*dev
, *aux
;
8329 * Push all migratable network devices back to the
8330 * initial network namespace
8333 for_each_netdev_safe(net
, dev
, aux
) {
8335 char fb_name
[IFNAMSIZ
];
8337 /* Ignore unmoveable devices (i.e. loopback) */
8338 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
8341 /* Leave virtual devices for the generic cleanup */
8342 if (dev
->rtnl_link_ops
)
8345 /* Push remaining network devices to init_net */
8346 snprintf(fb_name
, IFNAMSIZ
, "dev%d", dev
->ifindex
);
8347 err
= dev_change_net_namespace(dev
, &init_net
, fb_name
);
8349 pr_emerg("%s: failed to move %s to init_net: %d\n",
8350 __func__
, dev
->name
, err
);
8357 static void __net_exit
rtnl_lock_unregistering(struct list_head
*net_list
)
8359 /* Return with the rtnl_lock held when there are no network
8360 * devices unregistering in any network namespace in net_list.
8364 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
8366 add_wait_queue(&netdev_unregistering_wq
, &wait
);
8368 unregistering
= false;
8370 list_for_each_entry(net
, net_list
, exit_list
) {
8371 if (net
->dev_unreg_count
> 0) {
8372 unregistering
= true;
8380 wait_woken(&wait
, TASK_UNINTERRUPTIBLE
, MAX_SCHEDULE_TIMEOUT
);
8382 remove_wait_queue(&netdev_unregistering_wq
, &wait
);
8385 static void __net_exit
default_device_exit_batch(struct list_head
*net_list
)
8387 /* At exit all network devices most be removed from a network
8388 * namespace. Do this in the reverse order of registration.
8389 * Do this across as many network namespaces as possible to
8390 * improve batching efficiency.
8392 struct net_device
*dev
;
8394 LIST_HEAD(dev_kill_list
);
8396 /* To prevent network device cleanup code from dereferencing
8397 * loopback devices or network devices that have been freed
8398 * wait here for all pending unregistrations to complete,
8399 * before unregistring the loopback device and allowing the
8400 * network namespace be freed.
8402 * The netdev todo list containing all network devices
8403 * unregistrations that happen in default_device_exit_batch
8404 * will run in the rtnl_unlock() at the end of
8405 * default_device_exit_batch.
8407 rtnl_lock_unregistering(net_list
);
8408 list_for_each_entry(net
, net_list
, exit_list
) {
8409 for_each_netdev_reverse(net
, dev
) {
8410 if (dev
->rtnl_link_ops
&& dev
->rtnl_link_ops
->dellink
)
8411 dev
->rtnl_link_ops
->dellink(dev
, &dev_kill_list
);
8413 unregister_netdevice_queue(dev
, &dev_kill_list
);
8416 unregister_netdevice_many(&dev_kill_list
);
8420 static struct pernet_operations __net_initdata default_device_ops
= {
8421 .exit
= default_device_exit
,
8422 .exit_batch
= default_device_exit_batch
,
8426 * Initialize the DEV module. At boot time this walks the device list and
8427 * unhooks any devices that fail to initialise (normally hardware not
8428 * present) and leaves us with a valid list of present and active devices.
8433 * This is called single threaded during boot, so no need
8434 * to take the rtnl semaphore.
8436 static int __init
net_dev_init(void)
8438 int i
, rc
= -ENOMEM
;
8440 BUG_ON(!dev_boot_phase
);
8442 if (dev_proc_init())
8445 if (netdev_kobject_init())
8448 INIT_LIST_HEAD(&ptype_all
);
8449 for (i
= 0; i
< PTYPE_HASH_SIZE
; i
++)
8450 INIT_LIST_HEAD(&ptype_base
[i
]);
8452 INIT_LIST_HEAD(&offload_base
);
8454 if (register_pernet_subsys(&netdev_net_ops
))
8458 * Initialise the packet receive queues.
8461 for_each_possible_cpu(i
) {
8462 struct work_struct
*flush
= per_cpu_ptr(&flush_works
, i
);
8463 struct softnet_data
*sd
= &per_cpu(softnet_data
, i
);
8465 INIT_WORK(flush
, flush_backlog
);
8467 skb_queue_head_init(&sd
->input_pkt_queue
);
8468 skb_queue_head_init(&sd
->process_queue
);
8469 INIT_LIST_HEAD(&sd
->poll_list
);
8470 sd
->output_queue_tailp
= &sd
->output_queue
;
8472 sd
->csd
.func
= rps_trigger_softirq
;
8477 sd
->backlog
.poll
= process_backlog
;
8478 sd
->backlog
.weight
= weight_p
;
8483 /* The loopback device is special if any other network devices
8484 * is present in a network namespace the loopback device must
8485 * be present. Since we now dynamically allocate and free the
8486 * loopback device ensure this invariant is maintained by
8487 * keeping the loopback device as the first device on the
8488 * list of network devices. Ensuring the loopback devices
8489 * is the first device that appears and the last network device
8492 if (register_pernet_device(&loopback_net_ops
))
8495 if (register_pernet_device(&default_device_ops
))
8498 open_softirq(NET_TX_SOFTIRQ
, net_tx_action
);
8499 open_softirq(NET_RX_SOFTIRQ
, net_rx_action
);
8501 rc
= cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD
, "net/dev:dead",
8502 NULL
, dev_cpu_dead
);
8510 subsys_initcall(net_dev_init
);