]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/core/dev.c
netvm: propagate page->pfmemalloc from skb_alloc_page to skb
[mirror_ubuntu-bionic-kernel.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h>
1da177e4
LT
103#include <net/dst.h>
104#include <net/pkt_sched.h>
105#include <net/checksum.h>
44540960 106#include <net/xfrm.h>
1da177e4
LT
107#include <linux/highmem.h>
108#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h>
1da177e4
LT
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
295f4a1f 114#include <net/wext.h>
1da177e4 115#include <net/iw_handler.h>
1da177e4 116#include <asm/current.h>
5bdb9886 117#include <linux/audit.h>
db217334 118#include <linux/dmaengine.h>
f6a78bfc 119#include <linux/err.h>
c7fa9d18 120#include <linux/ctype.h>
723e98b7 121#include <linux/if_arp.h>
6de329e2 122#include <linux/if_vlan.h>
8f0f2223 123#include <linux/ip.h>
ad55dcaf 124#include <net/ip.h>
8f0f2223
DM
125#include <linux/ipv6.h>
126#include <linux/in.h>
b6b2fed1
DM
127#include <linux/jhash.h>
128#include <linux/random.h>
9cbc1cb8 129#include <trace/events/napi.h>
cf66ba58 130#include <trace/events/net.h>
07dc22e7 131#include <trace/events/skb.h>
5acbbd42 132#include <linux/pci.h>
caeda9b9 133#include <linux/inetdevice.h>
c445477d 134#include <linux/cpu_rmap.h>
4dc360c5 135#include <linux/net_tstamp.h>
c5905afb 136#include <linux/static_key.h>
4504b861 137#include <net/flow_keys.h>
1da177e4 138
342709ef
PE
139#include "net-sysfs.h"
140
d565b0a1
HX
141/* Instead of increasing this, you should create a hash table. */
142#define MAX_GRO_SKBS 8
143
5d38a079
HX
144/* This should be increased if a protocol with a bigger head is added. */
145#define GRO_MAX_HEAD (MAX_HEADER + 128)
146
1da177e4
LT
147/*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
3041a069 158 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
82d8a867
PE
175#define PTYPE_HASH_SIZE (16)
176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
1da177e4 178static DEFINE_SPINLOCK(ptype_lock);
82d8a867 179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 180static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 181
1da177e4 182/*
7562f876 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
184 * semaphore.
185 *
c6d14c84 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
187 *
188 * Writers must hold the rtnl semaphore while they loop through the
7562f876 189 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
190 * actual updates. This allows pure readers to access the list even
191 * while a writer is preparing to update it.
192 *
193 * To put it another way, dev_base_lock is held for writing only to
194 * protect against pure readers; the rtnl semaphore provides the
195 * protection against other writers.
196 *
197 * See, for example usages, register_netdevice() and
198 * unregister_netdevice(), which must be called with the rtnl
199 * semaphore held.
200 */
1da177e4 201DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
202EXPORT_SYMBOL(dev_base_lock);
203
4e985ada
TG
204static inline void dev_base_seq_inc(struct net *net)
205{
206 while (++net->dev_base_seq == 0);
207}
208
881d966b 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 210{
95c96174
ED
211 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212
08e9897d 213 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
214}
215
881d966b 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 217{
7c28bd0b 218 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
219}
220
e36fa2f7 221static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
222{
223#ifdef CONFIG_RPS
e36fa2f7 224 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
225#endif
226}
227
e36fa2f7 228static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
229{
230#ifdef CONFIG_RPS
e36fa2f7 231 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
232#endif
233}
234
ce286d32
EB
235/* Device list insertion */
236static int list_netdevice(struct net_device *dev)
237{
c346dca1 238 struct net *net = dev_net(dev);
ce286d32
EB
239
240 ASSERT_RTNL();
241
242 write_lock_bh(&dev_base_lock);
c6d14c84 243 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 244 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
245 hlist_add_head_rcu(&dev->index_hlist,
246 dev_index_hash(net, dev->ifindex));
ce286d32 247 write_unlock_bh(&dev_base_lock);
4e985ada
TG
248
249 dev_base_seq_inc(net);
250
ce286d32
EB
251 return 0;
252}
253
fb699dfd
ED
254/* Device list removal
255 * caller must respect a RCU grace period before freeing/reusing dev
256 */
ce286d32
EB
257static void unlist_netdevice(struct net_device *dev)
258{
259 ASSERT_RTNL();
260
261 /* Unlink dev from the device chain */
262 write_lock_bh(&dev_base_lock);
c6d14c84 263 list_del_rcu(&dev->dev_list);
72c9528b 264 hlist_del_rcu(&dev->name_hlist);
fb699dfd 265 hlist_del_rcu(&dev->index_hlist);
ce286d32 266 write_unlock_bh(&dev_base_lock);
4e985ada
TG
267
268 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
269}
270
1da177e4
LT
271/*
272 * Our notifier list
273 */
274
f07d5b94 275static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
276
277/*
278 * Device drivers call our routines to queue packets here. We empty the
279 * queue in the local softnet handler.
280 */
bea3348e 281
9958da05 282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 283EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 284
cf508b12 285#ifdef CONFIG_LOCKDEP
723e98b7 286/*
c773e847 287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
288 * according to dev->type
289 */
290static const unsigned short netdev_lock_type[] =
291 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
303 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 306
36cbd3dc 307static const char *const netdev_lock_name[] =
723e98b7
JP
308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
320 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
323
324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
326
327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328{
329 int i;
330
331 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 if (netdev_lock_type[i] == dev_type)
333 return i;
334 /* the last key is used by default */
335 return ARRAY_SIZE(netdev_lock_type) - 1;
336}
337
cf508b12
DM
338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 unsigned short dev_type)
723e98b7
JP
340{
341 int i;
342
343 i = netdev_lock_pos(dev_type);
344 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 netdev_lock_name[i]);
346}
cf508b12
DM
347
348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349{
350 int i;
351
352 i = netdev_lock_pos(dev->type);
353 lockdep_set_class_and_name(&dev->addr_list_lock,
354 &netdev_addr_lock_key[i],
355 netdev_lock_name[i]);
356}
723e98b7 357#else
cf508b12
DM
358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 unsigned short dev_type)
360{
361}
362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
363{
364}
365#endif
1da177e4
LT
366
367/*******************************************************************************
368
369 Protocol management and registration routines
370
371*******************************************************************************/
372
1da177e4
LT
373/*
374 * Add a protocol ID to the list. Now that the input handler is
375 * smarter we can dispense with all the messy stuff that used to be
376 * here.
377 *
378 * BEWARE!!! Protocol handlers, mangling input packets,
379 * MUST BE last in hash buckets and checking protocol handlers
380 * MUST start from promiscuous ptype_all chain in net_bh.
381 * It is true now, do not change it.
382 * Explanation follows: if protocol handler, mangling packet, will
383 * be the first on list, it is not able to sense, that packet
384 * is cloned and should be copied-on-write, so that it will
385 * change it and subsequent readers will get broken packet.
386 * --ANK (980803)
387 */
388
c07b68e8
ED
389static inline struct list_head *ptype_head(const struct packet_type *pt)
390{
391 if (pt->type == htons(ETH_P_ALL))
392 return &ptype_all;
393 else
394 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395}
396
1da177e4
LT
397/**
398 * dev_add_pack - add packet handler
399 * @pt: packet type declaration
400 *
401 * Add a protocol handler to the networking stack. The passed &packet_type
402 * is linked into kernel lists and may not be freed until it has been
403 * removed from the kernel lists.
404 *
4ec93edb 405 * This call does not sleep therefore it can not
1da177e4
LT
406 * guarantee all CPU's that are in middle of receiving packets
407 * will see the new packet type (until the next received packet).
408 */
409
410void dev_add_pack(struct packet_type *pt)
411{
c07b68e8 412 struct list_head *head = ptype_head(pt);
1da177e4 413
c07b68e8
ED
414 spin_lock(&ptype_lock);
415 list_add_rcu(&pt->list, head);
416 spin_unlock(&ptype_lock);
1da177e4 417}
d1b19dff 418EXPORT_SYMBOL(dev_add_pack);
1da177e4 419
1da177e4
LT
420/**
421 * __dev_remove_pack - remove packet handler
422 * @pt: packet type declaration
423 *
424 * Remove a protocol handler that was previously added to the kernel
425 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
426 * from the kernel lists and can be freed or reused once this function
4ec93edb 427 * returns.
1da177e4
LT
428 *
429 * The packet type might still be in use by receivers
430 * and must not be freed until after all the CPU's have gone
431 * through a quiescent state.
432 */
433void __dev_remove_pack(struct packet_type *pt)
434{
c07b68e8 435 struct list_head *head = ptype_head(pt);
1da177e4
LT
436 struct packet_type *pt1;
437
c07b68e8 438 spin_lock(&ptype_lock);
1da177e4
LT
439
440 list_for_each_entry(pt1, head, list) {
441 if (pt == pt1) {
442 list_del_rcu(&pt->list);
443 goto out;
444 }
445 }
446
7b6cd1ce 447 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 448out:
c07b68e8 449 spin_unlock(&ptype_lock);
1da177e4 450}
d1b19dff
ED
451EXPORT_SYMBOL(__dev_remove_pack);
452
1da177e4
LT
453/**
454 * dev_remove_pack - remove packet handler
455 * @pt: packet type declaration
456 *
457 * Remove a protocol handler that was previously added to the kernel
458 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
459 * from the kernel lists and can be freed or reused once this function
460 * returns.
461 *
462 * This call sleeps to guarantee that no CPU is looking at the packet
463 * type after return.
464 */
465void dev_remove_pack(struct packet_type *pt)
466{
467 __dev_remove_pack(pt);
4ec93edb 468
1da177e4
LT
469 synchronize_net();
470}
d1b19dff 471EXPORT_SYMBOL(dev_remove_pack);
1da177e4
LT
472
473/******************************************************************************
474
475 Device Boot-time Settings Routines
476
477*******************************************************************************/
478
479/* Boot time configuration table */
480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481
482/**
483 * netdev_boot_setup_add - add new setup entry
484 * @name: name of the device
485 * @map: configured settings for the device
486 *
487 * Adds new setup entry to the dev_boot_setup list. The function
488 * returns 0 on error and 1 on success. This is a generic routine to
489 * all netdevices.
490 */
491static int netdev_boot_setup_add(char *name, struct ifmap *map)
492{
493 struct netdev_boot_setup *s;
494 int i;
495
496 s = dev_boot_setup;
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 500 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
501 memcpy(&s[i].map, map, sizeof(s[i].map));
502 break;
503 }
504 }
505
506 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507}
508
509/**
510 * netdev_boot_setup_check - check boot time settings
511 * @dev: the netdevice
512 *
513 * Check boot time settings for the device.
514 * The found settings are set for the device to be used
515 * later in the device probing.
516 * Returns 0 if no settings found, 1 if they are.
517 */
518int netdev_boot_setup_check(struct net_device *dev)
519{
520 struct netdev_boot_setup *s = dev_boot_setup;
521 int i;
522
523 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 525 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
526 dev->irq = s[i].map.irq;
527 dev->base_addr = s[i].map.base_addr;
528 dev->mem_start = s[i].map.mem_start;
529 dev->mem_end = s[i].map.mem_end;
530 return 1;
531 }
532 }
533 return 0;
534}
d1b19dff 535EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
536
537
538/**
539 * netdev_boot_base - get address from boot time settings
540 * @prefix: prefix for network device
541 * @unit: id for network device
542 *
543 * Check boot time settings for the base address of device.
544 * The found settings are set for the device to be used
545 * later in the device probing.
546 * Returns 0 if no settings found.
547 */
548unsigned long netdev_boot_base(const char *prefix, int unit)
549{
550 const struct netdev_boot_setup *s = dev_boot_setup;
551 char name[IFNAMSIZ];
552 int i;
553
554 sprintf(name, "%s%d", prefix, unit);
555
556 /*
557 * If device already registered then return base of 1
558 * to indicate not to probe for this interface
559 */
881d966b 560 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
561 return 1;
562
563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 if (!strcmp(name, s[i].name))
565 return s[i].map.base_addr;
566 return 0;
567}
568
569/*
570 * Saves at boot time configured settings for any netdevice.
571 */
572int __init netdev_boot_setup(char *str)
573{
574 int ints[5];
575 struct ifmap map;
576
577 str = get_options(str, ARRAY_SIZE(ints), ints);
578 if (!str || !*str)
579 return 0;
580
581 /* Save settings */
582 memset(&map, 0, sizeof(map));
583 if (ints[0] > 0)
584 map.irq = ints[1];
585 if (ints[0] > 1)
586 map.base_addr = ints[2];
587 if (ints[0] > 2)
588 map.mem_start = ints[3];
589 if (ints[0] > 3)
590 map.mem_end = ints[4];
591
592 /* Add new entry to the list */
593 return netdev_boot_setup_add(str, &map);
594}
595
596__setup("netdev=", netdev_boot_setup);
597
598/*******************************************************************************
599
600 Device Interface Subroutines
601
602*******************************************************************************/
603
604/**
605 * __dev_get_by_name - find a device by its name
c4ea43c5 606 * @net: the applicable net namespace
1da177e4
LT
607 * @name: name to find
608 *
609 * Find an interface by name. Must be called under RTNL semaphore
610 * or @dev_base_lock. If the name is found a pointer to the device
611 * is returned. If the name is not found then %NULL is returned. The
612 * reference counters are not incremented so the caller must be
613 * careful with locks.
614 */
615
881d966b 616struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
617{
618 struct hlist_node *p;
0bd8d536
ED
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 621
0bd8d536 622 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
0bd8d536 625
1da177e4
LT
626 return NULL;
627}
d1b19dff 628EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 629
72c9528b
ED
630/**
631 * dev_get_by_name_rcu - find a device by its name
632 * @net: the applicable net namespace
633 * @name: name to find
634 *
635 * Find an interface by name.
636 * If the name is found a pointer to the device is returned.
637 * If the name is not found then %NULL is returned.
638 * The reference counters are not incremented so the caller must be
639 * careful with locks. The caller must hold RCU lock.
640 */
641
642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643{
644 struct hlist_node *p;
645 struct net_device *dev;
646 struct hlist_head *head = dev_name_hash(net, name);
647
648 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 if (!strncmp(dev->name, name, IFNAMSIZ))
650 return dev;
651
652 return NULL;
653}
654EXPORT_SYMBOL(dev_get_by_name_rcu);
655
1da177e4
LT
656/**
657 * dev_get_by_name - find a device by its name
c4ea43c5 658 * @net: the applicable net namespace
1da177e4
LT
659 * @name: name to find
660 *
661 * Find an interface by name. This can be called from any
662 * context and does its own locking. The returned handle has
663 * the usage count incremented and the caller must use dev_put() to
664 * release it when it is no longer needed. %NULL is returned if no
665 * matching device is found.
666 */
667
881d966b 668struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
669{
670 struct net_device *dev;
671
72c9528b
ED
672 rcu_read_lock();
673 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
674 if (dev)
675 dev_hold(dev);
72c9528b 676 rcu_read_unlock();
1da177e4
LT
677 return dev;
678}
d1b19dff 679EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
680
681/**
682 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 683 * @net: the applicable net namespace
1da177e4
LT
684 * @ifindex: index of device
685 *
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold either the RTNL semaphore
690 * or @dev_base_lock.
691 */
692
881d966b 693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
694{
695 struct hlist_node *p;
0bd8d536
ED
696 struct net_device *dev;
697 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 698
0bd8d536 699 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
700 if (dev->ifindex == ifindex)
701 return dev;
0bd8d536 702
1da177e4
LT
703 return NULL;
704}
d1b19dff 705EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 706
fb699dfd
ED
707/**
708 * dev_get_by_index_rcu - find a device by its ifindex
709 * @net: the applicable net namespace
710 * @ifindex: index of device
711 *
712 * Search for an interface by index. Returns %NULL if the device
713 * is not found or a pointer to the device. The device has not
714 * had its reference counter increased so the caller must be careful
715 * about locking. The caller must hold RCU lock.
716 */
717
718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719{
720 struct hlist_node *p;
721 struct net_device *dev;
722 struct hlist_head *head = dev_index_hash(net, ifindex);
723
724 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 if (dev->ifindex == ifindex)
726 return dev;
727
728 return NULL;
729}
730EXPORT_SYMBOL(dev_get_by_index_rcu);
731
1da177e4
LT
732
733/**
734 * dev_get_by_index - find a device by its ifindex
c4ea43c5 735 * @net: the applicable net namespace
1da177e4
LT
736 * @ifindex: index of device
737 *
738 * Search for an interface by index. Returns NULL if the device
739 * is not found or a pointer to the device. The device returned has
740 * had a reference added and the pointer is safe until the user calls
741 * dev_put to indicate they have finished with it.
742 */
743
881d966b 744struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
745{
746 struct net_device *dev;
747
fb699dfd
ED
748 rcu_read_lock();
749 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
750 if (dev)
751 dev_hold(dev);
fb699dfd 752 rcu_read_unlock();
1da177e4
LT
753 return dev;
754}
d1b19dff 755EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
756
757/**
941666c2 758 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 759 * @net: the applicable net namespace
1da177e4
LT
760 * @type: media type of device
761 * @ha: hardware address
762 *
763 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
764 * is not found or a pointer to the device.
765 * The caller must hold RCU or RTNL.
941666c2 766 * The returned device has not had its ref count increased
1da177e4
LT
767 * and the caller must therefore be careful about locking
768 *
1da177e4
LT
769 */
770
941666c2
ED
771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 const char *ha)
1da177e4
LT
773{
774 struct net_device *dev;
775
941666c2 776 for_each_netdev_rcu(net, dev)
1da177e4
LT
777 if (dev->type == type &&
778 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
779 return dev;
780
781 return NULL;
1da177e4 782}
941666c2 783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 784
881d966b 785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
786{
787 struct net_device *dev;
788
4e9cac2b 789 ASSERT_RTNL();
881d966b 790 for_each_netdev(net, dev)
4e9cac2b 791 if (dev->type == type)
7562f876
PE
792 return dev;
793
794 return NULL;
4e9cac2b 795}
4e9cac2b
PM
796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797
881d966b 798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 799{
99fe3c39 800 struct net_device *dev, *ret = NULL;
4e9cac2b 801
99fe3c39
ED
802 rcu_read_lock();
803 for_each_netdev_rcu(net, dev)
804 if (dev->type == type) {
805 dev_hold(dev);
806 ret = dev;
807 break;
808 }
809 rcu_read_unlock();
810 return ret;
1da177e4 811}
1da177e4
LT
812EXPORT_SYMBOL(dev_getfirstbyhwtype);
813
814/**
bb69ae04 815 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 816 * @net: the applicable net namespace
1da177e4
LT
817 * @if_flags: IFF_* values
818 * @mask: bitmask of bits in if_flags to check
819 *
820 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
821 * is not found or a pointer to the device. Must be called inside
822 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
823 */
824
bb69ae04 825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 826 unsigned short mask)
1da177e4 827{
7562f876 828 struct net_device *dev, *ret;
1da177e4 829
7562f876 830 ret = NULL;
c6d14c84 831 for_each_netdev_rcu(net, dev) {
1da177e4 832 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 833 ret = dev;
1da177e4
LT
834 break;
835 }
836 }
7562f876 837 return ret;
1da177e4 838}
bb69ae04 839EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
840
841/**
842 * dev_valid_name - check if name is okay for network device
843 * @name: name string
844 *
845 * Network device names need to be valid file names to
c7fa9d18
DM
846 * to allow sysfs to work. We also disallow any kind of
847 * whitespace.
1da177e4 848 */
95f050bf 849bool dev_valid_name(const char *name)
1da177e4 850{
c7fa9d18 851 if (*name == '\0')
95f050bf 852 return false;
b6fe17d6 853 if (strlen(name) >= IFNAMSIZ)
95f050bf 854 return false;
c7fa9d18 855 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 856 return false;
c7fa9d18
DM
857
858 while (*name) {
859 if (*name == '/' || isspace(*name))
95f050bf 860 return false;
c7fa9d18
DM
861 name++;
862 }
95f050bf 863 return true;
1da177e4 864}
d1b19dff 865EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
866
867/**
b267b179
EB
868 * __dev_alloc_name - allocate a name for a device
869 * @net: network namespace to allocate the device name in
1da177e4 870 * @name: name format string
b267b179 871 * @buf: scratch buffer and result name string
1da177e4
LT
872 *
873 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
874 * id. It scans list of devices to build up a free map, then chooses
875 * the first empty slot. The caller must hold the dev_base or rtnl lock
876 * while allocating the name and adding the device in order to avoid
877 * duplicates.
878 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
880 */
881
b267b179 882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
883{
884 int i = 0;
1da177e4
LT
885 const char *p;
886 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 887 unsigned long *inuse;
1da177e4
LT
888 struct net_device *d;
889
890 p = strnchr(name, IFNAMSIZ-1, '%');
891 if (p) {
892 /*
893 * Verify the string as this thing may have come from
894 * the user. There must be either one "%d" and no other "%"
895 * characters.
896 */
897 if (p[1] != 'd' || strchr(p + 2, '%'))
898 return -EINVAL;
899
900 /* Use one page as a bit array of possible slots */
cfcabdcc 901 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
902 if (!inuse)
903 return -ENOMEM;
904
881d966b 905 for_each_netdev(net, d) {
1da177e4
LT
906 if (!sscanf(d->name, name, &i))
907 continue;
908 if (i < 0 || i >= max_netdevices)
909 continue;
910
911 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 912 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
913 if (!strncmp(buf, d->name, IFNAMSIZ))
914 set_bit(i, inuse);
915 }
916
917 i = find_first_zero_bit(inuse, max_netdevices);
918 free_page((unsigned long) inuse);
919 }
920
d9031024
OP
921 if (buf != name)
922 snprintf(buf, IFNAMSIZ, name, i);
b267b179 923 if (!__dev_get_by_name(net, buf))
1da177e4 924 return i;
1da177e4
LT
925
926 /* It is possible to run out of possible slots
927 * when the name is long and there isn't enough space left
928 * for the digits, or if all bits are used.
929 */
930 return -ENFILE;
931}
932
b267b179
EB
933/**
934 * dev_alloc_name - allocate a name for a device
935 * @dev: device
936 * @name: name format string
937 *
938 * Passed a format string - eg "lt%d" it will try and find a suitable
939 * id. It scans list of devices to build up a free map, then chooses
940 * the first empty slot. The caller must hold the dev_base or rtnl lock
941 * while allocating the name and adding the device in order to avoid
942 * duplicates.
943 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944 * Returns the number of the unit assigned or a negative errno code.
945 */
946
947int dev_alloc_name(struct net_device *dev, const char *name)
948{
949 char buf[IFNAMSIZ];
950 struct net *net;
951 int ret;
952
c346dca1
YH
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
b267b179
EB
955 ret = __dev_alloc_name(net, name, buf);
956 if (ret >= 0)
957 strlcpy(dev->name, buf, IFNAMSIZ);
958 return ret;
959}
d1b19dff 960EXPORT_SYMBOL(dev_alloc_name);
b267b179 961
1c5cae81 962static int dev_get_valid_name(struct net_device *dev, const char *name)
d9031024 963{
8ce6cebc
DL
964 struct net *net;
965
966 BUG_ON(!dev_net(dev));
967 net = dev_net(dev);
968
d9031024
OP
969 if (!dev_valid_name(name))
970 return -EINVAL;
971
1c5cae81 972 if (strchr(name, '%'))
8ce6cebc 973 return dev_alloc_name(dev, name);
d9031024
OP
974 else if (__dev_get_by_name(net, name))
975 return -EEXIST;
8ce6cebc
DL
976 else if (dev->name != name)
977 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
978
979 return 0;
980}
1da177e4
LT
981
982/**
983 * dev_change_name - change name of a device
984 * @dev: device
985 * @newname: name (or format string) must be at least IFNAMSIZ
986 *
987 * Change name of a device, can pass format strings "eth%d".
988 * for wildcarding.
989 */
cf04a4c7 990int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 991{
fcc5a03a 992 char oldname[IFNAMSIZ];
1da177e4 993 int err = 0;
fcc5a03a 994 int ret;
881d966b 995 struct net *net;
1da177e4
LT
996
997 ASSERT_RTNL();
c346dca1 998 BUG_ON(!dev_net(dev));
1da177e4 999
c346dca1 1000 net = dev_net(dev);
1da177e4
LT
1001 if (dev->flags & IFF_UP)
1002 return -EBUSY;
1003
c8d90dca
SH
1004 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 return 0;
1006
fcc5a03a
HX
1007 memcpy(oldname, dev->name, IFNAMSIZ);
1008
1c5cae81 1009 err = dev_get_valid_name(dev, newname);
d9031024
OP
1010 if (err < 0)
1011 return err;
1da177e4 1012
fcc5a03a 1013rollback:
a1b3f594
EB
1014 ret = device_rename(&dev->dev, dev->name);
1015 if (ret) {
1016 memcpy(dev->name, oldname, IFNAMSIZ);
1017 return ret;
dcc99773 1018 }
7f988eab
HX
1019
1020 write_lock_bh(&dev_base_lock);
372b2312 1021 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1022 write_unlock_bh(&dev_base_lock);
1023
1024 synchronize_rcu();
1025
1026 write_lock_bh(&dev_base_lock);
1027 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1028 write_unlock_bh(&dev_base_lock);
1029
056925ab 1030 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1031 ret = notifier_to_errno(ret);
1032
1033 if (ret) {
91e9c07b
ED
1034 /* err >= 0 after dev_alloc_name() or stores the first errno */
1035 if (err >= 0) {
fcc5a03a
HX
1036 err = ret;
1037 memcpy(dev->name, oldname, IFNAMSIZ);
1038 goto rollback;
91e9c07b 1039 } else {
7b6cd1ce 1040 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1041 dev->name, ret);
fcc5a03a
HX
1042 }
1043 }
1da177e4
LT
1044
1045 return err;
1046}
1047
0b815a1a
SH
1048/**
1049 * dev_set_alias - change ifalias of a device
1050 * @dev: device
1051 * @alias: name up to IFALIASZ
f0db275a 1052 * @len: limit of bytes to copy from info
0b815a1a
SH
1053 *
1054 * Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058 ASSERT_RTNL();
1059
1060 if (len >= IFALIASZ)
1061 return -EINVAL;
1062
96ca4a2c
OH
1063 if (!len) {
1064 if (dev->ifalias) {
1065 kfree(dev->ifalias);
1066 dev->ifalias = NULL;
1067 }
1068 return 0;
1069 }
1070
d1b19dff 1071 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
0b815a1a
SH
1072 if (!dev->ifalias)
1073 return -ENOMEM;
1074
1075 strlcpy(dev->ifalias, alias, len+1);
1076 return len;
1077}
1078
1079
d8a33ac4 1080/**
3041a069 1081 * netdev_features_change - device changes features
d8a33ac4
SH
1082 * @dev: device to cause notification
1083 *
1084 * Called to indicate a device has changed features.
1085 */
1086void netdev_features_change(struct net_device *dev)
1087{
056925ab 1088 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1089}
1090EXPORT_SYMBOL(netdev_features_change);
1091
1da177e4
LT
1092/**
1093 * netdev_state_change - device changes state
1094 * @dev: device to cause notification
1095 *
1096 * Called to indicate a device has changed state. This function calls
1097 * the notifier chains for netdev_chain and sends a NEWLINK message
1098 * to the routing socket.
1099 */
1100void netdev_state_change(struct net_device *dev)
1101{
1102 if (dev->flags & IFF_UP) {
056925ab 1103 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1104 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105 }
1106}
d1b19dff 1107EXPORT_SYMBOL(netdev_state_change);
1da177e4 1108
3ca5b404 1109int netdev_bonding_change(struct net_device *dev, unsigned long event)
c1da4ac7 1110{
3ca5b404 1111 return call_netdevice_notifiers(event, dev);
c1da4ac7
OG
1112}
1113EXPORT_SYMBOL(netdev_bonding_change);
1114
1da177e4
LT
1115/**
1116 * dev_load - load a network module
c4ea43c5 1117 * @net: the applicable net namespace
1da177e4
LT
1118 * @name: name of interface
1119 *
1120 * If a network interface is not present and the process has suitable
1121 * privileges this function loads the module. If module loading is not
1122 * available in this kernel then it becomes a nop.
1123 */
1124
881d966b 1125void dev_load(struct net *net, const char *name)
1da177e4 1126{
4ec93edb 1127 struct net_device *dev;
8909c9ad 1128 int no_module;
1da177e4 1129
72c9528b
ED
1130 rcu_read_lock();
1131 dev = dev_get_by_name_rcu(net, name);
1132 rcu_read_unlock();
1da177e4 1133
8909c9ad
VK
1134 no_module = !dev;
1135 if (no_module && capable(CAP_NET_ADMIN))
1136 no_module = request_module("netdev-%s", name);
1137 if (no_module && capable(CAP_SYS_MODULE)) {
1138 if (!request_module("%s", name))
7cecb523
VL
1139 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1140 name);
8909c9ad 1141 }
1da177e4 1142}
d1b19dff 1143EXPORT_SYMBOL(dev_load);
1da177e4 1144
bd380811 1145static int __dev_open(struct net_device *dev)
1da177e4 1146{
d314774c 1147 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1148 int ret;
1da177e4 1149
e46b66bc
BH
1150 ASSERT_RTNL();
1151
1da177e4
LT
1152 if (!netif_device_present(dev))
1153 return -ENODEV;
1154
3b8bcfd5
JB
1155 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1156 ret = notifier_to_errno(ret);
1157 if (ret)
1158 return ret;
1159
1da177e4 1160 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1161
d314774c
SH
1162 if (ops->ndo_validate_addr)
1163 ret = ops->ndo_validate_addr(dev);
bada339b 1164
d314774c
SH
1165 if (!ret && ops->ndo_open)
1166 ret = ops->ndo_open(dev);
1da177e4 1167
bada339b
JG
1168 if (ret)
1169 clear_bit(__LINK_STATE_START, &dev->state);
1170 else {
1da177e4 1171 dev->flags |= IFF_UP;
b4bd07c2 1172 net_dmaengine_get();
4417da66 1173 dev_set_rx_mode(dev);
1da177e4 1174 dev_activate(dev);
1da177e4 1175 }
bada339b 1176
1da177e4
LT
1177 return ret;
1178}
1179
1180/**
bd380811
PM
1181 * dev_open - prepare an interface for use.
1182 * @dev: device to open
1da177e4 1183 *
bd380811
PM
1184 * Takes a device from down to up state. The device's private open
1185 * function is invoked and then the multicast lists are loaded. Finally
1186 * the device is moved into the up state and a %NETDEV_UP message is
1187 * sent to the netdev notifier chain.
1188 *
1189 * Calling this function on an active interface is a nop. On a failure
1190 * a negative errno code is returned.
1da177e4 1191 */
bd380811
PM
1192int dev_open(struct net_device *dev)
1193{
1194 int ret;
1195
bd380811
PM
1196 if (dev->flags & IFF_UP)
1197 return 0;
1198
bd380811
PM
1199 ret = __dev_open(dev);
1200 if (ret < 0)
1201 return ret;
1202
bd380811
PM
1203 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1204 call_netdevice_notifiers(NETDEV_UP, dev);
1205
1206 return ret;
1207}
1208EXPORT_SYMBOL(dev_open);
1209
44345724 1210static int __dev_close_many(struct list_head *head)
1da177e4 1211{
44345724 1212 struct net_device *dev;
e46b66bc 1213
bd380811 1214 ASSERT_RTNL();
9d5010db
DM
1215 might_sleep();
1216
44345724 1217 list_for_each_entry(dev, head, unreg_list) {
44345724 1218 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1219
44345724 1220 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1221
44345724
OP
1222 /* Synchronize to scheduled poll. We cannot touch poll list, it
1223 * can be even on different cpu. So just clear netif_running().
1224 *
1225 * dev->stop() will invoke napi_disable() on all of it's
1226 * napi_struct instances on this device.
1227 */
1228 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1229 }
1da177e4 1230
44345724 1231 dev_deactivate_many(head);
d8b2a4d2 1232
44345724
OP
1233 list_for_each_entry(dev, head, unreg_list) {
1234 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1235
44345724
OP
1236 /*
1237 * Call the device specific close. This cannot fail.
1238 * Only if device is UP
1239 *
1240 * We allow it to be called even after a DETACH hot-plug
1241 * event.
1242 */
1243 if (ops->ndo_stop)
1244 ops->ndo_stop(dev);
1245
44345724 1246 dev->flags &= ~IFF_UP;
44345724
OP
1247 net_dmaengine_put();
1248 }
1249
1250 return 0;
1251}
1252
1253static int __dev_close(struct net_device *dev)
1254{
f87e6f47 1255 int retval;
44345724
OP
1256 LIST_HEAD(single);
1257
1258 list_add(&dev->unreg_list, &single);
f87e6f47
LT
1259 retval = __dev_close_many(&single);
1260 list_del(&single);
1261 return retval;
44345724
OP
1262}
1263
3fbd8758 1264static int dev_close_many(struct list_head *head)
44345724
OP
1265{
1266 struct net_device *dev, *tmp;
1267 LIST_HEAD(tmp_list);
1da177e4 1268
44345724
OP
1269 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1270 if (!(dev->flags & IFF_UP))
1271 list_move(&dev->unreg_list, &tmp_list);
1272
1273 __dev_close_many(head);
1da177e4 1274
44345724
OP
1275 list_for_each_entry(dev, head, unreg_list) {
1276 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1277 call_netdevice_notifiers(NETDEV_DOWN, dev);
1278 }
bd380811 1279
44345724
OP
1280 /* rollback_registered_many needs the complete original list */
1281 list_splice(&tmp_list, head);
bd380811
PM
1282 return 0;
1283}
1284
1285/**
1286 * dev_close - shutdown an interface.
1287 * @dev: device to shutdown
1288 *
1289 * This function moves an active device into down state. A
1290 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1291 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1292 * chain.
1293 */
1294int dev_close(struct net_device *dev)
1295{
e14a5993
ED
1296 if (dev->flags & IFF_UP) {
1297 LIST_HEAD(single);
1da177e4 1298
e14a5993
ED
1299 list_add(&dev->unreg_list, &single);
1300 dev_close_many(&single);
1301 list_del(&single);
1302 }
1da177e4
LT
1303 return 0;
1304}
d1b19dff 1305EXPORT_SYMBOL(dev_close);
1da177e4
LT
1306
1307
0187bdfb
BH
1308/**
1309 * dev_disable_lro - disable Large Receive Offload on a device
1310 * @dev: device
1311 *
1312 * Disable Large Receive Offload (LRO) on a net device. Must be
1313 * called under RTNL. This is needed if received packets may be
1314 * forwarded to another interface.
1315 */
1316void dev_disable_lro(struct net_device *dev)
1317{
f11970e3
NH
1318 /*
1319 * If we're trying to disable lro on a vlan device
1320 * use the underlying physical device instead
1321 */
1322 if (is_vlan_dev(dev))
1323 dev = vlan_dev_real_dev(dev);
1324
bc5787c6
MM
1325 dev->wanted_features &= ~NETIF_F_LRO;
1326 netdev_update_features(dev);
27660515 1327
22d5969f
MM
1328 if (unlikely(dev->features & NETIF_F_LRO))
1329 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1330}
1331EXPORT_SYMBOL(dev_disable_lro);
1332
1333
881d966b
EB
1334static int dev_boot_phase = 1;
1335
1da177e4
LT
1336/**
1337 * register_netdevice_notifier - register a network notifier block
1338 * @nb: notifier
1339 *
1340 * Register a notifier to be called when network device events occur.
1341 * The notifier passed is linked into the kernel structures and must
1342 * not be reused until it has been unregistered. A negative errno code
1343 * is returned on a failure.
1344 *
1345 * When registered all registration and up events are replayed
4ec93edb 1346 * to the new notifier to allow device to have a race free
1da177e4
LT
1347 * view of the network device list.
1348 */
1349
1350int register_netdevice_notifier(struct notifier_block *nb)
1351{
1352 struct net_device *dev;
fcc5a03a 1353 struct net_device *last;
881d966b 1354 struct net *net;
1da177e4
LT
1355 int err;
1356
1357 rtnl_lock();
f07d5b94 1358 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1359 if (err)
1360 goto unlock;
881d966b
EB
1361 if (dev_boot_phase)
1362 goto unlock;
1363 for_each_net(net) {
1364 for_each_netdev(net, dev) {
1365 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1366 err = notifier_to_errno(err);
1367 if (err)
1368 goto rollback;
1369
1370 if (!(dev->flags & IFF_UP))
1371 continue;
1da177e4 1372
881d966b
EB
1373 nb->notifier_call(nb, NETDEV_UP, dev);
1374 }
1da177e4 1375 }
fcc5a03a
HX
1376
1377unlock:
1da177e4
LT
1378 rtnl_unlock();
1379 return err;
fcc5a03a
HX
1380
1381rollback:
1382 last = dev;
881d966b
EB
1383 for_each_net(net) {
1384 for_each_netdev(net, dev) {
1385 if (dev == last)
8f891489 1386 goto outroll;
fcc5a03a 1387
881d966b
EB
1388 if (dev->flags & IFF_UP) {
1389 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1390 nb->notifier_call(nb, NETDEV_DOWN, dev);
1391 }
1392 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
a5ee1551 1393 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
fcc5a03a 1394 }
fcc5a03a 1395 }
c67625a1 1396
8f891489 1397outroll:
c67625a1 1398 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1399 goto unlock;
1da177e4 1400}
d1b19dff 1401EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1402
1403/**
1404 * unregister_netdevice_notifier - unregister a network notifier block
1405 * @nb: notifier
1406 *
1407 * Unregister a notifier previously registered by
1408 * register_netdevice_notifier(). The notifier is unlinked into the
1409 * kernel structures and may then be reused. A negative errno code
1410 * is returned on a failure.
7d3d43da
EB
1411 *
1412 * After unregistering unregister and down device events are synthesized
1413 * for all devices on the device list to the removed notifier to remove
1414 * the need for special case cleanup code.
1da177e4
LT
1415 */
1416
1417int unregister_netdevice_notifier(struct notifier_block *nb)
1418{
7d3d43da
EB
1419 struct net_device *dev;
1420 struct net *net;
9f514950
HX
1421 int err;
1422
1423 rtnl_lock();
f07d5b94 1424 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1425 if (err)
1426 goto unlock;
1427
1428 for_each_net(net) {
1429 for_each_netdev(net, dev) {
1430 if (dev->flags & IFF_UP) {
1431 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1432 nb->notifier_call(nb, NETDEV_DOWN, dev);
1433 }
1434 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1435 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1436 }
1437 }
1438unlock:
9f514950
HX
1439 rtnl_unlock();
1440 return err;
1da177e4 1441}
d1b19dff 1442EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1443
1444/**
1445 * call_netdevice_notifiers - call all network notifier blocks
1446 * @val: value passed unmodified to notifier function
c4ea43c5 1447 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1448 *
1449 * Call all network notifier blocks. Parameters and return value
f07d5b94 1450 * are as for raw_notifier_call_chain().
1da177e4
LT
1451 */
1452
ad7379d4 1453int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1454{
ab930471 1455 ASSERT_RTNL();
ad7379d4 1456 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4 1457}
edf947f1 1458EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1459
c5905afb 1460static struct static_key netstamp_needed __read_mostly;
b90e5794 1461#ifdef HAVE_JUMP_LABEL
c5905afb 1462/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1463 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1464 * static_key_slow_dec() calls.
b90e5794
ED
1465 */
1466static atomic_t netstamp_needed_deferred;
1467#endif
1da177e4
LT
1468
1469void net_enable_timestamp(void)
1470{
b90e5794
ED
1471#ifdef HAVE_JUMP_LABEL
1472 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1473
1474 if (deferred) {
1475 while (--deferred)
c5905afb 1476 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1477 return;
1478 }
1479#endif
1480 WARN_ON(in_interrupt());
c5905afb 1481 static_key_slow_inc(&netstamp_needed);
1da177e4 1482}
d1b19dff 1483EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1484
1485void net_disable_timestamp(void)
1486{
b90e5794
ED
1487#ifdef HAVE_JUMP_LABEL
1488 if (in_interrupt()) {
1489 atomic_inc(&netstamp_needed_deferred);
1490 return;
1491 }
1492#endif
c5905afb 1493 static_key_slow_dec(&netstamp_needed);
1da177e4 1494}
d1b19dff 1495EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1496
3b098e2d 1497static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1498{
588f0330 1499 skb->tstamp.tv64 = 0;
c5905afb 1500 if (static_key_false(&netstamp_needed))
a61bbcf2 1501 __net_timestamp(skb);
1da177e4
LT
1502}
1503
588f0330 1504#define net_timestamp_check(COND, SKB) \
c5905afb 1505 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1506 if ((COND) && !(SKB)->tstamp.tv64) \
1507 __net_timestamp(SKB); \
1508 } \
3b098e2d 1509
4dc360c5
RC
1510static int net_hwtstamp_validate(struct ifreq *ifr)
1511{
1512 struct hwtstamp_config cfg;
1513 enum hwtstamp_tx_types tx_type;
1514 enum hwtstamp_rx_filters rx_filter;
1515 int tx_type_valid = 0;
1516 int rx_filter_valid = 0;
1517
1518 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1519 return -EFAULT;
1520
1521 if (cfg.flags) /* reserved for future extensions */
1522 return -EINVAL;
1523
1524 tx_type = cfg.tx_type;
1525 rx_filter = cfg.rx_filter;
1526
1527 switch (tx_type) {
1528 case HWTSTAMP_TX_OFF:
1529 case HWTSTAMP_TX_ON:
1530 case HWTSTAMP_TX_ONESTEP_SYNC:
1531 tx_type_valid = 1;
1532 break;
1533 }
1534
1535 switch (rx_filter) {
1536 case HWTSTAMP_FILTER_NONE:
1537 case HWTSTAMP_FILTER_ALL:
1538 case HWTSTAMP_FILTER_SOME:
1539 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1540 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1541 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1542 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1543 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1544 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1545 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1546 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1547 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1548 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1549 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1550 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1551 rx_filter_valid = 1;
1552 break;
1553 }
1554
1555 if (!tx_type_valid || !rx_filter_valid)
1556 return -ERANGE;
1557
1558 return 0;
1559}
1560
79b569f0
DL
1561static inline bool is_skb_forwardable(struct net_device *dev,
1562 struct sk_buff *skb)
1563{
1564 unsigned int len;
1565
1566 if (!(dev->flags & IFF_UP))
1567 return false;
1568
1569 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1570 if (skb->len <= len)
1571 return true;
1572
1573 /* if TSO is enabled, we don't care about the length as the packet
1574 * could be forwarded without being segmented before
1575 */
1576 if (skb_is_gso(skb))
1577 return true;
1578
1579 return false;
1580}
1581
44540960
AB
1582/**
1583 * dev_forward_skb - loopback an skb to another netif
1584 *
1585 * @dev: destination network device
1586 * @skb: buffer to forward
1587 *
1588 * return values:
1589 * NET_RX_SUCCESS (no congestion)
6ec82562 1590 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1591 *
1592 * dev_forward_skb can be used for injecting an skb from the
1593 * start_xmit function of one device into the receive queue
1594 * of another device.
1595 *
1596 * The receiving device may be in another namespace, so
1597 * we have to clear all information in the skb that could
1598 * impact namespace isolation.
1599 */
1600int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1601{
48c83012
MT
1602 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1603 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1604 atomic_long_inc(&dev->rx_dropped);
1605 kfree_skb(skb);
1606 return NET_RX_DROP;
1607 }
1608 }
1609
44540960 1610 skb_orphan(skb);
c736eefa 1611 nf_reset(skb);
44540960 1612
79b569f0 1613 if (unlikely(!is_skb_forwardable(dev, skb))) {
caf586e5 1614 atomic_long_inc(&dev->rx_dropped);
6ec82562 1615 kfree_skb(skb);
44540960 1616 return NET_RX_DROP;
6ec82562 1617 }
3b9785c6 1618 skb->skb_iif = 0;
59b9997b
DM
1619 skb->dev = dev;
1620 skb_dst_drop(skb);
44540960
AB
1621 skb->tstamp.tv64 = 0;
1622 skb->pkt_type = PACKET_HOST;
1623 skb->protocol = eth_type_trans(skb, dev);
59b9997b
DM
1624 skb->mark = 0;
1625 secpath_reset(skb);
1626 nf_reset(skb);
44540960
AB
1627 return netif_rx(skb);
1628}
1629EXPORT_SYMBOL_GPL(dev_forward_skb);
1630
71d9dec2
CG
1631static inline int deliver_skb(struct sk_buff *skb,
1632 struct packet_type *pt_prev,
1633 struct net_device *orig_dev)
1634{
1080e512
MT
1635 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1636 return -ENOMEM;
71d9dec2
CG
1637 atomic_inc(&skb->users);
1638 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1639}
1640
1da177e4
LT
1641/*
1642 * Support routine. Sends outgoing frames to any network
1643 * taps currently in use.
1644 */
1645
f6a78bfc 1646static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1647{
1648 struct packet_type *ptype;
71d9dec2
CG
1649 struct sk_buff *skb2 = NULL;
1650 struct packet_type *pt_prev = NULL;
a61bbcf2 1651
1da177e4
LT
1652 rcu_read_lock();
1653 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1654 /* Never send packets back to the socket
1655 * they originated from - MvS (miquels@drinkel.ow.org)
1656 */
1657 if ((ptype->dev == dev || !ptype->dev) &&
1658 (ptype->af_packet_priv == NULL ||
1659 (struct sock *)ptype->af_packet_priv != skb->sk)) {
71d9dec2
CG
1660 if (pt_prev) {
1661 deliver_skb(skb2, pt_prev, skb->dev);
1662 pt_prev = ptype;
1663 continue;
1664 }
1665
1666 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1667 if (!skb2)
1668 break;
1669
70978182
ED
1670 net_timestamp_set(skb2);
1671
1da177e4
LT
1672 /* skb->nh should be correctly
1673 set by sender, so that the second statement is
1674 just protection against buggy protocols.
1675 */
459a98ed 1676 skb_reset_mac_header(skb2);
1da177e4 1677
d56f90a7 1678 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1679 skb2->network_header > skb2->tail) {
e87cc472
JP
1680 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1681 ntohs(skb2->protocol),
1682 dev->name);
c1d2bbe1 1683 skb_reset_network_header(skb2);
1da177e4
LT
1684 }
1685
b0e380b1 1686 skb2->transport_header = skb2->network_header;
1da177e4 1687 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1688 pt_prev = ptype;
1da177e4
LT
1689 }
1690 }
71d9dec2
CG
1691 if (pt_prev)
1692 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1693 rcu_read_unlock();
1694}
1695
2c53040f
BH
1696/**
1697 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1698 * @dev: Network device
1699 * @txq: number of queues available
1700 *
1701 * If real_num_tx_queues is changed the tc mappings may no longer be
1702 * valid. To resolve this verify the tc mapping remains valid and if
1703 * not NULL the mapping. With no priorities mapping to this
1704 * offset/count pair it will no longer be used. In the worst case TC0
1705 * is invalid nothing can be done so disable priority mappings. If is
1706 * expected that drivers will fix this mapping if they can before
1707 * calling netif_set_real_num_tx_queues.
1708 */
bb134d22 1709static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1710{
1711 int i;
1712 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1713
1714 /* If TC0 is invalidated disable TC mapping */
1715 if (tc->offset + tc->count > txq) {
7b6cd1ce 1716 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1717 dev->num_tc = 0;
1718 return;
1719 }
1720
1721 /* Invalidated prio to tc mappings set to TC0 */
1722 for (i = 1; i < TC_BITMASK + 1; i++) {
1723 int q = netdev_get_prio_tc_map(dev, i);
1724
1725 tc = &dev->tc_to_txq[q];
1726 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1727 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1728 i, q);
4f57c087
JF
1729 netdev_set_prio_tc_map(dev, i, 0);
1730 }
1731 }
1732}
1733
f0796d5c
JF
1734/*
1735 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1736 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1737 */
e6484930 1738int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 1739{
1d24eb48
TH
1740 int rc;
1741
e6484930
TH
1742 if (txq < 1 || txq > dev->num_tx_queues)
1743 return -EINVAL;
f0796d5c 1744
5c56580b
BH
1745 if (dev->reg_state == NETREG_REGISTERED ||
1746 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
1747 ASSERT_RTNL();
1748
1d24eb48
TH
1749 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1750 txq);
bf264145
TH
1751 if (rc)
1752 return rc;
1753
4f57c087
JF
1754 if (dev->num_tc)
1755 netif_setup_tc(dev, txq);
1756
e6484930
TH
1757 if (txq < dev->real_num_tx_queues)
1758 qdisc_reset_all_tx_gt(dev, txq);
f0796d5c 1759 }
e6484930
TH
1760
1761 dev->real_num_tx_queues = txq;
1762 return 0;
f0796d5c
JF
1763}
1764EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 1765
62fe0b40
BH
1766#ifdef CONFIG_RPS
1767/**
1768 * netif_set_real_num_rx_queues - set actual number of RX queues used
1769 * @dev: Network device
1770 * @rxq: Actual number of RX queues
1771 *
1772 * This must be called either with the rtnl_lock held or before
1773 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
1774 * negative error code. If called before registration, it always
1775 * succeeds.
62fe0b40
BH
1776 */
1777int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1778{
1779 int rc;
1780
bd25fa7b
TH
1781 if (rxq < 1 || rxq > dev->num_rx_queues)
1782 return -EINVAL;
1783
62fe0b40
BH
1784 if (dev->reg_state == NETREG_REGISTERED) {
1785 ASSERT_RTNL();
1786
62fe0b40
BH
1787 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1788 rxq);
1789 if (rc)
1790 return rc;
62fe0b40
BH
1791 }
1792
1793 dev->real_num_rx_queues = rxq;
1794 return 0;
1795}
1796EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1797#endif
1798
2c53040f
BH
1799/**
1800 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
1801 *
1802 * This routine should set an upper limit on the number of RSS queues
1803 * used by default by multiqueue devices.
1804 */
a55b138b 1805int netif_get_num_default_rss_queues(void)
16917b87
YM
1806{
1807 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1808}
1809EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1810
def82a1d 1811static inline void __netif_reschedule(struct Qdisc *q)
56079431 1812{
def82a1d
JP
1813 struct softnet_data *sd;
1814 unsigned long flags;
56079431 1815
def82a1d
JP
1816 local_irq_save(flags);
1817 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
1818 q->next_sched = NULL;
1819 *sd->output_queue_tailp = q;
1820 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
1821 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1822 local_irq_restore(flags);
1823}
1824
1825void __netif_schedule(struct Qdisc *q)
1826{
1827 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1828 __netif_reschedule(q);
56079431
DV
1829}
1830EXPORT_SYMBOL(__netif_schedule);
1831
bea3348e 1832void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1833{
3578b0c8 1834 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
1835 struct softnet_data *sd;
1836 unsigned long flags;
56079431 1837
bea3348e
SH
1838 local_irq_save(flags);
1839 sd = &__get_cpu_var(softnet_data);
1840 skb->next = sd->completion_queue;
1841 sd->completion_queue = skb;
1842 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1843 local_irq_restore(flags);
1844 }
56079431 1845}
bea3348e 1846EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1847
1848void dev_kfree_skb_any(struct sk_buff *skb)
1849{
1850 if (in_irq() || irqs_disabled())
1851 dev_kfree_skb_irq(skb);
1852 else
1853 dev_kfree_skb(skb);
1854}
1855EXPORT_SYMBOL(dev_kfree_skb_any);
1856
1857
bea3348e
SH
1858/**
1859 * netif_device_detach - mark device as removed
1860 * @dev: network device
1861 *
1862 * Mark device as removed from system and therefore no longer available.
1863 */
56079431
DV
1864void netif_device_detach(struct net_device *dev)
1865{
1866 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1867 netif_running(dev)) {
d543103a 1868 netif_tx_stop_all_queues(dev);
56079431
DV
1869 }
1870}
1871EXPORT_SYMBOL(netif_device_detach);
1872
bea3348e
SH
1873/**
1874 * netif_device_attach - mark device as attached
1875 * @dev: network device
1876 *
1877 * Mark device as attached from system and restart if needed.
1878 */
56079431
DV
1879void netif_device_attach(struct net_device *dev)
1880{
1881 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1882 netif_running(dev)) {
d543103a 1883 netif_tx_wake_all_queues(dev);
4ec93edb 1884 __netdev_watchdog_up(dev);
56079431
DV
1885 }
1886}
1887EXPORT_SYMBOL(netif_device_attach);
1888
36c92474
BH
1889static void skb_warn_bad_offload(const struct sk_buff *skb)
1890{
65e9d2fa 1891 static const netdev_features_t null_features = 0;
36c92474
BH
1892 struct net_device *dev = skb->dev;
1893 const char *driver = "";
1894
1895 if (dev && dev->dev.parent)
1896 driver = dev_driver_string(dev->dev.parent);
1897
1898 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1899 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
1900 driver, dev ? &dev->features : &null_features,
1901 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
1902 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1903 skb_shinfo(skb)->gso_type, skb->ip_summed);
1904}
1905
1da177e4
LT
1906/*
1907 * Invalidate hardware checksum when packet is to be mangled, and
1908 * complete checksum manually on outgoing path.
1909 */
84fa7933 1910int skb_checksum_help(struct sk_buff *skb)
1da177e4 1911{
d3bc23e7 1912 __wsum csum;
663ead3b 1913 int ret = 0, offset;
1da177e4 1914
84fa7933 1915 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1916 goto out_set_summed;
1917
1918 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
1919 skb_warn_bad_offload(skb);
1920 return -EINVAL;
1da177e4
LT
1921 }
1922
55508d60 1923 offset = skb_checksum_start_offset(skb);
a030847e
HX
1924 BUG_ON(offset >= skb_headlen(skb));
1925 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1926
1927 offset += skb->csum_offset;
1928 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1929
1930 if (skb_cloned(skb) &&
1931 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1932 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1933 if (ret)
1934 goto out;
1935 }
1936
a030847e 1937 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1938out_set_summed:
1da177e4 1939 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1940out:
1da177e4
LT
1941 return ret;
1942}
d1b19dff 1943EXPORT_SYMBOL(skb_checksum_help);
1da177e4 1944
f6a78bfc
HX
1945/**
1946 * skb_gso_segment - Perform segmentation on skb.
1947 * @skb: buffer to segment
576a30eb 1948 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1949 *
1950 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1951 *
1952 * It may return NULL if the skb requires no segmentation. This is
1953 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1954 */
c8f44aff
MM
1955struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1956 netdev_features_t features)
f6a78bfc
HX
1957{
1958 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1959 struct packet_type *ptype;
252e3346 1960 __be16 type = skb->protocol;
c8d5bcd1 1961 int vlan_depth = ETH_HLEN;
a430a43d 1962 int err;
f6a78bfc 1963
c8d5bcd1
JG
1964 while (type == htons(ETH_P_8021Q)) {
1965 struct vlan_hdr *vh;
7b9c6090 1966
c8d5bcd1 1967 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
7b9c6090
JG
1968 return ERR_PTR(-EINVAL);
1969
c8d5bcd1
JG
1970 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1971 type = vh->h_vlan_encapsulated_proto;
1972 vlan_depth += VLAN_HLEN;
7b9c6090
JG
1973 }
1974
459a98ed 1975 skb_reset_mac_header(skb);
b0e380b1 1976 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1977 __skb_pull(skb, skb->mac_len);
1978
67fd1a73 1979 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
36c92474 1980 skb_warn_bad_offload(skb);
67fd1a73 1981
a430a43d
HX
1982 if (skb_header_cloned(skb) &&
1983 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1984 return ERR_PTR(err);
1985 }
1986
f6a78bfc 1987 rcu_read_lock();
82d8a867
PE
1988 list_for_each_entry_rcu(ptype,
1989 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 1990 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 1991 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1992 err = ptype->gso_send_check(skb);
1993 segs = ERR_PTR(err);
1994 if (err || skb_gso_ok(skb, features))
1995 break;
d56f90a7
ACM
1996 __skb_push(skb, (skb->data -
1997 skb_network_header(skb)));
a430a43d 1998 }
576a30eb 1999 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
2000 break;
2001 }
2002 }
2003 rcu_read_unlock();
2004
98e399f8 2005 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2006
f6a78bfc
HX
2007 return segs;
2008}
f6a78bfc
HX
2009EXPORT_SYMBOL(skb_gso_segment);
2010
fb286bb2
HX
2011/* Take action when hardware reception checksum errors are detected. */
2012#ifdef CONFIG_BUG
2013void netdev_rx_csum_fault(struct net_device *dev)
2014{
2015 if (net_ratelimit()) {
7b6cd1ce 2016 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2017 dump_stack();
2018 }
2019}
2020EXPORT_SYMBOL(netdev_rx_csum_fault);
2021#endif
2022
1da177e4
LT
2023/* Actually, we should eliminate this check as soon as we know, that:
2024 * 1. IOMMU is present and allows to map all the memory.
2025 * 2. No high memory really exists on this machine.
2026 */
2027
9092c658 2028static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2029{
3d3a8533 2030#ifdef CONFIG_HIGHMEM
1da177e4 2031 int i;
5acbbd42 2032 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2033 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2034 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2035 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2036 return 1;
ea2ab693 2037 }
5acbbd42 2038 }
1da177e4 2039
5acbbd42
FT
2040 if (PCI_DMA_BUS_IS_PHYS) {
2041 struct device *pdev = dev->dev.parent;
1da177e4 2042
9092c658
ED
2043 if (!pdev)
2044 return 0;
5acbbd42 2045 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2046 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2047 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2048 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2049 return 1;
2050 }
2051 }
3d3a8533 2052#endif
1da177e4
LT
2053 return 0;
2054}
1da177e4 2055
f6a78bfc
HX
2056struct dev_gso_cb {
2057 void (*destructor)(struct sk_buff *skb);
2058};
2059
2060#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2061
2062static void dev_gso_skb_destructor(struct sk_buff *skb)
2063{
2064 struct dev_gso_cb *cb;
2065
2066 do {
2067 struct sk_buff *nskb = skb->next;
2068
2069 skb->next = nskb->next;
2070 nskb->next = NULL;
2071 kfree_skb(nskb);
2072 } while (skb->next);
2073
2074 cb = DEV_GSO_CB(skb);
2075 if (cb->destructor)
2076 cb->destructor(skb);
2077}
2078
2079/**
2080 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2081 * @skb: buffer to segment
91ecb63c 2082 * @features: device features as applicable to this skb
f6a78bfc
HX
2083 *
2084 * This function segments the given skb and stores the list of segments
2085 * in skb->next.
2086 */
c8f44aff 2087static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
f6a78bfc 2088{
f6a78bfc 2089 struct sk_buff *segs;
576a30eb
HX
2090
2091 segs = skb_gso_segment(skb, features);
2092
2093 /* Verifying header integrity only. */
2094 if (!segs)
2095 return 0;
f6a78bfc 2096
801678c5 2097 if (IS_ERR(segs))
f6a78bfc
HX
2098 return PTR_ERR(segs);
2099
2100 skb->next = segs;
2101 DEV_GSO_CB(skb)->destructor = skb->destructor;
2102 skb->destructor = dev_gso_skb_destructor;
2103
2104 return 0;
2105}
2106
c8f44aff 2107static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
03634668
JG
2108{
2109 return ((features & NETIF_F_GEN_CSUM) ||
2110 ((features & NETIF_F_V4_CSUM) &&
2111 protocol == htons(ETH_P_IP)) ||
2112 ((features & NETIF_F_V6_CSUM) &&
2113 protocol == htons(ETH_P_IPV6)) ||
2114 ((features & NETIF_F_FCOE_CRC) &&
2115 protocol == htons(ETH_P_FCOE)));
2116}
2117
c8f44aff
MM
2118static netdev_features_t harmonize_features(struct sk_buff *skb,
2119 __be16 protocol, netdev_features_t features)
f01a5236 2120{
d402786e 2121 if (!can_checksum_protocol(features, protocol)) {
f01a5236
JG
2122 features &= ~NETIF_F_ALL_CSUM;
2123 features &= ~NETIF_F_SG;
2124 } else if (illegal_highdma(skb->dev, skb)) {
2125 features &= ~NETIF_F_SG;
2126 }
2127
2128 return features;
2129}
2130
c8f44aff 2131netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6
JG
2132{
2133 __be16 protocol = skb->protocol;
c8f44aff 2134 netdev_features_t features = skb->dev->features;
58e998c6
JG
2135
2136 if (protocol == htons(ETH_P_8021Q)) {
2137 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2138 protocol = veh->h_vlan_encapsulated_proto;
f01a5236
JG
2139 } else if (!vlan_tx_tag_present(skb)) {
2140 return harmonize_features(skb, protocol, features);
2141 }
58e998c6 2142
6ee400aa 2143 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
f01a5236
JG
2144
2145 if (protocol != htons(ETH_P_8021Q)) {
2146 return harmonize_features(skb, protocol, features);
2147 } else {
2148 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
6ee400aa 2149 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
f01a5236
JG
2150 return harmonize_features(skb, protocol, features);
2151 }
58e998c6 2152}
f01a5236 2153EXPORT_SYMBOL(netif_skb_features);
58e998c6 2154
6afff0ca
JF
2155/*
2156 * Returns true if either:
2157 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2158 * 2. skb is fragmented and the device does not support SG, or if
2159 * at least one of fragments is in highmem and device does not
2160 * support DMA from it.
2161 */
2162static inline int skb_needs_linearize(struct sk_buff *skb,
02932ce9 2163 int features)
6afff0ca 2164{
02932ce9
JG
2165 return skb_is_nonlinear(skb) &&
2166 ((skb_has_frag_list(skb) &&
2167 !(features & NETIF_F_FRAGLIST)) ||
e1e78db6 2168 (skb_shinfo(skb)->nr_frags &&
02932ce9 2169 !(features & NETIF_F_SG)));
6afff0ca
JF
2170}
2171
fd2ea0a7
DM
2172int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2173 struct netdev_queue *txq)
f6a78bfc 2174{
00829823 2175 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 2176 int rc = NETDEV_TX_OK;
ec764bf0 2177 unsigned int skb_len;
00829823 2178
f6a78bfc 2179 if (likely(!skb->next)) {
c8f44aff 2180 netdev_features_t features;
fc741216 2181
93f154b5 2182 /*
25985edc 2183 * If device doesn't need skb->dst, release it right now while
93f154b5
ED
2184 * its hot in this cpu cache
2185 */
adf30907
ED
2186 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2187 skb_dst_drop(skb);
2188
15c2d75f
ED
2189 if (!list_empty(&ptype_all))
2190 dev_queue_xmit_nit(skb, dev);
2191
fc741216
JG
2192 features = netif_skb_features(skb);
2193
7b9c6090 2194 if (vlan_tx_tag_present(skb) &&
fc741216 2195 !(features & NETIF_F_HW_VLAN_TX)) {
7b9c6090
JG
2196 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2197 if (unlikely(!skb))
2198 goto out;
2199
2200 skb->vlan_tci = 0;
2201 }
2202
fc741216 2203 if (netif_needs_gso(skb, features)) {
91ecb63c 2204 if (unlikely(dev_gso_segment(skb, features)))
9ccb8975
DM
2205 goto out_kfree_skb;
2206 if (skb->next)
2207 goto gso;
6afff0ca 2208 } else {
02932ce9 2209 if (skb_needs_linearize(skb, features) &&
6afff0ca
JF
2210 __skb_linearize(skb))
2211 goto out_kfree_skb;
2212
2213 /* If packet is not checksummed and device does not
2214 * support checksumming for this protocol, complete
2215 * checksumming here.
2216 */
2217 if (skb->ip_summed == CHECKSUM_PARTIAL) {
55508d60
MM
2218 skb_set_transport_header(skb,
2219 skb_checksum_start_offset(skb));
03634668 2220 if (!(features & NETIF_F_ALL_CSUM) &&
6afff0ca
JF
2221 skb_checksum_help(skb))
2222 goto out_kfree_skb;
2223 }
9ccb8975
DM
2224 }
2225
ec764bf0 2226 skb_len = skb->len;
ac45f602 2227 rc = ops->ndo_start_xmit(skb, dev);
ec764bf0 2228 trace_net_dev_xmit(skb, rc, dev, skb_len);
ec634fe3 2229 if (rc == NETDEV_TX_OK)
08baf561 2230 txq_trans_update(txq);
ac45f602 2231 return rc;
f6a78bfc
HX
2232 }
2233
576a30eb 2234gso:
f6a78bfc
HX
2235 do {
2236 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2237
2238 skb->next = nskb->next;
2239 nskb->next = NULL;
068a2de5
KK
2240
2241 /*
25985edc 2242 * If device doesn't need nskb->dst, release it right now while
068a2de5
KK
2243 * its hot in this cpu cache
2244 */
2245 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2246 skb_dst_drop(nskb);
2247
ec764bf0 2248 skb_len = nskb->len;
00829823 2249 rc = ops->ndo_start_xmit(nskb, dev);
ec764bf0 2250 trace_net_dev_xmit(nskb, rc, dev, skb_len);
ec634fe3 2251 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2252 if (rc & ~NETDEV_TX_MASK)
2253 goto out_kfree_gso_skb;
f54d9e8d 2254 nskb->next = skb->next;
f6a78bfc
HX
2255 skb->next = nskb;
2256 return rc;
2257 }
08baf561 2258 txq_trans_update(txq);
73466498 2259 if (unlikely(netif_xmit_stopped(txq) && skb->next))
f54d9e8d 2260 return NETDEV_TX_BUSY;
f6a78bfc 2261 } while (skb->next);
4ec93edb 2262
572a9d7b
PM
2263out_kfree_gso_skb:
2264 if (likely(skb->next == NULL))
2265 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
2266out_kfree_skb:
2267 kfree_skb(skb);
7b9c6090 2268out:
572a9d7b 2269 return rc;
f6a78bfc
HX
2270}
2271
0a9627f2 2272static u32 hashrnd __read_mostly;
b6b2fed1 2273
a3d22a68
VZ
2274/*
2275 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2276 * to be used as a distribution range.
2277 */
2278u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2279 unsigned int num_tx_queues)
8f0f2223 2280{
7019298a 2281 u32 hash;
4f57c087
JF
2282 u16 qoffset = 0;
2283 u16 qcount = num_tx_queues;
b6b2fed1 2284
513de11b
DM
2285 if (skb_rx_queue_recorded(skb)) {
2286 hash = skb_get_rx_queue(skb);
a3d22a68
VZ
2287 while (unlikely(hash >= num_tx_queues))
2288 hash -= num_tx_queues;
513de11b
DM
2289 return hash;
2290 }
ec581f6a 2291
4f57c087
JF
2292 if (dev->num_tc) {
2293 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2294 qoffset = dev->tc_to_txq[tc].offset;
2295 qcount = dev->tc_to_txq[tc].count;
2296 }
2297
ec581f6a 2298 if (skb->sk && skb->sk->sk_hash)
7019298a 2299 hash = skb->sk->sk_hash;
ec581f6a 2300 else
62b1a8ab 2301 hash = (__force u16) skb->protocol;
0a9627f2 2302 hash = jhash_1word(hash, hashrnd);
b6b2fed1 2303
4f57c087 2304 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
8f0f2223 2305}
a3d22a68 2306EXPORT_SYMBOL(__skb_tx_hash);
8f0f2223 2307
ed04642f
ED
2308static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2309{
2310 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
e87cc472
JP
2311 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2312 dev->name, queue_index,
2313 dev->real_num_tx_queues);
ed04642f
ED
2314 return 0;
2315 }
2316 return queue_index;
2317}
2318
1d24eb48
TH
2319static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2320{
bf264145 2321#ifdef CONFIG_XPS
1d24eb48
TH
2322 struct xps_dev_maps *dev_maps;
2323 struct xps_map *map;
2324 int queue_index = -1;
2325
2326 rcu_read_lock();
2327 dev_maps = rcu_dereference(dev->xps_maps);
2328 if (dev_maps) {
2329 map = rcu_dereference(
2330 dev_maps->cpu_map[raw_smp_processor_id()]);
2331 if (map) {
2332 if (map->len == 1)
2333 queue_index = map->queues[0];
2334 else {
2335 u32 hash;
2336 if (skb->sk && skb->sk->sk_hash)
2337 hash = skb->sk->sk_hash;
2338 else
2339 hash = (__force u16) skb->protocol ^
2340 skb->rxhash;
2341 hash = jhash_1word(hash, hashrnd);
2342 queue_index = map->queues[
2343 ((u64)hash * map->len) >> 32];
2344 }
2345 if (unlikely(queue_index >= dev->real_num_tx_queues))
2346 queue_index = -1;
2347 }
2348 }
2349 rcu_read_unlock();
2350
2351 return queue_index;
2352#else
2353 return -1;
2354#endif
2355}
2356
e8a0464c
DM
2357static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2358 struct sk_buff *skb)
2359{
b0f77d0e 2360 int queue_index;
deabc772 2361 const struct net_device_ops *ops = dev->netdev_ops;
a4ee3ce3 2362
3853b584
TH
2363 if (dev->real_num_tx_queues == 1)
2364 queue_index = 0;
2365 else if (ops->ndo_select_queue) {
deabc772
HS
2366 queue_index = ops->ndo_select_queue(dev, skb);
2367 queue_index = dev_cap_txqueue(dev, queue_index);
2368 } else {
2369 struct sock *sk = skb->sk;
2370 queue_index = sk_tx_queue_get(sk);
a4ee3ce3 2371
3853b584
TH
2372 if (queue_index < 0 || skb->ooo_okay ||
2373 queue_index >= dev->real_num_tx_queues) {
2374 int old_index = queue_index;
fd2ea0a7 2375
1d24eb48
TH
2376 queue_index = get_xps_queue(dev, skb);
2377 if (queue_index < 0)
2378 queue_index = skb_tx_hash(dev, skb);
3853b584
TH
2379
2380 if (queue_index != old_index && sk) {
2381 struct dst_entry *dst =
2382 rcu_dereference_check(sk->sk_dst_cache, 1);
8728c544
ED
2383
2384 if (dst && skb_dst(skb) == dst)
2385 sk_tx_queue_set(sk, queue_index);
2386 }
a4ee3ce3
KK
2387 }
2388 }
eae792b7 2389
fd2ea0a7
DM
2390 skb_set_queue_mapping(skb, queue_index);
2391 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
2392}
2393
bbd8a0d3
KK
2394static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2395 struct net_device *dev,
2396 struct netdev_queue *txq)
2397{
2398 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2399 bool contended;
bbd8a0d3
KK
2400 int rc;
2401
a2da570d
ED
2402 qdisc_skb_cb(skb)->pkt_len = skb->len;
2403 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2404 /*
2405 * Heuristic to force contended enqueues to serialize on a
2406 * separate lock before trying to get qdisc main lock.
2407 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2408 * and dequeue packets faster.
2409 */
a2da570d 2410 contended = qdisc_is_running(q);
79640a4c
ED
2411 if (unlikely(contended))
2412 spin_lock(&q->busylock);
2413
bbd8a0d3
KK
2414 spin_lock(root_lock);
2415 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2416 kfree_skb(skb);
2417 rc = NET_XMIT_DROP;
2418 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2419 qdisc_run_begin(q)) {
bbd8a0d3
KK
2420 /*
2421 * This is a work-conserving queue; there are no old skbs
2422 * waiting to be sent out; and the qdisc is not running -
2423 * xmit the skb directly.
2424 */
7fee226a
ED
2425 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2426 skb_dst_force(skb);
bfe0d029 2427
bfe0d029
ED
2428 qdisc_bstats_update(q, skb);
2429
79640a4c
ED
2430 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2431 if (unlikely(contended)) {
2432 spin_unlock(&q->busylock);
2433 contended = false;
2434 }
bbd8a0d3 2435 __qdisc_run(q);
79640a4c 2436 } else
bc135b23 2437 qdisc_run_end(q);
bbd8a0d3
KK
2438
2439 rc = NET_XMIT_SUCCESS;
2440 } else {
7fee226a 2441 skb_dst_force(skb);
a2da570d 2442 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2443 if (qdisc_run_begin(q)) {
2444 if (unlikely(contended)) {
2445 spin_unlock(&q->busylock);
2446 contended = false;
2447 }
2448 __qdisc_run(q);
2449 }
bbd8a0d3
KK
2450 }
2451 spin_unlock(root_lock);
79640a4c
ED
2452 if (unlikely(contended))
2453 spin_unlock(&q->busylock);
bbd8a0d3
KK
2454 return rc;
2455}
2456
5bc1421e
NH
2457#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2458static void skb_update_prio(struct sk_buff *skb)
2459{
6977a79d 2460 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2461
91c68ce2
ED
2462 if (!skb->priority && skb->sk && map) {
2463 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2464
2465 if (prioidx < map->priomap_len)
2466 skb->priority = map->priomap[prioidx];
2467 }
5bc1421e
NH
2468}
2469#else
2470#define skb_update_prio(skb)
2471#endif
2472
745e20f1 2473static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2474#define RECURSION_LIMIT 10
745e20f1 2475
95603e22
MM
2476/**
2477 * dev_loopback_xmit - loop back @skb
2478 * @skb: buffer to transmit
2479 */
2480int dev_loopback_xmit(struct sk_buff *skb)
2481{
2482 skb_reset_mac_header(skb);
2483 __skb_pull(skb, skb_network_offset(skb));
2484 skb->pkt_type = PACKET_LOOPBACK;
2485 skb->ip_summed = CHECKSUM_UNNECESSARY;
2486 WARN_ON(!skb_dst(skb));
2487 skb_dst_force(skb);
2488 netif_rx_ni(skb);
2489 return 0;
2490}
2491EXPORT_SYMBOL(dev_loopback_xmit);
2492
d29f749e
DJ
2493/**
2494 * dev_queue_xmit - transmit a buffer
2495 * @skb: buffer to transmit
2496 *
2497 * Queue a buffer for transmission to a network device. The caller must
2498 * have set the device and priority and built the buffer before calling
2499 * this function. The function can be called from an interrupt.
2500 *
2501 * A negative errno code is returned on a failure. A success does not
2502 * guarantee the frame will be transmitted as it may be dropped due
2503 * to congestion or traffic shaping.
2504 *
2505 * -----------------------------------------------------------------------------------
2506 * I notice this method can also return errors from the queue disciplines,
2507 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2508 * be positive.
2509 *
2510 * Regardless of the return value, the skb is consumed, so it is currently
2511 * difficult to retry a send to this method. (You can bump the ref count
2512 * before sending to hold a reference for retry if you are careful.)
2513 *
2514 * When calling this method, interrupts MUST be enabled. This is because
2515 * the BH enable code must have IRQs enabled so that it will not deadlock.
2516 * --BLG
2517 */
1da177e4
LT
2518int dev_queue_xmit(struct sk_buff *skb)
2519{
2520 struct net_device *dev = skb->dev;
dc2b4847 2521 struct netdev_queue *txq;
1da177e4
LT
2522 struct Qdisc *q;
2523 int rc = -ENOMEM;
2524
4ec93edb
YH
2525 /* Disable soft irqs for various locks below. Also
2526 * stops preemption for RCU.
1da177e4 2527 */
4ec93edb 2528 rcu_read_lock_bh();
1da177e4 2529
5bc1421e
NH
2530 skb_update_prio(skb);
2531
eae792b7 2532 txq = dev_pick_tx(dev, skb);
a898def2 2533 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2534
1da177e4 2535#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2536 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2537#endif
cf66ba58 2538 trace_net_dev_queue(skb);
1da177e4 2539 if (q->enqueue) {
bbd8a0d3 2540 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2541 goto out;
1da177e4
LT
2542 }
2543
2544 /* The device has no queue. Common case for software devices:
2545 loopback, all the sorts of tunnels...
2546
932ff279
HX
2547 Really, it is unlikely that netif_tx_lock protection is necessary
2548 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2549 counters.)
2550 However, it is possible, that they rely on protection
2551 made by us here.
2552
2553 Check this and shot the lock. It is not prone from deadlocks.
2554 Either shot noqueue qdisc, it is even simpler 8)
2555 */
2556 if (dev->flags & IFF_UP) {
2557 int cpu = smp_processor_id(); /* ok because BHs are off */
2558
c773e847 2559 if (txq->xmit_lock_owner != cpu) {
1da177e4 2560
745e20f1
ED
2561 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2562 goto recursion_alert;
2563
c773e847 2564 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2565
73466498 2566 if (!netif_xmit_stopped(txq)) {
745e20f1 2567 __this_cpu_inc(xmit_recursion);
572a9d7b 2568 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2569 __this_cpu_dec(xmit_recursion);
572a9d7b 2570 if (dev_xmit_complete(rc)) {
c773e847 2571 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2572 goto out;
2573 }
2574 }
c773e847 2575 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
2576 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2577 dev->name);
1da177e4
LT
2578 } else {
2579 /* Recursion is detected! It is possible,
745e20f1
ED
2580 * unfortunately
2581 */
2582recursion_alert:
e87cc472
JP
2583 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2584 dev->name);
1da177e4
LT
2585 }
2586 }
2587
2588 rc = -ENETDOWN;
d4828d85 2589 rcu_read_unlock_bh();
1da177e4 2590
1da177e4
LT
2591 kfree_skb(skb);
2592 return rc;
2593out:
d4828d85 2594 rcu_read_unlock_bh();
1da177e4
LT
2595 return rc;
2596}
d1b19dff 2597EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2598
2599
2600/*=======================================================================
2601 Receiver routines
2602 =======================================================================*/
2603
6b2bedc3 2604int netdev_max_backlog __read_mostly = 1000;
3b098e2d 2605int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2606int netdev_budget __read_mostly = 300;
2607int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2608
eecfd7c4
ED
2609/* Called with irq disabled */
2610static inline void ____napi_schedule(struct softnet_data *sd,
2611 struct napi_struct *napi)
2612{
2613 list_add_tail(&napi->poll_list, &sd->poll_list);
2614 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2615}
2616
0a9627f2 2617/*
bfb564e7 2618 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
bdeab991
TH
2619 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2620 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2621 * if hash is a canonical 4-tuple hash over transport ports.
0a9627f2 2622 */
bdeab991 2623void __skb_get_rxhash(struct sk_buff *skb)
0a9627f2 2624{
4504b861
ED
2625 struct flow_keys keys;
2626 u32 hash;
c6865cb3 2627
4504b861
ED
2628 if (!skb_flow_dissect(skb, &keys))
2629 return;
e971b722 2630
4504b861
ED
2631 if (keys.ports) {
2632 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2633 swap(keys.port16[0], keys.port16[1]);
2634 skb->l4_rxhash = 1;
0a9627f2
TH
2635 }
2636
b249dcb8 2637 /* get a consistent hash (same value on both flow directions) */
4504b861
ED
2638 if ((__force u32)keys.dst < (__force u32)keys.src)
2639 swap(keys.dst, keys.src);
0a9627f2 2640
4504b861
ED
2641 hash = jhash_3words((__force u32)keys.dst,
2642 (__force u32)keys.src,
2643 (__force u32)keys.ports, hashrnd);
bfb564e7
KK
2644 if (!hash)
2645 hash = 1;
2646
bdeab991 2647 skb->rxhash = hash;
bfb564e7
KK
2648}
2649EXPORT_SYMBOL(__skb_get_rxhash);
2650
2651#ifdef CONFIG_RPS
2652
2653/* One global table that all flow-based protocols share. */
6e3f7faf 2654struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
2655EXPORT_SYMBOL(rps_sock_flow_table);
2656
c5905afb 2657struct static_key rps_needed __read_mostly;
adc9300e 2658
c445477d
BH
2659static struct rps_dev_flow *
2660set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2661 struct rps_dev_flow *rflow, u16 next_cpu)
2662{
09994d1b 2663 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
2664#ifdef CONFIG_RFS_ACCEL
2665 struct netdev_rx_queue *rxqueue;
2666 struct rps_dev_flow_table *flow_table;
2667 struct rps_dev_flow *old_rflow;
2668 u32 flow_id;
2669 u16 rxq_index;
2670 int rc;
2671
2672 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
2673 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2674 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
2675 goto out;
2676 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2677 if (rxq_index == skb_get_rx_queue(skb))
2678 goto out;
2679
2680 rxqueue = dev->_rx + rxq_index;
2681 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2682 if (!flow_table)
2683 goto out;
2684 flow_id = skb->rxhash & flow_table->mask;
2685 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2686 rxq_index, flow_id);
2687 if (rc < 0)
2688 goto out;
2689 old_rflow = rflow;
2690 rflow = &flow_table->flows[flow_id];
c445477d
BH
2691 rflow->filter = rc;
2692 if (old_rflow->filter == rflow->filter)
2693 old_rflow->filter = RPS_NO_FILTER;
2694 out:
2695#endif
2696 rflow->last_qtail =
09994d1b 2697 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
2698 }
2699
09994d1b 2700 rflow->cpu = next_cpu;
c445477d
BH
2701 return rflow;
2702}
2703
bfb564e7
KK
2704/*
2705 * get_rps_cpu is called from netif_receive_skb and returns the target
2706 * CPU from the RPS map of the receiving queue for a given skb.
2707 * rcu_read_lock must be held on entry.
2708 */
2709static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2710 struct rps_dev_flow **rflowp)
2711{
2712 struct netdev_rx_queue *rxqueue;
6e3f7faf 2713 struct rps_map *map;
bfb564e7
KK
2714 struct rps_dev_flow_table *flow_table;
2715 struct rps_sock_flow_table *sock_flow_table;
2716 int cpu = -1;
2717 u16 tcpu;
2718
2719 if (skb_rx_queue_recorded(skb)) {
2720 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
2721 if (unlikely(index >= dev->real_num_rx_queues)) {
2722 WARN_ONCE(dev->real_num_rx_queues > 1,
2723 "%s received packet on queue %u, but number "
2724 "of RX queues is %u\n",
2725 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
2726 goto done;
2727 }
2728 rxqueue = dev->_rx + index;
2729 } else
2730 rxqueue = dev->_rx;
2731
6e3f7faf
ED
2732 map = rcu_dereference(rxqueue->rps_map);
2733 if (map) {
85875236 2734 if (map->len == 1 &&
33d480ce 2735 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
2736 tcpu = map->cpus[0];
2737 if (cpu_online(tcpu))
2738 cpu = tcpu;
2739 goto done;
2740 }
33d480ce 2741 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 2742 goto done;
6febfca9 2743 }
bfb564e7 2744
2d47b459 2745 skb_reset_network_header(skb);
bfb564e7
KK
2746 if (!skb_get_rxhash(skb))
2747 goto done;
2748
fec5e652
TH
2749 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2750 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2751 if (flow_table && sock_flow_table) {
2752 u16 next_cpu;
2753 struct rps_dev_flow *rflow;
2754
2755 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2756 tcpu = rflow->cpu;
2757
2758 next_cpu = sock_flow_table->ents[skb->rxhash &
2759 sock_flow_table->mask];
2760
2761 /*
2762 * If the desired CPU (where last recvmsg was done) is
2763 * different from current CPU (one in the rx-queue flow
2764 * table entry), switch if one of the following holds:
2765 * - Current CPU is unset (equal to RPS_NO_CPU).
2766 * - Current CPU is offline.
2767 * - The current CPU's queue tail has advanced beyond the
2768 * last packet that was enqueued using this table entry.
2769 * This guarantees that all previous packets for the flow
2770 * have been dequeued, thus preserving in order delivery.
2771 */
2772 if (unlikely(tcpu != next_cpu) &&
2773 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2774 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
c445477d
BH
2775 rflow->last_qtail)) >= 0))
2776 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2777
fec5e652
TH
2778 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2779 *rflowp = rflow;
2780 cpu = tcpu;
2781 goto done;
2782 }
2783 }
2784
0a9627f2 2785 if (map) {
fec5e652 2786 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
2787
2788 if (cpu_online(tcpu)) {
2789 cpu = tcpu;
2790 goto done;
2791 }
2792 }
2793
2794done:
0a9627f2
TH
2795 return cpu;
2796}
2797
c445477d
BH
2798#ifdef CONFIG_RFS_ACCEL
2799
2800/**
2801 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2802 * @dev: Device on which the filter was set
2803 * @rxq_index: RX queue index
2804 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2805 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2806 *
2807 * Drivers that implement ndo_rx_flow_steer() should periodically call
2808 * this function for each installed filter and remove the filters for
2809 * which it returns %true.
2810 */
2811bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2812 u32 flow_id, u16 filter_id)
2813{
2814 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2815 struct rps_dev_flow_table *flow_table;
2816 struct rps_dev_flow *rflow;
2817 bool expire = true;
2818 int cpu;
2819
2820 rcu_read_lock();
2821 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2822 if (flow_table && flow_id <= flow_table->mask) {
2823 rflow = &flow_table->flows[flow_id];
2824 cpu = ACCESS_ONCE(rflow->cpu);
2825 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2826 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2827 rflow->last_qtail) <
2828 (int)(10 * flow_table->mask)))
2829 expire = false;
2830 }
2831 rcu_read_unlock();
2832 return expire;
2833}
2834EXPORT_SYMBOL(rps_may_expire_flow);
2835
2836#endif /* CONFIG_RFS_ACCEL */
2837
0a9627f2 2838/* Called from hardirq (IPI) context */
e36fa2f7 2839static void rps_trigger_softirq(void *data)
0a9627f2 2840{
e36fa2f7
ED
2841 struct softnet_data *sd = data;
2842
eecfd7c4 2843 ____napi_schedule(sd, &sd->backlog);
dee42870 2844 sd->received_rps++;
0a9627f2 2845}
e36fa2f7 2846
fec5e652 2847#endif /* CONFIG_RPS */
0a9627f2 2848
e36fa2f7
ED
2849/*
2850 * Check if this softnet_data structure is another cpu one
2851 * If yes, queue it to our IPI list and return 1
2852 * If no, return 0
2853 */
2854static int rps_ipi_queued(struct softnet_data *sd)
2855{
2856#ifdef CONFIG_RPS
2857 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2858
2859 if (sd != mysd) {
2860 sd->rps_ipi_next = mysd->rps_ipi_list;
2861 mysd->rps_ipi_list = sd;
2862
2863 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2864 return 1;
2865 }
2866#endif /* CONFIG_RPS */
2867 return 0;
2868}
2869
0a9627f2
TH
2870/*
2871 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2872 * queue (may be a remote CPU queue).
2873 */
fec5e652
TH
2874static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2875 unsigned int *qtail)
0a9627f2 2876{
e36fa2f7 2877 struct softnet_data *sd;
0a9627f2
TH
2878 unsigned long flags;
2879
e36fa2f7 2880 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
2881
2882 local_irq_save(flags);
0a9627f2 2883
e36fa2f7 2884 rps_lock(sd);
6e7676c1
CG
2885 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2886 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 2887enqueue:
e36fa2f7 2888 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 2889 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 2890 rps_unlock(sd);
152102c7 2891 local_irq_restore(flags);
0a9627f2
TH
2892 return NET_RX_SUCCESS;
2893 }
2894
ebda37c2
ED
2895 /* Schedule NAPI for backlog device
2896 * We can use non atomic operation since we own the queue lock
2897 */
2898 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 2899 if (!rps_ipi_queued(sd))
eecfd7c4 2900 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
2901 }
2902 goto enqueue;
2903 }
2904
dee42870 2905 sd->dropped++;
e36fa2f7 2906 rps_unlock(sd);
0a9627f2 2907
0a9627f2
TH
2908 local_irq_restore(flags);
2909
caf586e5 2910 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
2911 kfree_skb(skb);
2912 return NET_RX_DROP;
2913}
1da177e4 2914
1da177e4
LT
2915/**
2916 * netif_rx - post buffer to the network code
2917 * @skb: buffer to post
2918 *
2919 * This function receives a packet from a device driver and queues it for
2920 * the upper (protocol) levels to process. It always succeeds. The buffer
2921 * may be dropped during processing for congestion control or by the
2922 * protocol layers.
2923 *
2924 * return values:
2925 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
2926 * NET_RX_DROP (packet was dropped)
2927 *
2928 */
2929
2930int netif_rx(struct sk_buff *skb)
2931{
b0e28f1e 2932 int ret;
1da177e4
LT
2933
2934 /* if netpoll wants it, pretend we never saw it */
2935 if (netpoll_rx(skb))
2936 return NET_RX_DROP;
2937
588f0330 2938 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 2939
cf66ba58 2940 trace_netif_rx(skb);
df334545 2941#ifdef CONFIG_RPS
c5905afb 2942 if (static_key_false(&rps_needed)) {
fec5e652 2943 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
2944 int cpu;
2945
cece1945 2946 preempt_disable();
b0e28f1e 2947 rcu_read_lock();
fec5e652
TH
2948
2949 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
2950 if (cpu < 0)
2951 cpu = smp_processor_id();
fec5e652
TH
2952
2953 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2954
b0e28f1e 2955 rcu_read_unlock();
cece1945 2956 preempt_enable();
adc9300e
ED
2957 } else
2958#endif
fec5e652
TH
2959 {
2960 unsigned int qtail;
2961 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2962 put_cpu();
2963 }
b0e28f1e 2964 return ret;
1da177e4 2965}
d1b19dff 2966EXPORT_SYMBOL(netif_rx);
1da177e4
LT
2967
2968int netif_rx_ni(struct sk_buff *skb)
2969{
2970 int err;
2971
2972 preempt_disable();
2973 err = netif_rx(skb);
2974 if (local_softirq_pending())
2975 do_softirq();
2976 preempt_enable();
2977
2978 return err;
2979}
1da177e4
LT
2980EXPORT_SYMBOL(netif_rx_ni);
2981
1da177e4
LT
2982static void net_tx_action(struct softirq_action *h)
2983{
2984 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2985
2986 if (sd->completion_queue) {
2987 struct sk_buff *clist;
2988
2989 local_irq_disable();
2990 clist = sd->completion_queue;
2991 sd->completion_queue = NULL;
2992 local_irq_enable();
2993
2994 while (clist) {
2995 struct sk_buff *skb = clist;
2996 clist = clist->next;
2997
547b792c 2998 WARN_ON(atomic_read(&skb->users));
07dc22e7 2999 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3000 __kfree_skb(skb);
3001 }
3002 }
3003
3004 if (sd->output_queue) {
37437bb2 3005 struct Qdisc *head;
1da177e4
LT
3006
3007 local_irq_disable();
3008 head = sd->output_queue;
3009 sd->output_queue = NULL;
a9cbd588 3010 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3011 local_irq_enable();
3012
3013 while (head) {
37437bb2
DM
3014 struct Qdisc *q = head;
3015 spinlock_t *root_lock;
3016
1da177e4
LT
3017 head = head->next_sched;
3018
5fb66229 3019 root_lock = qdisc_lock(q);
37437bb2 3020 if (spin_trylock(root_lock)) {
def82a1d
JP
3021 smp_mb__before_clear_bit();
3022 clear_bit(__QDISC_STATE_SCHED,
3023 &q->state);
37437bb2
DM
3024 qdisc_run(q);
3025 spin_unlock(root_lock);
1da177e4 3026 } else {
195648bb 3027 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3028 &q->state)) {
195648bb 3029 __netif_reschedule(q);
e8a83e10
JP
3030 } else {
3031 smp_mb__before_clear_bit();
3032 clear_bit(__QDISC_STATE_SCHED,
3033 &q->state);
3034 }
1da177e4
LT
3035 }
3036 }
3037 }
3038}
3039
ab95bfe0
JP
3040#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3041 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3042/* This hook is defined here for ATM LANE */
3043int (*br_fdb_test_addr_hook)(struct net_device *dev,
3044 unsigned char *addr) __read_mostly;
4fb019a0 3045EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3046#endif
1da177e4 3047
1da177e4
LT
3048#ifdef CONFIG_NET_CLS_ACT
3049/* TODO: Maybe we should just force sch_ingress to be compiled in
3050 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3051 * a compare and 2 stores extra right now if we dont have it on
3052 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3053 * NOTE: This doesn't stop any functionality; if you dont have
3054 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3055 *
3056 */
24824a09 3057static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3058{
1da177e4 3059 struct net_device *dev = skb->dev;
f697c3e8 3060 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3061 int result = TC_ACT_OK;
3062 struct Qdisc *q;
4ec93edb 3063
de384830 3064 if (unlikely(MAX_RED_LOOP < ttl++)) {
e87cc472
JP
3065 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3066 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3067 return TC_ACT_SHOT;
3068 }
1da177e4 3069
f697c3e8
HX
3070 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3071 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3072
83874000 3073 q = rxq->qdisc;
8d50b53d 3074 if (q != &noop_qdisc) {
83874000 3075 spin_lock(qdisc_lock(q));
a9312ae8
DM
3076 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3077 result = qdisc_enqueue_root(skb, q);
83874000
DM
3078 spin_unlock(qdisc_lock(q));
3079 }
f697c3e8
HX
3080
3081 return result;
3082}
86e65da9 3083
f697c3e8
HX
3084static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3085 struct packet_type **pt_prev,
3086 int *ret, struct net_device *orig_dev)
3087{
24824a09
ED
3088 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3089
3090 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 3091 goto out;
1da177e4 3092
f697c3e8
HX
3093 if (*pt_prev) {
3094 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3095 *pt_prev = NULL;
1da177e4
LT
3096 }
3097
24824a09 3098 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3099 case TC_ACT_SHOT:
3100 case TC_ACT_STOLEN:
3101 kfree_skb(skb);
3102 return NULL;
3103 }
3104
3105out:
3106 skb->tc_verd = 0;
3107 return skb;
1da177e4
LT
3108}
3109#endif
3110
ab95bfe0
JP
3111/**
3112 * netdev_rx_handler_register - register receive handler
3113 * @dev: device to register a handler for
3114 * @rx_handler: receive handler to register
93e2c32b 3115 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
3116 *
3117 * Register a receive hander for a device. This handler will then be
3118 * called from __netif_receive_skb. A negative errno code is returned
3119 * on a failure.
3120 *
3121 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3122 *
3123 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3124 */
3125int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3126 rx_handler_func_t *rx_handler,
3127 void *rx_handler_data)
ab95bfe0
JP
3128{
3129 ASSERT_RTNL();
3130
3131 if (dev->rx_handler)
3132 return -EBUSY;
3133
93e2c32b 3134 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3135 rcu_assign_pointer(dev->rx_handler, rx_handler);
3136
3137 return 0;
3138}
3139EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3140
3141/**
3142 * netdev_rx_handler_unregister - unregister receive handler
3143 * @dev: device to unregister a handler from
3144 *
3145 * Unregister a receive hander from a device.
3146 *
3147 * The caller must hold the rtnl_mutex.
3148 */
3149void netdev_rx_handler_unregister(struct net_device *dev)
3150{
3151
3152 ASSERT_RTNL();
a9b3cd7f
SH
3153 RCU_INIT_POINTER(dev->rx_handler, NULL);
3154 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3155}
3156EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3157
10f744d2 3158static int __netif_receive_skb(struct sk_buff *skb)
1da177e4
LT
3159{
3160 struct packet_type *ptype, *pt_prev;
ab95bfe0 3161 rx_handler_func_t *rx_handler;
f2ccd8fa 3162 struct net_device *orig_dev;
63d8ea7f 3163 struct net_device *null_or_dev;
8a4eb573 3164 bool deliver_exact = false;
1da177e4 3165 int ret = NET_RX_DROP;
252e3346 3166 __be16 type;
1da177e4 3167
588f0330 3168 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3169
cf66ba58 3170 trace_netif_receive_skb(skb);
9b22ea56 3171
1da177e4 3172 /* if we've gotten here through NAPI, check netpoll */
bea3348e 3173 if (netpoll_receive_skb(skb))
1da177e4
LT
3174 return NET_RX_DROP;
3175
cc9bd5ce 3176 orig_dev = skb->dev;
8f903c70 3177
c1d2bbe1 3178 skb_reset_network_header(skb);
badff6d0 3179 skb_reset_transport_header(skb);
0b5c9db1 3180 skb_reset_mac_len(skb);
1da177e4
LT
3181
3182 pt_prev = NULL;
3183
3184 rcu_read_lock();
3185
63d8ea7f 3186another_round:
b6858177 3187 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3188
3189 __this_cpu_inc(softnet_data.processed);
3190
bcc6d479
JP
3191 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3192 skb = vlan_untag(skb);
3193 if (unlikely(!skb))
3194 goto out;
3195 }
3196
1da177e4
LT
3197#ifdef CONFIG_NET_CLS_ACT
3198 if (skb->tc_verd & TC_NCLS) {
3199 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3200 goto ncls;
3201 }
3202#endif
3203
3204 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3205 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3206 if (pt_prev)
f2ccd8fa 3207 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3208 pt_prev = ptype;
3209 }
3210 }
3211
3212#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3213 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3214 if (!skb)
1da177e4 3215 goto out;
1da177e4
LT
3216ncls:
3217#endif
3218
6a32e4f9 3219 rx_handler = rcu_dereference(skb->dev->rx_handler);
2425717b
JF
3220 if (vlan_tx_tag_present(skb)) {
3221 if (pt_prev) {
3222 ret = deliver_skb(skb, pt_prev, orig_dev);
3223 pt_prev = NULL;
3224 }
6a32e4f9 3225 if (vlan_do_receive(&skb, !rx_handler))
2425717b
JF
3226 goto another_round;
3227 else if (unlikely(!skb))
3228 goto out;
3229 }
3230
ab95bfe0
JP
3231 if (rx_handler) {
3232 if (pt_prev) {
3233 ret = deliver_skb(skb, pt_prev, orig_dev);
3234 pt_prev = NULL;
3235 }
8a4eb573
JP
3236 switch (rx_handler(&skb)) {
3237 case RX_HANDLER_CONSUMED:
ab95bfe0 3238 goto out;
8a4eb573 3239 case RX_HANDLER_ANOTHER:
63d8ea7f 3240 goto another_round;
8a4eb573
JP
3241 case RX_HANDLER_EXACT:
3242 deliver_exact = true;
3243 case RX_HANDLER_PASS:
3244 break;
3245 default:
3246 BUG();
3247 }
ab95bfe0 3248 }
1da177e4 3249
63d8ea7f 3250 /* deliver only exact match when indicated */
8a4eb573 3251 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3252
1da177e4 3253 type = skb->protocol;
82d8a867
PE
3254 list_for_each_entry_rcu(ptype,
3255 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3256 if (ptype->type == type &&
e3f48d37
JP
3257 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3258 ptype->dev == orig_dev)) {
4ec93edb 3259 if (pt_prev)
f2ccd8fa 3260 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3261 pt_prev = ptype;
3262 }
3263 }
3264
3265 if (pt_prev) {
1080e512
MT
3266 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3267 ret = -ENOMEM;
3268 else
3269 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3270 } else {
caf586e5 3271 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3272 kfree_skb(skb);
3273 /* Jamal, now you will not able to escape explaining
3274 * me how you were going to use this. :-)
3275 */
3276 ret = NET_RX_DROP;
3277 }
3278
3279out:
3280 rcu_read_unlock();
3281 return ret;
3282}
0a9627f2
TH
3283
3284/**
3285 * netif_receive_skb - process receive buffer from network
3286 * @skb: buffer to process
3287 *
3288 * netif_receive_skb() is the main receive data processing function.
3289 * It always succeeds. The buffer may be dropped during processing
3290 * for congestion control or by the protocol layers.
3291 *
3292 * This function may only be called from softirq context and interrupts
3293 * should be enabled.
3294 *
3295 * Return values (usually ignored):
3296 * NET_RX_SUCCESS: no congestion
3297 * NET_RX_DROP: packet was dropped
3298 */
3299int netif_receive_skb(struct sk_buff *skb)
3300{
588f0330 3301 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3302
c1f19b51
RC
3303 if (skb_defer_rx_timestamp(skb))
3304 return NET_RX_SUCCESS;
3305
df334545 3306#ifdef CONFIG_RPS
c5905afb 3307 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3308 struct rps_dev_flow voidflow, *rflow = &voidflow;
3309 int cpu, ret;
fec5e652 3310
3b098e2d
ED
3311 rcu_read_lock();
3312
3313 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3314
3b098e2d
ED
3315 if (cpu >= 0) {
3316 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3317 rcu_read_unlock();
adc9300e 3318 return ret;
3b098e2d 3319 }
adc9300e 3320 rcu_read_unlock();
fec5e652 3321 }
1e94d72f 3322#endif
adc9300e 3323 return __netif_receive_skb(skb);
0a9627f2 3324}
d1b19dff 3325EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3326
88751275
ED
3327/* Network device is going away, flush any packets still pending
3328 * Called with irqs disabled.
3329 */
152102c7 3330static void flush_backlog(void *arg)
6e583ce5 3331{
152102c7 3332 struct net_device *dev = arg;
e36fa2f7 3333 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3334 struct sk_buff *skb, *tmp;
3335
e36fa2f7 3336 rps_lock(sd);
6e7676c1 3337 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3338 if (skb->dev == dev) {
e36fa2f7 3339 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3340 kfree_skb(skb);
76cc8b13 3341 input_queue_head_incr(sd);
6e583ce5 3342 }
6e7676c1 3343 }
e36fa2f7 3344 rps_unlock(sd);
6e7676c1
CG
3345
3346 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3347 if (skb->dev == dev) {
3348 __skb_unlink(skb, &sd->process_queue);
3349 kfree_skb(skb);
76cc8b13 3350 input_queue_head_incr(sd);
6e7676c1
CG
3351 }
3352 }
6e583ce5
SH
3353}
3354
d565b0a1
HX
3355static int napi_gro_complete(struct sk_buff *skb)
3356{
3357 struct packet_type *ptype;
3358 __be16 type = skb->protocol;
3359 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3360 int err = -ENOENT;
3361
fc59f9a3
HX
3362 if (NAPI_GRO_CB(skb)->count == 1) {
3363 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3364 goto out;
fc59f9a3 3365 }
d565b0a1
HX
3366
3367 rcu_read_lock();
3368 list_for_each_entry_rcu(ptype, head, list) {
3369 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3370 continue;
3371
3372 err = ptype->gro_complete(skb);
3373 break;
3374 }
3375 rcu_read_unlock();
3376
3377 if (err) {
3378 WARN_ON(&ptype->list == head);
3379 kfree_skb(skb);
3380 return NET_RX_SUCCESS;
3381 }
3382
3383out:
d565b0a1
HX
3384 return netif_receive_skb(skb);
3385}
3386
86cac58b 3387inline void napi_gro_flush(struct napi_struct *napi)
d565b0a1
HX
3388{
3389 struct sk_buff *skb, *next;
3390
3391 for (skb = napi->gro_list; skb; skb = next) {
3392 next = skb->next;
3393 skb->next = NULL;
3394 napi_gro_complete(skb);
3395 }
3396
4ae5544f 3397 napi->gro_count = 0;
d565b0a1
HX
3398 napi->gro_list = NULL;
3399}
86cac58b 3400EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3401
5b252f0c 3402enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3403{
3404 struct sk_buff **pp = NULL;
3405 struct packet_type *ptype;
3406 __be16 type = skb->protocol;
3407 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
0da2afd5 3408 int same_flow;
d565b0a1 3409 int mac_len;
5b252f0c 3410 enum gro_result ret;
d565b0a1 3411
ce9e76c8 3412 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3413 goto normal;
3414
21dc3301 3415 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3416 goto normal;
3417
d565b0a1
HX
3418 rcu_read_lock();
3419 list_for_each_entry_rcu(ptype, head, list) {
d565b0a1
HX
3420 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3421 continue;
3422
86911732 3423 skb_set_network_header(skb, skb_gro_offset(skb));
d565b0a1
HX
3424 mac_len = skb->network_header - skb->mac_header;
3425 skb->mac_len = mac_len;
3426 NAPI_GRO_CB(skb)->same_flow = 0;
3427 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3428 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3429
d565b0a1
HX
3430 pp = ptype->gro_receive(&napi->gro_list, skb);
3431 break;
3432 }
3433 rcu_read_unlock();
3434
3435 if (&ptype->list == head)
3436 goto normal;
3437
0da2afd5 3438 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3439 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3440
d565b0a1
HX
3441 if (pp) {
3442 struct sk_buff *nskb = *pp;
3443
3444 *pp = nskb->next;
3445 nskb->next = NULL;
3446 napi_gro_complete(nskb);
4ae5544f 3447 napi->gro_count--;
d565b0a1
HX
3448 }
3449
0da2afd5 3450 if (same_flow)
d565b0a1
HX
3451 goto ok;
3452
4ae5544f 3453 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3454 goto normal;
d565b0a1 3455
4ae5544f 3456 napi->gro_count++;
d565b0a1 3457 NAPI_GRO_CB(skb)->count = 1;
86911732 3458 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3459 skb->next = napi->gro_list;
3460 napi->gro_list = skb;
5d0d9be8 3461 ret = GRO_HELD;
d565b0a1 3462
ad0f9904 3463pull:
cb18978c
HX
3464 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3465 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3466
3467 BUG_ON(skb->end - skb->tail < grow);
3468
3469 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3470
3471 skb->tail += grow;
3472 skb->data_len -= grow;
3473
3474 skb_shinfo(skb)->frags[0].page_offset += grow;
9e903e08 3475 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
cb18978c 3476
9e903e08 3477 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
ea2ab693 3478 skb_frag_unref(skb, 0);
cb18978c
HX
3479 memmove(skb_shinfo(skb)->frags,
3480 skb_shinfo(skb)->frags + 1,
e5093aec 3481 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3482 }
ad0f9904
HX
3483 }
3484
d565b0a1 3485ok:
5d0d9be8 3486 return ret;
d565b0a1
HX
3487
3488normal:
ad0f9904
HX
3489 ret = GRO_NORMAL;
3490 goto pull;
5d38a079 3491}
96e93eab
HX
3492EXPORT_SYMBOL(dev_gro_receive);
3493
40d0802b 3494static inline gro_result_t
5b252f0c 3495__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
96e93eab
HX
3496{
3497 struct sk_buff *p;
5ca3b72c 3498 unsigned int maclen = skb->dev->hard_header_len;
96e93eab
HX
3499
3500 for (p = napi->gro_list; p; p = p->next) {
40d0802b
ED
3501 unsigned long diffs;
3502
3503 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3701e513 3504 diffs |= p->vlan_tci ^ skb->vlan_tci;
5ca3b72c
ED
3505 if (maclen == ETH_HLEN)
3506 diffs |= compare_ether_header(skb_mac_header(p),
3507 skb_gro_mac_header(skb));
3508 else if (!diffs)
3509 diffs = memcmp(skb_mac_header(p),
3510 skb_gro_mac_header(skb),
3511 maclen);
40d0802b 3512 NAPI_GRO_CB(p)->same_flow = !diffs;
96e93eab
HX
3513 NAPI_GRO_CB(p)->flush = 0;
3514 }
3515
3516 return dev_gro_receive(napi, skb);
3517}
5d38a079 3518
c7c4b3b6 3519gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3520{
5d0d9be8
HX
3521 switch (ret) {
3522 case GRO_NORMAL:
c7c4b3b6
BH
3523 if (netif_receive_skb(skb))
3524 ret = GRO_DROP;
3525 break;
5d38a079 3526
5d0d9be8 3527 case GRO_DROP:
5d38a079
HX
3528 kfree_skb(skb);
3529 break;
5b252f0c 3530
daa86548 3531 case GRO_MERGED_FREE:
d7e8883c
ED
3532 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3533 kmem_cache_free(skbuff_head_cache, skb);
3534 else
3535 __kfree_skb(skb);
daa86548
ED
3536 break;
3537
5b252f0c
BH
3538 case GRO_HELD:
3539 case GRO_MERGED:
3540 break;
5d38a079
HX
3541 }
3542
c7c4b3b6 3543 return ret;
5d0d9be8
HX
3544}
3545EXPORT_SYMBOL(napi_skb_finish);
3546
78a478d0
HX
3547void skb_gro_reset_offset(struct sk_buff *skb)
3548{
3549 NAPI_GRO_CB(skb)->data_offset = 0;
3550 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3551 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3552
78d3fd0b 3553 if (skb->mac_header == skb->tail &&
ea2ab693 3554 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
78a478d0 3555 NAPI_GRO_CB(skb)->frag0 =
ea2ab693 3556 skb_frag_address(&skb_shinfo(skb)->frags[0]);
9e903e08 3557 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
7489594c 3558 }
78a478d0
HX
3559}
3560EXPORT_SYMBOL(skb_gro_reset_offset);
3561
c7c4b3b6 3562gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3563{
86911732
HX
3564 skb_gro_reset_offset(skb);
3565
5d0d9be8 3566 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
d565b0a1
HX
3567}
3568EXPORT_SYMBOL(napi_gro_receive);
3569
d0c2b0d2 3570static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 3571{
96e93eab 3572 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
3573 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3574 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 3575 skb->vlan_tci = 0;
66c46d74 3576 skb->dev = napi->dev;
6d152e23 3577 skb->skb_iif = 0;
96e93eab
HX
3578
3579 napi->skb = skb;
3580}
96e93eab 3581
76620aaf 3582struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3583{
5d38a079 3584 struct sk_buff *skb = napi->skb;
5d38a079
HX
3585
3586 if (!skb) {
89d71a66
ED
3587 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3588 if (skb)
3589 napi->skb = skb;
80595d59 3590 }
96e93eab
HX
3591 return skb;
3592}
76620aaf 3593EXPORT_SYMBOL(napi_get_frags);
96e93eab 3594
c7c4b3b6
BH
3595gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3596 gro_result_t ret)
96e93eab 3597{
5d0d9be8
HX
3598 switch (ret) {
3599 case GRO_NORMAL:
86911732 3600 case GRO_HELD:
e76b69cc 3601 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3602
c7c4b3b6
BH
3603 if (ret == GRO_HELD)
3604 skb_gro_pull(skb, -ETH_HLEN);
3605 else if (netif_receive_skb(skb))
3606 ret = GRO_DROP;
86911732 3607 break;
5d38a079 3608
5d0d9be8 3609 case GRO_DROP:
5d0d9be8
HX
3610 case GRO_MERGED_FREE:
3611 napi_reuse_skb(napi, skb);
3612 break;
5b252f0c
BH
3613
3614 case GRO_MERGED:
3615 break;
5d0d9be8 3616 }
5d38a079 3617
c7c4b3b6 3618 return ret;
5d38a079 3619}
5d0d9be8
HX
3620EXPORT_SYMBOL(napi_frags_finish);
3621
4adb9c4a 3622static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
3623{
3624 struct sk_buff *skb = napi->skb;
3625 struct ethhdr *eth;
a5b1cf28
HX
3626 unsigned int hlen;
3627 unsigned int off;
76620aaf
HX
3628
3629 napi->skb = NULL;
3630
3631 skb_reset_mac_header(skb);
3632 skb_gro_reset_offset(skb);
3633
a5b1cf28
HX
3634 off = skb_gro_offset(skb);
3635 hlen = off + sizeof(*eth);
3636 eth = skb_gro_header_fast(skb, off);
3637 if (skb_gro_header_hard(skb, hlen)) {
3638 eth = skb_gro_header_slow(skb, hlen, off);
3639 if (unlikely(!eth)) {
3640 napi_reuse_skb(napi, skb);
3641 skb = NULL;
3642 goto out;
3643 }
76620aaf
HX
3644 }
3645
3646 skb_gro_pull(skb, sizeof(*eth));
3647
3648 /*
3649 * This works because the only protocols we care about don't require
3650 * special handling. We'll fix it up properly at the end.
3651 */
3652 skb->protocol = eth->h_proto;
3653
3654out:
3655 return skb;
3656}
76620aaf 3657
c7c4b3b6 3658gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3659{
76620aaf 3660 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3661
3662 if (!skb)
c7c4b3b6 3663 return GRO_DROP;
5d0d9be8
HX
3664
3665 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3666}
5d38a079
HX
3667EXPORT_SYMBOL(napi_gro_frags);
3668
e326bed2
ED
3669/*
3670 * net_rps_action sends any pending IPI's for rps.
3671 * Note: called with local irq disabled, but exits with local irq enabled.
3672 */
3673static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3674{
3675#ifdef CONFIG_RPS
3676 struct softnet_data *remsd = sd->rps_ipi_list;
3677
3678 if (remsd) {
3679 sd->rps_ipi_list = NULL;
3680
3681 local_irq_enable();
3682
3683 /* Send pending IPI's to kick RPS processing on remote cpus. */
3684 while (remsd) {
3685 struct softnet_data *next = remsd->rps_ipi_next;
3686
3687 if (cpu_online(remsd->cpu))
3688 __smp_call_function_single(remsd->cpu,
3689 &remsd->csd, 0);
3690 remsd = next;
3691 }
3692 } else
3693#endif
3694 local_irq_enable();
3695}
3696
bea3348e 3697static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
3698{
3699 int work = 0;
eecfd7c4 3700 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 3701
e326bed2
ED
3702#ifdef CONFIG_RPS
3703 /* Check if we have pending ipi, its better to send them now,
3704 * not waiting net_rx_action() end.
3705 */
3706 if (sd->rps_ipi_list) {
3707 local_irq_disable();
3708 net_rps_action_and_irq_enable(sd);
3709 }
3710#endif
bea3348e 3711 napi->weight = weight_p;
6e7676c1
CG
3712 local_irq_disable();
3713 while (work < quota) {
1da177e4 3714 struct sk_buff *skb;
6e7676c1
CG
3715 unsigned int qlen;
3716
3717 while ((skb = __skb_dequeue(&sd->process_queue))) {
3718 local_irq_enable();
3719 __netif_receive_skb(skb);
6e7676c1 3720 local_irq_disable();
76cc8b13
TH
3721 input_queue_head_incr(sd);
3722 if (++work >= quota) {
3723 local_irq_enable();
3724 return work;
3725 }
6e7676c1 3726 }
1da177e4 3727
e36fa2f7 3728 rps_lock(sd);
6e7676c1 3729 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 3730 if (qlen)
6e7676c1
CG
3731 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3732 &sd->process_queue);
76cc8b13 3733
6e7676c1 3734 if (qlen < quota - work) {
eecfd7c4
ED
3735 /*
3736 * Inline a custom version of __napi_complete().
3737 * only current cpu owns and manipulates this napi,
3738 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3739 * we can use a plain write instead of clear_bit(),
3740 * and we dont need an smp_mb() memory barrier.
3741 */
3742 list_del(&napi->poll_list);
3743 napi->state = 0;
3744
6e7676c1 3745 quota = work + qlen;
bea3348e 3746 }
e36fa2f7 3747 rps_unlock(sd);
6e7676c1
CG
3748 }
3749 local_irq_enable();
1da177e4 3750
bea3348e
SH
3751 return work;
3752}
1da177e4 3753
bea3348e
SH
3754/**
3755 * __napi_schedule - schedule for receive
c4ea43c5 3756 * @n: entry to schedule
bea3348e
SH
3757 *
3758 * The entry's receive function will be scheduled to run
3759 */
b5606c2d 3760void __napi_schedule(struct napi_struct *n)
bea3348e
SH
3761{
3762 unsigned long flags;
1da177e4 3763
bea3348e 3764 local_irq_save(flags);
eecfd7c4 3765 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 3766 local_irq_restore(flags);
1da177e4 3767}
bea3348e
SH
3768EXPORT_SYMBOL(__napi_schedule);
3769
d565b0a1
HX
3770void __napi_complete(struct napi_struct *n)
3771{
3772 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3773 BUG_ON(n->gro_list);
3774
3775 list_del(&n->poll_list);
3776 smp_mb__before_clear_bit();
3777 clear_bit(NAPI_STATE_SCHED, &n->state);
3778}
3779EXPORT_SYMBOL(__napi_complete);
3780
3781void napi_complete(struct napi_struct *n)
3782{
3783 unsigned long flags;
3784
3785 /*
3786 * don't let napi dequeue from the cpu poll list
3787 * just in case its running on a different cpu
3788 */
3789 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3790 return;
3791
3792 napi_gro_flush(n);
3793 local_irq_save(flags);
3794 __napi_complete(n);
3795 local_irq_restore(flags);
3796}
3797EXPORT_SYMBOL(napi_complete);
3798
3799void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3800 int (*poll)(struct napi_struct *, int), int weight)
3801{
3802 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 3803 napi->gro_count = 0;
d565b0a1 3804 napi->gro_list = NULL;
5d38a079 3805 napi->skb = NULL;
d565b0a1
HX
3806 napi->poll = poll;
3807 napi->weight = weight;
3808 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 3809 napi->dev = dev;
5d38a079 3810#ifdef CONFIG_NETPOLL
d565b0a1
HX
3811 spin_lock_init(&napi->poll_lock);
3812 napi->poll_owner = -1;
3813#endif
3814 set_bit(NAPI_STATE_SCHED, &napi->state);
3815}
3816EXPORT_SYMBOL(netif_napi_add);
3817
3818void netif_napi_del(struct napi_struct *napi)
3819{
3820 struct sk_buff *skb, *next;
3821
d7b06636 3822 list_del_init(&napi->dev_list);
76620aaf 3823 napi_free_frags(napi);
d565b0a1
HX
3824
3825 for (skb = napi->gro_list; skb; skb = next) {
3826 next = skb->next;
3827 skb->next = NULL;
3828 kfree_skb(skb);
3829 }
3830
3831 napi->gro_list = NULL;
4ae5544f 3832 napi->gro_count = 0;
d565b0a1
HX
3833}
3834EXPORT_SYMBOL(netif_napi_del);
3835
1da177e4
LT
3836static void net_rx_action(struct softirq_action *h)
3837{
e326bed2 3838 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 3839 unsigned long time_limit = jiffies + 2;
51b0bded 3840 int budget = netdev_budget;
53fb95d3
MM
3841 void *have;
3842
1da177e4
LT
3843 local_irq_disable();
3844
e326bed2 3845 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
3846 struct napi_struct *n;
3847 int work, weight;
1da177e4 3848
bea3348e 3849 /* If softirq window is exhuasted then punt.
24f8b238
SH
3850 * Allow this to run for 2 jiffies since which will allow
3851 * an average latency of 1.5/HZ.
bea3348e 3852 */
24f8b238 3853 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
3854 goto softnet_break;
3855
3856 local_irq_enable();
3857
bea3348e
SH
3858 /* Even though interrupts have been re-enabled, this
3859 * access is safe because interrupts can only add new
3860 * entries to the tail of this list, and only ->poll()
3861 * calls can remove this head entry from the list.
3862 */
e326bed2 3863 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 3864
bea3348e
SH
3865 have = netpoll_poll_lock(n);
3866
3867 weight = n->weight;
3868
0a7606c1
DM
3869 /* This NAPI_STATE_SCHED test is for avoiding a race
3870 * with netpoll's poll_napi(). Only the entity which
3871 * obtains the lock and sees NAPI_STATE_SCHED set will
3872 * actually make the ->poll() call. Therefore we avoid
25985edc 3873 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
3874 */
3875 work = 0;
4ea7e386 3876 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 3877 work = n->poll(n, weight);
4ea7e386
NH
3878 trace_napi_poll(n);
3879 }
bea3348e
SH
3880
3881 WARN_ON_ONCE(work > weight);
3882
3883 budget -= work;
3884
3885 local_irq_disable();
3886
3887 /* Drivers must not modify the NAPI state if they
3888 * consume the entire weight. In such cases this code
3889 * still "owns" the NAPI instance and therefore can
3890 * move the instance around on the list at-will.
3891 */
fed17f30 3892 if (unlikely(work == weight)) {
ff780cd8
HX
3893 if (unlikely(napi_disable_pending(n))) {
3894 local_irq_enable();
3895 napi_complete(n);
3896 local_irq_disable();
3897 } else
e326bed2 3898 list_move_tail(&n->poll_list, &sd->poll_list);
fed17f30 3899 }
bea3348e
SH
3900
3901 netpoll_poll_unlock(have);
1da177e4
LT
3902 }
3903out:
e326bed2 3904 net_rps_action_and_irq_enable(sd);
0a9627f2 3905
db217334
CL
3906#ifdef CONFIG_NET_DMA
3907 /*
3908 * There may not be any more sk_buffs coming right now, so push
3909 * any pending DMA copies to hardware
3910 */
2ba05622 3911 dma_issue_pending_all();
db217334 3912#endif
bea3348e 3913
1da177e4
LT
3914 return;
3915
3916softnet_break:
dee42870 3917 sd->time_squeeze++;
1da177e4
LT
3918 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3919 goto out;
3920}
3921
d1b19dff 3922static gifconf_func_t *gifconf_list[NPROTO];
1da177e4
LT
3923
3924/**
3925 * register_gifconf - register a SIOCGIF handler
3926 * @family: Address family
3927 * @gifconf: Function handler
3928 *
3929 * Register protocol dependent address dumping routines. The handler
3930 * that is passed must not be freed or reused until it has been replaced
3931 * by another handler.
3932 */
d1b19dff 3933int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
1da177e4
LT
3934{
3935 if (family >= NPROTO)
3936 return -EINVAL;
3937 gifconf_list[family] = gifconf;
3938 return 0;
3939}
d1b19dff 3940EXPORT_SYMBOL(register_gifconf);
1da177e4
LT
3941
3942
3943/*
3944 * Map an interface index to its name (SIOCGIFNAME)
3945 */
3946
3947/*
3948 * We need this ioctl for efficient implementation of the
3949 * if_indextoname() function required by the IPv6 API. Without
3950 * it, we would have to search all the interfaces to find a
3951 * match. --pb
3952 */
3953
881d966b 3954static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
3955{
3956 struct net_device *dev;
3957 struct ifreq ifr;
3958
3959 /*
3960 * Fetch the caller's info block.
3961 */
3962
3963 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3964 return -EFAULT;
3965
fb699dfd
ED
3966 rcu_read_lock();
3967 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
1da177e4 3968 if (!dev) {
fb699dfd 3969 rcu_read_unlock();
1da177e4
LT
3970 return -ENODEV;
3971 }
3972
3973 strcpy(ifr.ifr_name, dev->name);
fb699dfd 3974 rcu_read_unlock();
1da177e4
LT
3975
3976 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3977 return -EFAULT;
3978 return 0;
3979}
3980
3981/*
3982 * Perform a SIOCGIFCONF call. This structure will change
3983 * size eventually, and there is nothing I can do about it.
3984 * Thus we will need a 'compatibility mode'.
3985 */
3986
881d966b 3987static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
3988{
3989 struct ifconf ifc;
3990 struct net_device *dev;
3991 char __user *pos;
3992 int len;
3993 int total;
3994 int i;
3995
3996 /*
3997 * Fetch the caller's info block.
3998 */
3999
4000 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4001 return -EFAULT;
4002
4003 pos = ifc.ifc_buf;
4004 len = ifc.ifc_len;
4005
4006 /*
4007 * Loop over the interfaces, and write an info block for each.
4008 */
4009
4010 total = 0;
881d966b 4011 for_each_netdev(net, dev) {
1da177e4
LT
4012 for (i = 0; i < NPROTO; i++) {
4013 if (gifconf_list[i]) {
4014 int done;
4015 if (!pos)
4016 done = gifconf_list[i](dev, NULL, 0);
4017 else
4018 done = gifconf_list[i](dev, pos + total,
4019 len - total);
4020 if (done < 0)
4021 return -EFAULT;
4022 total += done;
4023 }
4024 }
4ec93edb 4025 }
1da177e4
LT
4026
4027 /*
4028 * All done. Write the updated control block back to the caller.
4029 */
4030 ifc.ifc_len = total;
4031
4032 /*
4033 * Both BSD and Solaris return 0 here, so we do too.
4034 */
4035 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4036}
4037
4038#ifdef CONFIG_PROC_FS
f04565dd 4039
2def16ae 4040#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
f04565dd
MM
4041
4042#define get_bucket(x) ((x) >> BUCKET_SPACE)
4043#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4044#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4045
2def16ae 4046static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
f04565dd 4047{
f04565dd
MM
4048 struct net *net = seq_file_net(seq);
4049 struct net_device *dev;
4050 struct hlist_node *p;
4051 struct hlist_head *h;
2def16ae 4052 unsigned int count = 0, offset = get_offset(*pos);
f04565dd 4053
2def16ae 4054 h = &net->dev_name_head[get_bucket(*pos)];
f04565dd 4055 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
2def16ae 4056 if (++count == offset)
f04565dd 4057 return dev;
f04565dd
MM
4058 }
4059
4060 return NULL;
4061}
4062
2def16ae 4063static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
f04565dd 4064{
f04565dd
MM
4065 struct net_device *dev;
4066 unsigned int bucket;
4067
f04565dd 4068 do {
2def16ae 4069 dev = dev_from_same_bucket(seq, pos);
f04565dd
MM
4070 if (dev)
4071 return dev;
4072
2def16ae
ED
4073 bucket = get_bucket(*pos) + 1;
4074 *pos = set_bucket_offset(bucket, 1);
f04565dd
MM
4075 } while (bucket < NETDEV_HASHENTRIES);
4076
4077 return NULL;
4078}
4079
1da177e4
LT
4080/*
4081 * This is invoked by the /proc filesystem handler to display a device
4082 * in detail.
4083 */
7562f876 4084void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 4085 __acquires(RCU)
1da177e4 4086{
c6d14c84 4087 rcu_read_lock();
7562f876
PE
4088 if (!*pos)
4089 return SEQ_START_TOKEN;
1da177e4 4090
2def16ae 4091 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
f04565dd 4092 return NULL;
1da177e4 4093
2def16ae 4094 return dev_from_bucket(seq, pos);
1da177e4
LT
4095}
4096
4097void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4098{
f04565dd 4099 ++*pos;
2def16ae 4100 return dev_from_bucket(seq, pos);
1da177e4
LT
4101}
4102
4103void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 4104 __releases(RCU)
1da177e4 4105{
c6d14c84 4106 rcu_read_unlock();
1da177e4
LT
4107}
4108
4109static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4110{
28172739
ED
4111 struct rtnl_link_stats64 temp;
4112 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
1da177e4 4113
be1f3c2c
BH
4114 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4115 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
5a1b5898
RR
4116 dev->name, stats->rx_bytes, stats->rx_packets,
4117 stats->rx_errors,
4118 stats->rx_dropped + stats->rx_missed_errors,
4119 stats->rx_fifo_errors,
4120 stats->rx_length_errors + stats->rx_over_errors +
4121 stats->rx_crc_errors + stats->rx_frame_errors,
4122 stats->rx_compressed, stats->multicast,
4123 stats->tx_bytes, stats->tx_packets,
4124 stats->tx_errors, stats->tx_dropped,
4125 stats->tx_fifo_errors, stats->collisions,
4126 stats->tx_carrier_errors +
4127 stats->tx_aborted_errors +
4128 stats->tx_window_errors +
4129 stats->tx_heartbeat_errors,
4130 stats->tx_compressed);
1da177e4
LT
4131}
4132
4133/*
4134 * Called from the PROCfs module. This now uses the new arbitrary sized
4135 * /proc/net interface to create /proc/net/dev
4136 */
4137static int dev_seq_show(struct seq_file *seq, void *v)
4138{
4139 if (v == SEQ_START_TOKEN)
4140 seq_puts(seq, "Inter-| Receive "
4141 " | Transmit\n"
4142 " face |bytes packets errs drop fifo frame "
4143 "compressed multicast|bytes packets errs "
4144 "drop fifo colls carrier compressed\n");
4145 else
4146 dev_seq_printf_stats(seq, v);
4147 return 0;
4148}
4149
dee42870 4150static struct softnet_data *softnet_get_online(loff_t *pos)
1da177e4 4151{
dee42870 4152 struct softnet_data *sd = NULL;
1da177e4 4153
0c0b0aca 4154 while (*pos < nr_cpu_ids)
4ec93edb 4155 if (cpu_online(*pos)) {
dee42870 4156 sd = &per_cpu(softnet_data, *pos);
1da177e4
LT
4157 break;
4158 } else
4159 ++*pos;
dee42870 4160 return sd;
1da177e4
LT
4161}
4162
4163static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4164{
4165 return softnet_get_online(pos);
4166}
4167
4168static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4169{
4170 ++*pos;
4171 return softnet_get_online(pos);
4172}
4173
4174static void softnet_seq_stop(struct seq_file *seq, void *v)
4175{
4176}
4177
4178static int softnet_seq_show(struct seq_file *seq, void *v)
4179{
dee42870 4180 struct softnet_data *sd = v;
1da177e4 4181
0a9627f2 4182 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
dee42870 4183 sd->processed, sd->dropped, sd->time_squeeze, 0,
c1ebcdb8 4184 0, 0, 0, 0, /* was fastroute */
dee42870 4185 sd->cpu_collision, sd->received_rps);
1da177e4
LT
4186 return 0;
4187}
4188
f690808e 4189static const struct seq_operations dev_seq_ops = {
1da177e4
LT
4190 .start = dev_seq_start,
4191 .next = dev_seq_next,
4192 .stop = dev_seq_stop,
4193 .show = dev_seq_show,
4194};
4195
4196static int dev_seq_open(struct inode *inode, struct file *file)
4197{
e372c414 4198 return seq_open_net(inode, file, &dev_seq_ops,
2def16ae 4199 sizeof(struct seq_net_private));
5cac98dd
AB
4200}
4201
9a32144e 4202static const struct file_operations dev_seq_fops = {
1da177e4
LT
4203 .owner = THIS_MODULE,
4204 .open = dev_seq_open,
4205 .read = seq_read,
4206 .llseek = seq_lseek,
e372c414 4207 .release = seq_release_net,
1da177e4
LT
4208};
4209
f690808e 4210static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
4211 .start = softnet_seq_start,
4212 .next = softnet_seq_next,
4213 .stop = softnet_seq_stop,
4214 .show = softnet_seq_show,
4215};
4216
4217static int softnet_seq_open(struct inode *inode, struct file *file)
4218{
4219 return seq_open(file, &softnet_seq_ops);
4220}
4221
9a32144e 4222static const struct file_operations softnet_seq_fops = {
1da177e4
LT
4223 .owner = THIS_MODULE,
4224 .open = softnet_seq_open,
4225 .read = seq_read,
4226 .llseek = seq_lseek,
4227 .release = seq_release,
4228};
4229
0e1256ff
SH
4230static void *ptype_get_idx(loff_t pos)
4231{
4232 struct packet_type *pt = NULL;
4233 loff_t i = 0;
4234 int t;
4235
4236 list_for_each_entry_rcu(pt, &ptype_all, list) {
4237 if (i == pos)
4238 return pt;
4239 ++i;
4240 }
4241
82d8a867 4242 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
4243 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4244 if (i == pos)
4245 return pt;
4246 ++i;
4247 }
4248 }
4249 return NULL;
4250}
4251
4252static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 4253 __acquires(RCU)
0e1256ff
SH
4254{
4255 rcu_read_lock();
4256 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4257}
4258
4259static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4260{
4261 struct packet_type *pt;
4262 struct list_head *nxt;
4263 int hash;
4264
4265 ++*pos;
4266 if (v == SEQ_START_TOKEN)
4267 return ptype_get_idx(0);
4268
4269 pt = v;
4270 nxt = pt->list.next;
4271 if (pt->type == htons(ETH_P_ALL)) {
4272 if (nxt != &ptype_all)
4273 goto found;
4274 hash = 0;
4275 nxt = ptype_base[0].next;
4276 } else
82d8a867 4277 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
4278
4279 while (nxt == &ptype_base[hash]) {
82d8a867 4280 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
4281 return NULL;
4282 nxt = ptype_base[hash].next;
4283 }
4284found:
4285 return list_entry(nxt, struct packet_type, list);
4286}
4287
4288static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 4289 __releases(RCU)
0e1256ff
SH
4290{
4291 rcu_read_unlock();
4292}
4293
0e1256ff
SH
4294static int ptype_seq_show(struct seq_file *seq, void *v)
4295{
4296 struct packet_type *pt = v;
4297
4298 if (v == SEQ_START_TOKEN)
4299 seq_puts(seq, "Type Device Function\n");
c346dca1 4300 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
4301 if (pt->type == htons(ETH_P_ALL))
4302 seq_puts(seq, "ALL ");
4303 else
4304 seq_printf(seq, "%04x", ntohs(pt->type));
4305
908cd2da
AD
4306 seq_printf(seq, " %-8s %pF\n",
4307 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
4308 }
4309
4310 return 0;
4311}
4312
4313static const struct seq_operations ptype_seq_ops = {
4314 .start = ptype_seq_start,
4315 .next = ptype_seq_next,
4316 .stop = ptype_seq_stop,
4317 .show = ptype_seq_show,
4318};
4319
4320static int ptype_seq_open(struct inode *inode, struct file *file)
4321{
2feb27db
PE
4322 return seq_open_net(inode, file, &ptype_seq_ops,
4323 sizeof(struct seq_net_private));
0e1256ff
SH
4324}
4325
4326static const struct file_operations ptype_seq_fops = {
4327 .owner = THIS_MODULE,
4328 .open = ptype_seq_open,
4329 .read = seq_read,
4330 .llseek = seq_lseek,
2feb27db 4331 .release = seq_release_net,
0e1256ff
SH
4332};
4333
4334
4665079c 4335static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
4336{
4337 int rc = -ENOMEM;
4338
881d966b 4339 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 4340 goto out;
881d966b 4341 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 4342 goto out_dev;
881d966b 4343 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 4344 goto out_softnet;
0e1256ff 4345
881d966b 4346 if (wext_proc_init(net))
457c4cbc 4347 goto out_ptype;
1da177e4
LT
4348 rc = 0;
4349out:
4350 return rc;
457c4cbc 4351out_ptype:
881d966b 4352 proc_net_remove(net, "ptype");
1da177e4 4353out_softnet:
881d966b 4354 proc_net_remove(net, "softnet_stat");
1da177e4 4355out_dev:
881d966b 4356 proc_net_remove(net, "dev");
1da177e4
LT
4357 goto out;
4358}
881d966b 4359
4665079c 4360static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
4361{
4362 wext_proc_exit(net);
4363
4364 proc_net_remove(net, "ptype");
4365 proc_net_remove(net, "softnet_stat");
4366 proc_net_remove(net, "dev");
4367}
4368
022cbae6 4369static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
4370 .init = dev_proc_net_init,
4371 .exit = dev_proc_net_exit,
4372};
4373
4374static int __init dev_proc_init(void)
4375{
4376 return register_pernet_subsys(&dev_proc_ops);
4377}
1da177e4
LT
4378#else
4379#define dev_proc_init() 0
4380#endif /* CONFIG_PROC_FS */
4381
4382
4383/**
1765a575 4384 * netdev_set_master - set up master pointer
1da177e4
LT
4385 * @slave: slave device
4386 * @master: new master device
4387 *
4388 * Changes the master device of the slave. Pass %NULL to break the
4389 * bonding. The caller must hold the RTNL semaphore. On a failure
4390 * a negative errno code is returned. On success the reference counts
1765a575 4391 * are adjusted and the function returns zero.
1da177e4
LT
4392 */
4393int netdev_set_master(struct net_device *slave, struct net_device *master)
4394{
4395 struct net_device *old = slave->master;
4396
4397 ASSERT_RTNL();
4398
4399 if (master) {
4400 if (old)
4401 return -EBUSY;
4402 dev_hold(master);
4403 }
4404
4405 slave->master = master;
4ec93edb 4406
6df427fe 4407 if (old)
1da177e4 4408 dev_put(old);
1765a575
JP
4409 return 0;
4410}
4411EXPORT_SYMBOL(netdev_set_master);
4412
4413/**
4414 * netdev_set_bond_master - set up bonding master/slave pair
4415 * @slave: slave device
4416 * @master: new master device
4417 *
4418 * Changes the master device of the slave. Pass %NULL to break the
4419 * bonding. The caller must hold the RTNL semaphore. On a failure
4420 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4421 * to the routing socket and the function returns zero.
4422 */
4423int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4424{
4425 int err;
4426
4427 ASSERT_RTNL();
4428
4429 err = netdev_set_master(slave, master);
4430 if (err)
4431 return err;
1da177e4
LT
4432 if (master)
4433 slave->flags |= IFF_SLAVE;
4434 else
4435 slave->flags &= ~IFF_SLAVE;
4436
4437 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4438 return 0;
4439}
1765a575 4440EXPORT_SYMBOL(netdev_set_bond_master);
1da177e4 4441
b6c40d68
PM
4442static void dev_change_rx_flags(struct net_device *dev, int flags)
4443{
d314774c
SH
4444 const struct net_device_ops *ops = dev->netdev_ops;
4445
4446 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4447 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4448}
4449
dad9b335 4450static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4 4451{
b536db93 4452 unsigned int old_flags = dev->flags;
8192b0c4
DH
4453 uid_t uid;
4454 gid_t gid;
1da177e4 4455
24023451
PM
4456 ASSERT_RTNL();
4457
dad9b335
WC
4458 dev->flags |= IFF_PROMISC;
4459 dev->promiscuity += inc;
4460 if (dev->promiscuity == 0) {
4461 /*
4462 * Avoid overflow.
4463 * If inc causes overflow, untouch promisc and return error.
4464 */
4465 if (inc < 0)
4466 dev->flags &= ~IFF_PROMISC;
4467 else {
4468 dev->promiscuity -= inc;
7b6cd1ce
JP
4469 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4470 dev->name);
dad9b335
WC
4471 return -EOVERFLOW;
4472 }
4473 }
52609c0b 4474 if (dev->flags != old_flags) {
7b6cd1ce
JP
4475 pr_info("device %s %s promiscuous mode\n",
4476 dev->name,
4477 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
4478 if (audit_enabled) {
4479 current_uid_gid(&uid, &gid);
7759db82
KHK
4480 audit_log(current->audit_context, GFP_ATOMIC,
4481 AUDIT_ANOM_PROMISCUOUS,
4482 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4483 dev->name, (dev->flags & IFF_PROMISC),
4484 (old_flags & IFF_PROMISC),
4485 audit_get_loginuid(current),
8192b0c4 4486 uid, gid,
7759db82 4487 audit_get_sessionid(current));
8192b0c4 4488 }
24023451 4489
b6c40d68 4490 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4491 }
dad9b335 4492 return 0;
1da177e4
LT
4493}
4494
4417da66
PM
4495/**
4496 * dev_set_promiscuity - update promiscuity count on a device
4497 * @dev: device
4498 * @inc: modifier
4499 *
4500 * Add or remove promiscuity from a device. While the count in the device
4501 * remains above zero the interface remains promiscuous. Once it hits zero
4502 * the device reverts back to normal filtering operation. A negative inc
4503 * value is used to drop promiscuity on the device.
dad9b335 4504 * Return 0 if successful or a negative errno code on error.
4417da66 4505 */
dad9b335 4506int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 4507{
b536db93 4508 unsigned int old_flags = dev->flags;
dad9b335 4509 int err;
4417da66 4510
dad9b335 4511 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4512 if (err < 0)
dad9b335 4513 return err;
4417da66
PM
4514 if (dev->flags != old_flags)
4515 dev_set_rx_mode(dev);
dad9b335 4516 return err;
4417da66 4517}
d1b19dff 4518EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4519
1da177e4
LT
4520/**
4521 * dev_set_allmulti - update allmulti count on a device
4522 * @dev: device
4523 * @inc: modifier
4524 *
4525 * Add or remove reception of all multicast frames to a device. While the
4526 * count in the device remains above zero the interface remains listening
4527 * to all interfaces. Once it hits zero the device reverts back to normal
4528 * filtering operation. A negative @inc value is used to drop the counter
4529 * when releasing a resource needing all multicasts.
dad9b335 4530 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4531 */
4532
dad9b335 4533int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4 4534{
b536db93 4535 unsigned int old_flags = dev->flags;
1da177e4 4536
24023451
PM
4537 ASSERT_RTNL();
4538
1da177e4 4539 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4540 dev->allmulti += inc;
4541 if (dev->allmulti == 0) {
4542 /*
4543 * Avoid overflow.
4544 * If inc causes overflow, untouch allmulti and return error.
4545 */
4546 if (inc < 0)
4547 dev->flags &= ~IFF_ALLMULTI;
4548 else {
4549 dev->allmulti -= inc;
7b6cd1ce
JP
4550 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4551 dev->name);
dad9b335
WC
4552 return -EOVERFLOW;
4553 }
4554 }
24023451 4555 if (dev->flags ^ old_flags) {
b6c40d68 4556 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4557 dev_set_rx_mode(dev);
24023451 4558 }
dad9b335 4559 return 0;
4417da66 4560}
d1b19dff 4561EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4562
4563/*
4564 * Upload unicast and multicast address lists to device and
4565 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4566 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4567 * are present.
4568 */
4569void __dev_set_rx_mode(struct net_device *dev)
4570{
d314774c
SH
4571 const struct net_device_ops *ops = dev->netdev_ops;
4572
4417da66
PM
4573 /* dev_open will call this function so the list will stay sane. */
4574 if (!(dev->flags&IFF_UP))
4575 return;
4576
4577 if (!netif_device_present(dev))
40b77c94 4578 return;
4417da66 4579
01789349 4580 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
4581 /* Unicast addresses changes may only happen under the rtnl,
4582 * therefore calling __dev_set_promiscuity here is safe.
4583 */
32e7bfc4 4584 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66 4585 __dev_set_promiscuity(dev, 1);
2d348d1f 4586 dev->uc_promisc = true;
32e7bfc4 4587 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66 4588 __dev_set_promiscuity(dev, -1);
2d348d1f 4589 dev->uc_promisc = false;
4417da66 4590 }
4417da66 4591 }
01789349
JP
4592
4593 if (ops->ndo_set_rx_mode)
4594 ops->ndo_set_rx_mode(dev);
4417da66
PM
4595}
4596
4597void dev_set_rx_mode(struct net_device *dev)
4598{
b9e40857 4599 netif_addr_lock_bh(dev);
4417da66 4600 __dev_set_rx_mode(dev);
b9e40857 4601 netif_addr_unlock_bh(dev);
1da177e4
LT
4602}
4603
f0db275a
SH
4604/**
4605 * dev_get_flags - get flags reported to userspace
4606 * @dev: device
4607 *
4608 * Get the combination of flag bits exported through APIs to userspace.
4609 */
95c96174 4610unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 4611{
95c96174 4612 unsigned int flags;
1da177e4
LT
4613
4614 flags = (dev->flags & ~(IFF_PROMISC |
4615 IFF_ALLMULTI |
b00055aa
SR
4616 IFF_RUNNING |
4617 IFF_LOWER_UP |
4618 IFF_DORMANT)) |
1da177e4
LT
4619 (dev->gflags & (IFF_PROMISC |
4620 IFF_ALLMULTI));
4621
b00055aa
SR
4622 if (netif_running(dev)) {
4623 if (netif_oper_up(dev))
4624 flags |= IFF_RUNNING;
4625 if (netif_carrier_ok(dev))
4626 flags |= IFF_LOWER_UP;
4627 if (netif_dormant(dev))
4628 flags |= IFF_DORMANT;
4629 }
1da177e4
LT
4630
4631 return flags;
4632}
d1b19dff 4633EXPORT_SYMBOL(dev_get_flags);
1da177e4 4634
bd380811 4635int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4636{
b536db93 4637 unsigned int old_flags = dev->flags;
bd380811 4638 int ret;
1da177e4 4639
24023451
PM
4640 ASSERT_RTNL();
4641
1da177e4
LT
4642 /*
4643 * Set the flags on our device.
4644 */
4645
4646 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4647 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4648 IFF_AUTOMEDIA)) |
4649 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4650 IFF_ALLMULTI));
4651
4652 /*
4653 * Load in the correct multicast list now the flags have changed.
4654 */
4655
b6c40d68
PM
4656 if ((old_flags ^ flags) & IFF_MULTICAST)
4657 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4658
4417da66 4659 dev_set_rx_mode(dev);
1da177e4
LT
4660
4661 /*
4662 * Have we downed the interface. We handle IFF_UP ourselves
4663 * according to user attempts to set it, rather than blindly
4664 * setting it.
4665 */
4666
4667 ret = 0;
4668 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4669 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4670
4671 if (!ret)
4417da66 4672 dev_set_rx_mode(dev);
1da177e4
LT
4673 }
4674
1da177e4 4675 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4676 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4677
1da177e4
LT
4678 dev->gflags ^= IFF_PROMISC;
4679 dev_set_promiscuity(dev, inc);
4680 }
4681
4682 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4683 is important. Some (broken) drivers set IFF_PROMISC, when
4684 IFF_ALLMULTI is requested not asking us and not reporting.
4685 */
4686 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4687 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4688
1da177e4
LT
4689 dev->gflags ^= IFF_ALLMULTI;
4690 dev_set_allmulti(dev, inc);
4691 }
4692
bd380811
PM
4693 return ret;
4694}
4695
4696void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4697{
4698 unsigned int changes = dev->flags ^ old_flags;
4699
4700 if (changes & IFF_UP) {
4701 if (dev->flags & IFF_UP)
4702 call_netdevice_notifiers(NETDEV_UP, dev);
4703 else
4704 call_netdevice_notifiers(NETDEV_DOWN, dev);
4705 }
4706
4707 if (dev->flags & IFF_UP &&
4708 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4709 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4710}
4711
4712/**
4713 * dev_change_flags - change device settings
4714 * @dev: device
4715 * @flags: device state flags
4716 *
4717 * Change settings on device based state flags. The flags are
4718 * in the userspace exported format.
4719 */
b536db93 4720int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 4721{
b536db93
ED
4722 int ret;
4723 unsigned int changes, old_flags = dev->flags;
bd380811
PM
4724
4725 ret = __dev_change_flags(dev, flags);
4726 if (ret < 0)
4727 return ret;
4728
4729 changes = old_flags ^ dev->flags;
7c355f53
TG
4730 if (changes)
4731 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4732
bd380811 4733 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4734 return ret;
4735}
d1b19dff 4736EXPORT_SYMBOL(dev_change_flags);
1da177e4 4737
f0db275a
SH
4738/**
4739 * dev_set_mtu - Change maximum transfer unit
4740 * @dev: device
4741 * @new_mtu: new transfer unit
4742 *
4743 * Change the maximum transfer size of the network device.
4744 */
1da177e4
LT
4745int dev_set_mtu(struct net_device *dev, int new_mtu)
4746{
d314774c 4747 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4748 int err;
4749
4750 if (new_mtu == dev->mtu)
4751 return 0;
4752
4753 /* MTU must be positive. */
4754 if (new_mtu < 0)
4755 return -EINVAL;
4756
4757 if (!netif_device_present(dev))
4758 return -ENODEV;
4759
4760 err = 0;
d314774c
SH
4761 if (ops->ndo_change_mtu)
4762 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4763 else
4764 dev->mtu = new_mtu;
d314774c 4765
1da177e4 4766 if (!err && dev->flags & IFF_UP)
056925ab 4767 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4768 return err;
4769}
d1b19dff 4770EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4771
cbda10fa
VD
4772/**
4773 * dev_set_group - Change group this device belongs to
4774 * @dev: device
4775 * @new_group: group this device should belong to
4776 */
4777void dev_set_group(struct net_device *dev, int new_group)
4778{
4779 dev->group = new_group;
4780}
4781EXPORT_SYMBOL(dev_set_group);
4782
f0db275a
SH
4783/**
4784 * dev_set_mac_address - Change Media Access Control Address
4785 * @dev: device
4786 * @sa: new address
4787 *
4788 * Change the hardware (MAC) address of the device
4789 */
1da177e4
LT
4790int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4791{
d314774c 4792 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4793 int err;
4794
d314774c 4795 if (!ops->ndo_set_mac_address)
1da177e4
LT
4796 return -EOPNOTSUPP;
4797 if (sa->sa_family != dev->type)
4798 return -EINVAL;
4799 if (!netif_device_present(dev))
4800 return -ENODEV;
d314774c 4801 err = ops->ndo_set_mac_address(dev, sa);
1da177e4 4802 if (!err)
056925ab 4803 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
4804 return err;
4805}
d1b19dff 4806EXPORT_SYMBOL(dev_set_mac_address);
1da177e4
LT
4807
4808/*
3710becf 4809 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
1da177e4 4810 */
14e3e079 4811static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
4812{
4813 int err;
3710becf 4814 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
1da177e4
LT
4815
4816 if (!dev)
4817 return -ENODEV;
4818
4819 switch (cmd) {
d1b19dff
ED
4820 case SIOCGIFFLAGS: /* Get interface flags */
4821 ifr->ifr_flags = (short) dev_get_flags(dev);
4822 return 0;
1da177e4 4823
d1b19dff
ED
4824 case SIOCGIFMETRIC: /* Get the metric on the interface
4825 (currently unused) */
4826 ifr->ifr_metric = 0;
4827 return 0;
1da177e4 4828
d1b19dff
ED
4829 case SIOCGIFMTU: /* Get the MTU of a device */
4830 ifr->ifr_mtu = dev->mtu;
4831 return 0;
1da177e4 4832
d1b19dff
ED
4833 case SIOCGIFHWADDR:
4834 if (!dev->addr_len)
4835 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4836 else
4837 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4838 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4839 ifr->ifr_hwaddr.sa_family = dev->type;
4840 return 0;
1da177e4 4841
d1b19dff
ED
4842 case SIOCGIFSLAVE:
4843 err = -EINVAL;
4844 break;
14e3e079 4845
d1b19dff
ED
4846 case SIOCGIFMAP:
4847 ifr->ifr_map.mem_start = dev->mem_start;
4848 ifr->ifr_map.mem_end = dev->mem_end;
4849 ifr->ifr_map.base_addr = dev->base_addr;
4850 ifr->ifr_map.irq = dev->irq;
4851 ifr->ifr_map.dma = dev->dma;
4852 ifr->ifr_map.port = dev->if_port;
4853 return 0;
14e3e079 4854
d1b19dff
ED
4855 case SIOCGIFINDEX:
4856 ifr->ifr_ifindex = dev->ifindex;
4857 return 0;
14e3e079 4858
d1b19dff
ED
4859 case SIOCGIFTXQLEN:
4860 ifr->ifr_qlen = dev->tx_queue_len;
4861 return 0;
14e3e079 4862
d1b19dff
ED
4863 default:
4864 /* dev_ioctl() should ensure this case
4865 * is never reached
4866 */
4867 WARN_ON(1);
41c31f31 4868 err = -ENOTTY;
d1b19dff 4869 break;
14e3e079
JG
4870
4871 }
4872 return err;
4873}
4874
4875/*
4876 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4877 */
4878static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4879{
4880 int err;
4881 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5f2f6da7 4882 const struct net_device_ops *ops;
14e3e079
JG
4883
4884 if (!dev)
4885 return -ENODEV;
4886
5f2f6da7
JP
4887 ops = dev->netdev_ops;
4888
14e3e079 4889 switch (cmd) {
d1b19dff
ED
4890 case SIOCSIFFLAGS: /* Set interface flags */
4891 return dev_change_flags(dev, ifr->ifr_flags);
14e3e079 4892
d1b19dff
ED
4893 case SIOCSIFMETRIC: /* Set the metric on the interface
4894 (currently unused) */
4895 return -EOPNOTSUPP;
14e3e079 4896
d1b19dff
ED
4897 case SIOCSIFMTU: /* Set the MTU of a device */
4898 return dev_set_mtu(dev, ifr->ifr_mtu);
1da177e4 4899
d1b19dff
ED
4900 case SIOCSIFHWADDR:
4901 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
1da177e4 4902
d1b19dff
ED
4903 case SIOCSIFHWBROADCAST:
4904 if (ifr->ifr_hwaddr.sa_family != dev->type)
4905 return -EINVAL;
4906 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4907 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4908 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4909 return 0;
1da177e4 4910
d1b19dff
ED
4911 case SIOCSIFMAP:
4912 if (ops->ndo_set_config) {
1da177e4
LT
4913 if (!netif_device_present(dev))
4914 return -ENODEV;
d1b19dff
ED
4915 return ops->ndo_set_config(dev, &ifr->ifr_map);
4916 }
4917 return -EOPNOTSUPP;
1da177e4 4918
d1b19dff 4919 case SIOCADDMULTI:
b81693d9 4920 if (!ops->ndo_set_rx_mode ||
d1b19dff
ED
4921 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4922 return -EINVAL;
4923 if (!netif_device_present(dev))
4924 return -ENODEV;
22bedad3 4925 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
d1b19dff
ED
4926
4927 case SIOCDELMULTI:
b81693d9 4928 if (!ops->ndo_set_rx_mode ||
d1b19dff
ED
4929 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4930 return -EINVAL;
4931 if (!netif_device_present(dev))
4932 return -ENODEV;
22bedad3 4933 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
1da177e4 4934
d1b19dff
ED
4935 case SIOCSIFTXQLEN:
4936 if (ifr->ifr_qlen < 0)
4937 return -EINVAL;
4938 dev->tx_queue_len = ifr->ifr_qlen;
4939 return 0;
1da177e4 4940
d1b19dff
ED
4941 case SIOCSIFNAME:
4942 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4943 return dev_change_name(dev, ifr->ifr_newname);
1da177e4 4944
4dc360c5
RC
4945 case SIOCSHWTSTAMP:
4946 err = net_hwtstamp_validate(ifr);
4947 if (err)
4948 return err;
4949 /* fall through */
4950
d1b19dff
ED
4951 /*
4952 * Unknown or private ioctl
4953 */
4954 default:
4955 if ((cmd >= SIOCDEVPRIVATE &&
4956 cmd <= SIOCDEVPRIVATE + 15) ||
4957 cmd == SIOCBONDENSLAVE ||
4958 cmd == SIOCBONDRELEASE ||
4959 cmd == SIOCBONDSETHWADDR ||
4960 cmd == SIOCBONDSLAVEINFOQUERY ||
4961 cmd == SIOCBONDINFOQUERY ||
4962 cmd == SIOCBONDCHANGEACTIVE ||
4963 cmd == SIOCGMIIPHY ||
4964 cmd == SIOCGMIIREG ||
4965 cmd == SIOCSMIIREG ||
4966 cmd == SIOCBRADDIF ||
4967 cmd == SIOCBRDELIF ||
4968 cmd == SIOCSHWTSTAMP ||
4969 cmd == SIOCWANDEV) {
4970 err = -EOPNOTSUPP;
4971 if (ops->ndo_do_ioctl) {
4972 if (netif_device_present(dev))
4973 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4974 else
4975 err = -ENODEV;
4976 }
4977 } else
4978 err = -EINVAL;
1da177e4
LT
4979
4980 }
4981 return err;
4982}
4983
4984/*
4985 * This function handles all "interface"-type I/O control requests. The actual
4986 * 'doing' part of this is dev_ifsioc above.
4987 */
4988
4989/**
4990 * dev_ioctl - network device ioctl
c4ea43c5 4991 * @net: the applicable net namespace
1da177e4
LT
4992 * @cmd: command to issue
4993 * @arg: pointer to a struct ifreq in user space
4994 *
4995 * Issue ioctl functions to devices. This is normally called by the
4996 * user space syscall interfaces but can sometimes be useful for
4997 * other purposes. The return value is the return from the syscall if
4998 * positive or a negative errno code on error.
4999 */
5000
881d966b 5001int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
5002{
5003 struct ifreq ifr;
5004 int ret;
5005 char *colon;
5006
5007 /* One special case: SIOCGIFCONF takes ifconf argument
5008 and requires shared lock, because it sleeps writing
5009 to user space.
5010 */
5011
5012 if (cmd == SIOCGIFCONF) {
6756ae4b 5013 rtnl_lock();
881d966b 5014 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 5015 rtnl_unlock();
1da177e4
LT
5016 return ret;
5017 }
5018 if (cmd == SIOCGIFNAME)
881d966b 5019 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
5020
5021 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5022 return -EFAULT;
5023
5024 ifr.ifr_name[IFNAMSIZ-1] = 0;
5025
5026 colon = strchr(ifr.ifr_name, ':');
5027 if (colon)
5028 *colon = 0;
5029
5030 /*
5031 * See which interface the caller is talking about.
5032 */
5033
5034 switch (cmd) {
d1b19dff
ED
5035 /*
5036 * These ioctl calls:
5037 * - can be done by all.
5038 * - atomic and do not require locking.
5039 * - return a value
5040 */
5041 case SIOCGIFFLAGS:
5042 case SIOCGIFMETRIC:
5043 case SIOCGIFMTU:
5044 case SIOCGIFHWADDR:
5045 case SIOCGIFSLAVE:
5046 case SIOCGIFMAP:
5047 case SIOCGIFINDEX:
5048 case SIOCGIFTXQLEN:
5049 dev_load(net, ifr.ifr_name);
3710becf 5050 rcu_read_lock();
d1b19dff 5051 ret = dev_ifsioc_locked(net, &ifr, cmd);
3710becf 5052 rcu_read_unlock();
d1b19dff
ED
5053 if (!ret) {
5054 if (colon)
5055 *colon = ':';
5056 if (copy_to_user(arg, &ifr,
5057 sizeof(struct ifreq)))
5058 ret = -EFAULT;
5059 }
5060 return ret;
1da177e4 5061
d1b19dff
ED
5062 case SIOCETHTOOL:
5063 dev_load(net, ifr.ifr_name);
5064 rtnl_lock();
5065 ret = dev_ethtool(net, &ifr);
5066 rtnl_unlock();
5067 if (!ret) {
5068 if (colon)
5069 *colon = ':';
5070 if (copy_to_user(arg, &ifr,
5071 sizeof(struct ifreq)))
5072 ret = -EFAULT;
5073 }
5074 return ret;
1da177e4 5075
d1b19dff
ED
5076 /*
5077 * These ioctl calls:
5078 * - require superuser power.
5079 * - require strict serialization.
5080 * - return a value
5081 */
5082 case SIOCGMIIPHY:
5083 case SIOCGMIIREG:
5084 case SIOCSIFNAME:
5085 if (!capable(CAP_NET_ADMIN))
5086 return -EPERM;
5087 dev_load(net, ifr.ifr_name);
5088 rtnl_lock();
5089 ret = dev_ifsioc(net, &ifr, cmd);
5090 rtnl_unlock();
5091 if (!ret) {
5092 if (colon)
5093 *colon = ':';
5094 if (copy_to_user(arg, &ifr,
5095 sizeof(struct ifreq)))
5096 ret = -EFAULT;
5097 }
5098 return ret;
1da177e4 5099
d1b19dff
ED
5100 /*
5101 * These ioctl calls:
5102 * - require superuser power.
5103 * - require strict serialization.
5104 * - do not return a value
5105 */
5106 case SIOCSIFFLAGS:
5107 case SIOCSIFMETRIC:
5108 case SIOCSIFMTU:
5109 case SIOCSIFMAP:
5110 case SIOCSIFHWADDR:
5111 case SIOCSIFSLAVE:
5112 case SIOCADDMULTI:
5113 case SIOCDELMULTI:
5114 case SIOCSIFHWBROADCAST:
5115 case SIOCSIFTXQLEN:
5116 case SIOCSMIIREG:
5117 case SIOCBONDENSLAVE:
5118 case SIOCBONDRELEASE:
5119 case SIOCBONDSETHWADDR:
5120 case SIOCBONDCHANGEACTIVE:
5121 case SIOCBRADDIF:
5122 case SIOCBRDELIF:
5123 case SIOCSHWTSTAMP:
5124 if (!capable(CAP_NET_ADMIN))
5125 return -EPERM;
5126 /* fall through */
5127 case SIOCBONDSLAVEINFOQUERY:
5128 case SIOCBONDINFOQUERY:
5129 dev_load(net, ifr.ifr_name);
5130 rtnl_lock();
5131 ret = dev_ifsioc(net, &ifr, cmd);
5132 rtnl_unlock();
5133 return ret;
5134
5135 case SIOCGIFMEM:
5136 /* Get the per device memory space. We can add this but
5137 * currently do not support it */
5138 case SIOCSIFMEM:
5139 /* Set the per device memory buffer space.
5140 * Not applicable in our case */
5141 case SIOCSIFLINK:
41c31f31 5142 return -ENOTTY;
d1b19dff
ED
5143
5144 /*
5145 * Unknown or private ioctl.
5146 */
5147 default:
5148 if (cmd == SIOCWANDEV ||
5149 (cmd >= SIOCDEVPRIVATE &&
5150 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 5151 dev_load(net, ifr.ifr_name);
1da177e4 5152 rtnl_lock();
881d966b 5153 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4 5154 rtnl_unlock();
d1b19dff
ED
5155 if (!ret && copy_to_user(arg, &ifr,
5156 sizeof(struct ifreq)))
5157 ret = -EFAULT;
1da177e4 5158 return ret;
d1b19dff
ED
5159 }
5160 /* Take care of Wireless Extensions */
5161 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5162 return wext_handle_ioctl(net, &ifr, cmd, arg);
41c31f31 5163 return -ENOTTY;
1da177e4
LT
5164 }
5165}
5166
5167
5168/**
5169 * dev_new_index - allocate an ifindex
c4ea43c5 5170 * @net: the applicable net namespace
1da177e4
LT
5171 *
5172 * Returns a suitable unique value for a new device interface
5173 * number. The caller must hold the rtnl semaphore or the
5174 * dev_base_lock to be sure it remains unique.
5175 */
881d966b 5176static int dev_new_index(struct net *net)
1da177e4
LT
5177{
5178 static int ifindex;
5179 for (;;) {
5180 if (++ifindex <= 0)
5181 ifindex = 1;
881d966b 5182 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
5183 return ifindex;
5184 }
5185}
5186
1da177e4 5187/* Delayed registration/unregisteration */
3b5b34fd 5188static LIST_HEAD(net_todo_list);
1da177e4 5189
6f05f629 5190static void net_set_todo(struct net_device *dev)
1da177e4 5191{
1da177e4 5192 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
5193}
5194
9b5e383c 5195static void rollback_registered_many(struct list_head *head)
93ee31f1 5196{
e93737b0 5197 struct net_device *dev, *tmp;
9b5e383c 5198
93ee31f1
DL
5199 BUG_ON(dev_boot_phase);
5200 ASSERT_RTNL();
5201
e93737b0 5202 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 5203 /* Some devices call without registering
e93737b0
KK
5204 * for initialization unwind. Remove those
5205 * devices and proceed with the remaining.
9b5e383c
ED
5206 */
5207 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
5208 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5209 dev->name, dev);
93ee31f1 5210
9b5e383c 5211 WARN_ON(1);
e93737b0
KK
5212 list_del(&dev->unreg_list);
5213 continue;
9b5e383c 5214 }
449f4544 5215 dev->dismantle = true;
9b5e383c 5216 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 5217 }
93ee31f1 5218
44345724
OP
5219 /* If device is running, close it first. */
5220 dev_close_many(head);
93ee31f1 5221
44345724 5222 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5223 /* And unlink it from device chain. */
5224 unlist_netdevice(dev);
93ee31f1 5225
9b5e383c
ED
5226 dev->reg_state = NETREG_UNREGISTERING;
5227 }
93ee31f1
DL
5228
5229 synchronize_net();
5230
9b5e383c
ED
5231 list_for_each_entry(dev, head, unreg_list) {
5232 /* Shutdown queueing discipline. */
5233 dev_shutdown(dev);
93ee31f1
DL
5234
5235
9b5e383c
ED
5236 /* Notify protocols, that we are about to destroy
5237 this device. They should clean all the things.
5238 */
5239 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5240
a2835763
PM
5241 if (!dev->rtnl_link_ops ||
5242 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5243 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5244
9b5e383c
ED
5245 /*
5246 * Flush the unicast and multicast chains
5247 */
a748ee24 5248 dev_uc_flush(dev);
22bedad3 5249 dev_mc_flush(dev);
93ee31f1 5250
9b5e383c
ED
5251 if (dev->netdev_ops->ndo_uninit)
5252 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5253
9b5e383c
ED
5254 /* Notifier chain MUST detach us from master device. */
5255 WARN_ON(dev->master);
93ee31f1 5256
9b5e383c
ED
5257 /* Remove entries from kobject tree */
5258 netdev_unregister_kobject(dev);
5259 }
93ee31f1 5260
a5ee1551 5261 /* Process any work delayed until the end of the batch */
e5e26d75 5262 dev = list_first_entry(head, struct net_device, unreg_list);
a5ee1551 5263 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
93ee31f1 5264
850a545b 5265 synchronize_net();
395264d5 5266
a5ee1551 5267 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5268 dev_put(dev);
5269}
5270
5271static void rollback_registered(struct net_device *dev)
5272{
5273 LIST_HEAD(single);
5274
5275 list_add(&dev->unreg_list, &single);
5276 rollback_registered_many(&single);
ceaaec98 5277 list_del(&single);
93ee31f1
DL
5278}
5279
c8f44aff
MM
5280static netdev_features_t netdev_fix_features(struct net_device *dev,
5281 netdev_features_t features)
b63365a2 5282{
57422dc5
MM
5283 /* Fix illegal checksum combinations */
5284 if ((features & NETIF_F_HW_CSUM) &&
5285 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5286 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
5287 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5288 }
5289
b63365a2
HX
5290 /* Fix illegal SG+CSUM combinations. */
5291 if ((features & NETIF_F_SG) &&
5292 !(features & NETIF_F_ALL_CSUM)) {
6f404e44
MM
5293 netdev_dbg(dev,
5294 "Dropping NETIF_F_SG since no checksum feature.\n");
b63365a2
HX
5295 features &= ~NETIF_F_SG;
5296 }
5297
5298 /* TSO requires that SG is present as well. */
ea2d3688 5299 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 5300 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 5301 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
5302 }
5303
31d8b9e0
BH
5304 /* TSO ECN requires that TSO is present as well. */
5305 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5306 features &= ~NETIF_F_TSO_ECN;
5307
212b573f
MM
5308 /* Software GSO depends on SG. */
5309 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5310 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5311 features &= ~NETIF_F_GSO;
5312 }
5313
acd1130e 5314 /* UFO needs SG and checksumming */
b63365a2 5315 if (features & NETIF_F_UFO) {
79032644
MM
5316 /* maybe split UFO into V4 and V6? */
5317 if (!((features & NETIF_F_GEN_CSUM) ||
5318 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5319 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5320 netdev_dbg(dev,
acd1130e 5321 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5322 features &= ~NETIF_F_UFO;
5323 }
5324
5325 if (!(features & NETIF_F_SG)) {
6f404e44 5326 netdev_dbg(dev,
acd1130e 5327 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
5328 features &= ~NETIF_F_UFO;
5329 }
5330 }
5331
5332 return features;
5333}
b63365a2 5334
6cb6a27c 5335int __netdev_update_features(struct net_device *dev)
5455c699 5336{
c8f44aff 5337 netdev_features_t features;
5455c699
MM
5338 int err = 0;
5339
87267485
MM
5340 ASSERT_RTNL();
5341
5455c699
MM
5342 features = netdev_get_wanted_features(dev);
5343
5344 if (dev->netdev_ops->ndo_fix_features)
5345 features = dev->netdev_ops->ndo_fix_features(dev, features);
5346
5347 /* driver might be less strict about feature dependencies */
5348 features = netdev_fix_features(dev, features);
5349
5350 if (dev->features == features)
6cb6a27c 5351 return 0;
5455c699 5352
c8f44aff
MM
5353 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5354 &dev->features, &features);
5455c699
MM
5355
5356 if (dev->netdev_ops->ndo_set_features)
5357 err = dev->netdev_ops->ndo_set_features(dev, features);
5358
6cb6a27c 5359 if (unlikely(err < 0)) {
5455c699 5360 netdev_err(dev,
c8f44aff
MM
5361 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5362 err, &features, &dev->features);
6cb6a27c
MM
5363 return -1;
5364 }
5365
5366 if (!err)
5367 dev->features = features;
5368
5369 return 1;
5370}
5371
afe12cc8
MM
5372/**
5373 * netdev_update_features - recalculate device features
5374 * @dev: the device to check
5375 *
5376 * Recalculate dev->features set and send notifications if it
5377 * has changed. Should be called after driver or hardware dependent
5378 * conditions might have changed that influence the features.
5379 */
6cb6a27c
MM
5380void netdev_update_features(struct net_device *dev)
5381{
5382 if (__netdev_update_features(dev))
5383 netdev_features_change(dev);
5455c699
MM
5384}
5385EXPORT_SYMBOL(netdev_update_features);
5386
afe12cc8
MM
5387/**
5388 * netdev_change_features - recalculate device features
5389 * @dev: the device to check
5390 *
5391 * Recalculate dev->features set and send notifications even
5392 * if they have not changed. Should be called instead of
5393 * netdev_update_features() if also dev->vlan_features might
5394 * have changed to allow the changes to be propagated to stacked
5395 * VLAN devices.
5396 */
5397void netdev_change_features(struct net_device *dev)
5398{
5399 __netdev_update_features(dev);
5400 netdev_features_change(dev);
5401}
5402EXPORT_SYMBOL(netdev_change_features);
5403
fc4a7489
PM
5404/**
5405 * netif_stacked_transfer_operstate - transfer operstate
5406 * @rootdev: the root or lower level device to transfer state from
5407 * @dev: the device to transfer operstate to
5408 *
5409 * Transfer operational state from root to device. This is normally
5410 * called when a stacking relationship exists between the root
5411 * device and the device(a leaf device).
5412 */
5413void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5414 struct net_device *dev)
5415{
5416 if (rootdev->operstate == IF_OPER_DORMANT)
5417 netif_dormant_on(dev);
5418 else
5419 netif_dormant_off(dev);
5420
5421 if (netif_carrier_ok(rootdev)) {
5422 if (!netif_carrier_ok(dev))
5423 netif_carrier_on(dev);
5424 } else {
5425 if (netif_carrier_ok(dev))
5426 netif_carrier_off(dev);
5427 }
5428}
5429EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5430
bf264145 5431#ifdef CONFIG_RPS
1b4bf461
ED
5432static int netif_alloc_rx_queues(struct net_device *dev)
5433{
1b4bf461 5434 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5435 struct netdev_rx_queue *rx;
1b4bf461 5436
bd25fa7b 5437 BUG_ON(count < 1);
1b4bf461 5438
bd25fa7b
TH
5439 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5440 if (!rx) {
7b6cd1ce 5441 pr_err("netdev: Unable to allocate %u rx queues\n", count);
bd25fa7b 5442 return -ENOMEM;
1b4bf461 5443 }
bd25fa7b
TH
5444 dev->_rx = rx;
5445
bd25fa7b 5446 for (i = 0; i < count; i++)
fe822240 5447 rx[i].dev = dev;
1b4bf461
ED
5448 return 0;
5449}
bf264145 5450#endif
1b4bf461 5451
aa942104
CG
5452static void netdev_init_one_queue(struct net_device *dev,
5453 struct netdev_queue *queue, void *_unused)
5454{
5455 /* Initialize queue lock */
5456 spin_lock_init(&queue->_xmit_lock);
5457 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5458 queue->xmit_lock_owner = -1;
b236da69 5459 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 5460 queue->dev = dev;
114cf580
TH
5461#ifdef CONFIG_BQL
5462 dql_init(&queue->dql, HZ);
5463#endif
aa942104
CG
5464}
5465
e6484930
TH
5466static int netif_alloc_netdev_queues(struct net_device *dev)
5467{
5468 unsigned int count = dev->num_tx_queues;
5469 struct netdev_queue *tx;
5470
5471 BUG_ON(count < 1);
5472
5473 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5474 if (!tx) {
7b6cd1ce 5475 pr_err("netdev: Unable to allocate %u tx queues\n", count);
e6484930
TH
5476 return -ENOMEM;
5477 }
5478 dev->_tx = tx;
1d24eb48 5479
e6484930
TH
5480 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5481 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
5482
5483 return 0;
e6484930
TH
5484}
5485
1da177e4
LT
5486/**
5487 * register_netdevice - register a network device
5488 * @dev: device to register
5489 *
5490 * Take a completed network device structure and add it to the kernel
5491 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5492 * chain. 0 is returned on success. A negative errno code is returned
5493 * on a failure to set up the device, or if the name is a duplicate.
5494 *
5495 * Callers must hold the rtnl semaphore. You may want
5496 * register_netdev() instead of this.
5497 *
5498 * BUGS:
5499 * The locking appears insufficient to guarantee two parallel registers
5500 * will not get the same name.
5501 */
5502
5503int register_netdevice(struct net_device *dev)
5504{
1da177e4 5505 int ret;
d314774c 5506 struct net *net = dev_net(dev);
1da177e4
LT
5507
5508 BUG_ON(dev_boot_phase);
5509 ASSERT_RTNL();
5510
b17a7c17
SH
5511 might_sleep();
5512
1da177e4
LT
5513 /* When net_device's are persistent, this will be fatal. */
5514 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5515 BUG_ON(!net);
1da177e4 5516
f1f28aa3 5517 spin_lock_init(&dev->addr_list_lock);
cf508b12 5518 netdev_set_addr_lockdep_class(dev);
1da177e4 5519
1da177e4
LT
5520 dev->iflink = -1;
5521
0696c3a8
PP
5522 ret = dev_get_valid_name(dev, dev->name);
5523 if (ret < 0)
5524 goto out;
5525
1da177e4 5526 /* Init, if this function is available */
d314774c
SH
5527 if (dev->netdev_ops->ndo_init) {
5528 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5529 if (ret) {
5530 if (ret > 0)
5531 ret = -EIO;
90833aa4 5532 goto out;
1da177e4
LT
5533 }
5534 }
4ec93edb 5535
881d966b 5536 dev->ifindex = dev_new_index(net);
1da177e4
LT
5537 if (dev->iflink == -1)
5538 dev->iflink = dev->ifindex;
5539
5455c699
MM
5540 /* Transfer changeable features to wanted_features and enable
5541 * software offloads (GSO and GRO).
5542 */
5543 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
5544 dev->features |= NETIF_F_SOFT_FEATURES;
5545 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 5546
c6e1a0d1 5547 /* Turn on no cache copy if HW is doing checksum */
34324dc2
MM
5548 if (!(dev->flags & IFF_LOOPBACK)) {
5549 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5550 if (dev->features & NETIF_F_ALL_CSUM) {
5551 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5552 dev->features |= NETIF_F_NOCACHE_COPY;
5553 }
c6e1a0d1
TH
5554 }
5555
1180e7d6 5556 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 5557 */
1180e7d6 5558 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 5559
7ffbe3fd
JB
5560 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5561 ret = notifier_to_errno(ret);
5562 if (ret)
5563 goto err_uninit;
5564
8b41d188 5565 ret = netdev_register_kobject(dev);
b17a7c17 5566 if (ret)
7ce1b0ed 5567 goto err_uninit;
b17a7c17
SH
5568 dev->reg_state = NETREG_REGISTERED;
5569
6cb6a27c 5570 __netdev_update_features(dev);
8e9b59b2 5571
1da177e4
LT
5572 /*
5573 * Default initial state at registry is that the
5574 * device is present.
5575 */
5576
5577 set_bit(__LINK_STATE_PRESENT, &dev->state);
5578
1da177e4 5579 dev_init_scheduler(dev);
1da177e4 5580 dev_hold(dev);
ce286d32 5581 list_netdevice(dev);
1da177e4
LT
5582
5583 /* Notify protocols, that a new device appeared. */
056925ab 5584 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5585 ret = notifier_to_errno(ret);
93ee31f1
DL
5586 if (ret) {
5587 rollback_registered(dev);
5588 dev->reg_state = NETREG_UNREGISTERED;
5589 }
d90a909e
EB
5590 /*
5591 * Prevent userspace races by waiting until the network
5592 * device is fully setup before sending notifications.
5593 */
a2835763
PM
5594 if (!dev->rtnl_link_ops ||
5595 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5596 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5597
5598out:
5599 return ret;
7ce1b0ed
HX
5600
5601err_uninit:
d314774c
SH
5602 if (dev->netdev_ops->ndo_uninit)
5603 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5604 goto out;
1da177e4 5605}
d1b19dff 5606EXPORT_SYMBOL(register_netdevice);
1da177e4 5607
937f1ba5
BH
5608/**
5609 * init_dummy_netdev - init a dummy network device for NAPI
5610 * @dev: device to init
5611 *
5612 * This takes a network device structure and initialize the minimum
5613 * amount of fields so it can be used to schedule NAPI polls without
5614 * registering a full blown interface. This is to be used by drivers
5615 * that need to tie several hardware interfaces to a single NAPI
5616 * poll scheduler due to HW limitations.
5617 */
5618int init_dummy_netdev(struct net_device *dev)
5619{
5620 /* Clear everything. Note we don't initialize spinlocks
5621 * are they aren't supposed to be taken by any of the
5622 * NAPI code and this dummy netdev is supposed to be
5623 * only ever used for NAPI polls
5624 */
5625 memset(dev, 0, sizeof(struct net_device));
5626
5627 /* make sure we BUG if trying to hit standard
5628 * register/unregister code path
5629 */
5630 dev->reg_state = NETREG_DUMMY;
5631
937f1ba5
BH
5632 /* NAPI wants this */
5633 INIT_LIST_HEAD(&dev->napi_list);
5634
5635 /* a dummy interface is started by default */
5636 set_bit(__LINK_STATE_PRESENT, &dev->state);
5637 set_bit(__LINK_STATE_START, &dev->state);
5638
29b4433d
ED
5639 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5640 * because users of this 'device' dont need to change
5641 * its refcount.
5642 */
5643
937f1ba5
BH
5644 return 0;
5645}
5646EXPORT_SYMBOL_GPL(init_dummy_netdev);
5647
5648
1da177e4
LT
5649/**
5650 * register_netdev - register a network device
5651 * @dev: device to register
5652 *
5653 * Take a completed network device structure and add it to the kernel
5654 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5655 * chain. 0 is returned on success. A negative errno code is returned
5656 * on a failure to set up the device, or if the name is a duplicate.
5657 *
38b4da38 5658 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5659 * and expands the device name if you passed a format string to
5660 * alloc_netdev.
5661 */
5662int register_netdev(struct net_device *dev)
5663{
5664 int err;
5665
5666 rtnl_lock();
1da177e4 5667 err = register_netdevice(dev);
1da177e4
LT
5668 rtnl_unlock();
5669 return err;
5670}
5671EXPORT_SYMBOL(register_netdev);
5672
29b4433d
ED
5673int netdev_refcnt_read(const struct net_device *dev)
5674{
5675 int i, refcnt = 0;
5676
5677 for_each_possible_cpu(i)
5678 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5679 return refcnt;
5680}
5681EXPORT_SYMBOL(netdev_refcnt_read);
5682
2c53040f 5683/**
1da177e4
LT
5684 * netdev_wait_allrefs - wait until all references are gone.
5685 *
5686 * This is called when unregistering network devices.
5687 *
5688 * Any protocol or device that holds a reference should register
5689 * for netdevice notification, and cleanup and put back the
5690 * reference if they receive an UNREGISTER event.
5691 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5692 * call dev_put.
1da177e4
LT
5693 */
5694static void netdev_wait_allrefs(struct net_device *dev)
5695{
5696 unsigned long rebroadcast_time, warning_time;
29b4433d 5697 int refcnt;
1da177e4 5698
e014debe
ED
5699 linkwatch_forget_dev(dev);
5700
1da177e4 5701 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
5702 refcnt = netdev_refcnt_read(dev);
5703
5704 while (refcnt != 0) {
1da177e4 5705 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5706 rtnl_lock();
1da177e4
LT
5707
5708 /* Rebroadcast unregister notification */
056925ab 5709 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5710 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
395264d5 5711 * should have already handle it the first time */
1da177e4
LT
5712
5713 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5714 &dev->state)) {
5715 /* We must not have linkwatch events
5716 * pending on unregister. If this
5717 * happens, we simply run the queue
5718 * unscheduled, resulting in a noop
5719 * for this device.
5720 */
5721 linkwatch_run_queue();
5722 }
5723
6756ae4b 5724 __rtnl_unlock();
1da177e4
LT
5725
5726 rebroadcast_time = jiffies;
5727 }
5728
5729 msleep(250);
5730
29b4433d
ED
5731 refcnt = netdev_refcnt_read(dev);
5732
1da177e4 5733 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
5734 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5735 dev->name, refcnt);
1da177e4
LT
5736 warning_time = jiffies;
5737 }
5738 }
5739}
5740
5741/* The sequence is:
5742 *
5743 * rtnl_lock();
5744 * ...
5745 * register_netdevice(x1);
5746 * register_netdevice(x2);
5747 * ...
5748 * unregister_netdevice(y1);
5749 * unregister_netdevice(y2);
5750 * ...
5751 * rtnl_unlock();
5752 * free_netdev(y1);
5753 * free_netdev(y2);
5754 *
58ec3b4d 5755 * We are invoked by rtnl_unlock().
1da177e4 5756 * This allows us to deal with problems:
b17a7c17 5757 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5758 * without deadlocking with linkwatch via keventd.
5759 * 2) Since we run with the RTNL semaphore not held, we can sleep
5760 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5761 *
5762 * We must not return until all unregister events added during
5763 * the interval the lock was held have been completed.
1da177e4 5764 */
1da177e4
LT
5765void netdev_run_todo(void)
5766{
626ab0e6 5767 struct list_head list;
1da177e4 5768
1da177e4 5769 /* Snapshot list, allow later requests */
626ab0e6 5770 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5771
5772 __rtnl_unlock();
626ab0e6 5773
850a545b
EB
5774 /* Wait for rcu callbacks to finish before attempting to drain
5775 * the device list. This usually avoids a 250ms wait.
5776 */
5777 if (!list_empty(&list))
5778 rcu_barrier();
5779
1da177e4
LT
5780 while (!list_empty(&list)) {
5781 struct net_device *dev
e5e26d75 5782 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5783 list_del(&dev->todo_list);
5784
b17a7c17 5785 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 5786 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
5787 dev->name, dev->reg_state);
5788 dump_stack();
5789 continue;
5790 }
1da177e4 5791
b17a7c17 5792 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5793
152102c7 5794 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5795
b17a7c17 5796 netdev_wait_allrefs(dev);
1da177e4 5797
b17a7c17 5798 /* paranoia */
29b4433d 5799 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
5800 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5801 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 5802 WARN_ON(dev->dn_ptr);
1da177e4 5803
b17a7c17
SH
5804 if (dev->destructor)
5805 dev->destructor(dev);
9093bbb2
SH
5806
5807 /* Free network device */
5808 kobject_put(&dev->dev.kobj);
1da177e4 5809 }
1da177e4
LT
5810}
5811
3cfde79c
BH
5812/* Convert net_device_stats to rtnl_link_stats64. They have the same
5813 * fields in the same order, with only the type differing.
5814 */
77a1abf5
ED
5815void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5816 const struct net_device_stats *netdev_stats)
3cfde79c
BH
5817{
5818#if BITS_PER_LONG == 64
77a1abf5
ED
5819 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5820 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
5821#else
5822 size_t i, n = sizeof(*stats64) / sizeof(u64);
5823 const unsigned long *src = (const unsigned long *)netdev_stats;
5824 u64 *dst = (u64 *)stats64;
5825
5826 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5827 sizeof(*stats64) / sizeof(u64));
5828 for (i = 0; i < n; i++)
5829 dst[i] = src[i];
5830#endif
5831}
77a1abf5 5832EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 5833
eeda3fd6
SH
5834/**
5835 * dev_get_stats - get network device statistics
5836 * @dev: device to get statistics from
28172739 5837 * @storage: place to store stats
eeda3fd6 5838 *
d7753516
BH
5839 * Get network statistics from device. Return @storage.
5840 * The device driver may provide its own method by setting
5841 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5842 * otherwise the internal statistics structure is used.
eeda3fd6 5843 */
d7753516
BH
5844struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5845 struct rtnl_link_stats64 *storage)
7004bf25 5846{
eeda3fd6
SH
5847 const struct net_device_ops *ops = dev->netdev_ops;
5848
28172739
ED
5849 if (ops->ndo_get_stats64) {
5850 memset(storage, 0, sizeof(*storage));
caf586e5
ED
5851 ops->ndo_get_stats64(dev, storage);
5852 } else if (ops->ndo_get_stats) {
3cfde79c 5853 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
5854 } else {
5855 netdev_stats_to_stats64(storage, &dev->stats);
28172739 5856 }
caf586e5 5857 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
28172739 5858 return storage;
c45d286e 5859}
eeda3fd6 5860EXPORT_SYMBOL(dev_get_stats);
c45d286e 5861
24824a09 5862struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 5863{
24824a09 5864 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 5865
24824a09
ED
5866#ifdef CONFIG_NET_CLS_ACT
5867 if (queue)
5868 return queue;
5869 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5870 if (!queue)
5871 return NULL;
5872 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
5873 queue->qdisc = &noop_qdisc;
5874 queue->qdisc_sleeping = &noop_qdisc;
5875 rcu_assign_pointer(dev->ingress_queue, queue);
5876#endif
5877 return queue;
bb949fbd
DM
5878}
5879
1da177e4 5880/**
36909ea4 5881 * alloc_netdev_mqs - allocate network device
1da177e4
LT
5882 * @sizeof_priv: size of private data to allocate space for
5883 * @name: device name format string
5884 * @setup: callback to initialize device
36909ea4
TH
5885 * @txqs: the number of TX subqueues to allocate
5886 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
5887 *
5888 * Allocates a struct net_device with private data area for driver use
f25f4e44 5889 * and performs basic initialization. Also allocates subquue structs
36909ea4 5890 * for each queue on the device.
1da177e4 5891 */
36909ea4
TH
5892struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5893 void (*setup)(struct net_device *),
5894 unsigned int txqs, unsigned int rxqs)
1da177e4 5895{
1da177e4 5896 struct net_device *dev;
7943986c 5897 size_t alloc_size;
1ce8e7b5 5898 struct net_device *p;
1da177e4 5899
b6fe17d6
SH
5900 BUG_ON(strlen(name) >= sizeof(dev->name));
5901
36909ea4 5902 if (txqs < 1) {
7b6cd1ce 5903 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
5904 return NULL;
5905 }
5906
36909ea4
TH
5907#ifdef CONFIG_RPS
5908 if (rxqs < 1) {
7b6cd1ce 5909 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
5910 return NULL;
5911 }
5912#endif
5913
fd2ea0a7 5914 alloc_size = sizeof(struct net_device);
d1643d24
AD
5915 if (sizeof_priv) {
5916 /* ensure 32-byte alignment of private area */
1ce8e7b5 5917 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5918 alloc_size += sizeof_priv;
5919 }
5920 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5921 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5922
31380de9 5923 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 5924 if (!p) {
7b6cd1ce 5925 pr_err("alloc_netdev: Unable to allocate device\n");
1da177e4
LT
5926 return NULL;
5927 }
1da177e4 5928
1ce8e7b5 5929 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5930 dev->padded = (char *)dev - (char *)p;
ab9c73cc 5931
29b4433d
ED
5932 dev->pcpu_refcnt = alloc_percpu(int);
5933 if (!dev->pcpu_refcnt)
e6484930 5934 goto free_p;
ab9c73cc 5935
ab9c73cc 5936 if (dev_addr_init(dev))
29b4433d 5937 goto free_pcpu;
ab9c73cc 5938
22bedad3 5939 dev_mc_init(dev);
a748ee24 5940 dev_uc_init(dev);
ccffad25 5941
c346dca1 5942 dev_net_set(dev, &init_net);
1da177e4 5943
8d3bdbd5
DM
5944 dev->gso_max_size = GSO_MAX_SIZE;
5945
8d3bdbd5
DM
5946 INIT_LIST_HEAD(&dev->napi_list);
5947 INIT_LIST_HEAD(&dev->unreg_list);
5948 INIT_LIST_HEAD(&dev->link_watch_list);
5949 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5950 setup(dev);
5951
36909ea4
TH
5952 dev->num_tx_queues = txqs;
5953 dev->real_num_tx_queues = txqs;
ed9af2e8 5954 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 5955 goto free_all;
e8a0464c 5956
df334545 5957#ifdef CONFIG_RPS
36909ea4
TH
5958 dev->num_rx_queues = rxqs;
5959 dev->real_num_rx_queues = rxqs;
fe822240 5960 if (netif_alloc_rx_queues(dev))
8d3bdbd5 5961 goto free_all;
df334545 5962#endif
0a9627f2 5963
1da177e4 5964 strcpy(dev->name, name);
cbda10fa 5965 dev->group = INIT_NETDEV_GROUP;
1da177e4 5966 return dev;
ab9c73cc 5967
8d3bdbd5
DM
5968free_all:
5969 free_netdev(dev);
5970 return NULL;
5971
29b4433d
ED
5972free_pcpu:
5973 free_percpu(dev->pcpu_refcnt);
ed9af2e8 5974 kfree(dev->_tx);
fe822240
TH
5975#ifdef CONFIG_RPS
5976 kfree(dev->_rx);
5977#endif
5978
ab9c73cc
JP
5979free_p:
5980 kfree(p);
5981 return NULL;
1da177e4 5982}
36909ea4 5983EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
5984
5985/**
5986 * free_netdev - free network device
5987 * @dev: device
5988 *
4ec93edb
YH
5989 * This function does the last stage of destroying an allocated device
5990 * interface. The reference to the device object is released.
1da177e4
LT
5991 * If this is the last reference then it will be freed.
5992 */
5993void free_netdev(struct net_device *dev)
5994{
d565b0a1
HX
5995 struct napi_struct *p, *n;
5996
f3005d7f
DL
5997 release_net(dev_net(dev));
5998
e8a0464c 5999 kfree(dev->_tx);
fe822240
TH
6000#ifdef CONFIG_RPS
6001 kfree(dev->_rx);
6002#endif
e8a0464c 6003
33d480ce 6004 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 6005
f001fde5
JP
6006 /* Flush device addresses */
6007 dev_addr_flush(dev);
6008
d565b0a1
HX
6009 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6010 netif_napi_del(p);
6011
29b4433d
ED
6012 free_percpu(dev->pcpu_refcnt);
6013 dev->pcpu_refcnt = NULL;
6014
3041a069 6015 /* Compatibility with error handling in drivers */
1da177e4
LT
6016 if (dev->reg_state == NETREG_UNINITIALIZED) {
6017 kfree((char *)dev - dev->padded);
6018 return;
6019 }
6020
6021 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6022 dev->reg_state = NETREG_RELEASED;
6023
43cb76d9
GKH
6024 /* will free via device release */
6025 put_device(&dev->dev);
1da177e4 6026}
d1b19dff 6027EXPORT_SYMBOL(free_netdev);
4ec93edb 6028
f0db275a
SH
6029/**
6030 * synchronize_net - Synchronize with packet receive processing
6031 *
6032 * Wait for packets currently being received to be done.
6033 * Does not block later packets from starting.
6034 */
4ec93edb 6035void synchronize_net(void)
1da177e4
LT
6036{
6037 might_sleep();
be3fc413
ED
6038 if (rtnl_is_locked())
6039 synchronize_rcu_expedited();
6040 else
6041 synchronize_rcu();
1da177e4 6042}
d1b19dff 6043EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
6044
6045/**
44a0873d 6046 * unregister_netdevice_queue - remove device from the kernel
1da177e4 6047 * @dev: device
44a0873d 6048 * @head: list
6ebfbc06 6049 *
1da177e4 6050 * This function shuts down a device interface and removes it
d59b54b1 6051 * from the kernel tables.
44a0873d 6052 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
6053 *
6054 * Callers must hold the rtnl semaphore. You may want
6055 * unregister_netdev() instead of this.
6056 */
6057
44a0873d 6058void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 6059{
a6620712
HX
6060 ASSERT_RTNL();
6061
44a0873d 6062 if (head) {
9fdce099 6063 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
6064 } else {
6065 rollback_registered(dev);
6066 /* Finish processing unregister after unlock */
6067 net_set_todo(dev);
6068 }
1da177e4 6069}
44a0873d 6070EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 6071
9b5e383c
ED
6072/**
6073 * unregister_netdevice_many - unregister many devices
6074 * @head: list of devices
9b5e383c
ED
6075 */
6076void unregister_netdevice_many(struct list_head *head)
6077{
6078 struct net_device *dev;
6079
6080 if (!list_empty(head)) {
6081 rollback_registered_many(head);
6082 list_for_each_entry(dev, head, unreg_list)
6083 net_set_todo(dev);
6084 }
6085}
63c8099d 6086EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 6087
1da177e4
LT
6088/**
6089 * unregister_netdev - remove device from the kernel
6090 * @dev: device
6091 *
6092 * This function shuts down a device interface and removes it
d59b54b1 6093 * from the kernel tables.
1da177e4
LT
6094 *
6095 * This is just a wrapper for unregister_netdevice that takes
6096 * the rtnl semaphore. In general you want to use this and not
6097 * unregister_netdevice.
6098 */
6099void unregister_netdev(struct net_device *dev)
6100{
6101 rtnl_lock();
6102 unregister_netdevice(dev);
6103 rtnl_unlock();
6104}
1da177e4
LT
6105EXPORT_SYMBOL(unregister_netdev);
6106
ce286d32
EB
6107/**
6108 * dev_change_net_namespace - move device to different nethost namespace
6109 * @dev: device
6110 * @net: network namespace
6111 * @pat: If not NULL name pattern to try if the current device name
6112 * is already taken in the destination network namespace.
6113 *
6114 * This function shuts down a device interface and moves it
6115 * to a new network namespace. On success 0 is returned, on
6116 * a failure a netagive errno code is returned.
6117 *
6118 * Callers must hold the rtnl semaphore.
6119 */
6120
6121int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6122{
ce286d32
EB
6123 int err;
6124
6125 ASSERT_RTNL();
6126
6127 /* Don't allow namespace local devices to be moved. */
6128 err = -EINVAL;
6129 if (dev->features & NETIF_F_NETNS_LOCAL)
6130 goto out;
6131
6132 /* Ensure the device has been registrered */
6133 err = -EINVAL;
6134 if (dev->reg_state != NETREG_REGISTERED)
6135 goto out;
6136
6137 /* Get out if there is nothing todo */
6138 err = 0;
878628fb 6139 if (net_eq(dev_net(dev), net))
ce286d32
EB
6140 goto out;
6141
6142 /* Pick the destination device name, and ensure
6143 * we can use it in the destination network namespace.
6144 */
6145 err = -EEXIST;
d9031024 6146 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
6147 /* We get here if we can't use the current device name */
6148 if (!pat)
6149 goto out;
1c5cae81 6150 if (dev_get_valid_name(dev, pat) < 0)
ce286d32
EB
6151 goto out;
6152 }
6153
6154 /*
6155 * And now a mini version of register_netdevice unregister_netdevice.
6156 */
6157
6158 /* If device is running close it first. */
9b772652 6159 dev_close(dev);
ce286d32
EB
6160
6161 /* And unlink it from device chain */
6162 err = -ENODEV;
6163 unlist_netdevice(dev);
6164
6165 synchronize_net();
6166
6167 /* Shutdown queueing discipline. */
6168 dev_shutdown(dev);
6169
6170 /* Notify protocols, that we are about to destroy
6171 this device. They should clean all the things.
3b27e105
DL
6172
6173 Note that dev->reg_state stays at NETREG_REGISTERED.
6174 This is wanted because this way 8021q and macvlan know
6175 the device is just moving and can keep their slaves up.
ce286d32
EB
6176 */
6177 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 6178 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
d2237d35 6179 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
ce286d32
EB
6180
6181 /*
6182 * Flush the unicast and multicast chains
6183 */
a748ee24 6184 dev_uc_flush(dev);
22bedad3 6185 dev_mc_flush(dev);
ce286d32
EB
6186
6187 /* Actually switch the network namespace */
c346dca1 6188 dev_net_set(dev, net);
ce286d32 6189
ce286d32
EB
6190 /* If there is an ifindex conflict assign a new one */
6191 if (__dev_get_by_index(net, dev->ifindex)) {
6192 int iflink = (dev->iflink == dev->ifindex);
6193 dev->ifindex = dev_new_index(net);
6194 if (iflink)
6195 dev->iflink = dev->ifindex;
6196 }
6197
8b41d188 6198 /* Fixup kobjects */
a1b3f594 6199 err = device_rename(&dev->dev, dev->name);
8b41d188 6200 WARN_ON(err);
ce286d32
EB
6201
6202 /* Add the device back in the hashes */
6203 list_netdevice(dev);
6204
6205 /* Notify protocols, that a new device appeared. */
6206 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6207
d90a909e
EB
6208 /*
6209 * Prevent userspace races by waiting until the network
6210 * device is fully setup before sending notifications.
6211 */
6212 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6213
ce286d32
EB
6214 synchronize_net();
6215 err = 0;
6216out:
6217 return err;
6218}
463d0183 6219EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 6220
1da177e4
LT
6221static int dev_cpu_callback(struct notifier_block *nfb,
6222 unsigned long action,
6223 void *ocpu)
6224{
6225 struct sk_buff **list_skb;
1da177e4
LT
6226 struct sk_buff *skb;
6227 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6228 struct softnet_data *sd, *oldsd;
6229
8bb78442 6230 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
6231 return NOTIFY_OK;
6232
6233 local_irq_disable();
6234 cpu = smp_processor_id();
6235 sd = &per_cpu(softnet_data, cpu);
6236 oldsd = &per_cpu(softnet_data, oldcpu);
6237
6238 /* Find end of our completion_queue. */
6239 list_skb = &sd->completion_queue;
6240 while (*list_skb)
6241 list_skb = &(*list_skb)->next;
6242 /* Append completion queue from offline CPU. */
6243 *list_skb = oldsd->completion_queue;
6244 oldsd->completion_queue = NULL;
6245
1da177e4 6246 /* Append output queue from offline CPU. */
a9cbd588
CG
6247 if (oldsd->output_queue) {
6248 *sd->output_queue_tailp = oldsd->output_queue;
6249 sd->output_queue_tailp = oldsd->output_queue_tailp;
6250 oldsd->output_queue = NULL;
6251 oldsd->output_queue_tailp = &oldsd->output_queue;
6252 }
264524d5
HC
6253 /* Append NAPI poll list from offline CPU. */
6254 if (!list_empty(&oldsd->poll_list)) {
6255 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6256 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6257 }
1da177e4
LT
6258
6259 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6260 local_irq_enable();
6261
6262 /* Process offline CPU's input_pkt_queue */
76cc8b13 6263 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 6264 netif_rx(skb);
76cc8b13 6265 input_queue_head_incr(oldsd);
fec5e652 6266 }
76cc8b13 6267 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 6268 netif_rx(skb);
76cc8b13
TH
6269 input_queue_head_incr(oldsd);
6270 }
1da177e4
LT
6271
6272 return NOTIFY_OK;
6273}
1da177e4
LT
6274
6275
7f353bf2 6276/**
b63365a2
HX
6277 * netdev_increment_features - increment feature set by one
6278 * @all: current feature set
6279 * @one: new feature set
6280 * @mask: mask feature set
7f353bf2
HX
6281 *
6282 * Computes a new feature set after adding a device with feature set
b63365a2
HX
6283 * @one to the master device with current feature set @all. Will not
6284 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 6285 */
c8f44aff
MM
6286netdev_features_t netdev_increment_features(netdev_features_t all,
6287 netdev_features_t one, netdev_features_t mask)
b63365a2 6288{
1742f183
MM
6289 if (mask & NETIF_F_GEN_CSUM)
6290 mask |= NETIF_F_ALL_CSUM;
6291 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 6292
1742f183
MM
6293 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6294 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 6295
1742f183
MM
6296 /* If one device supports hw checksumming, set for all. */
6297 if (all & NETIF_F_GEN_CSUM)
6298 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
6299
6300 return all;
6301}
b63365a2 6302EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 6303
30d97d35
PE
6304static struct hlist_head *netdev_create_hash(void)
6305{
6306 int i;
6307 struct hlist_head *hash;
6308
6309 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6310 if (hash != NULL)
6311 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6312 INIT_HLIST_HEAD(&hash[i]);
6313
6314 return hash;
6315}
6316
881d966b 6317/* Initialize per network namespace state */
4665079c 6318static int __net_init netdev_init(struct net *net)
881d966b 6319{
734b6541
RM
6320 if (net != &init_net)
6321 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 6322
30d97d35
PE
6323 net->dev_name_head = netdev_create_hash();
6324 if (net->dev_name_head == NULL)
6325 goto err_name;
881d966b 6326
30d97d35
PE
6327 net->dev_index_head = netdev_create_hash();
6328 if (net->dev_index_head == NULL)
6329 goto err_idx;
881d966b
EB
6330
6331 return 0;
30d97d35
PE
6332
6333err_idx:
6334 kfree(net->dev_name_head);
6335err_name:
6336 return -ENOMEM;
881d966b
EB
6337}
6338
f0db275a
SH
6339/**
6340 * netdev_drivername - network driver for the device
6341 * @dev: network device
f0db275a
SH
6342 *
6343 * Determine network driver for device.
6344 */
3019de12 6345const char *netdev_drivername(const struct net_device *dev)
6579e57b 6346{
cf04a4c7
SH
6347 const struct device_driver *driver;
6348 const struct device *parent;
3019de12 6349 const char *empty = "";
6579e57b
AV
6350
6351 parent = dev->dev.parent;
6579e57b 6352 if (!parent)
3019de12 6353 return empty;
6579e57b
AV
6354
6355 driver = parent->driver;
6356 if (driver && driver->name)
3019de12
DM
6357 return driver->name;
6358 return empty;
6579e57b
AV
6359}
6360
ffa10cb4 6361int __netdev_printk(const char *level, const struct net_device *dev,
256df2f3
JP
6362 struct va_format *vaf)
6363{
6364 int r;
6365
6366 if (dev && dev->dev.parent)
6367 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6368 netdev_name(dev), vaf);
6369 else if (dev)
6370 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6371 else
6372 r = printk("%s(NULL net_device): %pV", level, vaf);
6373
6374 return r;
6375}
ffa10cb4 6376EXPORT_SYMBOL(__netdev_printk);
256df2f3
JP
6377
6378int netdev_printk(const char *level, const struct net_device *dev,
6379 const char *format, ...)
6380{
6381 struct va_format vaf;
6382 va_list args;
6383 int r;
6384
6385 va_start(args, format);
6386
6387 vaf.fmt = format;
6388 vaf.va = &args;
6389
6390 r = __netdev_printk(level, dev, &vaf);
6391 va_end(args);
6392
6393 return r;
6394}
6395EXPORT_SYMBOL(netdev_printk);
6396
6397#define define_netdev_printk_level(func, level) \
6398int func(const struct net_device *dev, const char *fmt, ...) \
6399{ \
6400 int r; \
6401 struct va_format vaf; \
6402 va_list args; \
6403 \
6404 va_start(args, fmt); \
6405 \
6406 vaf.fmt = fmt; \
6407 vaf.va = &args; \
6408 \
6409 r = __netdev_printk(level, dev, &vaf); \
6410 va_end(args); \
6411 \
6412 return r; \
6413} \
6414EXPORT_SYMBOL(func);
6415
6416define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6417define_netdev_printk_level(netdev_alert, KERN_ALERT);
6418define_netdev_printk_level(netdev_crit, KERN_CRIT);
6419define_netdev_printk_level(netdev_err, KERN_ERR);
6420define_netdev_printk_level(netdev_warn, KERN_WARNING);
6421define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6422define_netdev_printk_level(netdev_info, KERN_INFO);
6423
4665079c 6424static void __net_exit netdev_exit(struct net *net)
881d966b
EB
6425{
6426 kfree(net->dev_name_head);
6427 kfree(net->dev_index_head);
6428}
6429
022cbae6 6430static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
6431 .init = netdev_init,
6432 .exit = netdev_exit,
6433};
6434
4665079c 6435static void __net_exit default_device_exit(struct net *net)
ce286d32 6436{
e008b5fc 6437 struct net_device *dev, *aux;
ce286d32 6438 /*
e008b5fc 6439 * Push all migratable network devices back to the
ce286d32
EB
6440 * initial network namespace
6441 */
6442 rtnl_lock();
e008b5fc 6443 for_each_netdev_safe(net, dev, aux) {
ce286d32 6444 int err;
aca51397 6445 char fb_name[IFNAMSIZ];
ce286d32
EB
6446
6447 /* Ignore unmoveable devices (i.e. loopback) */
6448 if (dev->features & NETIF_F_NETNS_LOCAL)
6449 continue;
6450
e008b5fc
EB
6451 /* Leave virtual devices for the generic cleanup */
6452 if (dev->rtnl_link_ops)
6453 continue;
d0c082ce 6454
25985edc 6455 /* Push remaining network devices to init_net */
aca51397
PE
6456 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6457 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 6458 if (err) {
7b6cd1ce
JP
6459 pr_emerg("%s: failed to move %s to init_net: %d\n",
6460 __func__, dev->name, err);
aca51397 6461 BUG();
ce286d32
EB
6462 }
6463 }
6464 rtnl_unlock();
6465}
6466
04dc7f6b
EB
6467static void __net_exit default_device_exit_batch(struct list_head *net_list)
6468{
6469 /* At exit all network devices most be removed from a network
b595076a 6470 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
6471 * Do this across as many network namespaces as possible to
6472 * improve batching efficiency.
6473 */
6474 struct net_device *dev;
6475 struct net *net;
6476 LIST_HEAD(dev_kill_list);
6477
6478 rtnl_lock();
6479 list_for_each_entry(net, net_list, exit_list) {
6480 for_each_netdev_reverse(net, dev) {
6481 if (dev->rtnl_link_ops)
6482 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6483 else
6484 unregister_netdevice_queue(dev, &dev_kill_list);
6485 }
6486 }
6487 unregister_netdevice_many(&dev_kill_list);
ceaaec98 6488 list_del(&dev_kill_list);
04dc7f6b
EB
6489 rtnl_unlock();
6490}
6491
022cbae6 6492static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6493 .exit = default_device_exit,
04dc7f6b 6494 .exit_batch = default_device_exit_batch,
ce286d32
EB
6495};
6496
1da177e4
LT
6497/*
6498 * Initialize the DEV module. At boot time this walks the device list and
6499 * unhooks any devices that fail to initialise (normally hardware not
6500 * present) and leaves us with a valid list of present and active devices.
6501 *
6502 */
6503
6504/*
6505 * This is called single threaded during boot, so no need
6506 * to take the rtnl semaphore.
6507 */
6508static int __init net_dev_init(void)
6509{
6510 int i, rc = -ENOMEM;
6511
6512 BUG_ON(!dev_boot_phase);
6513
1da177e4
LT
6514 if (dev_proc_init())
6515 goto out;
6516
8b41d188 6517 if (netdev_kobject_init())
1da177e4
LT
6518 goto out;
6519
6520 INIT_LIST_HEAD(&ptype_all);
82d8a867 6521 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6522 INIT_LIST_HEAD(&ptype_base[i]);
6523
881d966b
EB
6524 if (register_pernet_subsys(&netdev_net_ops))
6525 goto out;
1da177e4
LT
6526
6527 /*
6528 * Initialise the packet receive queues.
6529 */
6530
6f912042 6531 for_each_possible_cpu(i) {
e36fa2f7 6532 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6533
dee42870 6534 memset(sd, 0, sizeof(*sd));
e36fa2f7 6535 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6536 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6537 sd->completion_queue = NULL;
6538 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6539 sd->output_queue = NULL;
6540 sd->output_queue_tailp = &sd->output_queue;
df334545 6541#ifdef CONFIG_RPS
e36fa2f7
ED
6542 sd->csd.func = rps_trigger_softirq;
6543 sd->csd.info = sd;
6544 sd->csd.flags = 0;
6545 sd->cpu = i;
1e94d72f 6546#endif
0a9627f2 6547
e36fa2f7
ED
6548 sd->backlog.poll = process_backlog;
6549 sd->backlog.weight = weight_p;
6550 sd->backlog.gro_list = NULL;
6551 sd->backlog.gro_count = 0;
1da177e4
LT
6552 }
6553
1da177e4
LT
6554 dev_boot_phase = 0;
6555
505d4f73
EB
6556 /* The loopback device is special if any other network devices
6557 * is present in a network namespace the loopback device must
6558 * be present. Since we now dynamically allocate and free the
6559 * loopback device ensure this invariant is maintained by
6560 * keeping the loopback device as the first device on the
6561 * list of network devices. Ensuring the loopback devices
6562 * is the first device that appears and the last network device
6563 * that disappears.
6564 */
6565 if (register_pernet_device(&loopback_net_ops))
6566 goto out;
6567
6568 if (register_pernet_device(&default_device_ops))
6569 goto out;
6570
962cf36c
CM
6571 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6572 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6573
6574 hotcpu_notifier(dev_cpu_callback, 0);
6575 dst_init();
6576 dev_mcast_init();
6577 rc = 0;
6578out:
6579 return rc;
6580}
6581
6582subsys_initcall(net_dev_init);
6583
e88721f8
KK
6584static int __init initialize_hashrnd(void)
6585{
0a9627f2 6586 get_random_bytes(&hashrnd, sizeof(hashrnd));
e88721f8
KK
6587 return 0;
6588}
6589
6590late_initcall_sync(initialize_hashrnd);
6591