]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - net/core/dev.c
Merge branch 'ndo_xmit_flush'
[mirror_ubuntu-hirsute-kernel.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
134 #include <linux/if_macvlan.h>
135 #include <linux/errqueue.h>
136
137 #include "net-sysfs.h"
138
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145 static DEFINE_SPINLOCK(ptype_lock);
146 static DEFINE_SPINLOCK(offload_lock);
147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
148 struct list_head ptype_all __read_mostly; /* Taps */
149 static struct list_head offload_base __read_mostly;
150
151 static int netif_rx_internal(struct sk_buff *skb);
152 static int call_netdevice_notifiers_info(unsigned long val,
153 struct net_device *dev,
154 struct netdev_notifier_info *info);
155
156 /*
157 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
158 * semaphore.
159 *
160 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
161 *
162 * Writers must hold the rtnl semaphore while they loop through the
163 * dev_base_head list, and hold dev_base_lock for writing when they do the
164 * actual updates. This allows pure readers to access the list even
165 * while a writer is preparing to update it.
166 *
167 * To put it another way, dev_base_lock is held for writing only to
168 * protect against pure readers; the rtnl semaphore provides the
169 * protection against other writers.
170 *
171 * See, for example usages, register_netdevice() and
172 * unregister_netdevice(), which must be called with the rtnl
173 * semaphore held.
174 */
175 DEFINE_RWLOCK(dev_base_lock);
176 EXPORT_SYMBOL(dev_base_lock);
177
178 /* protects napi_hash addition/deletion and napi_gen_id */
179 static DEFINE_SPINLOCK(napi_hash_lock);
180
181 static unsigned int napi_gen_id;
182 static DEFINE_HASHTABLE(napi_hash, 8);
183
184 static seqcount_t devnet_rename_seq;
185
186 static inline void dev_base_seq_inc(struct net *net)
187 {
188 while (++net->dev_base_seq == 0);
189 }
190
191 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
192 {
193 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
194
195 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
196 }
197
198 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
199 {
200 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
201 }
202
203 static inline void rps_lock(struct softnet_data *sd)
204 {
205 #ifdef CONFIG_RPS
206 spin_lock(&sd->input_pkt_queue.lock);
207 #endif
208 }
209
210 static inline void rps_unlock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 spin_unlock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 /* Device list insertion */
218 static void list_netdevice(struct net_device *dev)
219 {
220 struct net *net = dev_net(dev);
221
222 ASSERT_RTNL();
223
224 write_lock_bh(&dev_base_lock);
225 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
226 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
227 hlist_add_head_rcu(&dev->index_hlist,
228 dev_index_hash(net, dev->ifindex));
229 write_unlock_bh(&dev_base_lock);
230
231 dev_base_seq_inc(net);
232 }
233
234 /* Device list removal
235 * caller must respect a RCU grace period before freeing/reusing dev
236 */
237 static void unlist_netdevice(struct net_device *dev)
238 {
239 ASSERT_RTNL();
240
241 /* Unlink dev from the device chain */
242 write_lock_bh(&dev_base_lock);
243 list_del_rcu(&dev->dev_list);
244 hlist_del_rcu(&dev->name_hlist);
245 hlist_del_rcu(&dev->index_hlist);
246 write_unlock_bh(&dev_base_lock);
247
248 dev_base_seq_inc(dev_net(dev));
249 }
250
251 /*
252 * Our notifier list
253 */
254
255 static RAW_NOTIFIER_HEAD(netdev_chain);
256
257 /*
258 * Device drivers call our routines to queue packets here. We empty the
259 * queue in the local softnet handler.
260 */
261
262 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
263 EXPORT_PER_CPU_SYMBOL(softnet_data);
264
265 #ifdef CONFIG_LOCKDEP
266 /*
267 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
268 * according to dev->type
269 */
270 static const unsigned short netdev_lock_type[] =
271 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
272 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
273 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
274 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
275 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
276 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
277 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
278 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
279 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
280 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
281 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
282 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
283 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
284 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
285 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
286
287 static const char *const netdev_lock_name[] =
288 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
289 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
290 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
291 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
292 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
293 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
294 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
295 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
296 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
297 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
298 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
299 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
300 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
301 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
302 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
303
304 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
305 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
306
307 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
308 {
309 int i;
310
311 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
312 if (netdev_lock_type[i] == dev_type)
313 return i;
314 /* the last key is used by default */
315 return ARRAY_SIZE(netdev_lock_type) - 1;
316 }
317
318 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
319 unsigned short dev_type)
320 {
321 int i;
322
323 i = netdev_lock_pos(dev_type);
324 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
325 netdev_lock_name[i]);
326 }
327
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 int i;
331
332 i = netdev_lock_pos(dev->type);
333 lockdep_set_class_and_name(&dev->addr_list_lock,
334 &netdev_addr_lock_key[i],
335 netdev_lock_name[i]);
336 }
337 #else
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 unsigned short dev_type)
340 {
341 }
342 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
343 {
344 }
345 #endif
346
347 /*******************************************************************************
348
349 Protocol management and registration routines
350
351 *******************************************************************************/
352
353 /*
354 * Add a protocol ID to the list. Now that the input handler is
355 * smarter we can dispense with all the messy stuff that used to be
356 * here.
357 *
358 * BEWARE!!! Protocol handlers, mangling input packets,
359 * MUST BE last in hash buckets and checking protocol handlers
360 * MUST start from promiscuous ptype_all chain in net_bh.
361 * It is true now, do not change it.
362 * Explanation follows: if protocol handler, mangling packet, will
363 * be the first on list, it is not able to sense, that packet
364 * is cloned and should be copied-on-write, so that it will
365 * change it and subsequent readers will get broken packet.
366 * --ANK (980803)
367 */
368
369 static inline struct list_head *ptype_head(const struct packet_type *pt)
370 {
371 if (pt->type == htons(ETH_P_ALL))
372 return &ptype_all;
373 else
374 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
375 }
376
377 /**
378 * dev_add_pack - add packet handler
379 * @pt: packet type declaration
380 *
381 * Add a protocol handler to the networking stack. The passed &packet_type
382 * is linked into kernel lists and may not be freed until it has been
383 * removed from the kernel lists.
384 *
385 * This call does not sleep therefore it can not
386 * guarantee all CPU's that are in middle of receiving packets
387 * will see the new packet type (until the next received packet).
388 */
389
390 void dev_add_pack(struct packet_type *pt)
391 {
392 struct list_head *head = ptype_head(pt);
393
394 spin_lock(&ptype_lock);
395 list_add_rcu(&pt->list, head);
396 spin_unlock(&ptype_lock);
397 }
398 EXPORT_SYMBOL(dev_add_pack);
399
400 /**
401 * __dev_remove_pack - remove packet handler
402 * @pt: packet type declaration
403 *
404 * Remove a protocol handler that was previously added to the kernel
405 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
406 * from the kernel lists and can be freed or reused once this function
407 * returns.
408 *
409 * The packet type might still be in use by receivers
410 * and must not be freed until after all the CPU's have gone
411 * through a quiescent state.
412 */
413 void __dev_remove_pack(struct packet_type *pt)
414 {
415 struct list_head *head = ptype_head(pt);
416 struct packet_type *pt1;
417
418 spin_lock(&ptype_lock);
419
420 list_for_each_entry(pt1, head, list) {
421 if (pt == pt1) {
422 list_del_rcu(&pt->list);
423 goto out;
424 }
425 }
426
427 pr_warn("dev_remove_pack: %p not found\n", pt);
428 out:
429 spin_unlock(&ptype_lock);
430 }
431 EXPORT_SYMBOL(__dev_remove_pack);
432
433 /**
434 * dev_remove_pack - remove packet handler
435 * @pt: packet type declaration
436 *
437 * Remove a protocol handler that was previously added to the kernel
438 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
439 * from the kernel lists and can be freed or reused once this function
440 * returns.
441 *
442 * This call sleeps to guarantee that no CPU is looking at the packet
443 * type after return.
444 */
445 void dev_remove_pack(struct packet_type *pt)
446 {
447 __dev_remove_pack(pt);
448
449 synchronize_net();
450 }
451 EXPORT_SYMBOL(dev_remove_pack);
452
453
454 /**
455 * dev_add_offload - register offload handlers
456 * @po: protocol offload declaration
457 *
458 * Add protocol offload handlers to the networking stack. The passed
459 * &proto_offload is linked into kernel lists and may not be freed until
460 * it has been removed from the kernel lists.
461 *
462 * This call does not sleep therefore it can not
463 * guarantee all CPU's that are in middle of receiving packets
464 * will see the new offload handlers (until the next received packet).
465 */
466 void dev_add_offload(struct packet_offload *po)
467 {
468 struct list_head *head = &offload_base;
469
470 spin_lock(&offload_lock);
471 list_add_rcu(&po->list, head);
472 spin_unlock(&offload_lock);
473 }
474 EXPORT_SYMBOL(dev_add_offload);
475
476 /**
477 * __dev_remove_offload - remove offload handler
478 * @po: packet offload declaration
479 *
480 * Remove a protocol offload handler that was previously added to the
481 * kernel offload handlers by dev_add_offload(). The passed &offload_type
482 * is removed from the kernel lists and can be freed or reused once this
483 * function returns.
484 *
485 * The packet type might still be in use by receivers
486 * and must not be freed until after all the CPU's have gone
487 * through a quiescent state.
488 */
489 static void __dev_remove_offload(struct packet_offload *po)
490 {
491 struct list_head *head = &offload_base;
492 struct packet_offload *po1;
493
494 spin_lock(&offload_lock);
495
496 list_for_each_entry(po1, head, list) {
497 if (po == po1) {
498 list_del_rcu(&po->list);
499 goto out;
500 }
501 }
502
503 pr_warn("dev_remove_offload: %p not found\n", po);
504 out:
505 spin_unlock(&offload_lock);
506 }
507
508 /**
509 * dev_remove_offload - remove packet offload handler
510 * @po: packet offload declaration
511 *
512 * Remove a packet offload handler that was previously added to the kernel
513 * offload handlers by dev_add_offload(). The passed &offload_type is
514 * removed from the kernel lists and can be freed or reused once this
515 * function returns.
516 *
517 * This call sleeps to guarantee that no CPU is looking at the packet
518 * type after return.
519 */
520 void dev_remove_offload(struct packet_offload *po)
521 {
522 __dev_remove_offload(po);
523
524 synchronize_net();
525 }
526 EXPORT_SYMBOL(dev_remove_offload);
527
528 /******************************************************************************
529
530 Device Boot-time Settings Routines
531
532 *******************************************************************************/
533
534 /* Boot time configuration table */
535 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
536
537 /**
538 * netdev_boot_setup_add - add new setup entry
539 * @name: name of the device
540 * @map: configured settings for the device
541 *
542 * Adds new setup entry to the dev_boot_setup list. The function
543 * returns 0 on error and 1 on success. This is a generic routine to
544 * all netdevices.
545 */
546 static int netdev_boot_setup_add(char *name, struct ifmap *map)
547 {
548 struct netdev_boot_setup *s;
549 int i;
550
551 s = dev_boot_setup;
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
553 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
554 memset(s[i].name, 0, sizeof(s[i].name));
555 strlcpy(s[i].name, name, IFNAMSIZ);
556 memcpy(&s[i].map, map, sizeof(s[i].map));
557 break;
558 }
559 }
560
561 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
562 }
563
564 /**
565 * netdev_boot_setup_check - check boot time settings
566 * @dev: the netdevice
567 *
568 * Check boot time settings for the device.
569 * The found settings are set for the device to be used
570 * later in the device probing.
571 * Returns 0 if no settings found, 1 if they are.
572 */
573 int netdev_boot_setup_check(struct net_device *dev)
574 {
575 struct netdev_boot_setup *s = dev_boot_setup;
576 int i;
577
578 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
579 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
580 !strcmp(dev->name, s[i].name)) {
581 dev->irq = s[i].map.irq;
582 dev->base_addr = s[i].map.base_addr;
583 dev->mem_start = s[i].map.mem_start;
584 dev->mem_end = s[i].map.mem_end;
585 return 1;
586 }
587 }
588 return 0;
589 }
590 EXPORT_SYMBOL(netdev_boot_setup_check);
591
592
593 /**
594 * netdev_boot_base - get address from boot time settings
595 * @prefix: prefix for network device
596 * @unit: id for network device
597 *
598 * Check boot time settings for the base address of device.
599 * The found settings are set for the device to be used
600 * later in the device probing.
601 * Returns 0 if no settings found.
602 */
603 unsigned long netdev_boot_base(const char *prefix, int unit)
604 {
605 const struct netdev_boot_setup *s = dev_boot_setup;
606 char name[IFNAMSIZ];
607 int i;
608
609 sprintf(name, "%s%d", prefix, unit);
610
611 /*
612 * If device already registered then return base of 1
613 * to indicate not to probe for this interface
614 */
615 if (__dev_get_by_name(&init_net, name))
616 return 1;
617
618 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
619 if (!strcmp(name, s[i].name))
620 return s[i].map.base_addr;
621 return 0;
622 }
623
624 /*
625 * Saves at boot time configured settings for any netdevice.
626 */
627 int __init netdev_boot_setup(char *str)
628 {
629 int ints[5];
630 struct ifmap map;
631
632 str = get_options(str, ARRAY_SIZE(ints), ints);
633 if (!str || !*str)
634 return 0;
635
636 /* Save settings */
637 memset(&map, 0, sizeof(map));
638 if (ints[0] > 0)
639 map.irq = ints[1];
640 if (ints[0] > 1)
641 map.base_addr = ints[2];
642 if (ints[0] > 2)
643 map.mem_start = ints[3];
644 if (ints[0] > 3)
645 map.mem_end = ints[4];
646
647 /* Add new entry to the list */
648 return netdev_boot_setup_add(str, &map);
649 }
650
651 __setup("netdev=", netdev_boot_setup);
652
653 /*******************************************************************************
654
655 Device Interface Subroutines
656
657 *******************************************************************************/
658
659 /**
660 * __dev_get_by_name - find a device by its name
661 * @net: the applicable net namespace
662 * @name: name to find
663 *
664 * Find an interface by name. Must be called under RTNL semaphore
665 * or @dev_base_lock. If the name is found a pointer to the device
666 * is returned. If the name is not found then %NULL is returned. The
667 * reference counters are not incremented so the caller must be
668 * careful with locks.
669 */
670
671 struct net_device *__dev_get_by_name(struct net *net, const char *name)
672 {
673 struct net_device *dev;
674 struct hlist_head *head = dev_name_hash(net, name);
675
676 hlist_for_each_entry(dev, head, name_hlist)
677 if (!strncmp(dev->name, name, IFNAMSIZ))
678 return dev;
679
680 return NULL;
681 }
682 EXPORT_SYMBOL(__dev_get_by_name);
683
684 /**
685 * dev_get_by_name_rcu - find a device by its name
686 * @net: the applicable net namespace
687 * @name: name to find
688 *
689 * Find an interface by name.
690 * If the name is found a pointer to the device is returned.
691 * If the name is not found then %NULL is returned.
692 * The reference counters are not incremented so the caller must be
693 * careful with locks. The caller must hold RCU lock.
694 */
695
696 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
697 {
698 struct net_device *dev;
699 struct hlist_head *head = dev_name_hash(net, name);
700
701 hlist_for_each_entry_rcu(dev, head, name_hlist)
702 if (!strncmp(dev->name, name, IFNAMSIZ))
703 return dev;
704
705 return NULL;
706 }
707 EXPORT_SYMBOL(dev_get_by_name_rcu);
708
709 /**
710 * dev_get_by_name - find a device by its name
711 * @net: the applicable net namespace
712 * @name: name to find
713 *
714 * Find an interface by name. This can be called from any
715 * context and does its own locking. The returned handle has
716 * the usage count incremented and the caller must use dev_put() to
717 * release it when it is no longer needed. %NULL is returned if no
718 * matching device is found.
719 */
720
721 struct net_device *dev_get_by_name(struct net *net, const char *name)
722 {
723 struct net_device *dev;
724
725 rcu_read_lock();
726 dev = dev_get_by_name_rcu(net, name);
727 if (dev)
728 dev_hold(dev);
729 rcu_read_unlock();
730 return dev;
731 }
732 EXPORT_SYMBOL(dev_get_by_name);
733
734 /**
735 * __dev_get_by_index - find a device by its ifindex
736 * @net: the applicable net namespace
737 * @ifindex: index of device
738 *
739 * Search for an interface by index. Returns %NULL if the device
740 * is not found or a pointer to the device. The device has not
741 * had its reference counter increased so the caller must be careful
742 * about locking. The caller must hold either the RTNL semaphore
743 * or @dev_base_lock.
744 */
745
746 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
747 {
748 struct net_device *dev;
749 struct hlist_head *head = dev_index_hash(net, ifindex);
750
751 hlist_for_each_entry(dev, head, index_hlist)
752 if (dev->ifindex == ifindex)
753 return dev;
754
755 return NULL;
756 }
757 EXPORT_SYMBOL(__dev_get_by_index);
758
759 /**
760 * dev_get_by_index_rcu - find a device by its ifindex
761 * @net: the applicable net namespace
762 * @ifindex: index of device
763 *
764 * Search for an interface by index. Returns %NULL if the device
765 * is not found or a pointer to the device. The device has not
766 * had its reference counter increased so the caller must be careful
767 * about locking. The caller must hold RCU lock.
768 */
769
770 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
771 {
772 struct net_device *dev;
773 struct hlist_head *head = dev_index_hash(net, ifindex);
774
775 hlist_for_each_entry_rcu(dev, head, index_hlist)
776 if (dev->ifindex == ifindex)
777 return dev;
778
779 return NULL;
780 }
781 EXPORT_SYMBOL(dev_get_by_index_rcu);
782
783
784 /**
785 * dev_get_by_index - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
788 *
789 * Search for an interface by index. Returns NULL if the device
790 * is not found or a pointer to the device. The device returned has
791 * had a reference added and the pointer is safe until the user calls
792 * dev_put to indicate they have finished with it.
793 */
794
795 struct net_device *dev_get_by_index(struct net *net, int ifindex)
796 {
797 struct net_device *dev;
798
799 rcu_read_lock();
800 dev = dev_get_by_index_rcu(net, ifindex);
801 if (dev)
802 dev_hold(dev);
803 rcu_read_unlock();
804 return dev;
805 }
806 EXPORT_SYMBOL(dev_get_by_index);
807
808 /**
809 * netdev_get_name - get a netdevice name, knowing its ifindex.
810 * @net: network namespace
811 * @name: a pointer to the buffer where the name will be stored.
812 * @ifindex: the ifindex of the interface to get the name from.
813 *
814 * The use of raw_seqcount_begin() and cond_resched() before
815 * retrying is required as we want to give the writers a chance
816 * to complete when CONFIG_PREEMPT is not set.
817 */
818 int netdev_get_name(struct net *net, char *name, int ifindex)
819 {
820 struct net_device *dev;
821 unsigned int seq;
822
823 retry:
824 seq = raw_seqcount_begin(&devnet_rename_seq);
825 rcu_read_lock();
826 dev = dev_get_by_index_rcu(net, ifindex);
827 if (!dev) {
828 rcu_read_unlock();
829 return -ENODEV;
830 }
831
832 strcpy(name, dev->name);
833 rcu_read_unlock();
834 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
835 cond_resched();
836 goto retry;
837 }
838
839 return 0;
840 }
841
842 /**
843 * dev_getbyhwaddr_rcu - find a device by its hardware address
844 * @net: the applicable net namespace
845 * @type: media type of device
846 * @ha: hardware address
847 *
848 * Search for an interface by MAC address. Returns NULL if the device
849 * is not found or a pointer to the device.
850 * The caller must hold RCU or RTNL.
851 * The returned device has not had its ref count increased
852 * and the caller must therefore be careful about locking
853 *
854 */
855
856 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
857 const char *ha)
858 {
859 struct net_device *dev;
860
861 for_each_netdev_rcu(net, dev)
862 if (dev->type == type &&
863 !memcmp(dev->dev_addr, ha, dev->addr_len))
864 return dev;
865
866 return NULL;
867 }
868 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
869
870 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
871 {
872 struct net_device *dev;
873
874 ASSERT_RTNL();
875 for_each_netdev(net, dev)
876 if (dev->type == type)
877 return dev;
878
879 return NULL;
880 }
881 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
882
883 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
884 {
885 struct net_device *dev, *ret = NULL;
886
887 rcu_read_lock();
888 for_each_netdev_rcu(net, dev)
889 if (dev->type == type) {
890 dev_hold(dev);
891 ret = dev;
892 break;
893 }
894 rcu_read_unlock();
895 return ret;
896 }
897 EXPORT_SYMBOL(dev_getfirstbyhwtype);
898
899 /**
900 * dev_get_by_flags_rcu - find any device with given flags
901 * @net: the applicable net namespace
902 * @if_flags: IFF_* values
903 * @mask: bitmask of bits in if_flags to check
904 *
905 * Search for any interface with the given flags. Returns NULL if a device
906 * is not found or a pointer to the device. Must be called inside
907 * rcu_read_lock(), and result refcount is unchanged.
908 */
909
910 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
911 unsigned short mask)
912 {
913 struct net_device *dev, *ret;
914
915 ret = NULL;
916 for_each_netdev_rcu(net, dev) {
917 if (((dev->flags ^ if_flags) & mask) == 0) {
918 ret = dev;
919 break;
920 }
921 }
922 return ret;
923 }
924 EXPORT_SYMBOL(dev_get_by_flags_rcu);
925
926 /**
927 * dev_valid_name - check if name is okay for network device
928 * @name: name string
929 *
930 * Network device names need to be valid file names to
931 * to allow sysfs to work. We also disallow any kind of
932 * whitespace.
933 */
934 bool dev_valid_name(const char *name)
935 {
936 if (*name == '\0')
937 return false;
938 if (strlen(name) >= IFNAMSIZ)
939 return false;
940 if (!strcmp(name, ".") || !strcmp(name, ".."))
941 return false;
942
943 while (*name) {
944 if (*name == '/' || isspace(*name))
945 return false;
946 name++;
947 }
948 return true;
949 }
950 EXPORT_SYMBOL(dev_valid_name);
951
952 /**
953 * __dev_alloc_name - allocate a name for a device
954 * @net: network namespace to allocate the device name in
955 * @name: name format string
956 * @buf: scratch buffer and result name string
957 *
958 * Passed a format string - eg "lt%d" it will try and find a suitable
959 * id. It scans list of devices to build up a free map, then chooses
960 * the first empty slot. The caller must hold the dev_base or rtnl lock
961 * while allocating the name and adding the device in order to avoid
962 * duplicates.
963 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
964 * Returns the number of the unit assigned or a negative errno code.
965 */
966
967 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
968 {
969 int i = 0;
970 const char *p;
971 const int max_netdevices = 8*PAGE_SIZE;
972 unsigned long *inuse;
973 struct net_device *d;
974
975 p = strnchr(name, IFNAMSIZ-1, '%');
976 if (p) {
977 /*
978 * Verify the string as this thing may have come from
979 * the user. There must be either one "%d" and no other "%"
980 * characters.
981 */
982 if (p[1] != 'd' || strchr(p + 2, '%'))
983 return -EINVAL;
984
985 /* Use one page as a bit array of possible slots */
986 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
987 if (!inuse)
988 return -ENOMEM;
989
990 for_each_netdev(net, d) {
991 if (!sscanf(d->name, name, &i))
992 continue;
993 if (i < 0 || i >= max_netdevices)
994 continue;
995
996 /* avoid cases where sscanf is not exact inverse of printf */
997 snprintf(buf, IFNAMSIZ, name, i);
998 if (!strncmp(buf, d->name, IFNAMSIZ))
999 set_bit(i, inuse);
1000 }
1001
1002 i = find_first_zero_bit(inuse, max_netdevices);
1003 free_page((unsigned long) inuse);
1004 }
1005
1006 if (buf != name)
1007 snprintf(buf, IFNAMSIZ, name, i);
1008 if (!__dev_get_by_name(net, buf))
1009 return i;
1010
1011 /* It is possible to run out of possible slots
1012 * when the name is long and there isn't enough space left
1013 * for the digits, or if all bits are used.
1014 */
1015 return -ENFILE;
1016 }
1017
1018 /**
1019 * dev_alloc_name - allocate a name for a device
1020 * @dev: device
1021 * @name: name format string
1022 *
1023 * Passed a format string - eg "lt%d" it will try and find a suitable
1024 * id. It scans list of devices to build up a free map, then chooses
1025 * the first empty slot. The caller must hold the dev_base or rtnl lock
1026 * while allocating the name and adding the device in order to avoid
1027 * duplicates.
1028 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1029 * Returns the number of the unit assigned or a negative errno code.
1030 */
1031
1032 int dev_alloc_name(struct net_device *dev, const char *name)
1033 {
1034 char buf[IFNAMSIZ];
1035 struct net *net;
1036 int ret;
1037
1038 BUG_ON(!dev_net(dev));
1039 net = dev_net(dev);
1040 ret = __dev_alloc_name(net, name, buf);
1041 if (ret >= 0)
1042 strlcpy(dev->name, buf, IFNAMSIZ);
1043 return ret;
1044 }
1045 EXPORT_SYMBOL(dev_alloc_name);
1046
1047 static int dev_alloc_name_ns(struct net *net,
1048 struct net_device *dev,
1049 const char *name)
1050 {
1051 char buf[IFNAMSIZ];
1052 int ret;
1053
1054 ret = __dev_alloc_name(net, name, buf);
1055 if (ret >= 0)
1056 strlcpy(dev->name, buf, IFNAMSIZ);
1057 return ret;
1058 }
1059
1060 static int dev_get_valid_name(struct net *net,
1061 struct net_device *dev,
1062 const char *name)
1063 {
1064 BUG_ON(!net);
1065
1066 if (!dev_valid_name(name))
1067 return -EINVAL;
1068
1069 if (strchr(name, '%'))
1070 return dev_alloc_name_ns(net, dev, name);
1071 else if (__dev_get_by_name(net, name))
1072 return -EEXIST;
1073 else if (dev->name != name)
1074 strlcpy(dev->name, name, IFNAMSIZ);
1075
1076 return 0;
1077 }
1078
1079 /**
1080 * dev_change_name - change name of a device
1081 * @dev: device
1082 * @newname: name (or format string) must be at least IFNAMSIZ
1083 *
1084 * Change name of a device, can pass format strings "eth%d".
1085 * for wildcarding.
1086 */
1087 int dev_change_name(struct net_device *dev, const char *newname)
1088 {
1089 unsigned char old_assign_type;
1090 char oldname[IFNAMSIZ];
1091 int err = 0;
1092 int ret;
1093 struct net *net;
1094
1095 ASSERT_RTNL();
1096 BUG_ON(!dev_net(dev));
1097
1098 net = dev_net(dev);
1099 if (dev->flags & IFF_UP)
1100 return -EBUSY;
1101
1102 write_seqcount_begin(&devnet_rename_seq);
1103
1104 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1105 write_seqcount_end(&devnet_rename_seq);
1106 return 0;
1107 }
1108
1109 memcpy(oldname, dev->name, IFNAMSIZ);
1110
1111 err = dev_get_valid_name(net, dev, newname);
1112 if (err < 0) {
1113 write_seqcount_end(&devnet_rename_seq);
1114 return err;
1115 }
1116
1117 if (oldname[0] && !strchr(oldname, '%'))
1118 netdev_info(dev, "renamed from %s\n", oldname);
1119
1120 old_assign_type = dev->name_assign_type;
1121 dev->name_assign_type = NET_NAME_RENAMED;
1122
1123 rollback:
1124 ret = device_rename(&dev->dev, dev->name);
1125 if (ret) {
1126 memcpy(dev->name, oldname, IFNAMSIZ);
1127 dev->name_assign_type = old_assign_type;
1128 write_seqcount_end(&devnet_rename_seq);
1129 return ret;
1130 }
1131
1132 write_seqcount_end(&devnet_rename_seq);
1133
1134 netdev_adjacent_rename_links(dev, oldname);
1135
1136 write_lock_bh(&dev_base_lock);
1137 hlist_del_rcu(&dev->name_hlist);
1138 write_unlock_bh(&dev_base_lock);
1139
1140 synchronize_rcu();
1141
1142 write_lock_bh(&dev_base_lock);
1143 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1144 write_unlock_bh(&dev_base_lock);
1145
1146 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1147 ret = notifier_to_errno(ret);
1148
1149 if (ret) {
1150 /* err >= 0 after dev_alloc_name() or stores the first errno */
1151 if (err >= 0) {
1152 err = ret;
1153 write_seqcount_begin(&devnet_rename_seq);
1154 memcpy(dev->name, oldname, IFNAMSIZ);
1155 memcpy(oldname, newname, IFNAMSIZ);
1156 dev->name_assign_type = old_assign_type;
1157 old_assign_type = NET_NAME_RENAMED;
1158 goto rollback;
1159 } else {
1160 pr_err("%s: name change rollback failed: %d\n",
1161 dev->name, ret);
1162 }
1163 }
1164
1165 return err;
1166 }
1167
1168 /**
1169 * dev_set_alias - change ifalias of a device
1170 * @dev: device
1171 * @alias: name up to IFALIASZ
1172 * @len: limit of bytes to copy from info
1173 *
1174 * Set ifalias for a device,
1175 */
1176 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1177 {
1178 char *new_ifalias;
1179
1180 ASSERT_RTNL();
1181
1182 if (len >= IFALIASZ)
1183 return -EINVAL;
1184
1185 if (!len) {
1186 kfree(dev->ifalias);
1187 dev->ifalias = NULL;
1188 return 0;
1189 }
1190
1191 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1192 if (!new_ifalias)
1193 return -ENOMEM;
1194 dev->ifalias = new_ifalias;
1195
1196 strlcpy(dev->ifalias, alias, len+1);
1197 return len;
1198 }
1199
1200
1201 /**
1202 * netdev_features_change - device changes features
1203 * @dev: device to cause notification
1204 *
1205 * Called to indicate a device has changed features.
1206 */
1207 void netdev_features_change(struct net_device *dev)
1208 {
1209 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1210 }
1211 EXPORT_SYMBOL(netdev_features_change);
1212
1213 /**
1214 * netdev_state_change - device changes state
1215 * @dev: device to cause notification
1216 *
1217 * Called to indicate a device has changed state. This function calls
1218 * the notifier chains for netdev_chain and sends a NEWLINK message
1219 * to the routing socket.
1220 */
1221 void netdev_state_change(struct net_device *dev)
1222 {
1223 if (dev->flags & IFF_UP) {
1224 struct netdev_notifier_change_info change_info;
1225
1226 change_info.flags_changed = 0;
1227 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1228 &change_info.info);
1229 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1230 }
1231 }
1232 EXPORT_SYMBOL(netdev_state_change);
1233
1234 /**
1235 * netdev_notify_peers - notify network peers about existence of @dev
1236 * @dev: network device
1237 *
1238 * Generate traffic such that interested network peers are aware of
1239 * @dev, such as by generating a gratuitous ARP. This may be used when
1240 * a device wants to inform the rest of the network about some sort of
1241 * reconfiguration such as a failover event or virtual machine
1242 * migration.
1243 */
1244 void netdev_notify_peers(struct net_device *dev)
1245 {
1246 rtnl_lock();
1247 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1248 rtnl_unlock();
1249 }
1250 EXPORT_SYMBOL(netdev_notify_peers);
1251
1252 static int __dev_open(struct net_device *dev)
1253 {
1254 const struct net_device_ops *ops = dev->netdev_ops;
1255 int ret;
1256
1257 ASSERT_RTNL();
1258
1259 if (!netif_device_present(dev))
1260 return -ENODEV;
1261
1262 /* Block netpoll from trying to do any rx path servicing.
1263 * If we don't do this there is a chance ndo_poll_controller
1264 * or ndo_poll may be running while we open the device
1265 */
1266 netpoll_poll_disable(dev);
1267
1268 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1269 ret = notifier_to_errno(ret);
1270 if (ret)
1271 return ret;
1272
1273 set_bit(__LINK_STATE_START, &dev->state);
1274
1275 if (ops->ndo_validate_addr)
1276 ret = ops->ndo_validate_addr(dev);
1277
1278 if (!ret && ops->ndo_open)
1279 ret = ops->ndo_open(dev);
1280
1281 netpoll_poll_enable(dev);
1282
1283 if (ret)
1284 clear_bit(__LINK_STATE_START, &dev->state);
1285 else {
1286 dev->flags |= IFF_UP;
1287 net_dmaengine_get();
1288 dev_set_rx_mode(dev);
1289 dev_activate(dev);
1290 add_device_randomness(dev->dev_addr, dev->addr_len);
1291 }
1292
1293 return ret;
1294 }
1295
1296 /**
1297 * dev_open - prepare an interface for use.
1298 * @dev: device to open
1299 *
1300 * Takes a device from down to up state. The device's private open
1301 * function is invoked and then the multicast lists are loaded. Finally
1302 * the device is moved into the up state and a %NETDEV_UP message is
1303 * sent to the netdev notifier chain.
1304 *
1305 * Calling this function on an active interface is a nop. On a failure
1306 * a negative errno code is returned.
1307 */
1308 int dev_open(struct net_device *dev)
1309 {
1310 int ret;
1311
1312 if (dev->flags & IFF_UP)
1313 return 0;
1314
1315 ret = __dev_open(dev);
1316 if (ret < 0)
1317 return ret;
1318
1319 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1320 call_netdevice_notifiers(NETDEV_UP, dev);
1321
1322 return ret;
1323 }
1324 EXPORT_SYMBOL(dev_open);
1325
1326 static int __dev_close_many(struct list_head *head)
1327 {
1328 struct net_device *dev;
1329
1330 ASSERT_RTNL();
1331 might_sleep();
1332
1333 list_for_each_entry(dev, head, close_list) {
1334 /* Temporarily disable netpoll until the interface is down */
1335 netpoll_poll_disable(dev);
1336
1337 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1338
1339 clear_bit(__LINK_STATE_START, &dev->state);
1340
1341 /* Synchronize to scheduled poll. We cannot touch poll list, it
1342 * can be even on different cpu. So just clear netif_running().
1343 *
1344 * dev->stop() will invoke napi_disable() on all of it's
1345 * napi_struct instances on this device.
1346 */
1347 smp_mb__after_atomic(); /* Commit netif_running(). */
1348 }
1349
1350 dev_deactivate_many(head);
1351
1352 list_for_each_entry(dev, head, close_list) {
1353 const struct net_device_ops *ops = dev->netdev_ops;
1354
1355 /*
1356 * Call the device specific close. This cannot fail.
1357 * Only if device is UP
1358 *
1359 * We allow it to be called even after a DETACH hot-plug
1360 * event.
1361 */
1362 if (ops->ndo_stop)
1363 ops->ndo_stop(dev);
1364
1365 dev->flags &= ~IFF_UP;
1366 net_dmaengine_put();
1367 netpoll_poll_enable(dev);
1368 }
1369
1370 return 0;
1371 }
1372
1373 static int __dev_close(struct net_device *dev)
1374 {
1375 int retval;
1376 LIST_HEAD(single);
1377
1378 list_add(&dev->close_list, &single);
1379 retval = __dev_close_many(&single);
1380 list_del(&single);
1381
1382 return retval;
1383 }
1384
1385 static int dev_close_many(struct list_head *head)
1386 {
1387 struct net_device *dev, *tmp;
1388
1389 /* Remove the devices that don't need to be closed */
1390 list_for_each_entry_safe(dev, tmp, head, close_list)
1391 if (!(dev->flags & IFF_UP))
1392 list_del_init(&dev->close_list);
1393
1394 __dev_close_many(head);
1395
1396 list_for_each_entry_safe(dev, tmp, head, close_list) {
1397 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1398 call_netdevice_notifiers(NETDEV_DOWN, dev);
1399 list_del_init(&dev->close_list);
1400 }
1401
1402 return 0;
1403 }
1404
1405 /**
1406 * dev_close - shutdown an interface.
1407 * @dev: device to shutdown
1408 *
1409 * This function moves an active device into down state. A
1410 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1411 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1412 * chain.
1413 */
1414 int dev_close(struct net_device *dev)
1415 {
1416 if (dev->flags & IFF_UP) {
1417 LIST_HEAD(single);
1418
1419 list_add(&dev->close_list, &single);
1420 dev_close_many(&single);
1421 list_del(&single);
1422 }
1423 return 0;
1424 }
1425 EXPORT_SYMBOL(dev_close);
1426
1427
1428 /**
1429 * dev_disable_lro - disable Large Receive Offload on a device
1430 * @dev: device
1431 *
1432 * Disable Large Receive Offload (LRO) on a net device. Must be
1433 * called under RTNL. This is needed if received packets may be
1434 * forwarded to another interface.
1435 */
1436 void dev_disable_lro(struct net_device *dev)
1437 {
1438 /*
1439 * If we're trying to disable lro on a vlan device
1440 * use the underlying physical device instead
1441 */
1442 if (is_vlan_dev(dev))
1443 dev = vlan_dev_real_dev(dev);
1444
1445 /* the same for macvlan devices */
1446 if (netif_is_macvlan(dev))
1447 dev = macvlan_dev_real_dev(dev);
1448
1449 dev->wanted_features &= ~NETIF_F_LRO;
1450 netdev_update_features(dev);
1451
1452 if (unlikely(dev->features & NETIF_F_LRO))
1453 netdev_WARN(dev, "failed to disable LRO!\n");
1454 }
1455 EXPORT_SYMBOL(dev_disable_lro);
1456
1457 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1458 struct net_device *dev)
1459 {
1460 struct netdev_notifier_info info;
1461
1462 netdev_notifier_info_init(&info, dev);
1463 return nb->notifier_call(nb, val, &info);
1464 }
1465
1466 static int dev_boot_phase = 1;
1467
1468 /**
1469 * register_netdevice_notifier - register a network notifier block
1470 * @nb: notifier
1471 *
1472 * Register a notifier to be called when network device events occur.
1473 * The notifier passed is linked into the kernel structures and must
1474 * not be reused until it has been unregistered. A negative errno code
1475 * is returned on a failure.
1476 *
1477 * When registered all registration and up events are replayed
1478 * to the new notifier to allow device to have a race free
1479 * view of the network device list.
1480 */
1481
1482 int register_netdevice_notifier(struct notifier_block *nb)
1483 {
1484 struct net_device *dev;
1485 struct net_device *last;
1486 struct net *net;
1487 int err;
1488
1489 rtnl_lock();
1490 err = raw_notifier_chain_register(&netdev_chain, nb);
1491 if (err)
1492 goto unlock;
1493 if (dev_boot_phase)
1494 goto unlock;
1495 for_each_net(net) {
1496 for_each_netdev(net, dev) {
1497 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1498 err = notifier_to_errno(err);
1499 if (err)
1500 goto rollback;
1501
1502 if (!(dev->flags & IFF_UP))
1503 continue;
1504
1505 call_netdevice_notifier(nb, NETDEV_UP, dev);
1506 }
1507 }
1508
1509 unlock:
1510 rtnl_unlock();
1511 return err;
1512
1513 rollback:
1514 last = dev;
1515 for_each_net(net) {
1516 for_each_netdev(net, dev) {
1517 if (dev == last)
1518 goto outroll;
1519
1520 if (dev->flags & IFF_UP) {
1521 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1522 dev);
1523 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1524 }
1525 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1526 }
1527 }
1528
1529 outroll:
1530 raw_notifier_chain_unregister(&netdev_chain, nb);
1531 goto unlock;
1532 }
1533 EXPORT_SYMBOL(register_netdevice_notifier);
1534
1535 /**
1536 * unregister_netdevice_notifier - unregister a network notifier block
1537 * @nb: notifier
1538 *
1539 * Unregister a notifier previously registered by
1540 * register_netdevice_notifier(). The notifier is unlinked into the
1541 * kernel structures and may then be reused. A negative errno code
1542 * is returned on a failure.
1543 *
1544 * After unregistering unregister and down device events are synthesized
1545 * for all devices on the device list to the removed notifier to remove
1546 * the need for special case cleanup code.
1547 */
1548
1549 int unregister_netdevice_notifier(struct notifier_block *nb)
1550 {
1551 struct net_device *dev;
1552 struct net *net;
1553 int err;
1554
1555 rtnl_lock();
1556 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1557 if (err)
1558 goto unlock;
1559
1560 for_each_net(net) {
1561 for_each_netdev(net, dev) {
1562 if (dev->flags & IFF_UP) {
1563 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1564 dev);
1565 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1566 }
1567 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1568 }
1569 }
1570 unlock:
1571 rtnl_unlock();
1572 return err;
1573 }
1574 EXPORT_SYMBOL(unregister_netdevice_notifier);
1575
1576 /**
1577 * call_netdevice_notifiers_info - call all network notifier blocks
1578 * @val: value passed unmodified to notifier function
1579 * @dev: net_device pointer passed unmodified to notifier function
1580 * @info: notifier information data
1581 *
1582 * Call all network notifier blocks. Parameters and return value
1583 * are as for raw_notifier_call_chain().
1584 */
1585
1586 static int call_netdevice_notifiers_info(unsigned long val,
1587 struct net_device *dev,
1588 struct netdev_notifier_info *info)
1589 {
1590 ASSERT_RTNL();
1591 netdev_notifier_info_init(info, dev);
1592 return raw_notifier_call_chain(&netdev_chain, val, info);
1593 }
1594
1595 /**
1596 * call_netdevice_notifiers - call all network notifier blocks
1597 * @val: value passed unmodified to notifier function
1598 * @dev: net_device pointer passed unmodified to notifier function
1599 *
1600 * Call all network notifier blocks. Parameters and return value
1601 * are as for raw_notifier_call_chain().
1602 */
1603
1604 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1605 {
1606 struct netdev_notifier_info info;
1607
1608 return call_netdevice_notifiers_info(val, dev, &info);
1609 }
1610 EXPORT_SYMBOL(call_netdevice_notifiers);
1611
1612 static struct static_key netstamp_needed __read_mostly;
1613 #ifdef HAVE_JUMP_LABEL
1614 /* We are not allowed to call static_key_slow_dec() from irq context
1615 * If net_disable_timestamp() is called from irq context, defer the
1616 * static_key_slow_dec() calls.
1617 */
1618 static atomic_t netstamp_needed_deferred;
1619 #endif
1620
1621 void net_enable_timestamp(void)
1622 {
1623 #ifdef HAVE_JUMP_LABEL
1624 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1625
1626 if (deferred) {
1627 while (--deferred)
1628 static_key_slow_dec(&netstamp_needed);
1629 return;
1630 }
1631 #endif
1632 static_key_slow_inc(&netstamp_needed);
1633 }
1634 EXPORT_SYMBOL(net_enable_timestamp);
1635
1636 void net_disable_timestamp(void)
1637 {
1638 #ifdef HAVE_JUMP_LABEL
1639 if (in_interrupt()) {
1640 atomic_inc(&netstamp_needed_deferred);
1641 return;
1642 }
1643 #endif
1644 static_key_slow_dec(&netstamp_needed);
1645 }
1646 EXPORT_SYMBOL(net_disable_timestamp);
1647
1648 static inline void net_timestamp_set(struct sk_buff *skb)
1649 {
1650 skb->tstamp.tv64 = 0;
1651 if (static_key_false(&netstamp_needed))
1652 __net_timestamp(skb);
1653 }
1654
1655 #define net_timestamp_check(COND, SKB) \
1656 if (static_key_false(&netstamp_needed)) { \
1657 if ((COND) && !(SKB)->tstamp.tv64) \
1658 __net_timestamp(SKB); \
1659 } \
1660
1661 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1662 {
1663 unsigned int len;
1664
1665 if (!(dev->flags & IFF_UP))
1666 return false;
1667
1668 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1669 if (skb->len <= len)
1670 return true;
1671
1672 /* if TSO is enabled, we don't care about the length as the packet
1673 * could be forwarded without being segmented before
1674 */
1675 if (skb_is_gso(skb))
1676 return true;
1677
1678 return false;
1679 }
1680 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1681
1682 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1683 {
1684 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1685 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1686 atomic_long_inc(&dev->rx_dropped);
1687 kfree_skb(skb);
1688 return NET_RX_DROP;
1689 }
1690 }
1691
1692 if (unlikely(!is_skb_forwardable(dev, skb))) {
1693 atomic_long_inc(&dev->rx_dropped);
1694 kfree_skb(skb);
1695 return NET_RX_DROP;
1696 }
1697
1698 skb_scrub_packet(skb, true);
1699 skb->protocol = eth_type_trans(skb, dev);
1700
1701 return 0;
1702 }
1703 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1704
1705 /**
1706 * dev_forward_skb - loopback an skb to another netif
1707 *
1708 * @dev: destination network device
1709 * @skb: buffer to forward
1710 *
1711 * return values:
1712 * NET_RX_SUCCESS (no congestion)
1713 * NET_RX_DROP (packet was dropped, but freed)
1714 *
1715 * dev_forward_skb can be used for injecting an skb from the
1716 * start_xmit function of one device into the receive queue
1717 * of another device.
1718 *
1719 * The receiving device may be in another namespace, so
1720 * we have to clear all information in the skb that could
1721 * impact namespace isolation.
1722 */
1723 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1724 {
1725 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1726 }
1727 EXPORT_SYMBOL_GPL(dev_forward_skb);
1728
1729 static inline int deliver_skb(struct sk_buff *skb,
1730 struct packet_type *pt_prev,
1731 struct net_device *orig_dev)
1732 {
1733 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1734 return -ENOMEM;
1735 atomic_inc(&skb->users);
1736 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1737 }
1738
1739 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1740 {
1741 if (!ptype->af_packet_priv || !skb->sk)
1742 return false;
1743
1744 if (ptype->id_match)
1745 return ptype->id_match(ptype, skb->sk);
1746 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1747 return true;
1748
1749 return false;
1750 }
1751
1752 /*
1753 * Support routine. Sends outgoing frames to any network
1754 * taps currently in use.
1755 */
1756
1757 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1758 {
1759 struct packet_type *ptype;
1760 struct sk_buff *skb2 = NULL;
1761 struct packet_type *pt_prev = NULL;
1762
1763 rcu_read_lock();
1764 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1765 /* Never send packets back to the socket
1766 * they originated from - MvS (miquels@drinkel.ow.org)
1767 */
1768 if ((ptype->dev == dev || !ptype->dev) &&
1769 (!skb_loop_sk(ptype, skb))) {
1770 if (pt_prev) {
1771 deliver_skb(skb2, pt_prev, skb->dev);
1772 pt_prev = ptype;
1773 continue;
1774 }
1775
1776 skb2 = skb_clone(skb, GFP_ATOMIC);
1777 if (!skb2)
1778 break;
1779
1780 net_timestamp_set(skb2);
1781
1782 /* skb->nh should be correctly
1783 set by sender, so that the second statement is
1784 just protection against buggy protocols.
1785 */
1786 skb_reset_mac_header(skb2);
1787
1788 if (skb_network_header(skb2) < skb2->data ||
1789 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1790 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1791 ntohs(skb2->protocol),
1792 dev->name);
1793 skb_reset_network_header(skb2);
1794 }
1795
1796 skb2->transport_header = skb2->network_header;
1797 skb2->pkt_type = PACKET_OUTGOING;
1798 pt_prev = ptype;
1799 }
1800 }
1801 if (pt_prev)
1802 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1803 rcu_read_unlock();
1804 }
1805
1806 /**
1807 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1808 * @dev: Network device
1809 * @txq: number of queues available
1810 *
1811 * If real_num_tx_queues is changed the tc mappings may no longer be
1812 * valid. To resolve this verify the tc mapping remains valid and if
1813 * not NULL the mapping. With no priorities mapping to this
1814 * offset/count pair it will no longer be used. In the worst case TC0
1815 * is invalid nothing can be done so disable priority mappings. If is
1816 * expected that drivers will fix this mapping if they can before
1817 * calling netif_set_real_num_tx_queues.
1818 */
1819 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1820 {
1821 int i;
1822 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1823
1824 /* If TC0 is invalidated disable TC mapping */
1825 if (tc->offset + tc->count > txq) {
1826 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1827 dev->num_tc = 0;
1828 return;
1829 }
1830
1831 /* Invalidated prio to tc mappings set to TC0 */
1832 for (i = 1; i < TC_BITMASK + 1; i++) {
1833 int q = netdev_get_prio_tc_map(dev, i);
1834
1835 tc = &dev->tc_to_txq[q];
1836 if (tc->offset + tc->count > txq) {
1837 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1838 i, q);
1839 netdev_set_prio_tc_map(dev, i, 0);
1840 }
1841 }
1842 }
1843
1844 #ifdef CONFIG_XPS
1845 static DEFINE_MUTEX(xps_map_mutex);
1846 #define xmap_dereference(P) \
1847 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1848
1849 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1850 int cpu, u16 index)
1851 {
1852 struct xps_map *map = NULL;
1853 int pos;
1854
1855 if (dev_maps)
1856 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1857
1858 for (pos = 0; map && pos < map->len; pos++) {
1859 if (map->queues[pos] == index) {
1860 if (map->len > 1) {
1861 map->queues[pos] = map->queues[--map->len];
1862 } else {
1863 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1864 kfree_rcu(map, rcu);
1865 map = NULL;
1866 }
1867 break;
1868 }
1869 }
1870
1871 return map;
1872 }
1873
1874 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1875 {
1876 struct xps_dev_maps *dev_maps;
1877 int cpu, i;
1878 bool active = false;
1879
1880 mutex_lock(&xps_map_mutex);
1881 dev_maps = xmap_dereference(dev->xps_maps);
1882
1883 if (!dev_maps)
1884 goto out_no_maps;
1885
1886 for_each_possible_cpu(cpu) {
1887 for (i = index; i < dev->num_tx_queues; i++) {
1888 if (!remove_xps_queue(dev_maps, cpu, i))
1889 break;
1890 }
1891 if (i == dev->num_tx_queues)
1892 active = true;
1893 }
1894
1895 if (!active) {
1896 RCU_INIT_POINTER(dev->xps_maps, NULL);
1897 kfree_rcu(dev_maps, rcu);
1898 }
1899
1900 for (i = index; i < dev->num_tx_queues; i++)
1901 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1902 NUMA_NO_NODE);
1903
1904 out_no_maps:
1905 mutex_unlock(&xps_map_mutex);
1906 }
1907
1908 static struct xps_map *expand_xps_map(struct xps_map *map,
1909 int cpu, u16 index)
1910 {
1911 struct xps_map *new_map;
1912 int alloc_len = XPS_MIN_MAP_ALLOC;
1913 int i, pos;
1914
1915 for (pos = 0; map && pos < map->len; pos++) {
1916 if (map->queues[pos] != index)
1917 continue;
1918 return map;
1919 }
1920
1921 /* Need to add queue to this CPU's existing map */
1922 if (map) {
1923 if (pos < map->alloc_len)
1924 return map;
1925
1926 alloc_len = map->alloc_len * 2;
1927 }
1928
1929 /* Need to allocate new map to store queue on this CPU's map */
1930 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1931 cpu_to_node(cpu));
1932 if (!new_map)
1933 return NULL;
1934
1935 for (i = 0; i < pos; i++)
1936 new_map->queues[i] = map->queues[i];
1937 new_map->alloc_len = alloc_len;
1938 new_map->len = pos;
1939
1940 return new_map;
1941 }
1942
1943 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1944 u16 index)
1945 {
1946 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1947 struct xps_map *map, *new_map;
1948 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1949 int cpu, numa_node_id = -2;
1950 bool active = false;
1951
1952 mutex_lock(&xps_map_mutex);
1953
1954 dev_maps = xmap_dereference(dev->xps_maps);
1955
1956 /* allocate memory for queue storage */
1957 for_each_online_cpu(cpu) {
1958 if (!cpumask_test_cpu(cpu, mask))
1959 continue;
1960
1961 if (!new_dev_maps)
1962 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1963 if (!new_dev_maps) {
1964 mutex_unlock(&xps_map_mutex);
1965 return -ENOMEM;
1966 }
1967
1968 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1969 NULL;
1970
1971 map = expand_xps_map(map, cpu, index);
1972 if (!map)
1973 goto error;
1974
1975 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1976 }
1977
1978 if (!new_dev_maps)
1979 goto out_no_new_maps;
1980
1981 for_each_possible_cpu(cpu) {
1982 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1983 /* add queue to CPU maps */
1984 int pos = 0;
1985
1986 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1987 while ((pos < map->len) && (map->queues[pos] != index))
1988 pos++;
1989
1990 if (pos == map->len)
1991 map->queues[map->len++] = index;
1992 #ifdef CONFIG_NUMA
1993 if (numa_node_id == -2)
1994 numa_node_id = cpu_to_node(cpu);
1995 else if (numa_node_id != cpu_to_node(cpu))
1996 numa_node_id = -1;
1997 #endif
1998 } else if (dev_maps) {
1999 /* fill in the new device map from the old device map */
2000 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2001 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2002 }
2003
2004 }
2005
2006 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2007
2008 /* Cleanup old maps */
2009 if (dev_maps) {
2010 for_each_possible_cpu(cpu) {
2011 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2012 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2013 if (map && map != new_map)
2014 kfree_rcu(map, rcu);
2015 }
2016
2017 kfree_rcu(dev_maps, rcu);
2018 }
2019
2020 dev_maps = new_dev_maps;
2021 active = true;
2022
2023 out_no_new_maps:
2024 /* update Tx queue numa node */
2025 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2026 (numa_node_id >= 0) ? numa_node_id :
2027 NUMA_NO_NODE);
2028
2029 if (!dev_maps)
2030 goto out_no_maps;
2031
2032 /* removes queue from unused CPUs */
2033 for_each_possible_cpu(cpu) {
2034 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2035 continue;
2036
2037 if (remove_xps_queue(dev_maps, cpu, index))
2038 active = true;
2039 }
2040
2041 /* free map if not active */
2042 if (!active) {
2043 RCU_INIT_POINTER(dev->xps_maps, NULL);
2044 kfree_rcu(dev_maps, rcu);
2045 }
2046
2047 out_no_maps:
2048 mutex_unlock(&xps_map_mutex);
2049
2050 return 0;
2051 error:
2052 /* remove any maps that we added */
2053 for_each_possible_cpu(cpu) {
2054 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2055 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2056 NULL;
2057 if (new_map && new_map != map)
2058 kfree(new_map);
2059 }
2060
2061 mutex_unlock(&xps_map_mutex);
2062
2063 kfree(new_dev_maps);
2064 return -ENOMEM;
2065 }
2066 EXPORT_SYMBOL(netif_set_xps_queue);
2067
2068 #endif
2069 /*
2070 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2071 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2072 */
2073 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2074 {
2075 int rc;
2076
2077 if (txq < 1 || txq > dev->num_tx_queues)
2078 return -EINVAL;
2079
2080 if (dev->reg_state == NETREG_REGISTERED ||
2081 dev->reg_state == NETREG_UNREGISTERING) {
2082 ASSERT_RTNL();
2083
2084 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2085 txq);
2086 if (rc)
2087 return rc;
2088
2089 if (dev->num_tc)
2090 netif_setup_tc(dev, txq);
2091
2092 if (txq < dev->real_num_tx_queues) {
2093 qdisc_reset_all_tx_gt(dev, txq);
2094 #ifdef CONFIG_XPS
2095 netif_reset_xps_queues_gt(dev, txq);
2096 #endif
2097 }
2098 }
2099
2100 dev->real_num_tx_queues = txq;
2101 return 0;
2102 }
2103 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2104
2105 #ifdef CONFIG_SYSFS
2106 /**
2107 * netif_set_real_num_rx_queues - set actual number of RX queues used
2108 * @dev: Network device
2109 * @rxq: Actual number of RX queues
2110 *
2111 * This must be called either with the rtnl_lock held or before
2112 * registration of the net device. Returns 0 on success, or a
2113 * negative error code. If called before registration, it always
2114 * succeeds.
2115 */
2116 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2117 {
2118 int rc;
2119
2120 if (rxq < 1 || rxq > dev->num_rx_queues)
2121 return -EINVAL;
2122
2123 if (dev->reg_state == NETREG_REGISTERED) {
2124 ASSERT_RTNL();
2125
2126 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2127 rxq);
2128 if (rc)
2129 return rc;
2130 }
2131
2132 dev->real_num_rx_queues = rxq;
2133 return 0;
2134 }
2135 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2136 #endif
2137
2138 /**
2139 * netif_get_num_default_rss_queues - default number of RSS queues
2140 *
2141 * This routine should set an upper limit on the number of RSS queues
2142 * used by default by multiqueue devices.
2143 */
2144 int netif_get_num_default_rss_queues(void)
2145 {
2146 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2147 }
2148 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2149
2150 static inline void __netif_reschedule(struct Qdisc *q)
2151 {
2152 struct softnet_data *sd;
2153 unsigned long flags;
2154
2155 local_irq_save(flags);
2156 sd = &__get_cpu_var(softnet_data);
2157 q->next_sched = NULL;
2158 *sd->output_queue_tailp = q;
2159 sd->output_queue_tailp = &q->next_sched;
2160 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2161 local_irq_restore(flags);
2162 }
2163
2164 void __netif_schedule(struct Qdisc *q)
2165 {
2166 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2167 __netif_reschedule(q);
2168 }
2169 EXPORT_SYMBOL(__netif_schedule);
2170
2171 struct dev_kfree_skb_cb {
2172 enum skb_free_reason reason;
2173 };
2174
2175 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2176 {
2177 return (struct dev_kfree_skb_cb *)skb->cb;
2178 }
2179
2180 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2181 {
2182 unsigned long flags;
2183
2184 if (likely(atomic_read(&skb->users) == 1)) {
2185 smp_rmb();
2186 atomic_set(&skb->users, 0);
2187 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2188 return;
2189 }
2190 get_kfree_skb_cb(skb)->reason = reason;
2191 local_irq_save(flags);
2192 skb->next = __this_cpu_read(softnet_data.completion_queue);
2193 __this_cpu_write(softnet_data.completion_queue, skb);
2194 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2195 local_irq_restore(flags);
2196 }
2197 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2198
2199 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2200 {
2201 if (in_irq() || irqs_disabled())
2202 __dev_kfree_skb_irq(skb, reason);
2203 else
2204 dev_kfree_skb(skb);
2205 }
2206 EXPORT_SYMBOL(__dev_kfree_skb_any);
2207
2208
2209 /**
2210 * netif_device_detach - mark device as removed
2211 * @dev: network device
2212 *
2213 * Mark device as removed from system and therefore no longer available.
2214 */
2215 void netif_device_detach(struct net_device *dev)
2216 {
2217 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2218 netif_running(dev)) {
2219 netif_tx_stop_all_queues(dev);
2220 }
2221 }
2222 EXPORT_SYMBOL(netif_device_detach);
2223
2224 /**
2225 * netif_device_attach - mark device as attached
2226 * @dev: network device
2227 *
2228 * Mark device as attached from system and restart if needed.
2229 */
2230 void netif_device_attach(struct net_device *dev)
2231 {
2232 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2233 netif_running(dev)) {
2234 netif_tx_wake_all_queues(dev);
2235 __netdev_watchdog_up(dev);
2236 }
2237 }
2238 EXPORT_SYMBOL(netif_device_attach);
2239
2240 static void skb_warn_bad_offload(const struct sk_buff *skb)
2241 {
2242 static const netdev_features_t null_features = 0;
2243 struct net_device *dev = skb->dev;
2244 const char *driver = "";
2245
2246 if (!net_ratelimit())
2247 return;
2248
2249 if (dev && dev->dev.parent)
2250 driver = dev_driver_string(dev->dev.parent);
2251
2252 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2253 "gso_type=%d ip_summed=%d\n",
2254 driver, dev ? &dev->features : &null_features,
2255 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2256 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2257 skb_shinfo(skb)->gso_type, skb->ip_summed);
2258 }
2259
2260 /*
2261 * Invalidate hardware checksum when packet is to be mangled, and
2262 * complete checksum manually on outgoing path.
2263 */
2264 int skb_checksum_help(struct sk_buff *skb)
2265 {
2266 __wsum csum;
2267 int ret = 0, offset;
2268
2269 if (skb->ip_summed == CHECKSUM_COMPLETE)
2270 goto out_set_summed;
2271
2272 if (unlikely(skb_shinfo(skb)->gso_size)) {
2273 skb_warn_bad_offload(skb);
2274 return -EINVAL;
2275 }
2276
2277 /* Before computing a checksum, we should make sure no frag could
2278 * be modified by an external entity : checksum could be wrong.
2279 */
2280 if (skb_has_shared_frag(skb)) {
2281 ret = __skb_linearize(skb);
2282 if (ret)
2283 goto out;
2284 }
2285
2286 offset = skb_checksum_start_offset(skb);
2287 BUG_ON(offset >= skb_headlen(skb));
2288 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2289
2290 offset += skb->csum_offset;
2291 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2292
2293 if (skb_cloned(skb) &&
2294 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2295 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2296 if (ret)
2297 goto out;
2298 }
2299
2300 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2301 out_set_summed:
2302 skb->ip_summed = CHECKSUM_NONE;
2303 out:
2304 return ret;
2305 }
2306 EXPORT_SYMBOL(skb_checksum_help);
2307
2308 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2309 {
2310 unsigned int vlan_depth = skb->mac_len;
2311 __be16 type = skb->protocol;
2312
2313 /* Tunnel gso handlers can set protocol to ethernet. */
2314 if (type == htons(ETH_P_TEB)) {
2315 struct ethhdr *eth;
2316
2317 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2318 return 0;
2319
2320 eth = (struct ethhdr *)skb_mac_header(skb);
2321 type = eth->h_proto;
2322 }
2323
2324 /* if skb->protocol is 802.1Q/AD then the header should already be
2325 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2326 * ETH_HLEN otherwise
2327 */
2328 if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2329 if (vlan_depth) {
2330 if (WARN_ON(vlan_depth < VLAN_HLEN))
2331 return 0;
2332 vlan_depth -= VLAN_HLEN;
2333 } else {
2334 vlan_depth = ETH_HLEN;
2335 }
2336 do {
2337 struct vlan_hdr *vh;
2338
2339 if (unlikely(!pskb_may_pull(skb,
2340 vlan_depth + VLAN_HLEN)))
2341 return 0;
2342
2343 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2344 type = vh->h_vlan_encapsulated_proto;
2345 vlan_depth += VLAN_HLEN;
2346 } while (type == htons(ETH_P_8021Q) ||
2347 type == htons(ETH_P_8021AD));
2348 }
2349
2350 *depth = vlan_depth;
2351
2352 return type;
2353 }
2354
2355 /**
2356 * skb_mac_gso_segment - mac layer segmentation handler.
2357 * @skb: buffer to segment
2358 * @features: features for the output path (see dev->features)
2359 */
2360 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2361 netdev_features_t features)
2362 {
2363 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2364 struct packet_offload *ptype;
2365 int vlan_depth = skb->mac_len;
2366 __be16 type = skb_network_protocol(skb, &vlan_depth);
2367
2368 if (unlikely(!type))
2369 return ERR_PTR(-EINVAL);
2370
2371 __skb_pull(skb, vlan_depth);
2372
2373 rcu_read_lock();
2374 list_for_each_entry_rcu(ptype, &offload_base, list) {
2375 if (ptype->type == type && ptype->callbacks.gso_segment) {
2376 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2377 int err;
2378
2379 err = ptype->callbacks.gso_send_check(skb);
2380 segs = ERR_PTR(err);
2381 if (err || skb_gso_ok(skb, features))
2382 break;
2383 __skb_push(skb, (skb->data -
2384 skb_network_header(skb)));
2385 }
2386 segs = ptype->callbacks.gso_segment(skb, features);
2387 break;
2388 }
2389 }
2390 rcu_read_unlock();
2391
2392 __skb_push(skb, skb->data - skb_mac_header(skb));
2393
2394 return segs;
2395 }
2396 EXPORT_SYMBOL(skb_mac_gso_segment);
2397
2398
2399 /* openvswitch calls this on rx path, so we need a different check.
2400 */
2401 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2402 {
2403 if (tx_path)
2404 return skb->ip_summed != CHECKSUM_PARTIAL;
2405 else
2406 return skb->ip_summed == CHECKSUM_NONE;
2407 }
2408
2409 /**
2410 * __skb_gso_segment - Perform segmentation on skb.
2411 * @skb: buffer to segment
2412 * @features: features for the output path (see dev->features)
2413 * @tx_path: whether it is called in TX path
2414 *
2415 * This function segments the given skb and returns a list of segments.
2416 *
2417 * It may return NULL if the skb requires no segmentation. This is
2418 * only possible when GSO is used for verifying header integrity.
2419 */
2420 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2421 netdev_features_t features, bool tx_path)
2422 {
2423 if (unlikely(skb_needs_check(skb, tx_path))) {
2424 int err;
2425
2426 skb_warn_bad_offload(skb);
2427
2428 err = skb_cow_head(skb, 0);
2429 if (err < 0)
2430 return ERR_PTR(err);
2431 }
2432
2433 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2434 SKB_GSO_CB(skb)->encap_level = 0;
2435
2436 skb_reset_mac_header(skb);
2437 skb_reset_mac_len(skb);
2438
2439 return skb_mac_gso_segment(skb, features);
2440 }
2441 EXPORT_SYMBOL(__skb_gso_segment);
2442
2443 /* Take action when hardware reception checksum errors are detected. */
2444 #ifdef CONFIG_BUG
2445 void netdev_rx_csum_fault(struct net_device *dev)
2446 {
2447 if (net_ratelimit()) {
2448 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2449 dump_stack();
2450 }
2451 }
2452 EXPORT_SYMBOL(netdev_rx_csum_fault);
2453 #endif
2454
2455 /* Actually, we should eliminate this check as soon as we know, that:
2456 * 1. IOMMU is present and allows to map all the memory.
2457 * 2. No high memory really exists on this machine.
2458 */
2459
2460 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2461 {
2462 #ifdef CONFIG_HIGHMEM
2463 int i;
2464 if (!(dev->features & NETIF_F_HIGHDMA)) {
2465 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2466 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2467 if (PageHighMem(skb_frag_page(frag)))
2468 return 1;
2469 }
2470 }
2471
2472 if (PCI_DMA_BUS_IS_PHYS) {
2473 struct device *pdev = dev->dev.parent;
2474
2475 if (!pdev)
2476 return 0;
2477 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2478 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2479 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2480 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2481 return 1;
2482 }
2483 }
2484 #endif
2485 return 0;
2486 }
2487
2488 struct dev_gso_cb {
2489 void (*destructor)(struct sk_buff *skb);
2490 };
2491
2492 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2493
2494 static void dev_gso_skb_destructor(struct sk_buff *skb)
2495 {
2496 struct dev_gso_cb *cb;
2497
2498 kfree_skb_list(skb->next);
2499 skb->next = NULL;
2500
2501 cb = DEV_GSO_CB(skb);
2502 if (cb->destructor)
2503 cb->destructor(skb);
2504 }
2505
2506 /**
2507 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2508 * @skb: buffer to segment
2509 * @features: device features as applicable to this skb
2510 *
2511 * This function segments the given skb and stores the list of segments
2512 * in skb->next.
2513 */
2514 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2515 {
2516 struct sk_buff *segs;
2517
2518 segs = skb_gso_segment(skb, features);
2519
2520 /* Verifying header integrity only. */
2521 if (!segs)
2522 return 0;
2523
2524 if (IS_ERR(segs))
2525 return PTR_ERR(segs);
2526
2527 skb->next = segs;
2528 DEV_GSO_CB(skb)->destructor = skb->destructor;
2529 skb->destructor = dev_gso_skb_destructor;
2530
2531 return 0;
2532 }
2533
2534 /* If MPLS offload request, verify we are testing hardware MPLS features
2535 * instead of standard features for the netdev.
2536 */
2537 #ifdef CONFIG_NET_MPLS_GSO
2538 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2539 netdev_features_t features,
2540 __be16 type)
2541 {
2542 if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2543 features &= skb->dev->mpls_features;
2544
2545 return features;
2546 }
2547 #else
2548 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2549 netdev_features_t features,
2550 __be16 type)
2551 {
2552 return features;
2553 }
2554 #endif
2555
2556 static netdev_features_t harmonize_features(struct sk_buff *skb,
2557 netdev_features_t features)
2558 {
2559 int tmp;
2560 __be16 type;
2561
2562 type = skb_network_protocol(skb, &tmp);
2563 features = net_mpls_features(skb, features, type);
2564
2565 if (skb->ip_summed != CHECKSUM_NONE &&
2566 !can_checksum_protocol(features, type)) {
2567 features &= ~NETIF_F_ALL_CSUM;
2568 } else if (illegal_highdma(skb->dev, skb)) {
2569 features &= ~NETIF_F_SG;
2570 }
2571
2572 return features;
2573 }
2574
2575 netdev_features_t netif_skb_features(struct sk_buff *skb)
2576 {
2577 __be16 protocol = skb->protocol;
2578 netdev_features_t features = skb->dev->features;
2579
2580 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2581 features &= ~NETIF_F_GSO_MASK;
2582
2583 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2584 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2585 protocol = veh->h_vlan_encapsulated_proto;
2586 } else if (!vlan_tx_tag_present(skb)) {
2587 return harmonize_features(skb, features);
2588 }
2589
2590 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2591 NETIF_F_HW_VLAN_STAG_TX);
2592
2593 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2594 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2595 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2596 NETIF_F_HW_VLAN_STAG_TX;
2597
2598 return harmonize_features(skb, features);
2599 }
2600 EXPORT_SYMBOL(netif_skb_features);
2601
2602 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2603 struct netdev_queue *txq)
2604 {
2605 int rc = NETDEV_TX_OK;
2606 unsigned int skb_len;
2607
2608 if (likely(!skb->next)) {
2609 netdev_features_t features;
2610
2611 /*
2612 * If device doesn't need skb->dst, release it right now while
2613 * its hot in this cpu cache
2614 */
2615 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2616 skb_dst_drop(skb);
2617
2618 features = netif_skb_features(skb);
2619
2620 if (vlan_tx_tag_present(skb) &&
2621 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2622 skb = __vlan_put_tag(skb, skb->vlan_proto,
2623 vlan_tx_tag_get(skb));
2624 if (unlikely(!skb))
2625 goto out;
2626
2627 skb->vlan_tci = 0;
2628 }
2629
2630 /* If encapsulation offload request, verify we are testing
2631 * hardware encapsulation features instead of standard
2632 * features for the netdev
2633 */
2634 if (skb->encapsulation)
2635 features &= dev->hw_enc_features;
2636
2637 if (netif_needs_gso(skb, features)) {
2638 if (unlikely(dev_gso_segment(skb, features)))
2639 goto out_kfree_skb;
2640 if (skb->next)
2641 goto gso;
2642 } else {
2643 if (skb_needs_linearize(skb, features) &&
2644 __skb_linearize(skb))
2645 goto out_kfree_skb;
2646
2647 /* If packet is not checksummed and device does not
2648 * support checksumming for this protocol, complete
2649 * checksumming here.
2650 */
2651 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2652 if (skb->encapsulation)
2653 skb_set_inner_transport_header(skb,
2654 skb_checksum_start_offset(skb));
2655 else
2656 skb_set_transport_header(skb,
2657 skb_checksum_start_offset(skb));
2658 if (!(features & NETIF_F_ALL_CSUM) &&
2659 skb_checksum_help(skb))
2660 goto out_kfree_skb;
2661 }
2662 }
2663
2664 if (!list_empty(&ptype_all))
2665 dev_queue_xmit_nit(skb, dev);
2666
2667 skb_len = skb->len;
2668 trace_net_dev_start_xmit(skb, dev);
2669 rc = netdev_start_xmit(skb, dev);
2670 trace_net_dev_xmit(skb, rc, dev, skb_len);
2671 if (rc == NETDEV_TX_OK)
2672 txq_trans_update(txq);
2673 return rc;
2674 }
2675
2676 gso:
2677 do {
2678 struct sk_buff *nskb = skb->next;
2679
2680 skb->next = nskb->next;
2681 nskb->next = NULL;
2682
2683 if (!list_empty(&ptype_all))
2684 dev_queue_xmit_nit(nskb, dev);
2685
2686 skb_len = nskb->len;
2687 trace_net_dev_start_xmit(nskb, dev);
2688 rc = netdev_start_xmit(nskb, dev);
2689 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2690 if (unlikely(rc != NETDEV_TX_OK)) {
2691 if (rc & ~NETDEV_TX_MASK)
2692 goto out_kfree_gso_skb;
2693 nskb->next = skb->next;
2694 skb->next = nskb;
2695 return rc;
2696 }
2697 txq_trans_update(txq);
2698 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2699 return NETDEV_TX_BUSY;
2700 } while (skb->next);
2701
2702 out_kfree_gso_skb:
2703 if (likely(skb->next == NULL)) {
2704 skb->destructor = DEV_GSO_CB(skb)->destructor;
2705 consume_skb(skb);
2706 return rc;
2707 }
2708 out_kfree_skb:
2709 kfree_skb(skb);
2710 out:
2711 return rc;
2712 }
2713 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2714
2715 static void qdisc_pkt_len_init(struct sk_buff *skb)
2716 {
2717 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2718
2719 qdisc_skb_cb(skb)->pkt_len = skb->len;
2720
2721 /* To get more precise estimation of bytes sent on wire,
2722 * we add to pkt_len the headers size of all segments
2723 */
2724 if (shinfo->gso_size) {
2725 unsigned int hdr_len;
2726 u16 gso_segs = shinfo->gso_segs;
2727
2728 /* mac layer + network layer */
2729 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2730
2731 /* + transport layer */
2732 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2733 hdr_len += tcp_hdrlen(skb);
2734 else
2735 hdr_len += sizeof(struct udphdr);
2736
2737 if (shinfo->gso_type & SKB_GSO_DODGY)
2738 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2739 shinfo->gso_size);
2740
2741 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2742 }
2743 }
2744
2745 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2746 struct net_device *dev,
2747 struct netdev_queue *txq)
2748 {
2749 spinlock_t *root_lock = qdisc_lock(q);
2750 bool contended;
2751 int rc;
2752
2753 qdisc_pkt_len_init(skb);
2754 qdisc_calculate_pkt_len(skb, q);
2755 /*
2756 * Heuristic to force contended enqueues to serialize on a
2757 * separate lock before trying to get qdisc main lock.
2758 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2759 * often and dequeue packets faster.
2760 */
2761 contended = qdisc_is_running(q);
2762 if (unlikely(contended))
2763 spin_lock(&q->busylock);
2764
2765 spin_lock(root_lock);
2766 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2767 kfree_skb(skb);
2768 rc = NET_XMIT_DROP;
2769 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2770 qdisc_run_begin(q)) {
2771 /*
2772 * This is a work-conserving queue; there are no old skbs
2773 * waiting to be sent out; and the qdisc is not running -
2774 * xmit the skb directly.
2775 */
2776 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2777 skb_dst_force(skb);
2778
2779 qdisc_bstats_update(q, skb);
2780
2781 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2782 if (unlikely(contended)) {
2783 spin_unlock(&q->busylock);
2784 contended = false;
2785 }
2786 __qdisc_run(q);
2787 } else
2788 qdisc_run_end(q);
2789
2790 rc = NET_XMIT_SUCCESS;
2791 } else {
2792 skb_dst_force(skb);
2793 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2794 if (qdisc_run_begin(q)) {
2795 if (unlikely(contended)) {
2796 spin_unlock(&q->busylock);
2797 contended = false;
2798 }
2799 __qdisc_run(q);
2800 }
2801 }
2802 spin_unlock(root_lock);
2803 if (unlikely(contended))
2804 spin_unlock(&q->busylock);
2805 return rc;
2806 }
2807
2808 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2809 static void skb_update_prio(struct sk_buff *skb)
2810 {
2811 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2812
2813 if (!skb->priority && skb->sk && map) {
2814 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2815
2816 if (prioidx < map->priomap_len)
2817 skb->priority = map->priomap[prioidx];
2818 }
2819 }
2820 #else
2821 #define skb_update_prio(skb)
2822 #endif
2823
2824 static DEFINE_PER_CPU(int, xmit_recursion);
2825 #define RECURSION_LIMIT 10
2826
2827 /**
2828 * dev_loopback_xmit - loop back @skb
2829 * @skb: buffer to transmit
2830 */
2831 int dev_loopback_xmit(struct sk_buff *skb)
2832 {
2833 skb_reset_mac_header(skb);
2834 __skb_pull(skb, skb_network_offset(skb));
2835 skb->pkt_type = PACKET_LOOPBACK;
2836 skb->ip_summed = CHECKSUM_UNNECESSARY;
2837 WARN_ON(!skb_dst(skb));
2838 skb_dst_force(skb);
2839 netif_rx_ni(skb);
2840 return 0;
2841 }
2842 EXPORT_SYMBOL(dev_loopback_xmit);
2843
2844 /**
2845 * __dev_queue_xmit - transmit a buffer
2846 * @skb: buffer to transmit
2847 * @accel_priv: private data used for L2 forwarding offload
2848 *
2849 * Queue a buffer for transmission to a network device. The caller must
2850 * have set the device and priority and built the buffer before calling
2851 * this function. The function can be called from an interrupt.
2852 *
2853 * A negative errno code is returned on a failure. A success does not
2854 * guarantee the frame will be transmitted as it may be dropped due
2855 * to congestion or traffic shaping.
2856 *
2857 * -----------------------------------------------------------------------------------
2858 * I notice this method can also return errors from the queue disciplines,
2859 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2860 * be positive.
2861 *
2862 * Regardless of the return value, the skb is consumed, so it is currently
2863 * difficult to retry a send to this method. (You can bump the ref count
2864 * before sending to hold a reference for retry if you are careful.)
2865 *
2866 * When calling this method, interrupts MUST be enabled. This is because
2867 * the BH enable code must have IRQs enabled so that it will not deadlock.
2868 * --BLG
2869 */
2870 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2871 {
2872 struct net_device *dev = skb->dev;
2873 struct netdev_queue *txq;
2874 struct Qdisc *q;
2875 int rc = -ENOMEM;
2876
2877 skb_reset_mac_header(skb);
2878
2879 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2880 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2881
2882 /* Disable soft irqs for various locks below. Also
2883 * stops preemption for RCU.
2884 */
2885 rcu_read_lock_bh();
2886
2887 skb_update_prio(skb);
2888
2889 txq = netdev_pick_tx(dev, skb, accel_priv);
2890 q = rcu_dereference_bh(txq->qdisc);
2891
2892 #ifdef CONFIG_NET_CLS_ACT
2893 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2894 #endif
2895 trace_net_dev_queue(skb);
2896 if (q->enqueue) {
2897 rc = __dev_xmit_skb(skb, q, dev, txq);
2898 goto out;
2899 }
2900
2901 /* The device has no queue. Common case for software devices:
2902 loopback, all the sorts of tunnels...
2903
2904 Really, it is unlikely that netif_tx_lock protection is necessary
2905 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2906 counters.)
2907 However, it is possible, that they rely on protection
2908 made by us here.
2909
2910 Check this and shot the lock. It is not prone from deadlocks.
2911 Either shot noqueue qdisc, it is even simpler 8)
2912 */
2913 if (dev->flags & IFF_UP) {
2914 int cpu = smp_processor_id(); /* ok because BHs are off */
2915
2916 if (txq->xmit_lock_owner != cpu) {
2917
2918 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2919 goto recursion_alert;
2920
2921 HARD_TX_LOCK(dev, txq, cpu);
2922
2923 if (!netif_xmit_stopped(txq)) {
2924 __this_cpu_inc(xmit_recursion);
2925 rc = dev_hard_start_xmit(skb, dev, txq);
2926 __this_cpu_dec(xmit_recursion);
2927 if (dev_xmit_complete(rc)) {
2928 HARD_TX_UNLOCK(dev, txq);
2929 goto out;
2930 }
2931 }
2932 HARD_TX_UNLOCK(dev, txq);
2933 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2934 dev->name);
2935 } else {
2936 /* Recursion is detected! It is possible,
2937 * unfortunately
2938 */
2939 recursion_alert:
2940 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2941 dev->name);
2942 }
2943 }
2944
2945 rc = -ENETDOWN;
2946 rcu_read_unlock_bh();
2947
2948 atomic_long_inc(&dev->tx_dropped);
2949 kfree_skb(skb);
2950 return rc;
2951 out:
2952 rcu_read_unlock_bh();
2953 return rc;
2954 }
2955
2956 int dev_queue_xmit(struct sk_buff *skb)
2957 {
2958 return __dev_queue_xmit(skb, NULL);
2959 }
2960 EXPORT_SYMBOL(dev_queue_xmit);
2961
2962 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2963 {
2964 return __dev_queue_xmit(skb, accel_priv);
2965 }
2966 EXPORT_SYMBOL(dev_queue_xmit_accel);
2967
2968
2969 /*=======================================================================
2970 Receiver routines
2971 =======================================================================*/
2972
2973 int netdev_max_backlog __read_mostly = 1000;
2974 EXPORT_SYMBOL(netdev_max_backlog);
2975
2976 int netdev_tstamp_prequeue __read_mostly = 1;
2977 int netdev_budget __read_mostly = 300;
2978 int weight_p __read_mostly = 64; /* old backlog weight */
2979
2980 /* Called with irq disabled */
2981 static inline void ____napi_schedule(struct softnet_data *sd,
2982 struct napi_struct *napi)
2983 {
2984 list_add_tail(&napi->poll_list, &sd->poll_list);
2985 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2986 }
2987
2988 #ifdef CONFIG_RPS
2989
2990 /* One global table that all flow-based protocols share. */
2991 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2992 EXPORT_SYMBOL(rps_sock_flow_table);
2993
2994 struct static_key rps_needed __read_mostly;
2995
2996 static struct rps_dev_flow *
2997 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2998 struct rps_dev_flow *rflow, u16 next_cpu)
2999 {
3000 if (next_cpu != RPS_NO_CPU) {
3001 #ifdef CONFIG_RFS_ACCEL
3002 struct netdev_rx_queue *rxqueue;
3003 struct rps_dev_flow_table *flow_table;
3004 struct rps_dev_flow *old_rflow;
3005 u32 flow_id;
3006 u16 rxq_index;
3007 int rc;
3008
3009 /* Should we steer this flow to a different hardware queue? */
3010 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3011 !(dev->features & NETIF_F_NTUPLE))
3012 goto out;
3013 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3014 if (rxq_index == skb_get_rx_queue(skb))
3015 goto out;
3016
3017 rxqueue = dev->_rx + rxq_index;
3018 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3019 if (!flow_table)
3020 goto out;
3021 flow_id = skb_get_hash(skb) & flow_table->mask;
3022 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3023 rxq_index, flow_id);
3024 if (rc < 0)
3025 goto out;
3026 old_rflow = rflow;
3027 rflow = &flow_table->flows[flow_id];
3028 rflow->filter = rc;
3029 if (old_rflow->filter == rflow->filter)
3030 old_rflow->filter = RPS_NO_FILTER;
3031 out:
3032 #endif
3033 rflow->last_qtail =
3034 per_cpu(softnet_data, next_cpu).input_queue_head;
3035 }
3036
3037 rflow->cpu = next_cpu;
3038 return rflow;
3039 }
3040
3041 /*
3042 * get_rps_cpu is called from netif_receive_skb and returns the target
3043 * CPU from the RPS map of the receiving queue for a given skb.
3044 * rcu_read_lock must be held on entry.
3045 */
3046 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3047 struct rps_dev_flow **rflowp)
3048 {
3049 struct netdev_rx_queue *rxqueue;
3050 struct rps_map *map;
3051 struct rps_dev_flow_table *flow_table;
3052 struct rps_sock_flow_table *sock_flow_table;
3053 int cpu = -1;
3054 u16 tcpu;
3055 u32 hash;
3056
3057 if (skb_rx_queue_recorded(skb)) {
3058 u16 index = skb_get_rx_queue(skb);
3059 if (unlikely(index >= dev->real_num_rx_queues)) {
3060 WARN_ONCE(dev->real_num_rx_queues > 1,
3061 "%s received packet on queue %u, but number "
3062 "of RX queues is %u\n",
3063 dev->name, index, dev->real_num_rx_queues);
3064 goto done;
3065 }
3066 rxqueue = dev->_rx + index;
3067 } else
3068 rxqueue = dev->_rx;
3069
3070 map = rcu_dereference(rxqueue->rps_map);
3071 if (map) {
3072 if (map->len == 1 &&
3073 !rcu_access_pointer(rxqueue->rps_flow_table)) {
3074 tcpu = map->cpus[0];
3075 if (cpu_online(tcpu))
3076 cpu = tcpu;
3077 goto done;
3078 }
3079 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3080 goto done;
3081 }
3082
3083 skb_reset_network_header(skb);
3084 hash = skb_get_hash(skb);
3085 if (!hash)
3086 goto done;
3087
3088 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3089 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3090 if (flow_table && sock_flow_table) {
3091 u16 next_cpu;
3092 struct rps_dev_flow *rflow;
3093
3094 rflow = &flow_table->flows[hash & flow_table->mask];
3095 tcpu = rflow->cpu;
3096
3097 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3098
3099 /*
3100 * If the desired CPU (where last recvmsg was done) is
3101 * different from current CPU (one in the rx-queue flow
3102 * table entry), switch if one of the following holds:
3103 * - Current CPU is unset (equal to RPS_NO_CPU).
3104 * - Current CPU is offline.
3105 * - The current CPU's queue tail has advanced beyond the
3106 * last packet that was enqueued using this table entry.
3107 * This guarantees that all previous packets for the flow
3108 * have been dequeued, thus preserving in order delivery.
3109 */
3110 if (unlikely(tcpu != next_cpu) &&
3111 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3112 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3113 rflow->last_qtail)) >= 0)) {
3114 tcpu = next_cpu;
3115 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3116 }
3117
3118 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3119 *rflowp = rflow;
3120 cpu = tcpu;
3121 goto done;
3122 }
3123 }
3124
3125 if (map) {
3126 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3127 if (cpu_online(tcpu)) {
3128 cpu = tcpu;
3129 goto done;
3130 }
3131 }
3132
3133 done:
3134 return cpu;
3135 }
3136
3137 #ifdef CONFIG_RFS_ACCEL
3138
3139 /**
3140 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3141 * @dev: Device on which the filter was set
3142 * @rxq_index: RX queue index
3143 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3144 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3145 *
3146 * Drivers that implement ndo_rx_flow_steer() should periodically call
3147 * this function for each installed filter and remove the filters for
3148 * which it returns %true.
3149 */
3150 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3151 u32 flow_id, u16 filter_id)
3152 {
3153 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3154 struct rps_dev_flow_table *flow_table;
3155 struct rps_dev_flow *rflow;
3156 bool expire = true;
3157 int cpu;
3158
3159 rcu_read_lock();
3160 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3161 if (flow_table && flow_id <= flow_table->mask) {
3162 rflow = &flow_table->flows[flow_id];
3163 cpu = ACCESS_ONCE(rflow->cpu);
3164 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3165 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3166 rflow->last_qtail) <
3167 (int)(10 * flow_table->mask)))
3168 expire = false;
3169 }
3170 rcu_read_unlock();
3171 return expire;
3172 }
3173 EXPORT_SYMBOL(rps_may_expire_flow);
3174
3175 #endif /* CONFIG_RFS_ACCEL */
3176
3177 /* Called from hardirq (IPI) context */
3178 static void rps_trigger_softirq(void *data)
3179 {
3180 struct softnet_data *sd = data;
3181
3182 ____napi_schedule(sd, &sd->backlog);
3183 sd->received_rps++;
3184 }
3185
3186 #endif /* CONFIG_RPS */
3187
3188 /*
3189 * Check if this softnet_data structure is another cpu one
3190 * If yes, queue it to our IPI list and return 1
3191 * If no, return 0
3192 */
3193 static int rps_ipi_queued(struct softnet_data *sd)
3194 {
3195 #ifdef CONFIG_RPS
3196 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3197
3198 if (sd != mysd) {
3199 sd->rps_ipi_next = mysd->rps_ipi_list;
3200 mysd->rps_ipi_list = sd;
3201
3202 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3203 return 1;
3204 }
3205 #endif /* CONFIG_RPS */
3206 return 0;
3207 }
3208
3209 #ifdef CONFIG_NET_FLOW_LIMIT
3210 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3211 #endif
3212
3213 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3214 {
3215 #ifdef CONFIG_NET_FLOW_LIMIT
3216 struct sd_flow_limit *fl;
3217 struct softnet_data *sd;
3218 unsigned int old_flow, new_flow;
3219
3220 if (qlen < (netdev_max_backlog >> 1))
3221 return false;
3222
3223 sd = &__get_cpu_var(softnet_data);
3224
3225 rcu_read_lock();
3226 fl = rcu_dereference(sd->flow_limit);
3227 if (fl) {
3228 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3229 old_flow = fl->history[fl->history_head];
3230 fl->history[fl->history_head] = new_flow;
3231
3232 fl->history_head++;
3233 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3234
3235 if (likely(fl->buckets[old_flow]))
3236 fl->buckets[old_flow]--;
3237
3238 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3239 fl->count++;
3240 rcu_read_unlock();
3241 return true;
3242 }
3243 }
3244 rcu_read_unlock();
3245 #endif
3246 return false;
3247 }
3248
3249 /*
3250 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3251 * queue (may be a remote CPU queue).
3252 */
3253 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3254 unsigned int *qtail)
3255 {
3256 struct softnet_data *sd;
3257 unsigned long flags;
3258 unsigned int qlen;
3259
3260 sd = &per_cpu(softnet_data, cpu);
3261
3262 local_irq_save(flags);
3263
3264 rps_lock(sd);
3265 qlen = skb_queue_len(&sd->input_pkt_queue);
3266 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3267 if (skb_queue_len(&sd->input_pkt_queue)) {
3268 enqueue:
3269 __skb_queue_tail(&sd->input_pkt_queue, skb);
3270 input_queue_tail_incr_save(sd, qtail);
3271 rps_unlock(sd);
3272 local_irq_restore(flags);
3273 return NET_RX_SUCCESS;
3274 }
3275
3276 /* Schedule NAPI for backlog device
3277 * We can use non atomic operation since we own the queue lock
3278 */
3279 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3280 if (!rps_ipi_queued(sd))
3281 ____napi_schedule(sd, &sd->backlog);
3282 }
3283 goto enqueue;
3284 }
3285
3286 sd->dropped++;
3287 rps_unlock(sd);
3288
3289 local_irq_restore(flags);
3290
3291 atomic_long_inc(&skb->dev->rx_dropped);
3292 kfree_skb(skb);
3293 return NET_RX_DROP;
3294 }
3295
3296 static int netif_rx_internal(struct sk_buff *skb)
3297 {
3298 int ret;
3299
3300 net_timestamp_check(netdev_tstamp_prequeue, skb);
3301
3302 trace_netif_rx(skb);
3303 #ifdef CONFIG_RPS
3304 if (static_key_false(&rps_needed)) {
3305 struct rps_dev_flow voidflow, *rflow = &voidflow;
3306 int cpu;
3307
3308 preempt_disable();
3309 rcu_read_lock();
3310
3311 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3312 if (cpu < 0)
3313 cpu = smp_processor_id();
3314
3315 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3316
3317 rcu_read_unlock();
3318 preempt_enable();
3319 } else
3320 #endif
3321 {
3322 unsigned int qtail;
3323 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3324 put_cpu();
3325 }
3326 return ret;
3327 }
3328
3329 /**
3330 * netif_rx - post buffer to the network code
3331 * @skb: buffer to post
3332 *
3333 * This function receives a packet from a device driver and queues it for
3334 * the upper (protocol) levels to process. It always succeeds. The buffer
3335 * may be dropped during processing for congestion control or by the
3336 * protocol layers.
3337 *
3338 * return values:
3339 * NET_RX_SUCCESS (no congestion)
3340 * NET_RX_DROP (packet was dropped)
3341 *
3342 */
3343
3344 int netif_rx(struct sk_buff *skb)
3345 {
3346 trace_netif_rx_entry(skb);
3347
3348 return netif_rx_internal(skb);
3349 }
3350 EXPORT_SYMBOL(netif_rx);
3351
3352 int netif_rx_ni(struct sk_buff *skb)
3353 {
3354 int err;
3355
3356 trace_netif_rx_ni_entry(skb);
3357
3358 preempt_disable();
3359 err = netif_rx_internal(skb);
3360 if (local_softirq_pending())
3361 do_softirq();
3362 preempt_enable();
3363
3364 return err;
3365 }
3366 EXPORT_SYMBOL(netif_rx_ni);
3367
3368 static void net_tx_action(struct softirq_action *h)
3369 {
3370 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3371
3372 if (sd->completion_queue) {
3373 struct sk_buff *clist;
3374
3375 local_irq_disable();
3376 clist = sd->completion_queue;
3377 sd->completion_queue = NULL;
3378 local_irq_enable();
3379
3380 while (clist) {
3381 struct sk_buff *skb = clist;
3382 clist = clist->next;
3383
3384 WARN_ON(atomic_read(&skb->users));
3385 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3386 trace_consume_skb(skb);
3387 else
3388 trace_kfree_skb(skb, net_tx_action);
3389 __kfree_skb(skb);
3390 }
3391 }
3392
3393 if (sd->output_queue) {
3394 struct Qdisc *head;
3395
3396 local_irq_disable();
3397 head = sd->output_queue;
3398 sd->output_queue = NULL;
3399 sd->output_queue_tailp = &sd->output_queue;
3400 local_irq_enable();
3401
3402 while (head) {
3403 struct Qdisc *q = head;
3404 spinlock_t *root_lock;
3405
3406 head = head->next_sched;
3407
3408 root_lock = qdisc_lock(q);
3409 if (spin_trylock(root_lock)) {
3410 smp_mb__before_atomic();
3411 clear_bit(__QDISC_STATE_SCHED,
3412 &q->state);
3413 qdisc_run(q);
3414 spin_unlock(root_lock);
3415 } else {
3416 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3417 &q->state)) {
3418 __netif_reschedule(q);
3419 } else {
3420 smp_mb__before_atomic();
3421 clear_bit(__QDISC_STATE_SCHED,
3422 &q->state);
3423 }
3424 }
3425 }
3426 }
3427 }
3428
3429 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3430 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3431 /* This hook is defined here for ATM LANE */
3432 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3433 unsigned char *addr) __read_mostly;
3434 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3435 #endif
3436
3437 #ifdef CONFIG_NET_CLS_ACT
3438 /* TODO: Maybe we should just force sch_ingress to be compiled in
3439 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3440 * a compare and 2 stores extra right now if we dont have it on
3441 * but have CONFIG_NET_CLS_ACT
3442 * NOTE: This doesn't stop any functionality; if you dont have
3443 * the ingress scheduler, you just can't add policies on ingress.
3444 *
3445 */
3446 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3447 {
3448 struct net_device *dev = skb->dev;
3449 u32 ttl = G_TC_RTTL(skb->tc_verd);
3450 int result = TC_ACT_OK;
3451 struct Qdisc *q;
3452
3453 if (unlikely(MAX_RED_LOOP < ttl++)) {
3454 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3455 skb->skb_iif, dev->ifindex);
3456 return TC_ACT_SHOT;
3457 }
3458
3459 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3460 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3461
3462 q = rxq->qdisc;
3463 if (q != &noop_qdisc) {
3464 spin_lock(qdisc_lock(q));
3465 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3466 result = qdisc_enqueue_root(skb, q);
3467 spin_unlock(qdisc_lock(q));
3468 }
3469
3470 return result;
3471 }
3472
3473 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3474 struct packet_type **pt_prev,
3475 int *ret, struct net_device *orig_dev)
3476 {
3477 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3478
3479 if (!rxq || rxq->qdisc == &noop_qdisc)
3480 goto out;
3481
3482 if (*pt_prev) {
3483 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3484 *pt_prev = NULL;
3485 }
3486
3487 switch (ing_filter(skb, rxq)) {
3488 case TC_ACT_SHOT:
3489 case TC_ACT_STOLEN:
3490 kfree_skb(skb);
3491 return NULL;
3492 }
3493
3494 out:
3495 skb->tc_verd = 0;
3496 return skb;
3497 }
3498 #endif
3499
3500 /**
3501 * netdev_rx_handler_register - register receive handler
3502 * @dev: device to register a handler for
3503 * @rx_handler: receive handler to register
3504 * @rx_handler_data: data pointer that is used by rx handler
3505 *
3506 * Register a receive handler for a device. This handler will then be
3507 * called from __netif_receive_skb. A negative errno code is returned
3508 * on a failure.
3509 *
3510 * The caller must hold the rtnl_mutex.
3511 *
3512 * For a general description of rx_handler, see enum rx_handler_result.
3513 */
3514 int netdev_rx_handler_register(struct net_device *dev,
3515 rx_handler_func_t *rx_handler,
3516 void *rx_handler_data)
3517 {
3518 ASSERT_RTNL();
3519
3520 if (dev->rx_handler)
3521 return -EBUSY;
3522
3523 /* Note: rx_handler_data must be set before rx_handler */
3524 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3525 rcu_assign_pointer(dev->rx_handler, rx_handler);
3526
3527 return 0;
3528 }
3529 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3530
3531 /**
3532 * netdev_rx_handler_unregister - unregister receive handler
3533 * @dev: device to unregister a handler from
3534 *
3535 * Unregister a receive handler from a device.
3536 *
3537 * The caller must hold the rtnl_mutex.
3538 */
3539 void netdev_rx_handler_unregister(struct net_device *dev)
3540 {
3541
3542 ASSERT_RTNL();
3543 RCU_INIT_POINTER(dev->rx_handler, NULL);
3544 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3545 * section has a guarantee to see a non NULL rx_handler_data
3546 * as well.
3547 */
3548 synchronize_net();
3549 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3550 }
3551 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3552
3553 /*
3554 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3555 * the special handling of PFMEMALLOC skbs.
3556 */
3557 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3558 {
3559 switch (skb->protocol) {
3560 case htons(ETH_P_ARP):
3561 case htons(ETH_P_IP):
3562 case htons(ETH_P_IPV6):
3563 case htons(ETH_P_8021Q):
3564 case htons(ETH_P_8021AD):
3565 return true;
3566 default:
3567 return false;
3568 }
3569 }
3570
3571 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3572 {
3573 struct packet_type *ptype, *pt_prev;
3574 rx_handler_func_t *rx_handler;
3575 struct net_device *orig_dev;
3576 struct net_device *null_or_dev;
3577 bool deliver_exact = false;
3578 int ret = NET_RX_DROP;
3579 __be16 type;
3580
3581 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3582
3583 trace_netif_receive_skb(skb);
3584
3585 orig_dev = skb->dev;
3586
3587 skb_reset_network_header(skb);
3588 if (!skb_transport_header_was_set(skb))
3589 skb_reset_transport_header(skb);
3590 skb_reset_mac_len(skb);
3591
3592 pt_prev = NULL;
3593
3594 rcu_read_lock();
3595
3596 another_round:
3597 skb->skb_iif = skb->dev->ifindex;
3598
3599 __this_cpu_inc(softnet_data.processed);
3600
3601 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3602 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3603 skb = skb_vlan_untag(skb);
3604 if (unlikely(!skb))
3605 goto unlock;
3606 }
3607
3608 #ifdef CONFIG_NET_CLS_ACT
3609 if (skb->tc_verd & TC_NCLS) {
3610 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3611 goto ncls;
3612 }
3613 #endif
3614
3615 if (pfmemalloc)
3616 goto skip_taps;
3617
3618 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3619 if (!ptype->dev || ptype->dev == skb->dev) {
3620 if (pt_prev)
3621 ret = deliver_skb(skb, pt_prev, orig_dev);
3622 pt_prev = ptype;
3623 }
3624 }
3625
3626 skip_taps:
3627 #ifdef CONFIG_NET_CLS_ACT
3628 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3629 if (!skb)
3630 goto unlock;
3631 ncls:
3632 #endif
3633
3634 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3635 goto drop;
3636
3637 if (vlan_tx_tag_present(skb)) {
3638 if (pt_prev) {
3639 ret = deliver_skb(skb, pt_prev, orig_dev);
3640 pt_prev = NULL;
3641 }
3642 if (vlan_do_receive(&skb))
3643 goto another_round;
3644 else if (unlikely(!skb))
3645 goto unlock;
3646 }
3647
3648 rx_handler = rcu_dereference(skb->dev->rx_handler);
3649 if (rx_handler) {
3650 if (pt_prev) {
3651 ret = deliver_skb(skb, pt_prev, orig_dev);
3652 pt_prev = NULL;
3653 }
3654 switch (rx_handler(&skb)) {
3655 case RX_HANDLER_CONSUMED:
3656 ret = NET_RX_SUCCESS;
3657 goto unlock;
3658 case RX_HANDLER_ANOTHER:
3659 goto another_round;
3660 case RX_HANDLER_EXACT:
3661 deliver_exact = true;
3662 case RX_HANDLER_PASS:
3663 break;
3664 default:
3665 BUG();
3666 }
3667 }
3668
3669 if (unlikely(vlan_tx_tag_present(skb))) {
3670 if (vlan_tx_tag_get_id(skb))
3671 skb->pkt_type = PACKET_OTHERHOST;
3672 /* Note: we might in the future use prio bits
3673 * and set skb->priority like in vlan_do_receive()
3674 * For the time being, just ignore Priority Code Point
3675 */
3676 skb->vlan_tci = 0;
3677 }
3678
3679 /* deliver only exact match when indicated */
3680 null_or_dev = deliver_exact ? skb->dev : NULL;
3681
3682 type = skb->protocol;
3683 list_for_each_entry_rcu(ptype,
3684 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3685 if (ptype->type == type &&
3686 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3687 ptype->dev == orig_dev)) {
3688 if (pt_prev)
3689 ret = deliver_skb(skb, pt_prev, orig_dev);
3690 pt_prev = ptype;
3691 }
3692 }
3693
3694 if (pt_prev) {
3695 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3696 goto drop;
3697 else
3698 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3699 } else {
3700 drop:
3701 atomic_long_inc(&skb->dev->rx_dropped);
3702 kfree_skb(skb);
3703 /* Jamal, now you will not able to escape explaining
3704 * me how you were going to use this. :-)
3705 */
3706 ret = NET_RX_DROP;
3707 }
3708
3709 unlock:
3710 rcu_read_unlock();
3711 return ret;
3712 }
3713
3714 static int __netif_receive_skb(struct sk_buff *skb)
3715 {
3716 int ret;
3717
3718 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3719 unsigned long pflags = current->flags;
3720
3721 /*
3722 * PFMEMALLOC skbs are special, they should
3723 * - be delivered to SOCK_MEMALLOC sockets only
3724 * - stay away from userspace
3725 * - have bounded memory usage
3726 *
3727 * Use PF_MEMALLOC as this saves us from propagating the allocation
3728 * context down to all allocation sites.
3729 */
3730 current->flags |= PF_MEMALLOC;
3731 ret = __netif_receive_skb_core(skb, true);
3732 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3733 } else
3734 ret = __netif_receive_skb_core(skb, false);
3735
3736 return ret;
3737 }
3738
3739 static int netif_receive_skb_internal(struct sk_buff *skb)
3740 {
3741 net_timestamp_check(netdev_tstamp_prequeue, skb);
3742
3743 if (skb_defer_rx_timestamp(skb))
3744 return NET_RX_SUCCESS;
3745
3746 #ifdef CONFIG_RPS
3747 if (static_key_false(&rps_needed)) {
3748 struct rps_dev_flow voidflow, *rflow = &voidflow;
3749 int cpu, ret;
3750
3751 rcu_read_lock();
3752
3753 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3754
3755 if (cpu >= 0) {
3756 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3757 rcu_read_unlock();
3758 return ret;
3759 }
3760 rcu_read_unlock();
3761 }
3762 #endif
3763 return __netif_receive_skb(skb);
3764 }
3765
3766 /**
3767 * netif_receive_skb - process receive buffer from network
3768 * @skb: buffer to process
3769 *
3770 * netif_receive_skb() is the main receive data processing function.
3771 * It always succeeds. The buffer may be dropped during processing
3772 * for congestion control or by the protocol layers.
3773 *
3774 * This function may only be called from softirq context and interrupts
3775 * should be enabled.
3776 *
3777 * Return values (usually ignored):
3778 * NET_RX_SUCCESS: no congestion
3779 * NET_RX_DROP: packet was dropped
3780 */
3781 int netif_receive_skb(struct sk_buff *skb)
3782 {
3783 trace_netif_receive_skb_entry(skb);
3784
3785 return netif_receive_skb_internal(skb);
3786 }
3787 EXPORT_SYMBOL(netif_receive_skb);
3788
3789 /* Network device is going away, flush any packets still pending
3790 * Called with irqs disabled.
3791 */
3792 static void flush_backlog(void *arg)
3793 {
3794 struct net_device *dev = arg;
3795 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3796 struct sk_buff *skb, *tmp;
3797
3798 rps_lock(sd);
3799 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3800 if (skb->dev == dev) {
3801 __skb_unlink(skb, &sd->input_pkt_queue);
3802 kfree_skb(skb);
3803 input_queue_head_incr(sd);
3804 }
3805 }
3806 rps_unlock(sd);
3807
3808 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3809 if (skb->dev == dev) {
3810 __skb_unlink(skb, &sd->process_queue);
3811 kfree_skb(skb);
3812 input_queue_head_incr(sd);
3813 }
3814 }
3815 }
3816
3817 static int napi_gro_complete(struct sk_buff *skb)
3818 {
3819 struct packet_offload *ptype;
3820 __be16 type = skb->protocol;
3821 struct list_head *head = &offload_base;
3822 int err = -ENOENT;
3823
3824 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3825
3826 if (NAPI_GRO_CB(skb)->count == 1) {
3827 skb_shinfo(skb)->gso_size = 0;
3828 goto out;
3829 }
3830
3831 rcu_read_lock();
3832 list_for_each_entry_rcu(ptype, head, list) {
3833 if (ptype->type != type || !ptype->callbacks.gro_complete)
3834 continue;
3835
3836 err = ptype->callbacks.gro_complete(skb, 0);
3837 break;
3838 }
3839 rcu_read_unlock();
3840
3841 if (err) {
3842 WARN_ON(&ptype->list == head);
3843 kfree_skb(skb);
3844 return NET_RX_SUCCESS;
3845 }
3846
3847 out:
3848 return netif_receive_skb_internal(skb);
3849 }
3850
3851 /* napi->gro_list contains packets ordered by age.
3852 * youngest packets at the head of it.
3853 * Complete skbs in reverse order to reduce latencies.
3854 */
3855 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3856 {
3857 struct sk_buff *skb, *prev = NULL;
3858
3859 /* scan list and build reverse chain */
3860 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3861 skb->prev = prev;
3862 prev = skb;
3863 }
3864
3865 for (skb = prev; skb; skb = prev) {
3866 skb->next = NULL;
3867
3868 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3869 return;
3870
3871 prev = skb->prev;
3872 napi_gro_complete(skb);
3873 napi->gro_count--;
3874 }
3875
3876 napi->gro_list = NULL;
3877 }
3878 EXPORT_SYMBOL(napi_gro_flush);
3879
3880 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3881 {
3882 struct sk_buff *p;
3883 unsigned int maclen = skb->dev->hard_header_len;
3884 u32 hash = skb_get_hash_raw(skb);
3885
3886 for (p = napi->gro_list; p; p = p->next) {
3887 unsigned long diffs;
3888
3889 NAPI_GRO_CB(p)->flush = 0;
3890
3891 if (hash != skb_get_hash_raw(p)) {
3892 NAPI_GRO_CB(p)->same_flow = 0;
3893 continue;
3894 }
3895
3896 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3897 diffs |= p->vlan_tci ^ skb->vlan_tci;
3898 if (maclen == ETH_HLEN)
3899 diffs |= compare_ether_header(skb_mac_header(p),
3900 skb_mac_header(skb));
3901 else if (!diffs)
3902 diffs = memcmp(skb_mac_header(p),
3903 skb_mac_header(skb),
3904 maclen);
3905 NAPI_GRO_CB(p)->same_flow = !diffs;
3906 }
3907 }
3908
3909 static void skb_gro_reset_offset(struct sk_buff *skb)
3910 {
3911 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3912 const skb_frag_t *frag0 = &pinfo->frags[0];
3913
3914 NAPI_GRO_CB(skb)->data_offset = 0;
3915 NAPI_GRO_CB(skb)->frag0 = NULL;
3916 NAPI_GRO_CB(skb)->frag0_len = 0;
3917
3918 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3919 pinfo->nr_frags &&
3920 !PageHighMem(skb_frag_page(frag0))) {
3921 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3922 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3923 }
3924 }
3925
3926 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3927 {
3928 struct skb_shared_info *pinfo = skb_shinfo(skb);
3929
3930 BUG_ON(skb->end - skb->tail < grow);
3931
3932 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3933
3934 skb->data_len -= grow;
3935 skb->tail += grow;
3936
3937 pinfo->frags[0].page_offset += grow;
3938 skb_frag_size_sub(&pinfo->frags[0], grow);
3939
3940 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3941 skb_frag_unref(skb, 0);
3942 memmove(pinfo->frags, pinfo->frags + 1,
3943 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3944 }
3945 }
3946
3947 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3948 {
3949 struct sk_buff **pp = NULL;
3950 struct packet_offload *ptype;
3951 __be16 type = skb->protocol;
3952 struct list_head *head = &offload_base;
3953 int same_flow;
3954 enum gro_result ret;
3955 int grow;
3956
3957 if (!(skb->dev->features & NETIF_F_GRO))
3958 goto normal;
3959
3960 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3961 goto normal;
3962
3963 gro_list_prepare(napi, skb);
3964
3965 if (skb->ip_summed == CHECKSUM_COMPLETE) {
3966 NAPI_GRO_CB(skb)->csum = skb->csum;
3967 NAPI_GRO_CB(skb)->csum_valid = 1;
3968 } else {
3969 NAPI_GRO_CB(skb)->csum_valid = 0;
3970 }
3971
3972 rcu_read_lock();
3973 list_for_each_entry_rcu(ptype, head, list) {
3974 if (ptype->type != type || !ptype->callbacks.gro_receive)
3975 continue;
3976
3977 skb_set_network_header(skb, skb_gro_offset(skb));
3978 skb_reset_mac_len(skb);
3979 NAPI_GRO_CB(skb)->same_flow = 0;
3980 NAPI_GRO_CB(skb)->flush = 0;
3981 NAPI_GRO_CB(skb)->free = 0;
3982 NAPI_GRO_CB(skb)->udp_mark = 0;
3983 NAPI_GRO_CB(skb)->encapsulation = 0;
3984
3985 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3986 break;
3987 }
3988 rcu_read_unlock();
3989
3990 if (&ptype->list == head)
3991 goto normal;
3992
3993 same_flow = NAPI_GRO_CB(skb)->same_flow;
3994 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3995
3996 if (pp) {
3997 struct sk_buff *nskb = *pp;
3998
3999 *pp = nskb->next;
4000 nskb->next = NULL;
4001 napi_gro_complete(nskb);
4002 napi->gro_count--;
4003 }
4004
4005 if (same_flow)
4006 goto ok;
4007
4008 if (NAPI_GRO_CB(skb)->flush)
4009 goto normal;
4010
4011 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4012 struct sk_buff *nskb = napi->gro_list;
4013
4014 /* locate the end of the list to select the 'oldest' flow */
4015 while (nskb->next) {
4016 pp = &nskb->next;
4017 nskb = *pp;
4018 }
4019 *pp = NULL;
4020 nskb->next = NULL;
4021 napi_gro_complete(nskb);
4022 } else {
4023 napi->gro_count++;
4024 }
4025 NAPI_GRO_CB(skb)->count = 1;
4026 NAPI_GRO_CB(skb)->age = jiffies;
4027 NAPI_GRO_CB(skb)->last = skb;
4028 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4029 skb->next = napi->gro_list;
4030 napi->gro_list = skb;
4031 ret = GRO_HELD;
4032
4033 pull:
4034 grow = skb_gro_offset(skb) - skb_headlen(skb);
4035 if (grow > 0)
4036 gro_pull_from_frag0(skb, grow);
4037 ok:
4038 return ret;
4039
4040 normal:
4041 ret = GRO_NORMAL;
4042 goto pull;
4043 }
4044
4045 struct packet_offload *gro_find_receive_by_type(__be16 type)
4046 {
4047 struct list_head *offload_head = &offload_base;
4048 struct packet_offload *ptype;
4049
4050 list_for_each_entry_rcu(ptype, offload_head, list) {
4051 if (ptype->type != type || !ptype->callbacks.gro_receive)
4052 continue;
4053 return ptype;
4054 }
4055 return NULL;
4056 }
4057 EXPORT_SYMBOL(gro_find_receive_by_type);
4058
4059 struct packet_offload *gro_find_complete_by_type(__be16 type)
4060 {
4061 struct list_head *offload_head = &offload_base;
4062 struct packet_offload *ptype;
4063
4064 list_for_each_entry_rcu(ptype, offload_head, list) {
4065 if (ptype->type != type || !ptype->callbacks.gro_complete)
4066 continue;
4067 return ptype;
4068 }
4069 return NULL;
4070 }
4071 EXPORT_SYMBOL(gro_find_complete_by_type);
4072
4073 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4074 {
4075 switch (ret) {
4076 case GRO_NORMAL:
4077 if (netif_receive_skb_internal(skb))
4078 ret = GRO_DROP;
4079 break;
4080
4081 case GRO_DROP:
4082 kfree_skb(skb);
4083 break;
4084
4085 case GRO_MERGED_FREE:
4086 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4087 kmem_cache_free(skbuff_head_cache, skb);
4088 else
4089 __kfree_skb(skb);
4090 break;
4091
4092 case GRO_HELD:
4093 case GRO_MERGED:
4094 break;
4095 }
4096
4097 return ret;
4098 }
4099
4100 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4101 {
4102 trace_napi_gro_receive_entry(skb);
4103
4104 skb_gro_reset_offset(skb);
4105
4106 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4107 }
4108 EXPORT_SYMBOL(napi_gro_receive);
4109
4110 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4111 {
4112 __skb_pull(skb, skb_headlen(skb));
4113 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4114 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4115 skb->vlan_tci = 0;
4116 skb->dev = napi->dev;
4117 skb->skb_iif = 0;
4118 skb->encapsulation = 0;
4119 skb_shinfo(skb)->gso_type = 0;
4120 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4121
4122 napi->skb = skb;
4123 }
4124
4125 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4126 {
4127 struct sk_buff *skb = napi->skb;
4128
4129 if (!skb) {
4130 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4131 napi->skb = skb;
4132 }
4133 return skb;
4134 }
4135 EXPORT_SYMBOL(napi_get_frags);
4136
4137 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4138 struct sk_buff *skb,
4139 gro_result_t ret)
4140 {
4141 switch (ret) {
4142 case GRO_NORMAL:
4143 case GRO_HELD:
4144 __skb_push(skb, ETH_HLEN);
4145 skb->protocol = eth_type_trans(skb, skb->dev);
4146 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4147 ret = GRO_DROP;
4148 break;
4149
4150 case GRO_DROP:
4151 case GRO_MERGED_FREE:
4152 napi_reuse_skb(napi, skb);
4153 break;
4154
4155 case GRO_MERGED:
4156 break;
4157 }
4158
4159 return ret;
4160 }
4161
4162 /* Upper GRO stack assumes network header starts at gro_offset=0
4163 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4164 * We copy ethernet header into skb->data to have a common layout.
4165 */
4166 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4167 {
4168 struct sk_buff *skb = napi->skb;
4169 const struct ethhdr *eth;
4170 unsigned int hlen = sizeof(*eth);
4171
4172 napi->skb = NULL;
4173
4174 skb_reset_mac_header(skb);
4175 skb_gro_reset_offset(skb);
4176
4177 eth = skb_gro_header_fast(skb, 0);
4178 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4179 eth = skb_gro_header_slow(skb, hlen, 0);
4180 if (unlikely(!eth)) {
4181 napi_reuse_skb(napi, skb);
4182 return NULL;
4183 }
4184 } else {
4185 gro_pull_from_frag0(skb, hlen);
4186 NAPI_GRO_CB(skb)->frag0 += hlen;
4187 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4188 }
4189 __skb_pull(skb, hlen);
4190
4191 /*
4192 * This works because the only protocols we care about don't require
4193 * special handling.
4194 * We'll fix it up properly in napi_frags_finish()
4195 */
4196 skb->protocol = eth->h_proto;
4197
4198 return skb;
4199 }
4200
4201 gro_result_t napi_gro_frags(struct napi_struct *napi)
4202 {
4203 struct sk_buff *skb = napi_frags_skb(napi);
4204
4205 if (!skb)
4206 return GRO_DROP;
4207
4208 trace_napi_gro_frags_entry(skb);
4209
4210 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4211 }
4212 EXPORT_SYMBOL(napi_gro_frags);
4213
4214 /* Compute the checksum from gro_offset and return the folded value
4215 * after adding in any pseudo checksum.
4216 */
4217 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4218 {
4219 __wsum wsum;
4220 __sum16 sum;
4221
4222 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4223
4224 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4225 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4226 if (likely(!sum)) {
4227 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4228 !skb->csum_complete_sw)
4229 netdev_rx_csum_fault(skb->dev);
4230 }
4231
4232 NAPI_GRO_CB(skb)->csum = wsum;
4233 NAPI_GRO_CB(skb)->csum_valid = 1;
4234
4235 return sum;
4236 }
4237 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4238
4239 /*
4240 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4241 * Note: called with local irq disabled, but exits with local irq enabled.
4242 */
4243 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4244 {
4245 #ifdef CONFIG_RPS
4246 struct softnet_data *remsd = sd->rps_ipi_list;
4247
4248 if (remsd) {
4249 sd->rps_ipi_list = NULL;
4250
4251 local_irq_enable();
4252
4253 /* Send pending IPI's to kick RPS processing on remote cpus. */
4254 while (remsd) {
4255 struct softnet_data *next = remsd->rps_ipi_next;
4256
4257 if (cpu_online(remsd->cpu))
4258 smp_call_function_single_async(remsd->cpu,
4259 &remsd->csd);
4260 remsd = next;
4261 }
4262 } else
4263 #endif
4264 local_irq_enable();
4265 }
4266
4267 static int process_backlog(struct napi_struct *napi, int quota)
4268 {
4269 int work = 0;
4270 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4271
4272 #ifdef CONFIG_RPS
4273 /* Check if we have pending ipi, its better to send them now,
4274 * not waiting net_rx_action() end.
4275 */
4276 if (sd->rps_ipi_list) {
4277 local_irq_disable();
4278 net_rps_action_and_irq_enable(sd);
4279 }
4280 #endif
4281 napi->weight = weight_p;
4282 local_irq_disable();
4283 while (1) {
4284 struct sk_buff *skb;
4285
4286 while ((skb = __skb_dequeue(&sd->process_queue))) {
4287 local_irq_enable();
4288 __netif_receive_skb(skb);
4289 local_irq_disable();
4290 input_queue_head_incr(sd);
4291 if (++work >= quota) {
4292 local_irq_enable();
4293 return work;
4294 }
4295 }
4296
4297 rps_lock(sd);
4298 if (skb_queue_empty(&sd->input_pkt_queue)) {
4299 /*
4300 * Inline a custom version of __napi_complete().
4301 * only current cpu owns and manipulates this napi,
4302 * and NAPI_STATE_SCHED is the only possible flag set
4303 * on backlog.
4304 * We can use a plain write instead of clear_bit(),
4305 * and we dont need an smp_mb() memory barrier.
4306 */
4307 list_del(&napi->poll_list);
4308 napi->state = 0;
4309 rps_unlock(sd);
4310
4311 break;
4312 }
4313
4314 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4315 &sd->process_queue);
4316 rps_unlock(sd);
4317 }
4318 local_irq_enable();
4319
4320 return work;
4321 }
4322
4323 /**
4324 * __napi_schedule - schedule for receive
4325 * @n: entry to schedule
4326 *
4327 * The entry's receive function will be scheduled to run
4328 */
4329 void __napi_schedule(struct napi_struct *n)
4330 {
4331 unsigned long flags;
4332
4333 local_irq_save(flags);
4334 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4335 local_irq_restore(flags);
4336 }
4337 EXPORT_SYMBOL(__napi_schedule);
4338
4339 void __napi_complete(struct napi_struct *n)
4340 {
4341 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4342 BUG_ON(n->gro_list);
4343
4344 list_del(&n->poll_list);
4345 smp_mb__before_atomic();
4346 clear_bit(NAPI_STATE_SCHED, &n->state);
4347 }
4348 EXPORT_SYMBOL(__napi_complete);
4349
4350 void napi_complete(struct napi_struct *n)
4351 {
4352 unsigned long flags;
4353
4354 /*
4355 * don't let napi dequeue from the cpu poll list
4356 * just in case its running on a different cpu
4357 */
4358 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4359 return;
4360
4361 napi_gro_flush(n, false);
4362 local_irq_save(flags);
4363 __napi_complete(n);
4364 local_irq_restore(flags);
4365 }
4366 EXPORT_SYMBOL(napi_complete);
4367
4368 /* must be called under rcu_read_lock(), as we dont take a reference */
4369 struct napi_struct *napi_by_id(unsigned int napi_id)
4370 {
4371 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4372 struct napi_struct *napi;
4373
4374 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4375 if (napi->napi_id == napi_id)
4376 return napi;
4377
4378 return NULL;
4379 }
4380 EXPORT_SYMBOL_GPL(napi_by_id);
4381
4382 void napi_hash_add(struct napi_struct *napi)
4383 {
4384 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4385
4386 spin_lock(&napi_hash_lock);
4387
4388 /* 0 is not a valid id, we also skip an id that is taken
4389 * we expect both events to be extremely rare
4390 */
4391 napi->napi_id = 0;
4392 while (!napi->napi_id) {
4393 napi->napi_id = ++napi_gen_id;
4394 if (napi_by_id(napi->napi_id))
4395 napi->napi_id = 0;
4396 }
4397
4398 hlist_add_head_rcu(&napi->napi_hash_node,
4399 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4400
4401 spin_unlock(&napi_hash_lock);
4402 }
4403 }
4404 EXPORT_SYMBOL_GPL(napi_hash_add);
4405
4406 /* Warning : caller is responsible to make sure rcu grace period
4407 * is respected before freeing memory containing @napi
4408 */
4409 void napi_hash_del(struct napi_struct *napi)
4410 {
4411 spin_lock(&napi_hash_lock);
4412
4413 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4414 hlist_del_rcu(&napi->napi_hash_node);
4415
4416 spin_unlock(&napi_hash_lock);
4417 }
4418 EXPORT_SYMBOL_GPL(napi_hash_del);
4419
4420 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4421 int (*poll)(struct napi_struct *, int), int weight)
4422 {
4423 INIT_LIST_HEAD(&napi->poll_list);
4424 napi->gro_count = 0;
4425 napi->gro_list = NULL;
4426 napi->skb = NULL;
4427 napi->poll = poll;
4428 if (weight > NAPI_POLL_WEIGHT)
4429 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4430 weight, dev->name);
4431 napi->weight = weight;
4432 list_add(&napi->dev_list, &dev->napi_list);
4433 napi->dev = dev;
4434 #ifdef CONFIG_NETPOLL
4435 spin_lock_init(&napi->poll_lock);
4436 napi->poll_owner = -1;
4437 #endif
4438 set_bit(NAPI_STATE_SCHED, &napi->state);
4439 }
4440 EXPORT_SYMBOL(netif_napi_add);
4441
4442 void netif_napi_del(struct napi_struct *napi)
4443 {
4444 list_del_init(&napi->dev_list);
4445 napi_free_frags(napi);
4446
4447 kfree_skb_list(napi->gro_list);
4448 napi->gro_list = NULL;
4449 napi->gro_count = 0;
4450 }
4451 EXPORT_SYMBOL(netif_napi_del);
4452
4453 static void net_rx_action(struct softirq_action *h)
4454 {
4455 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4456 unsigned long time_limit = jiffies + 2;
4457 int budget = netdev_budget;
4458 void *have;
4459
4460 local_irq_disable();
4461
4462 while (!list_empty(&sd->poll_list)) {
4463 struct napi_struct *n;
4464 int work, weight;
4465
4466 /* If softirq window is exhuasted then punt.
4467 * Allow this to run for 2 jiffies since which will allow
4468 * an average latency of 1.5/HZ.
4469 */
4470 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4471 goto softnet_break;
4472
4473 local_irq_enable();
4474
4475 /* Even though interrupts have been re-enabled, this
4476 * access is safe because interrupts can only add new
4477 * entries to the tail of this list, and only ->poll()
4478 * calls can remove this head entry from the list.
4479 */
4480 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4481
4482 have = netpoll_poll_lock(n);
4483
4484 weight = n->weight;
4485
4486 /* This NAPI_STATE_SCHED test is for avoiding a race
4487 * with netpoll's poll_napi(). Only the entity which
4488 * obtains the lock and sees NAPI_STATE_SCHED set will
4489 * actually make the ->poll() call. Therefore we avoid
4490 * accidentally calling ->poll() when NAPI is not scheduled.
4491 */
4492 work = 0;
4493 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4494 work = n->poll(n, weight);
4495 trace_napi_poll(n);
4496 }
4497
4498 WARN_ON_ONCE(work > weight);
4499
4500 budget -= work;
4501
4502 local_irq_disable();
4503
4504 /* Drivers must not modify the NAPI state if they
4505 * consume the entire weight. In such cases this code
4506 * still "owns" the NAPI instance and therefore can
4507 * move the instance around on the list at-will.
4508 */
4509 if (unlikely(work == weight)) {
4510 if (unlikely(napi_disable_pending(n))) {
4511 local_irq_enable();
4512 napi_complete(n);
4513 local_irq_disable();
4514 } else {
4515 if (n->gro_list) {
4516 /* flush too old packets
4517 * If HZ < 1000, flush all packets.
4518 */
4519 local_irq_enable();
4520 napi_gro_flush(n, HZ >= 1000);
4521 local_irq_disable();
4522 }
4523 list_move_tail(&n->poll_list, &sd->poll_list);
4524 }
4525 }
4526
4527 netpoll_poll_unlock(have);
4528 }
4529 out:
4530 net_rps_action_and_irq_enable(sd);
4531
4532 #ifdef CONFIG_NET_DMA
4533 /*
4534 * There may not be any more sk_buffs coming right now, so push
4535 * any pending DMA copies to hardware
4536 */
4537 dma_issue_pending_all();
4538 #endif
4539
4540 return;
4541
4542 softnet_break:
4543 sd->time_squeeze++;
4544 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4545 goto out;
4546 }
4547
4548 struct netdev_adjacent {
4549 struct net_device *dev;
4550
4551 /* upper master flag, there can only be one master device per list */
4552 bool master;
4553
4554 /* counter for the number of times this device was added to us */
4555 u16 ref_nr;
4556
4557 /* private field for the users */
4558 void *private;
4559
4560 struct list_head list;
4561 struct rcu_head rcu;
4562 };
4563
4564 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4565 struct net_device *adj_dev,
4566 struct list_head *adj_list)
4567 {
4568 struct netdev_adjacent *adj;
4569
4570 list_for_each_entry(adj, adj_list, list) {
4571 if (adj->dev == adj_dev)
4572 return adj;
4573 }
4574 return NULL;
4575 }
4576
4577 /**
4578 * netdev_has_upper_dev - Check if device is linked to an upper device
4579 * @dev: device
4580 * @upper_dev: upper device to check
4581 *
4582 * Find out if a device is linked to specified upper device and return true
4583 * in case it is. Note that this checks only immediate upper device,
4584 * not through a complete stack of devices. The caller must hold the RTNL lock.
4585 */
4586 bool netdev_has_upper_dev(struct net_device *dev,
4587 struct net_device *upper_dev)
4588 {
4589 ASSERT_RTNL();
4590
4591 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4592 }
4593 EXPORT_SYMBOL(netdev_has_upper_dev);
4594
4595 /**
4596 * netdev_has_any_upper_dev - Check if device is linked to some device
4597 * @dev: device
4598 *
4599 * Find out if a device is linked to an upper device and return true in case
4600 * it is. The caller must hold the RTNL lock.
4601 */
4602 static bool netdev_has_any_upper_dev(struct net_device *dev)
4603 {
4604 ASSERT_RTNL();
4605
4606 return !list_empty(&dev->all_adj_list.upper);
4607 }
4608
4609 /**
4610 * netdev_master_upper_dev_get - Get master upper device
4611 * @dev: device
4612 *
4613 * Find a master upper device and return pointer to it or NULL in case
4614 * it's not there. The caller must hold the RTNL lock.
4615 */
4616 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4617 {
4618 struct netdev_adjacent *upper;
4619
4620 ASSERT_RTNL();
4621
4622 if (list_empty(&dev->adj_list.upper))
4623 return NULL;
4624
4625 upper = list_first_entry(&dev->adj_list.upper,
4626 struct netdev_adjacent, list);
4627 if (likely(upper->master))
4628 return upper->dev;
4629 return NULL;
4630 }
4631 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4632
4633 void *netdev_adjacent_get_private(struct list_head *adj_list)
4634 {
4635 struct netdev_adjacent *adj;
4636
4637 adj = list_entry(adj_list, struct netdev_adjacent, list);
4638
4639 return adj->private;
4640 }
4641 EXPORT_SYMBOL(netdev_adjacent_get_private);
4642
4643 /**
4644 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4645 * @dev: device
4646 * @iter: list_head ** of the current position
4647 *
4648 * Gets the next device from the dev's upper list, starting from iter
4649 * position. The caller must hold RCU read lock.
4650 */
4651 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4652 struct list_head **iter)
4653 {
4654 struct netdev_adjacent *upper;
4655
4656 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4657
4658 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4659
4660 if (&upper->list == &dev->adj_list.upper)
4661 return NULL;
4662
4663 *iter = &upper->list;
4664
4665 return upper->dev;
4666 }
4667 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4668
4669 /**
4670 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4671 * @dev: device
4672 * @iter: list_head ** of the current position
4673 *
4674 * Gets the next device from the dev's upper list, starting from iter
4675 * position. The caller must hold RCU read lock.
4676 */
4677 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4678 struct list_head **iter)
4679 {
4680 struct netdev_adjacent *upper;
4681
4682 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4683
4684 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4685
4686 if (&upper->list == &dev->all_adj_list.upper)
4687 return NULL;
4688
4689 *iter = &upper->list;
4690
4691 return upper->dev;
4692 }
4693 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4694
4695 /**
4696 * netdev_lower_get_next_private - Get the next ->private from the
4697 * lower neighbour list
4698 * @dev: device
4699 * @iter: list_head ** of the current position
4700 *
4701 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4702 * list, starting from iter position. The caller must hold either hold the
4703 * RTNL lock or its own locking that guarantees that the neighbour lower
4704 * list will remain unchainged.
4705 */
4706 void *netdev_lower_get_next_private(struct net_device *dev,
4707 struct list_head **iter)
4708 {
4709 struct netdev_adjacent *lower;
4710
4711 lower = list_entry(*iter, struct netdev_adjacent, list);
4712
4713 if (&lower->list == &dev->adj_list.lower)
4714 return NULL;
4715
4716 *iter = lower->list.next;
4717
4718 return lower->private;
4719 }
4720 EXPORT_SYMBOL(netdev_lower_get_next_private);
4721
4722 /**
4723 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4724 * lower neighbour list, RCU
4725 * variant
4726 * @dev: device
4727 * @iter: list_head ** of the current position
4728 *
4729 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4730 * list, starting from iter position. The caller must hold RCU read lock.
4731 */
4732 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4733 struct list_head **iter)
4734 {
4735 struct netdev_adjacent *lower;
4736
4737 WARN_ON_ONCE(!rcu_read_lock_held());
4738
4739 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4740
4741 if (&lower->list == &dev->adj_list.lower)
4742 return NULL;
4743
4744 *iter = &lower->list;
4745
4746 return lower->private;
4747 }
4748 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4749
4750 /**
4751 * netdev_lower_get_next - Get the next device from the lower neighbour
4752 * list
4753 * @dev: device
4754 * @iter: list_head ** of the current position
4755 *
4756 * Gets the next netdev_adjacent from the dev's lower neighbour
4757 * list, starting from iter position. The caller must hold RTNL lock or
4758 * its own locking that guarantees that the neighbour lower
4759 * list will remain unchainged.
4760 */
4761 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4762 {
4763 struct netdev_adjacent *lower;
4764
4765 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4766
4767 if (&lower->list == &dev->adj_list.lower)
4768 return NULL;
4769
4770 *iter = &lower->list;
4771
4772 return lower->dev;
4773 }
4774 EXPORT_SYMBOL(netdev_lower_get_next);
4775
4776 /**
4777 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4778 * lower neighbour list, RCU
4779 * variant
4780 * @dev: device
4781 *
4782 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4783 * list. The caller must hold RCU read lock.
4784 */
4785 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4786 {
4787 struct netdev_adjacent *lower;
4788
4789 lower = list_first_or_null_rcu(&dev->adj_list.lower,
4790 struct netdev_adjacent, list);
4791 if (lower)
4792 return lower->private;
4793 return NULL;
4794 }
4795 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4796
4797 /**
4798 * netdev_master_upper_dev_get_rcu - Get master upper device
4799 * @dev: device
4800 *
4801 * Find a master upper device and return pointer to it or NULL in case
4802 * it's not there. The caller must hold the RCU read lock.
4803 */
4804 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4805 {
4806 struct netdev_adjacent *upper;
4807
4808 upper = list_first_or_null_rcu(&dev->adj_list.upper,
4809 struct netdev_adjacent, list);
4810 if (upper && likely(upper->master))
4811 return upper->dev;
4812 return NULL;
4813 }
4814 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4815
4816 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4817 struct net_device *adj_dev,
4818 struct list_head *dev_list)
4819 {
4820 char linkname[IFNAMSIZ+7];
4821 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4822 "upper_%s" : "lower_%s", adj_dev->name);
4823 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4824 linkname);
4825 }
4826 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4827 char *name,
4828 struct list_head *dev_list)
4829 {
4830 char linkname[IFNAMSIZ+7];
4831 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4832 "upper_%s" : "lower_%s", name);
4833 sysfs_remove_link(&(dev->dev.kobj), linkname);
4834 }
4835
4836 #define netdev_adjacent_is_neigh_list(dev, dev_list) \
4837 (dev_list == &dev->adj_list.upper || \
4838 dev_list == &dev->adj_list.lower)
4839
4840 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4841 struct net_device *adj_dev,
4842 struct list_head *dev_list,
4843 void *private, bool master)
4844 {
4845 struct netdev_adjacent *adj;
4846 int ret;
4847
4848 adj = __netdev_find_adj(dev, adj_dev, dev_list);
4849
4850 if (adj) {
4851 adj->ref_nr++;
4852 return 0;
4853 }
4854
4855 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4856 if (!adj)
4857 return -ENOMEM;
4858
4859 adj->dev = adj_dev;
4860 adj->master = master;
4861 adj->ref_nr = 1;
4862 adj->private = private;
4863 dev_hold(adj_dev);
4864
4865 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4866 adj_dev->name, dev->name, adj_dev->name);
4867
4868 if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4869 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4870 if (ret)
4871 goto free_adj;
4872 }
4873
4874 /* Ensure that master link is always the first item in list. */
4875 if (master) {
4876 ret = sysfs_create_link(&(dev->dev.kobj),
4877 &(adj_dev->dev.kobj), "master");
4878 if (ret)
4879 goto remove_symlinks;
4880
4881 list_add_rcu(&adj->list, dev_list);
4882 } else {
4883 list_add_tail_rcu(&adj->list, dev_list);
4884 }
4885
4886 return 0;
4887
4888 remove_symlinks:
4889 if (netdev_adjacent_is_neigh_list(dev, dev_list))
4890 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4891 free_adj:
4892 kfree(adj);
4893 dev_put(adj_dev);
4894
4895 return ret;
4896 }
4897
4898 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4899 struct net_device *adj_dev,
4900 struct list_head *dev_list)
4901 {
4902 struct netdev_adjacent *adj;
4903
4904 adj = __netdev_find_adj(dev, adj_dev, dev_list);
4905
4906 if (!adj) {
4907 pr_err("tried to remove device %s from %s\n",
4908 dev->name, adj_dev->name);
4909 BUG();
4910 }
4911
4912 if (adj->ref_nr > 1) {
4913 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4914 adj->ref_nr-1);
4915 adj->ref_nr--;
4916 return;
4917 }
4918
4919 if (adj->master)
4920 sysfs_remove_link(&(dev->dev.kobj), "master");
4921
4922 if (netdev_adjacent_is_neigh_list(dev, dev_list))
4923 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4924
4925 list_del_rcu(&adj->list);
4926 pr_debug("dev_put for %s, because link removed from %s to %s\n",
4927 adj_dev->name, dev->name, adj_dev->name);
4928 dev_put(adj_dev);
4929 kfree_rcu(adj, rcu);
4930 }
4931
4932 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4933 struct net_device *upper_dev,
4934 struct list_head *up_list,
4935 struct list_head *down_list,
4936 void *private, bool master)
4937 {
4938 int ret;
4939
4940 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4941 master);
4942 if (ret)
4943 return ret;
4944
4945 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4946 false);
4947 if (ret) {
4948 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4949 return ret;
4950 }
4951
4952 return 0;
4953 }
4954
4955 static int __netdev_adjacent_dev_link(struct net_device *dev,
4956 struct net_device *upper_dev)
4957 {
4958 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4959 &dev->all_adj_list.upper,
4960 &upper_dev->all_adj_list.lower,
4961 NULL, false);
4962 }
4963
4964 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4965 struct net_device *upper_dev,
4966 struct list_head *up_list,
4967 struct list_head *down_list)
4968 {
4969 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4970 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4971 }
4972
4973 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4974 struct net_device *upper_dev)
4975 {
4976 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4977 &dev->all_adj_list.upper,
4978 &upper_dev->all_adj_list.lower);
4979 }
4980
4981 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4982 struct net_device *upper_dev,
4983 void *private, bool master)
4984 {
4985 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4986
4987 if (ret)
4988 return ret;
4989
4990 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4991 &dev->adj_list.upper,
4992 &upper_dev->adj_list.lower,
4993 private, master);
4994 if (ret) {
4995 __netdev_adjacent_dev_unlink(dev, upper_dev);
4996 return ret;
4997 }
4998
4999 return 0;
5000 }
5001
5002 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5003 struct net_device *upper_dev)
5004 {
5005 __netdev_adjacent_dev_unlink(dev, upper_dev);
5006 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5007 &dev->adj_list.upper,
5008 &upper_dev->adj_list.lower);
5009 }
5010
5011 static int __netdev_upper_dev_link(struct net_device *dev,
5012 struct net_device *upper_dev, bool master,
5013 void *private)
5014 {
5015 struct netdev_adjacent *i, *j, *to_i, *to_j;
5016 int ret = 0;
5017
5018 ASSERT_RTNL();
5019
5020 if (dev == upper_dev)
5021 return -EBUSY;
5022
5023 /* To prevent loops, check if dev is not upper device to upper_dev. */
5024 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5025 return -EBUSY;
5026
5027 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5028 return -EEXIST;
5029
5030 if (master && netdev_master_upper_dev_get(dev))
5031 return -EBUSY;
5032
5033 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5034 master);
5035 if (ret)
5036 return ret;
5037
5038 /* Now that we linked these devs, make all the upper_dev's
5039 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5040 * versa, and don't forget the devices itself. All of these
5041 * links are non-neighbours.
5042 */
5043 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5044 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5045 pr_debug("Interlinking %s with %s, non-neighbour\n",
5046 i->dev->name, j->dev->name);
5047 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5048 if (ret)
5049 goto rollback_mesh;
5050 }
5051 }
5052
5053 /* add dev to every upper_dev's upper device */
5054 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5055 pr_debug("linking %s's upper device %s with %s\n",
5056 upper_dev->name, i->dev->name, dev->name);
5057 ret = __netdev_adjacent_dev_link(dev, i->dev);
5058 if (ret)
5059 goto rollback_upper_mesh;
5060 }
5061
5062 /* add upper_dev to every dev's lower device */
5063 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5064 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5065 i->dev->name, upper_dev->name);
5066 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5067 if (ret)
5068 goto rollback_lower_mesh;
5069 }
5070
5071 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5072 return 0;
5073
5074 rollback_lower_mesh:
5075 to_i = i;
5076 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5077 if (i == to_i)
5078 break;
5079 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5080 }
5081
5082 i = NULL;
5083
5084 rollback_upper_mesh:
5085 to_i = i;
5086 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5087 if (i == to_i)
5088 break;
5089 __netdev_adjacent_dev_unlink(dev, i->dev);
5090 }
5091
5092 i = j = NULL;
5093
5094 rollback_mesh:
5095 to_i = i;
5096 to_j = j;
5097 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5098 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5099 if (i == to_i && j == to_j)
5100 break;
5101 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5102 }
5103 if (i == to_i)
5104 break;
5105 }
5106
5107 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5108
5109 return ret;
5110 }
5111
5112 /**
5113 * netdev_upper_dev_link - Add a link to the upper device
5114 * @dev: device
5115 * @upper_dev: new upper device
5116 *
5117 * Adds a link to device which is upper to this one. The caller must hold
5118 * the RTNL lock. On a failure a negative errno code is returned.
5119 * On success the reference counts are adjusted and the function
5120 * returns zero.
5121 */
5122 int netdev_upper_dev_link(struct net_device *dev,
5123 struct net_device *upper_dev)
5124 {
5125 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5126 }
5127 EXPORT_SYMBOL(netdev_upper_dev_link);
5128
5129 /**
5130 * netdev_master_upper_dev_link - Add a master link to the upper device
5131 * @dev: device
5132 * @upper_dev: new upper device
5133 *
5134 * Adds a link to device which is upper to this one. In this case, only
5135 * one master upper device can be linked, although other non-master devices
5136 * might be linked as well. The caller must hold the RTNL lock.
5137 * On a failure a negative errno code is returned. On success the reference
5138 * counts are adjusted and the function returns zero.
5139 */
5140 int netdev_master_upper_dev_link(struct net_device *dev,
5141 struct net_device *upper_dev)
5142 {
5143 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5144 }
5145 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5146
5147 int netdev_master_upper_dev_link_private(struct net_device *dev,
5148 struct net_device *upper_dev,
5149 void *private)
5150 {
5151 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5152 }
5153 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5154
5155 /**
5156 * netdev_upper_dev_unlink - Removes a link to upper device
5157 * @dev: device
5158 * @upper_dev: new upper device
5159 *
5160 * Removes a link to device which is upper to this one. The caller must hold
5161 * the RTNL lock.
5162 */
5163 void netdev_upper_dev_unlink(struct net_device *dev,
5164 struct net_device *upper_dev)
5165 {
5166 struct netdev_adjacent *i, *j;
5167 ASSERT_RTNL();
5168
5169 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5170
5171 /* Here is the tricky part. We must remove all dev's lower
5172 * devices from all upper_dev's upper devices and vice
5173 * versa, to maintain the graph relationship.
5174 */
5175 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5176 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5177 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5178
5179 /* remove also the devices itself from lower/upper device
5180 * list
5181 */
5182 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5183 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5184
5185 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5186 __netdev_adjacent_dev_unlink(dev, i->dev);
5187
5188 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5189 }
5190 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5191
5192 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5193 {
5194 struct netdev_adjacent *iter;
5195
5196 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5197 netdev_adjacent_sysfs_del(iter->dev, oldname,
5198 &iter->dev->adj_list.lower);
5199 netdev_adjacent_sysfs_add(iter->dev, dev,
5200 &iter->dev->adj_list.lower);
5201 }
5202
5203 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5204 netdev_adjacent_sysfs_del(iter->dev, oldname,
5205 &iter->dev->adj_list.upper);
5206 netdev_adjacent_sysfs_add(iter->dev, dev,
5207 &iter->dev->adj_list.upper);
5208 }
5209 }
5210
5211 void *netdev_lower_dev_get_private(struct net_device *dev,
5212 struct net_device *lower_dev)
5213 {
5214 struct netdev_adjacent *lower;
5215
5216 if (!lower_dev)
5217 return NULL;
5218 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5219 if (!lower)
5220 return NULL;
5221
5222 return lower->private;
5223 }
5224 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5225
5226
5227 int dev_get_nest_level(struct net_device *dev,
5228 bool (*type_check)(struct net_device *dev))
5229 {
5230 struct net_device *lower = NULL;
5231 struct list_head *iter;
5232 int max_nest = -1;
5233 int nest;
5234
5235 ASSERT_RTNL();
5236
5237 netdev_for_each_lower_dev(dev, lower, iter) {
5238 nest = dev_get_nest_level(lower, type_check);
5239 if (max_nest < nest)
5240 max_nest = nest;
5241 }
5242
5243 if (type_check(dev))
5244 max_nest++;
5245
5246 return max_nest;
5247 }
5248 EXPORT_SYMBOL(dev_get_nest_level);
5249
5250 static void dev_change_rx_flags(struct net_device *dev, int flags)
5251 {
5252 const struct net_device_ops *ops = dev->netdev_ops;
5253
5254 if (ops->ndo_change_rx_flags)
5255 ops->ndo_change_rx_flags(dev, flags);
5256 }
5257
5258 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5259 {
5260 unsigned int old_flags = dev->flags;
5261 kuid_t uid;
5262 kgid_t gid;
5263
5264 ASSERT_RTNL();
5265
5266 dev->flags |= IFF_PROMISC;
5267 dev->promiscuity += inc;
5268 if (dev->promiscuity == 0) {
5269 /*
5270 * Avoid overflow.
5271 * If inc causes overflow, untouch promisc and return error.
5272 */
5273 if (inc < 0)
5274 dev->flags &= ~IFF_PROMISC;
5275 else {
5276 dev->promiscuity -= inc;
5277 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5278 dev->name);
5279 return -EOVERFLOW;
5280 }
5281 }
5282 if (dev->flags != old_flags) {
5283 pr_info("device %s %s promiscuous mode\n",
5284 dev->name,
5285 dev->flags & IFF_PROMISC ? "entered" : "left");
5286 if (audit_enabled) {
5287 current_uid_gid(&uid, &gid);
5288 audit_log(current->audit_context, GFP_ATOMIC,
5289 AUDIT_ANOM_PROMISCUOUS,
5290 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5291 dev->name, (dev->flags & IFF_PROMISC),
5292 (old_flags & IFF_PROMISC),
5293 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5294 from_kuid(&init_user_ns, uid),
5295 from_kgid(&init_user_ns, gid),
5296 audit_get_sessionid(current));
5297 }
5298
5299 dev_change_rx_flags(dev, IFF_PROMISC);
5300 }
5301 if (notify)
5302 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5303 return 0;
5304 }
5305
5306 /**
5307 * dev_set_promiscuity - update promiscuity count on a device
5308 * @dev: device
5309 * @inc: modifier
5310 *
5311 * Add or remove promiscuity from a device. While the count in the device
5312 * remains above zero the interface remains promiscuous. Once it hits zero
5313 * the device reverts back to normal filtering operation. A negative inc
5314 * value is used to drop promiscuity on the device.
5315 * Return 0 if successful or a negative errno code on error.
5316 */
5317 int dev_set_promiscuity(struct net_device *dev, int inc)
5318 {
5319 unsigned int old_flags = dev->flags;
5320 int err;
5321
5322 err = __dev_set_promiscuity(dev, inc, true);
5323 if (err < 0)
5324 return err;
5325 if (dev->flags != old_flags)
5326 dev_set_rx_mode(dev);
5327 return err;
5328 }
5329 EXPORT_SYMBOL(dev_set_promiscuity);
5330
5331 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5332 {
5333 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5334
5335 ASSERT_RTNL();
5336
5337 dev->flags |= IFF_ALLMULTI;
5338 dev->allmulti += inc;
5339 if (dev->allmulti == 0) {
5340 /*
5341 * Avoid overflow.
5342 * If inc causes overflow, untouch allmulti and return error.
5343 */
5344 if (inc < 0)
5345 dev->flags &= ~IFF_ALLMULTI;
5346 else {
5347 dev->allmulti -= inc;
5348 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5349 dev->name);
5350 return -EOVERFLOW;
5351 }
5352 }
5353 if (dev->flags ^ old_flags) {
5354 dev_change_rx_flags(dev, IFF_ALLMULTI);
5355 dev_set_rx_mode(dev);
5356 if (notify)
5357 __dev_notify_flags(dev, old_flags,
5358 dev->gflags ^ old_gflags);
5359 }
5360 return 0;
5361 }
5362
5363 /**
5364 * dev_set_allmulti - update allmulti count on a device
5365 * @dev: device
5366 * @inc: modifier
5367 *
5368 * Add or remove reception of all multicast frames to a device. While the
5369 * count in the device remains above zero the interface remains listening
5370 * to all interfaces. Once it hits zero the device reverts back to normal
5371 * filtering operation. A negative @inc value is used to drop the counter
5372 * when releasing a resource needing all multicasts.
5373 * Return 0 if successful or a negative errno code on error.
5374 */
5375
5376 int dev_set_allmulti(struct net_device *dev, int inc)
5377 {
5378 return __dev_set_allmulti(dev, inc, true);
5379 }
5380 EXPORT_SYMBOL(dev_set_allmulti);
5381
5382 /*
5383 * Upload unicast and multicast address lists to device and
5384 * configure RX filtering. When the device doesn't support unicast
5385 * filtering it is put in promiscuous mode while unicast addresses
5386 * are present.
5387 */
5388 void __dev_set_rx_mode(struct net_device *dev)
5389 {
5390 const struct net_device_ops *ops = dev->netdev_ops;
5391
5392 /* dev_open will call this function so the list will stay sane. */
5393 if (!(dev->flags&IFF_UP))
5394 return;
5395
5396 if (!netif_device_present(dev))
5397 return;
5398
5399 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5400 /* Unicast addresses changes may only happen under the rtnl,
5401 * therefore calling __dev_set_promiscuity here is safe.
5402 */
5403 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5404 __dev_set_promiscuity(dev, 1, false);
5405 dev->uc_promisc = true;
5406 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5407 __dev_set_promiscuity(dev, -1, false);
5408 dev->uc_promisc = false;
5409 }
5410 }
5411
5412 if (ops->ndo_set_rx_mode)
5413 ops->ndo_set_rx_mode(dev);
5414 }
5415
5416 void dev_set_rx_mode(struct net_device *dev)
5417 {
5418 netif_addr_lock_bh(dev);
5419 __dev_set_rx_mode(dev);
5420 netif_addr_unlock_bh(dev);
5421 }
5422
5423 /**
5424 * dev_get_flags - get flags reported to userspace
5425 * @dev: device
5426 *
5427 * Get the combination of flag bits exported through APIs to userspace.
5428 */
5429 unsigned int dev_get_flags(const struct net_device *dev)
5430 {
5431 unsigned int flags;
5432
5433 flags = (dev->flags & ~(IFF_PROMISC |
5434 IFF_ALLMULTI |
5435 IFF_RUNNING |
5436 IFF_LOWER_UP |
5437 IFF_DORMANT)) |
5438 (dev->gflags & (IFF_PROMISC |
5439 IFF_ALLMULTI));
5440
5441 if (netif_running(dev)) {
5442 if (netif_oper_up(dev))
5443 flags |= IFF_RUNNING;
5444 if (netif_carrier_ok(dev))
5445 flags |= IFF_LOWER_UP;
5446 if (netif_dormant(dev))
5447 flags |= IFF_DORMANT;
5448 }
5449
5450 return flags;
5451 }
5452 EXPORT_SYMBOL(dev_get_flags);
5453
5454 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5455 {
5456 unsigned int old_flags = dev->flags;
5457 int ret;
5458
5459 ASSERT_RTNL();
5460
5461 /*
5462 * Set the flags on our device.
5463 */
5464
5465 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5466 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5467 IFF_AUTOMEDIA)) |
5468 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5469 IFF_ALLMULTI));
5470
5471 /*
5472 * Load in the correct multicast list now the flags have changed.
5473 */
5474
5475 if ((old_flags ^ flags) & IFF_MULTICAST)
5476 dev_change_rx_flags(dev, IFF_MULTICAST);
5477
5478 dev_set_rx_mode(dev);
5479
5480 /*
5481 * Have we downed the interface. We handle IFF_UP ourselves
5482 * according to user attempts to set it, rather than blindly
5483 * setting it.
5484 */
5485
5486 ret = 0;
5487 if ((old_flags ^ flags) & IFF_UP)
5488 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5489
5490 if ((flags ^ dev->gflags) & IFF_PROMISC) {
5491 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5492 unsigned int old_flags = dev->flags;
5493
5494 dev->gflags ^= IFF_PROMISC;
5495
5496 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5497 if (dev->flags != old_flags)
5498 dev_set_rx_mode(dev);
5499 }
5500
5501 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5502 is important. Some (broken) drivers set IFF_PROMISC, when
5503 IFF_ALLMULTI is requested not asking us and not reporting.
5504 */
5505 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5506 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5507
5508 dev->gflags ^= IFF_ALLMULTI;
5509 __dev_set_allmulti(dev, inc, false);
5510 }
5511
5512 return ret;
5513 }
5514
5515 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5516 unsigned int gchanges)
5517 {
5518 unsigned int changes = dev->flags ^ old_flags;
5519
5520 if (gchanges)
5521 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5522
5523 if (changes & IFF_UP) {
5524 if (dev->flags & IFF_UP)
5525 call_netdevice_notifiers(NETDEV_UP, dev);
5526 else
5527 call_netdevice_notifiers(NETDEV_DOWN, dev);
5528 }
5529
5530 if (dev->flags & IFF_UP &&
5531 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5532 struct netdev_notifier_change_info change_info;
5533
5534 change_info.flags_changed = changes;
5535 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5536 &change_info.info);
5537 }
5538 }
5539
5540 /**
5541 * dev_change_flags - change device settings
5542 * @dev: device
5543 * @flags: device state flags
5544 *
5545 * Change settings on device based state flags. The flags are
5546 * in the userspace exported format.
5547 */
5548 int dev_change_flags(struct net_device *dev, unsigned int flags)
5549 {
5550 int ret;
5551 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5552
5553 ret = __dev_change_flags(dev, flags);
5554 if (ret < 0)
5555 return ret;
5556
5557 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5558 __dev_notify_flags(dev, old_flags, changes);
5559 return ret;
5560 }
5561 EXPORT_SYMBOL(dev_change_flags);
5562
5563 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5564 {
5565 const struct net_device_ops *ops = dev->netdev_ops;
5566
5567 if (ops->ndo_change_mtu)
5568 return ops->ndo_change_mtu(dev, new_mtu);
5569
5570 dev->mtu = new_mtu;
5571 return 0;
5572 }
5573
5574 /**
5575 * dev_set_mtu - Change maximum transfer unit
5576 * @dev: device
5577 * @new_mtu: new transfer unit
5578 *
5579 * Change the maximum transfer size of the network device.
5580 */
5581 int dev_set_mtu(struct net_device *dev, int new_mtu)
5582 {
5583 int err, orig_mtu;
5584
5585 if (new_mtu == dev->mtu)
5586 return 0;
5587
5588 /* MTU must be positive. */
5589 if (new_mtu < 0)
5590 return -EINVAL;
5591
5592 if (!netif_device_present(dev))
5593 return -ENODEV;
5594
5595 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5596 err = notifier_to_errno(err);
5597 if (err)
5598 return err;
5599
5600 orig_mtu = dev->mtu;
5601 err = __dev_set_mtu(dev, new_mtu);
5602
5603 if (!err) {
5604 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5605 err = notifier_to_errno(err);
5606 if (err) {
5607 /* setting mtu back and notifying everyone again,
5608 * so that they have a chance to revert changes.
5609 */
5610 __dev_set_mtu(dev, orig_mtu);
5611 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5612 }
5613 }
5614 return err;
5615 }
5616 EXPORT_SYMBOL(dev_set_mtu);
5617
5618 /**
5619 * dev_set_group - Change group this device belongs to
5620 * @dev: device
5621 * @new_group: group this device should belong to
5622 */
5623 void dev_set_group(struct net_device *dev, int new_group)
5624 {
5625 dev->group = new_group;
5626 }
5627 EXPORT_SYMBOL(dev_set_group);
5628
5629 /**
5630 * dev_set_mac_address - Change Media Access Control Address
5631 * @dev: device
5632 * @sa: new address
5633 *
5634 * Change the hardware (MAC) address of the device
5635 */
5636 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5637 {
5638 const struct net_device_ops *ops = dev->netdev_ops;
5639 int err;
5640
5641 if (!ops->ndo_set_mac_address)
5642 return -EOPNOTSUPP;
5643 if (sa->sa_family != dev->type)
5644 return -EINVAL;
5645 if (!netif_device_present(dev))
5646 return -ENODEV;
5647 err = ops->ndo_set_mac_address(dev, sa);
5648 if (err)
5649 return err;
5650 dev->addr_assign_type = NET_ADDR_SET;
5651 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5652 add_device_randomness(dev->dev_addr, dev->addr_len);
5653 return 0;
5654 }
5655 EXPORT_SYMBOL(dev_set_mac_address);
5656
5657 /**
5658 * dev_change_carrier - Change device carrier
5659 * @dev: device
5660 * @new_carrier: new value
5661 *
5662 * Change device carrier
5663 */
5664 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5665 {
5666 const struct net_device_ops *ops = dev->netdev_ops;
5667
5668 if (!ops->ndo_change_carrier)
5669 return -EOPNOTSUPP;
5670 if (!netif_device_present(dev))
5671 return -ENODEV;
5672 return ops->ndo_change_carrier(dev, new_carrier);
5673 }
5674 EXPORT_SYMBOL(dev_change_carrier);
5675
5676 /**
5677 * dev_get_phys_port_id - Get device physical port ID
5678 * @dev: device
5679 * @ppid: port ID
5680 *
5681 * Get device physical port ID
5682 */
5683 int dev_get_phys_port_id(struct net_device *dev,
5684 struct netdev_phys_port_id *ppid)
5685 {
5686 const struct net_device_ops *ops = dev->netdev_ops;
5687
5688 if (!ops->ndo_get_phys_port_id)
5689 return -EOPNOTSUPP;
5690 return ops->ndo_get_phys_port_id(dev, ppid);
5691 }
5692 EXPORT_SYMBOL(dev_get_phys_port_id);
5693
5694 /**
5695 * dev_new_index - allocate an ifindex
5696 * @net: the applicable net namespace
5697 *
5698 * Returns a suitable unique value for a new device interface
5699 * number. The caller must hold the rtnl semaphore or the
5700 * dev_base_lock to be sure it remains unique.
5701 */
5702 static int dev_new_index(struct net *net)
5703 {
5704 int ifindex = net->ifindex;
5705 for (;;) {
5706 if (++ifindex <= 0)
5707 ifindex = 1;
5708 if (!__dev_get_by_index(net, ifindex))
5709 return net->ifindex = ifindex;
5710 }
5711 }
5712
5713 /* Delayed registration/unregisteration */
5714 static LIST_HEAD(net_todo_list);
5715 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5716
5717 static void net_set_todo(struct net_device *dev)
5718 {
5719 list_add_tail(&dev->todo_list, &net_todo_list);
5720 dev_net(dev)->dev_unreg_count++;
5721 }
5722
5723 static void rollback_registered_many(struct list_head *head)
5724 {
5725 struct net_device *dev, *tmp;
5726 LIST_HEAD(close_head);
5727
5728 BUG_ON(dev_boot_phase);
5729 ASSERT_RTNL();
5730
5731 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5732 /* Some devices call without registering
5733 * for initialization unwind. Remove those
5734 * devices and proceed with the remaining.
5735 */
5736 if (dev->reg_state == NETREG_UNINITIALIZED) {
5737 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5738 dev->name, dev);
5739
5740 WARN_ON(1);
5741 list_del(&dev->unreg_list);
5742 continue;
5743 }
5744 dev->dismantle = true;
5745 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5746 }
5747
5748 /* If device is running, close it first. */
5749 list_for_each_entry(dev, head, unreg_list)
5750 list_add_tail(&dev->close_list, &close_head);
5751 dev_close_many(&close_head);
5752
5753 list_for_each_entry(dev, head, unreg_list) {
5754 /* And unlink it from device chain. */
5755 unlist_netdevice(dev);
5756
5757 dev->reg_state = NETREG_UNREGISTERING;
5758 }
5759
5760 synchronize_net();
5761
5762 list_for_each_entry(dev, head, unreg_list) {
5763 /* Shutdown queueing discipline. */
5764 dev_shutdown(dev);
5765
5766
5767 /* Notify protocols, that we are about to destroy
5768 this device. They should clean all the things.
5769 */
5770 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5771
5772 /*
5773 * Flush the unicast and multicast chains
5774 */
5775 dev_uc_flush(dev);
5776 dev_mc_flush(dev);
5777
5778 if (dev->netdev_ops->ndo_uninit)
5779 dev->netdev_ops->ndo_uninit(dev);
5780
5781 if (!dev->rtnl_link_ops ||
5782 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5783 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5784
5785 /* Notifier chain MUST detach us all upper devices. */
5786 WARN_ON(netdev_has_any_upper_dev(dev));
5787
5788 /* Remove entries from kobject tree */
5789 netdev_unregister_kobject(dev);
5790 #ifdef CONFIG_XPS
5791 /* Remove XPS queueing entries */
5792 netif_reset_xps_queues_gt(dev, 0);
5793 #endif
5794 }
5795
5796 synchronize_net();
5797
5798 list_for_each_entry(dev, head, unreg_list)
5799 dev_put(dev);
5800 }
5801
5802 static void rollback_registered(struct net_device *dev)
5803 {
5804 LIST_HEAD(single);
5805
5806 list_add(&dev->unreg_list, &single);
5807 rollback_registered_many(&single);
5808 list_del(&single);
5809 }
5810
5811 static netdev_features_t netdev_fix_features(struct net_device *dev,
5812 netdev_features_t features)
5813 {
5814 /* Fix illegal checksum combinations */
5815 if ((features & NETIF_F_HW_CSUM) &&
5816 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5817 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5818 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5819 }
5820
5821 /* TSO requires that SG is present as well. */
5822 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5823 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5824 features &= ~NETIF_F_ALL_TSO;
5825 }
5826
5827 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5828 !(features & NETIF_F_IP_CSUM)) {
5829 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5830 features &= ~NETIF_F_TSO;
5831 features &= ~NETIF_F_TSO_ECN;
5832 }
5833
5834 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5835 !(features & NETIF_F_IPV6_CSUM)) {
5836 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5837 features &= ~NETIF_F_TSO6;
5838 }
5839
5840 /* TSO ECN requires that TSO is present as well. */
5841 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5842 features &= ~NETIF_F_TSO_ECN;
5843
5844 /* Software GSO depends on SG. */
5845 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5846 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5847 features &= ~NETIF_F_GSO;
5848 }
5849
5850 /* UFO needs SG and checksumming */
5851 if (features & NETIF_F_UFO) {
5852 /* maybe split UFO into V4 and V6? */
5853 if (!((features & NETIF_F_GEN_CSUM) ||
5854 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5855 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5856 netdev_dbg(dev,
5857 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5858 features &= ~NETIF_F_UFO;
5859 }
5860
5861 if (!(features & NETIF_F_SG)) {
5862 netdev_dbg(dev,
5863 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5864 features &= ~NETIF_F_UFO;
5865 }
5866 }
5867
5868 #ifdef CONFIG_NET_RX_BUSY_POLL
5869 if (dev->netdev_ops->ndo_busy_poll)
5870 features |= NETIF_F_BUSY_POLL;
5871 else
5872 #endif
5873 features &= ~NETIF_F_BUSY_POLL;
5874
5875 return features;
5876 }
5877
5878 int __netdev_update_features(struct net_device *dev)
5879 {
5880 netdev_features_t features;
5881 int err = 0;
5882
5883 ASSERT_RTNL();
5884
5885 features = netdev_get_wanted_features(dev);
5886
5887 if (dev->netdev_ops->ndo_fix_features)
5888 features = dev->netdev_ops->ndo_fix_features(dev, features);
5889
5890 /* driver might be less strict about feature dependencies */
5891 features = netdev_fix_features(dev, features);
5892
5893 if (dev->features == features)
5894 return 0;
5895
5896 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5897 &dev->features, &features);
5898
5899 if (dev->netdev_ops->ndo_set_features)
5900 err = dev->netdev_ops->ndo_set_features(dev, features);
5901
5902 if (unlikely(err < 0)) {
5903 netdev_err(dev,
5904 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5905 err, &features, &dev->features);
5906 return -1;
5907 }
5908
5909 if (!err)
5910 dev->features = features;
5911
5912 return 1;
5913 }
5914
5915 /**
5916 * netdev_update_features - recalculate device features
5917 * @dev: the device to check
5918 *
5919 * Recalculate dev->features set and send notifications if it
5920 * has changed. Should be called after driver or hardware dependent
5921 * conditions might have changed that influence the features.
5922 */
5923 void netdev_update_features(struct net_device *dev)
5924 {
5925 if (__netdev_update_features(dev))
5926 netdev_features_change(dev);
5927 }
5928 EXPORT_SYMBOL(netdev_update_features);
5929
5930 /**
5931 * netdev_change_features - recalculate device features
5932 * @dev: the device to check
5933 *
5934 * Recalculate dev->features set and send notifications even
5935 * if they have not changed. Should be called instead of
5936 * netdev_update_features() if also dev->vlan_features might
5937 * have changed to allow the changes to be propagated to stacked
5938 * VLAN devices.
5939 */
5940 void netdev_change_features(struct net_device *dev)
5941 {
5942 __netdev_update_features(dev);
5943 netdev_features_change(dev);
5944 }
5945 EXPORT_SYMBOL(netdev_change_features);
5946
5947 /**
5948 * netif_stacked_transfer_operstate - transfer operstate
5949 * @rootdev: the root or lower level device to transfer state from
5950 * @dev: the device to transfer operstate to
5951 *
5952 * Transfer operational state from root to device. This is normally
5953 * called when a stacking relationship exists between the root
5954 * device and the device(a leaf device).
5955 */
5956 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5957 struct net_device *dev)
5958 {
5959 if (rootdev->operstate == IF_OPER_DORMANT)
5960 netif_dormant_on(dev);
5961 else
5962 netif_dormant_off(dev);
5963
5964 if (netif_carrier_ok(rootdev)) {
5965 if (!netif_carrier_ok(dev))
5966 netif_carrier_on(dev);
5967 } else {
5968 if (netif_carrier_ok(dev))
5969 netif_carrier_off(dev);
5970 }
5971 }
5972 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5973
5974 #ifdef CONFIG_SYSFS
5975 static int netif_alloc_rx_queues(struct net_device *dev)
5976 {
5977 unsigned int i, count = dev->num_rx_queues;
5978 struct netdev_rx_queue *rx;
5979
5980 BUG_ON(count < 1);
5981
5982 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5983 if (!rx)
5984 return -ENOMEM;
5985
5986 dev->_rx = rx;
5987
5988 for (i = 0; i < count; i++)
5989 rx[i].dev = dev;
5990 return 0;
5991 }
5992 #endif
5993
5994 static void netdev_init_one_queue(struct net_device *dev,
5995 struct netdev_queue *queue, void *_unused)
5996 {
5997 /* Initialize queue lock */
5998 spin_lock_init(&queue->_xmit_lock);
5999 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6000 queue->xmit_lock_owner = -1;
6001 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6002 queue->dev = dev;
6003 #ifdef CONFIG_BQL
6004 dql_init(&queue->dql, HZ);
6005 #endif
6006 }
6007
6008 static void netif_free_tx_queues(struct net_device *dev)
6009 {
6010 kvfree(dev->_tx);
6011 }
6012
6013 static int netif_alloc_netdev_queues(struct net_device *dev)
6014 {
6015 unsigned int count = dev->num_tx_queues;
6016 struct netdev_queue *tx;
6017 size_t sz = count * sizeof(*tx);
6018
6019 BUG_ON(count < 1 || count > 0xffff);
6020
6021 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6022 if (!tx) {
6023 tx = vzalloc(sz);
6024 if (!tx)
6025 return -ENOMEM;
6026 }
6027 dev->_tx = tx;
6028
6029 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6030 spin_lock_init(&dev->tx_global_lock);
6031
6032 return 0;
6033 }
6034
6035 /**
6036 * register_netdevice - register a network device
6037 * @dev: device to register
6038 *
6039 * Take a completed network device structure and add it to the kernel
6040 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6041 * chain. 0 is returned on success. A negative errno code is returned
6042 * on a failure to set up the device, or if the name is a duplicate.
6043 *
6044 * Callers must hold the rtnl semaphore. You may want
6045 * register_netdev() instead of this.
6046 *
6047 * BUGS:
6048 * The locking appears insufficient to guarantee two parallel registers
6049 * will not get the same name.
6050 */
6051
6052 int register_netdevice(struct net_device *dev)
6053 {
6054 int ret;
6055 struct net *net = dev_net(dev);
6056
6057 BUG_ON(dev_boot_phase);
6058 ASSERT_RTNL();
6059
6060 might_sleep();
6061
6062 /* When net_device's are persistent, this will be fatal. */
6063 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6064 BUG_ON(!net);
6065
6066 spin_lock_init(&dev->addr_list_lock);
6067 netdev_set_addr_lockdep_class(dev);
6068
6069 dev->iflink = -1;
6070
6071 ret = dev_get_valid_name(net, dev, dev->name);
6072 if (ret < 0)
6073 goto out;
6074
6075 /* Init, if this function is available */
6076 if (dev->netdev_ops->ndo_init) {
6077 ret = dev->netdev_ops->ndo_init(dev);
6078 if (ret) {
6079 if (ret > 0)
6080 ret = -EIO;
6081 goto out;
6082 }
6083 }
6084
6085 if (((dev->hw_features | dev->features) &
6086 NETIF_F_HW_VLAN_CTAG_FILTER) &&
6087 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6088 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6089 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6090 ret = -EINVAL;
6091 goto err_uninit;
6092 }
6093
6094 ret = -EBUSY;
6095 if (!dev->ifindex)
6096 dev->ifindex = dev_new_index(net);
6097 else if (__dev_get_by_index(net, dev->ifindex))
6098 goto err_uninit;
6099
6100 if (dev->iflink == -1)
6101 dev->iflink = dev->ifindex;
6102
6103 /* Transfer changeable features to wanted_features and enable
6104 * software offloads (GSO and GRO).
6105 */
6106 dev->hw_features |= NETIF_F_SOFT_FEATURES;
6107 dev->features |= NETIF_F_SOFT_FEATURES;
6108 dev->wanted_features = dev->features & dev->hw_features;
6109
6110 if (!(dev->flags & IFF_LOOPBACK)) {
6111 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6112 }
6113
6114 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6115 */
6116 dev->vlan_features |= NETIF_F_HIGHDMA;
6117
6118 /* Make NETIF_F_SG inheritable to tunnel devices.
6119 */
6120 dev->hw_enc_features |= NETIF_F_SG;
6121
6122 /* Make NETIF_F_SG inheritable to MPLS.
6123 */
6124 dev->mpls_features |= NETIF_F_SG;
6125
6126 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6127 ret = notifier_to_errno(ret);
6128 if (ret)
6129 goto err_uninit;
6130
6131 ret = netdev_register_kobject(dev);
6132 if (ret)
6133 goto err_uninit;
6134 dev->reg_state = NETREG_REGISTERED;
6135
6136 __netdev_update_features(dev);
6137
6138 /*
6139 * Default initial state at registry is that the
6140 * device is present.
6141 */
6142
6143 set_bit(__LINK_STATE_PRESENT, &dev->state);
6144
6145 linkwatch_init_dev(dev);
6146
6147 dev_init_scheduler(dev);
6148 dev_hold(dev);
6149 list_netdevice(dev);
6150 add_device_randomness(dev->dev_addr, dev->addr_len);
6151
6152 /* If the device has permanent device address, driver should
6153 * set dev_addr and also addr_assign_type should be set to
6154 * NET_ADDR_PERM (default value).
6155 */
6156 if (dev->addr_assign_type == NET_ADDR_PERM)
6157 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6158
6159 /* Notify protocols, that a new device appeared. */
6160 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6161 ret = notifier_to_errno(ret);
6162 if (ret) {
6163 rollback_registered(dev);
6164 dev->reg_state = NETREG_UNREGISTERED;
6165 }
6166 /*
6167 * Prevent userspace races by waiting until the network
6168 * device is fully setup before sending notifications.
6169 */
6170 if (!dev->rtnl_link_ops ||
6171 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6172 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6173
6174 out:
6175 return ret;
6176
6177 err_uninit:
6178 if (dev->netdev_ops->ndo_uninit)
6179 dev->netdev_ops->ndo_uninit(dev);
6180 goto out;
6181 }
6182 EXPORT_SYMBOL(register_netdevice);
6183
6184 /**
6185 * init_dummy_netdev - init a dummy network device for NAPI
6186 * @dev: device to init
6187 *
6188 * This takes a network device structure and initialize the minimum
6189 * amount of fields so it can be used to schedule NAPI polls without
6190 * registering a full blown interface. This is to be used by drivers
6191 * that need to tie several hardware interfaces to a single NAPI
6192 * poll scheduler due to HW limitations.
6193 */
6194 int init_dummy_netdev(struct net_device *dev)
6195 {
6196 /* Clear everything. Note we don't initialize spinlocks
6197 * are they aren't supposed to be taken by any of the
6198 * NAPI code and this dummy netdev is supposed to be
6199 * only ever used for NAPI polls
6200 */
6201 memset(dev, 0, sizeof(struct net_device));
6202
6203 /* make sure we BUG if trying to hit standard
6204 * register/unregister code path
6205 */
6206 dev->reg_state = NETREG_DUMMY;
6207
6208 /* NAPI wants this */
6209 INIT_LIST_HEAD(&dev->napi_list);
6210
6211 /* a dummy interface is started by default */
6212 set_bit(__LINK_STATE_PRESENT, &dev->state);
6213 set_bit(__LINK_STATE_START, &dev->state);
6214
6215 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6216 * because users of this 'device' dont need to change
6217 * its refcount.
6218 */
6219
6220 return 0;
6221 }
6222 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6223
6224
6225 /**
6226 * register_netdev - register a network device
6227 * @dev: device to register
6228 *
6229 * Take a completed network device structure and add it to the kernel
6230 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6231 * chain. 0 is returned on success. A negative errno code is returned
6232 * on a failure to set up the device, or if the name is a duplicate.
6233 *
6234 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6235 * and expands the device name if you passed a format string to
6236 * alloc_netdev.
6237 */
6238 int register_netdev(struct net_device *dev)
6239 {
6240 int err;
6241
6242 rtnl_lock();
6243 err = register_netdevice(dev);
6244 rtnl_unlock();
6245 return err;
6246 }
6247 EXPORT_SYMBOL(register_netdev);
6248
6249 int netdev_refcnt_read(const struct net_device *dev)
6250 {
6251 int i, refcnt = 0;
6252
6253 for_each_possible_cpu(i)
6254 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6255 return refcnt;
6256 }
6257 EXPORT_SYMBOL(netdev_refcnt_read);
6258
6259 /**
6260 * netdev_wait_allrefs - wait until all references are gone.
6261 * @dev: target net_device
6262 *
6263 * This is called when unregistering network devices.
6264 *
6265 * Any protocol or device that holds a reference should register
6266 * for netdevice notification, and cleanup and put back the
6267 * reference if they receive an UNREGISTER event.
6268 * We can get stuck here if buggy protocols don't correctly
6269 * call dev_put.
6270 */
6271 static void netdev_wait_allrefs(struct net_device *dev)
6272 {
6273 unsigned long rebroadcast_time, warning_time;
6274 int refcnt;
6275
6276 linkwatch_forget_dev(dev);
6277
6278 rebroadcast_time = warning_time = jiffies;
6279 refcnt = netdev_refcnt_read(dev);
6280
6281 while (refcnt != 0) {
6282 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6283 rtnl_lock();
6284
6285 /* Rebroadcast unregister notification */
6286 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6287
6288 __rtnl_unlock();
6289 rcu_barrier();
6290 rtnl_lock();
6291
6292 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6293 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6294 &dev->state)) {
6295 /* We must not have linkwatch events
6296 * pending on unregister. If this
6297 * happens, we simply run the queue
6298 * unscheduled, resulting in a noop
6299 * for this device.
6300 */
6301 linkwatch_run_queue();
6302 }
6303
6304 __rtnl_unlock();
6305
6306 rebroadcast_time = jiffies;
6307 }
6308
6309 msleep(250);
6310
6311 refcnt = netdev_refcnt_read(dev);
6312
6313 if (time_after(jiffies, warning_time + 10 * HZ)) {
6314 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6315 dev->name, refcnt);
6316 warning_time = jiffies;
6317 }
6318 }
6319 }
6320
6321 /* The sequence is:
6322 *
6323 * rtnl_lock();
6324 * ...
6325 * register_netdevice(x1);
6326 * register_netdevice(x2);
6327 * ...
6328 * unregister_netdevice(y1);
6329 * unregister_netdevice(y2);
6330 * ...
6331 * rtnl_unlock();
6332 * free_netdev(y1);
6333 * free_netdev(y2);
6334 *
6335 * We are invoked by rtnl_unlock().
6336 * This allows us to deal with problems:
6337 * 1) We can delete sysfs objects which invoke hotplug
6338 * without deadlocking with linkwatch via keventd.
6339 * 2) Since we run with the RTNL semaphore not held, we can sleep
6340 * safely in order to wait for the netdev refcnt to drop to zero.
6341 *
6342 * We must not return until all unregister events added during
6343 * the interval the lock was held have been completed.
6344 */
6345 void netdev_run_todo(void)
6346 {
6347 struct list_head list;
6348
6349 /* Snapshot list, allow later requests */
6350 list_replace_init(&net_todo_list, &list);
6351
6352 __rtnl_unlock();
6353
6354
6355 /* Wait for rcu callbacks to finish before next phase */
6356 if (!list_empty(&list))
6357 rcu_barrier();
6358
6359 while (!list_empty(&list)) {
6360 struct net_device *dev
6361 = list_first_entry(&list, struct net_device, todo_list);
6362 list_del(&dev->todo_list);
6363
6364 rtnl_lock();
6365 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6366 __rtnl_unlock();
6367
6368 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6369 pr_err("network todo '%s' but state %d\n",
6370 dev->name, dev->reg_state);
6371 dump_stack();
6372 continue;
6373 }
6374
6375 dev->reg_state = NETREG_UNREGISTERED;
6376
6377 on_each_cpu(flush_backlog, dev, 1);
6378
6379 netdev_wait_allrefs(dev);
6380
6381 /* paranoia */
6382 BUG_ON(netdev_refcnt_read(dev));
6383 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6384 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6385 WARN_ON(dev->dn_ptr);
6386
6387 if (dev->destructor)
6388 dev->destructor(dev);
6389
6390 /* Report a network device has been unregistered */
6391 rtnl_lock();
6392 dev_net(dev)->dev_unreg_count--;
6393 __rtnl_unlock();
6394 wake_up(&netdev_unregistering_wq);
6395
6396 /* Free network device */
6397 kobject_put(&dev->dev.kobj);
6398 }
6399 }
6400
6401 /* Convert net_device_stats to rtnl_link_stats64. They have the same
6402 * fields in the same order, with only the type differing.
6403 */
6404 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6405 const struct net_device_stats *netdev_stats)
6406 {
6407 #if BITS_PER_LONG == 64
6408 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6409 memcpy(stats64, netdev_stats, sizeof(*stats64));
6410 #else
6411 size_t i, n = sizeof(*stats64) / sizeof(u64);
6412 const unsigned long *src = (const unsigned long *)netdev_stats;
6413 u64 *dst = (u64 *)stats64;
6414
6415 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6416 sizeof(*stats64) / sizeof(u64));
6417 for (i = 0; i < n; i++)
6418 dst[i] = src[i];
6419 #endif
6420 }
6421 EXPORT_SYMBOL(netdev_stats_to_stats64);
6422
6423 /**
6424 * dev_get_stats - get network device statistics
6425 * @dev: device to get statistics from
6426 * @storage: place to store stats
6427 *
6428 * Get network statistics from device. Return @storage.
6429 * The device driver may provide its own method by setting
6430 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6431 * otherwise the internal statistics structure is used.
6432 */
6433 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6434 struct rtnl_link_stats64 *storage)
6435 {
6436 const struct net_device_ops *ops = dev->netdev_ops;
6437
6438 if (ops->ndo_get_stats64) {
6439 memset(storage, 0, sizeof(*storage));
6440 ops->ndo_get_stats64(dev, storage);
6441 } else if (ops->ndo_get_stats) {
6442 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6443 } else {
6444 netdev_stats_to_stats64(storage, &dev->stats);
6445 }
6446 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6447 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6448 return storage;
6449 }
6450 EXPORT_SYMBOL(dev_get_stats);
6451
6452 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6453 {
6454 struct netdev_queue *queue = dev_ingress_queue(dev);
6455
6456 #ifdef CONFIG_NET_CLS_ACT
6457 if (queue)
6458 return queue;
6459 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6460 if (!queue)
6461 return NULL;
6462 netdev_init_one_queue(dev, queue, NULL);
6463 queue->qdisc = &noop_qdisc;
6464 queue->qdisc_sleeping = &noop_qdisc;
6465 rcu_assign_pointer(dev->ingress_queue, queue);
6466 #endif
6467 return queue;
6468 }
6469
6470 static const struct ethtool_ops default_ethtool_ops;
6471
6472 void netdev_set_default_ethtool_ops(struct net_device *dev,
6473 const struct ethtool_ops *ops)
6474 {
6475 if (dev->ethtool_ops == &default_ethtool_ops)
6476 dev->ethtool_ops = ops;
6477 }
6478 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6479
6480 void netdev_freemem(struct net_device *dev)
6481 {
6482 char *addr = (char *)dev - dev->padded;
6483
6484 kvfree(addr);
6485 }
6486
6487 /**
6488 * alloc_netdev_mqs - allocate network device
6489 * @sizeof_priv: size of private data to allocate space for
6490 * @name: device name format string
6491 * @name_assign_type: origin of device name
6492 * @setup: callback to initialize device
6493 * @txqs: the number of TX subqueues to allocate
6494 * @rxqs: the number of RX subqueues to allocate
6495 *
6496 * Allocates a struct net_device with private data area for driver use
6497 * and performs basic initialization. Also allocates subqueue structs
6498 * for each queue on the device.
6499 */
6500 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6501 unsigned char name_assign_type,
6502 void (*setup)(struct net_device *),
6503 unsigned int txqs, unsigned int rxqs)
6504 {
6505 struct net_device *dev;
6506 size_t alloc_size;
6507 struct net_device *p;
6508
6509 BUG_ON(strlen(name) >= sizeof(dev->name));
6510
6511 if (txqs < 1) {
6512 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6513 return NULL;
6514 }
6515
6516 #ifdef CONFIG_SYSFS
6517 if (rxqs < 1) {
6518 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6519 return NULL;
6520 }
6521 #endif
6522
6523 alloc_size = sizeof(struct net_device);
6524 if (sizeof_priv) {
6525 /* ensure 32-byte alignment of private area */
6526 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6527 alloc_size += sizeof_priv;
6528 }
6529 /* ensure 32-byte alignment of whole construct */
6530 alloc_size += NETDEV_ALIGN - 1;
6531
6532 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6533 if (!p)
6534 p = vzalloc(alloc_size);
6535 if (!p)
6536 return NULL;
6537
6538 dev = PTR_ALIGN(p, NETDEV_ALIGN);
6539 dev->padded = (char *)dev - (char *)p;
6540
6541 dev->pcpu_refcnt = alloc_percpu(int);
6542 if (!dev->pcpu_refcnt)
6543 goto free_dev;
6544
6545 if (dev_addr_init(dev))
6546 goto free_pcpu;
6547
6548 dev_mc_init(dev);
6549 dev_uc_init(dev);
6550
6551 dev_net_set(dev, &init_net);
6552
6553 dev->gso_max_size = GSO_MAX_SIZE;
6554 dev->gso_max_segs = GSO_MAX_SEGS;
6555
6556 INIT_LIST_HEAD(&dev->napi_list);
6557 INIT_LIST_HEAD(&dev->unreg_list);
6558 INIT_LIST_HEAD(&dev->close_list);
6559 INIT_LIST_HEAD(&dev->link_watch_list);
6560 INIT_LIST_HEAD(&dev->adj_list.upper);
6561 INIT_LIST_HEAD(&dev->adj_list.lower);
6562 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6563 INIT_LIST_HEAD(&dev->all_adj_list.lower);
6564 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6565 setup(dev);
6566
6567 dev->num_tx_queues = txqs;
6568 dev->real_num_tx_queues = txqs;
6569 if (netif_alloc_netdev_queues(dev))
6570 goto free_all;
6571
6572 #ifdef CONFIG_SYSFS
6573 dev->num_rx_queues = rxqs;
6574 dev->real_num_rx_queues = rxqs;
6575 if (netif_alloc_rx_queues(dev))
6576 goto free_all;
6577 #endif
6578
6579 strcpy(dev->name, name);
6580 dev->name_assign_type = name_assign_type;
6581 dev->group = INIT_NETDEV_GROUP;
6582 if (!dev->ethtool_ops)
6583 dev->ethtool_ops = &default_ethtool_ops;
6584 return dev;
6585
6586 free_all:
6587 free_netdev(dev);
6588 return NULL;
6589
6590 free_pcpu:
6591 free_percpu(dev->pcpu_refcnt);
6592 free_dev:
6593 netdev_freemem(dev);
6594 return NULL;
6595 }
6596 EXPORT_SYMBOL(alloc_netdev_mqs);
6597
6598 /**
6599 * free_netdev - free network device
6600 * @dev: device
6601 *
6602 * This function does the last stage of destroying an allocated device
6603 * interface. The reference to the device object is released.
6604 * If this is the last reference then it will be freed.
6605 */
6606 void free_netdev(struct net_device *dev)
6607 {
6608 struct napi_struct *p, *n;
6609
6610 release_net(dev_net(dev));
6611
6612 netif_free_tx_queues(dev);
6613 #ifdef CONFIG_SYSFS
6614 kfree(dev->_rx);
6615 #endif
6616
6617 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6618
6619 /* Flush device addresses */
6620 dev_addr_flush(dev);
6621
6622 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6623 netif_napi_del(p);
6624
6625 free_percpu(dev->pcpu_refcnt);
6626 dev->pcpu_refcnt = NULL;
6627
6628 /* Compatibility with error handling in drivers */
6629 if (dev->reg_state == NETREG_UNINITIALIZED) {
6630 netdev_freemem(dev);
6631 return;
6632 }
6633
6634 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6635 dev->reg_state = NETREG_RELEASED;
6636
6637 /* will free via device release */
6638 put_device(&dev->dev);
6639 }
6640 EXPORT_SYMBOL(free_netdev);
6641
6642 /**
6643 * synchronize_net - Synchronize with packet receive processing
6644 *
6645 * Wait for packets currently being received to be done.
6646 * Does not block later packets from starting.
6647 */
6648 void synchronize_net(void)
6649 {
6650 might_sleep();
6651 if (rtnl_is_locked())
6652 synchronize_rcu_expedited();
6653 else
6654 synchronize_rcu();
6655 }
6656 EXPORT_SYMBOL(synchronize_net);
6657
6658 /**
6659 * unregister_netdevice_queue - remove device from the kernel
6660 * @dev: device
6661 * @head: list
6662 *
6663 * This function shuts down a device interface and removes it
6664 * from the kernel tables.
6665 * If head not NULL, device is queued to be unregistered later.
6666 *
6667 * Callers must hold the rtnl semaphore. You may want
6668 * unregister_netdev() instead of this.
6669 */
6670
6671 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6672 {
6673 ASSERT_RTNL();
6674
6675 if (head) {
6676 list_move_tail(&dev->unreg_list, head);
6677 } else {
6678 rollback_registered(dev);
6679 /* Finish processing unregister after unlock */
6680 net_set_todo(dev);
6681 }
6682 }
6683 EXPORT_SYMBOL(unregister_netdevice_queue);
6684
6685 /**
6686 * unregister_netdevice_many - unregister many devices
6687 * @head: list of devices
6688 *
6689 * Note: As most callers use a stack allocated list_head,
6690 * we force a list_del() to make sure stack wont be corrupted later.
6691 */
6692 void unregister_netdevice_many(struct list_head *head)
6693 {
6694 struct net_device *dev;
6695
6696 if (!list_empty(head)) {
6697 rollback_registered_many(head);
6698 list_for_each_entry(dev, head, unreg_list)
6699 net_set_todo(dev);
6700 list_del(head);
6701 }
6702 }
6703 EXPORT_SYMBOL(unregister_netdevice_many);
6704
6705 /**
6706 * unregister_netdev - remove device from the kernel
6707 * @dev: device
6708 *
6709 * This function shuts down a device interface and removes it
6710 * from the kernel tables.
6711 *
6712 * This is just a wrapper for unregister_netdevice that takes
6713 * the rtnl semaphore. In general you want to use this and not
6714 * unregister_netdevice.
6715 */
6716 void unregister_netdev(struct net_device *dev)
6717 {
6718 rtnl_lock();
6719 unregister_netdevice(dev);
6720 rtnl_unlock();
6721 }
6722 EXPORT_SYMBOL(unregister_netdev);
6723
6724 /**
6725 * dev_change_net_namespace - move device to different nethost namespace
6726 * @dev: device
6727 * @net: network namespace
6728 * @pat: If not NULL name pattern to try if the current device name
6729 * is already taken in the destination network namespace.
6730 *
6731 * This function shuts down a device interface and moves it
6732 * to a new network namespace. On success 0 is returned, on
6733 * a failure a netagive errno code is returned.
6734 *
6735 * Callers must hold the rtnl semaphore.
6736 */
6737
6738 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6739 {
6740 int err;
6741
6742 ASSERT_RTNL();
6743
6744 /* Don't allow namespace local devices to be moved. */
6745 err = -EINVAL;
6746 if (dev->features & NETIF_F_NETNS_LOCAL)
6747 goto out;
6748
6749 /* Ensure the device has been registrered */
6750 if (dev->reg_state != NETREG_REGISTERED)
6751 goto out;
6752
6753 /* Get out if there is nothing todo */
6754 err = 0;
6755 if (net_eq(dev_net(dev), net))
6756 goto out;
6757
6758 /* Pick the destination device name, and ensure
6759 * we can use it in the destination network namespace.
6760 */
6761 err = -EEXIST;
6762 if (__dev_get_by_name(net, dev->name)) {
6763 /* We get here if we can't use the current device name */
6764 if (!pat)
6765 goto out;
6766 if (dev_get_valid_name(net, dev, pat) < 0)
6767 goto out;
6768 }
6769
6770 /*
6771 * And now a mini version of register_netdevice unregister_netdevice.
6772 */
6773
6774 /* If device is running close it first. */
6775 dev_close(dev);
6776
6777 /* And unlink it from device chain */
6778 err = -ENODEV;
6779 unlist_netdevice(dev);
6780
6781 synchronize_net();
6782
6783 /* Shutdown queueing discipline. */
6784 dev_shutdown(dev);
6785
6786 /* Notify protocols, that we are about to destroy
6787 this device. They should clean all the things.
6788
6789 Note that dev->reg_state stays at NETREG_REGISTERED.
6790 This is wanted because this way 8021q and macvlan know
6791 the device is just moving and can keep their slaves up.
6792 */
6793 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6794 rcu_barrier();
6795 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6796 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6797
6798 /*
6799 * Flush the unicast and multicast chains
6800 */
6801 dev_uc_flush(dev);
6802 dev_mc_flush(dev);
6803
6804 /* Send a netdev-removed uevent to the old namespace */
6805 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6806
6807 /* Actually switch the network namespace */
6808 dev_net_set(dev, net);
6809
6810 /* If there is an ifindex conflict assign a new one */
6811 if (__dev_get_by_index(net, dev->ifindex)) {
6812 int iflink = (dev->iflink == dev->ifindex);
6813 dev->ifindex = dev_new_index(net);
6814 if (iflink)
6815 dev->iflink = dev->ifindex;
6816 }
6817
6818 /* Send a netdev-add uevent to the new namespace */
6819 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6820
6821 /* Fixup kobjects */
6822 err = device_rename(&dev->dev, dev->name);
6823 WARN_ON(err);
6824
6825 /* Add the device back in the hashes */
6826 list_netdevice(dev);
6827
6828 /* Notify protocols, that a new device appeared. */
6829 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6830
6831 /*
6832 * Prevent userspace races by waiting until the network
6833 * device is fully setup before sending notifications.
6834 */
6835 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6836
6837 synchronize_net();
6838 err = 0;
6839 out:
6840 return err;
6841 }
6842 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6843
6844 static int dev_cpu_callback(struct notifier_block *nfb,
6845 unsigned long action,
6846 void *ocpu)
6847 {
6848 struct sk_buff **list_skb;
6849 struct sk_buff *skb;
6850 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6851 struct softnet_data *sd, *oldsd;
6852
6853 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6854 return NOTIFY_OK;
6855
6856 local_irq_disable();
6857 cpu = smp_processor_id();
6858 sd = &per_cpu(softnet_data, cpu);
6859 oldsd = &per_cpu(softnet_data, oldcpu);
6860
6861 /* Find end of our completion_queue. */
6862 list_skb = &sd->completion_queue;
6863 while (*list_skb)
6864 list_skb = &(*list_skb)->next;
6865 /* Append completion queue from offline CPU. */
6866 *list_skb = oldsd->completion_queue;
6867 oldsd->completion_queue = NULL;
6868
6869 /* Append output queue from offline CPU. */
6870 if (oldsd->output_queue) {
6871 *sd->output_queue_tailp = oldsd->output_queue;
6872 sd->output_queue_tailp = oldsd->output_queue_tailp;
6873 oldsd->output_queue = NULL;
6874 oldsd->output_queue_tailp = &oldsd->output_queue;
6875 }
6876 /* Append NAPI poll list from offline CPU. */
6877 if (!list_empty(&oldsd->poll_list)) {
6878 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6879 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6880 }
6881
6882 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6883 local_irq_enable();
6884
6885 /* Process offline CPU's input_pkt_queue */
6886 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6887 netif_rx_internal(skb);
6888 input_queue_head_incr(oldsd);
6889 }
6890 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6891 netif_rx_internal(skb);
6892 input_queue_head_incr(oldsd);
6893 }
6894
6895 return NOTIFY_OK;
6896 }
6897
6898
6899 /**
6900 * netdev_increment_features - increment feature set by one
6901 * @all: current feature set
6902 * @one: new feature set
6903 * @mask: mask feature set
6904 *
6905 * Computes a new feature set after adding a device with feature set
6906 * @one to the master device with current feature set @all. Will not
6907 * enable anything that is off in @mask. Returns the new feature set.
6908 */
6909 netdev_features_t netdev_increment_features(netdev_features_t all,
6910 netdev_features_t one, netdev_features_t mask)
6911 {
6912 if (mask & NETIF_F_GEN_CSUM)
6913 mask |= NETIF_F_ALL_CSUM;
6914 mask |= NETIF_F_VLAN_CHALLENGED;
6915
6916 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6917 all &= one | ~NETIF_F_ALL_FOR_ALL;
6918
6919 /* If one device supports hw checksumming, set for all. */
6920 if (all & NETIF_F_GEN_CSUM)
6921 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6922
6923 return all;
6924 }
6925 EXPORT_SYMBOL(netdev_increment_features);
6926
6927 static struct hlist_head * __net_init netdev_create_hash(void)
6928 {
6929 int i;
6930 struct hlist_head *hash;
6931
6932 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6933 if (hash != NULL)
6934 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6935 INIT_HLIST_HEAD(&hash[i]);
6936
6937 return hash;
6938 }
6939
6940 /* Initialize per network namespace state */
6941 static int __net_init netdev_init(struct net *net)
6942 {
6943 if (net != &init_net)
6944 INIT_LIST_HEAD(&net->dev_base_head);
6945
6946 net->dev_name_head = netdev_create_hash();
6947 if (net->dev_name_head == NULL)
6948 goto err_name;
6949
6950 net->dev_index_head = netdev_create_hash();
6951 if (net->dev_index_head == NULL)
6952 goto err_idx;
6953
6954 return 0;
6955
6956 err_idx:
6957 kfree(net->dev_name_head);
6958 err_name:
6959 return -ENOMEM;
6960 }
6961
6962 /**
6963 * netdev_drivername - network driver for the device
6964 * @dev: network device
6965 *
6966 * Determine network driver for device.
6967 */
6968 const char *netdev_drivername(const struct net_device *dev)
6969 {
6970 const struct device_driver *driver;
6971 const struct device *parent;
6972 const char *empty = "";
6973
6974 parent = dev->dev.parent;
6975 if (!parent)
6976 return empty;
6977
6978 driver = parent->driver;
6979 if (driver && driver->name)
6980 return driver->name;
6981 return empty;
6982 }
6983
6984 static int __netdev_printk(const char *level, const struct net_device *dev,
6985 struct va_format *vaf)
6986 {
6987 int r;
6988
6989 if (dev && dev->dev.parent) {
6990 r = dev_printk_emit(level[1] - '0',
6991 dev->dev.parent,
6992 "%s %s %s%s: %pV",
6993 dev_driver_string(dev->dev.parent),
6994 dev_name(dev->dev.parent),
6995 netdev_name(dev), netdev_reg_state(dev),
6996 vaf);
6997 } else if (dev) {
6998 r = printk("%s%s%s: %pV", level, netdev_name(dev),
6999 netdev_reg_state(dev), vaf);
7000 } else {
7001 r = printk("%s(NULL net_device): %pV", level, vaf);
7002 }
7003
7004 return r;
7005 }
7006
7007 int netdev_printk(const char *level, const struct net_device *dev,
7008 const char *format, ...)
7009 {
7010 struct va_format vaf;
7011 va_list args;
7012 int r;
7013
7014 va_start(args, format);
7015
7016 vaf.fmt = format;
7017 vaf.va = &args;
7018
7019 r = __netdev_printk(level, dev, &vaf);
7020
7021 va_end(args);
7022
7023 return r;
7024 }
7025 EXPORT_SYMBOL(netdev_printk);
7026
7027 #define define_netdev_printk_level(func, level) \
7028 int func(const struct net_device *dev, const char *fmt, ...) \
7029 { \
7030 int r; \
7031 struct va_format vaf; \
7032 va_list args; \
7033 \
7034 va_start(args, fmt); \
7035 \
7036 vaf.fmt = fmt; \
7037 vaf.va = &args; \
7038 \
7039 r = __netdev_printk(level, dev, &vaf); \
7040 \
7041 va_end(args); \
7042 \
7043 return r; \
7044 } \
7045 EXPORT_SYMBOL(func);
7046
7047 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7048 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7049 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7050 define_netdev_printk_level(netdev_err, KERN_ERR);
7051 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7052 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7053 define_netdev_printk_level(netdev_info, KERN_INFO);
7054
7055 static void __net_exit netdev_exit(struct net *net)
7056 {
7057 kfree(net->dev_name_head);
7058 kfree(net->dev_index_head);
7059 }
7060
7061 static struct pernet_operations __net_initdata netdev_net_ops = {
7062 .init = netdev_init,
7063 .exit = netdev_exit,
7064 };
7065
7066 static void __net_exit default_device_exit(struct net *net)
7067 {
7068 struct net_device *dev, *aux;
7069 /*
7070 * Push all migratable network devices back to the
7071 * initial network namespace
7072 */
7073 rtnl_lock();
7074 for_each_netdev_safe(net, dev, aux) {
7075 int err;
7076 char fb_name[IFNAMSIZ];
7077
7078 /* Ignore unmoveable devices (i.e. loopback) */
7079 if (dev->features & NETIF_F_NETNS_LOCAL)
7080 continue;
7081
7082 /* Leave virtual devices for the generic cleanup */
7083 if (dev->rtnl_link_ops)
7084 continue;
7085
7086 /* Push remaining network devices to init_net */
7087 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7088 err = dev_change_net_namespace(dev, &init_net, fb_name);
7089 if (err) {
7090 pr_emerg("%s: failed to move %s to init_net: %d\n",
7091 __func__, dev->name, err);
7092 BUG();
7093 }
7094 }
7095 rtnl_unlock();
7096 }
7097
7098 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7099 {
7100 /* Return with the rtnl_lock held when there are no network
7101 * devices unregistering in any network namespace in net_list.
7102 */
7103 struct net *net;
7104 bool unregistering;
7105 DEFINE_WAIT(wait);
7106
7107 for (;;) {
7108 prepare_to_wait(&netdev_unregistering_wq, &wait,
7109 TASK_UNINTERRUPTIBLE);
7110 unregistering = false;
7111 rtnl_lock();
7112 list_for_each_entry(net, net_list, exit_list) {
7113 if (net->dev_unreg_count > 0) {
7114 unregistering = true;
7115 break;
7116 }
7117 }
7118 if (!unregistering)
7119 break;
7120 __rtnl_unlock();
7121 schedule();
7122 }
7123 finish_wait(&netdev_unregistering_wq, &wait);
7124 }
7125
7126 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7127 {
7128 /* At exit all network devices most be removed from a network
7129 * namespace. Do this in the reverse order of registration.
7130 * Do this across as many network namespaces as possible to
7131 * improve batching efficiency.
7132 */
7133 struct net_device *dev;
7134 struct net *net;
7135 LIST_HEAD(dev_kill_list);
7136
7137 /* To prevent network device cleanup code from dereferencing
7138 * loopback devices or network devices that have been freed
7139 * wait here for all pending unregistrations to complete,
7140 * before unregistring the loopback device and allowing the
7141 * network namespace be freed.
7142 *
7143 * The netdev todo list containing all network devices
7144 * unregistrations that happen in default_device_exit_batch
7145 * will run in the rtnl_unlock() at the end of
7146 * default_device_exit_batch.
7147 */
7148 rtnl_lock_unregistering(net_list);
7149 list_for_each_entry(net, net_list, exit_list) {
7150 for_each_netdev_reverse(net, dev) {
7151 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7152 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7153 else
7154 unregister_netdevice_queue(dev, &dev_kill_list);
7155 }
7156 }
7157 unregister_netdevice_many(&dev_kill_list);
7158 rtnl_unlock();
7159 }
7160
7161 static struct pernet_operations __net_initdata default_device_ops = {
7162 .exit = default_device_exit,
7163 .exit_batch = default_device_exit_batch,
7164 };
7165
7166 /*
7167 * Initialize the DEV module. At boot time this walks the device list and
7168 * unhooks any devices that fail to initialise (normally hardware not
7169 * present) and leaves us with a valid list of present and active devices.
7170 *
7171 */
7172
7173 /*
7174 * This is called single threaded during boot, so no need
7175 * to take the rtnl semaphore.
7176 */
7177 static int __init net_dev_init(void)
7178 {
7179 int i, rc = -ENOMEM;
7180
7181 BUG_ON(!dev_boot_phase);
7182
7183 if (dev_proc_init())
7184 goto out;
7185
7186 if (netdev_kobject_init())
7187 goto out;
7188
7189 INIT_LIST_HEAD(&ptype_all);
7190 for (i = 0; i < PTYPE_HASH_SIZE; i++)
7191 INIT_LIST_HEAD(&ptype_base[i]);
7192
7193 INIT_LIST_HEAD(&offload_base);
7194
7195 if (register_pernet_subsys(&netdev_net_ops))
7196 goto out;
7197
7198 /*
7199 * Initialise the packet receive queues.
7200 */
7201
7202 for_each_possible_cpu(i) {
7203 struct softnet_data *sd = &per_cpu(softnet_data, i);
7204
7205 skb_queue_head_init(&sd->input_pkt_queue);
7206 skb_queue_head_init(&sd->process_queue);
7207 INIT_LIST_HEAD(&sd->poll_list);
7208 sd->output_queue_tailp = &sd->output_queue;
7209 #ifdef CONFIG_RPS
7210 sd->csd.func = rps_trigger_softirq;
7211 sd->csd.info = sd;
7212 sd->cpu = i;
7213 #endif
7214
7215 sd->backlog.poll = process_backlog;
7216 sd->backlog.weight = weight_p;
7217 }
7218
7219 dev_boot_phase = 0;
7220
7221 /* The loopback device is special if any other network devices
7222 * is present in a network namespace the loopback device must
7223 * be present. Since we now dynamically allocate and free the
7224 * loopback device ensure this invariant is maintained by
7225 * keeping the loopback device as the first device on the
7226 * list of network devices. Ensuring the loopback devices
7227 * is the first device that appears and the last network device
7228 * that disappears.
7229 */
7230 if (register_pernet_device(&loopback_net_ops))
7231 goto out;
7232
7233 if (register_pernet_device(&default_device_ops))
7234 goto out;
7235
7236 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7237 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7238
7239 hotcpu_notifier(dev_cpu_callback, 0);
7240 dst_init();
7241 rc = 0;
7242 out:
7243 return rc;
7244 }
7245
7246 subsys_initcall(net_dev_init);