]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/core/dev.c
Merge tag 'for-linus-4.9-rc5-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-zesty-kernel.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/sctp.h>
143 #include <linux/crash_dump.h>
144
145 #include "net-sysfs.h"
146
147 /* Instead of increasing this, you should create a hash table. */
148 #define MAX_GRO_SKBS 8
149
150 /* This should be increased if a protocol with a bigger head is added. */
151 #define GRO_MAX_HEAD (MAX_HEADER + 128)
152
153 static DEFINE_SPINLOCK(ptype_lock);
154 static DEFINE_SPINLOCK(offload_lock);
155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
156 struct list_head ptype_all __read_mostly; /* Taps */
157 static struct list_head offload_base __read_mostly;
158
159 static int netif_rx_internal(struct sk_buff *skb);
160 static int call_netdevice_notifiers_info(unsigned long val,
161 struct net_device *dev,
162 struct netdev_notifier_info *info);
163
164 /*
165 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
166 * semaphore.
167 *
168 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
169 *
170 * Writers must hold the rtnl semaphore while they loop through the
171 * dev_base_head list, and hold dev_base_lock for writing when they do the
172 * actual updates. This allows pure readers to access the list even
173 * while a writer is preparing to update it.
174 *
175 * To put it another way, dev_base_lock is held for writing only to
176 * protect against pure readers; the rtnl semaphore provides the
177 * protection against other writers.
178 *
179 * See, for example usages, register_netdevice() and
180 * unregister_netdevice(), which must be called with the rtnl
181 * semaphore held.
182 */
183 DEFINE_RWLOCK(dev_base_lock);
184 EXPORT_SYMBOL(dev_base_lock);
185
186 /* protects napi_hash addition/deletion and napi_gen_id */
187 static DEFINE_SPINLOCK(napi_hash_lock);
188
189 static unsigned int napi_gen_id = NR_CPUS;
190 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
191
192 static seqcount_t devnet_rename_seq;
193
194 static inline void dev_base_seq_inc(struct net *net)
195 {
196 while (++net->dev_base_seq == 0);
197 }
198
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
202
203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
204 }
205
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 {
208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
209 }
210
211 static inline void rps_lock(struct softnet_data *sd)
212 {
213 #ifdef CONFIG_RPS
214 spin_lock(&sd->input_pkt_queue.lock);
215 #endif
216 }
217
218 static inline void rps_unlock(struct softnet_data *sd)
219 {
220 #ifdef CONFIG_RPS
221 spin_unlock(&sd->input_pkt_queue.lock);
222 #endif
223 }
224
225 /* Device list insertion */
226 static void list_netdevice(struct net_device *dev)
227 {
228 struct net *net = dev_net(dev);
229
230 ASSERT_RTNL();
231
232 write_lock_bh(&dev_base_lock);
233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
237 write_unlock_bh(&dev_base_lock);
238
239 dev_base_seq_inc(net);
240 }
241
242 /* Device list removal
243 * caller must respect a RCU grace period before freeing/reusing dev
244 */
245 static void unlist_netdevice(struct net_device *dev)
246 {
247 ASSERT_RTNL();
248
249 /* Unlink dev from the device chain */
250 write_lock_bh(&dev_base_lock);
251 list_del_rcu(&dev->dev_list);
252 hlist_del_rcu(&dev->name_hlist);
253 hlist_del_rcu(&dev->index_hlist);
254 write_unlock_bh(&dev_base_lock);
255
256 dev_base_seq_inc(dev_net(dev));
257 }
258
259 /*
260 * Our notifier list
261 */
262
263 static RAW_NOTIFIER_HEAD(netdev_chain);
264
265 /*
266 * Device drivers call our routines to queue packets here. We empty the
267 * queue in the local softnet handler.
268 */
269
270 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
271 EXPORT_PER_CPU_SYMBOL(softnet_data);
272
273 #ifdef CONFIG_LOCKDEP
274 /*
275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
276 * according to dev->type
277 */
278 static const unsigned short netdev_lock_type[] =
279 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
280 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
281 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
282 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
283 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
284 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
285 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
286 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
287 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
288 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
289 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
290 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
291 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
292 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
293 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
294
295 static const char *const netdev_lock_name[] =
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
309 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
310 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
311
312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314
315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316 {
317 int i;
318
319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 if (netdev_lock_type[i] == dev_type)
321 return i;
322 /* the last key is used by default */
323 return ARRAY_SIZE(netdev_lock_type) - 1;
324 }
325
326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
328 {
329 int i;
330
331 i = netdev_lock_pos(dev_type);
332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 netdev_lock_name[i]);
334 }
335
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 int i;
339
340 i = netdev_lock_pos(dev->type);
341 lockdep_set_class_and_name(&dev->addr_list_lock,
342 &netdev_addr_lock_key[i],
343 netdev_lock_name[i]);
344 }
345 #else
346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 unsigned short dev_type)
348 {
349 }
350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351 {
352 }
353 #endif
354
355 /*******************************************************************************
356
357 Protocol management and registration routines
358
359 *******************************************************************************/
360
361 /*
362 * Add a protocol ID to the list. Now that the input handler is
363 * smarter we can dispense with all the messy stuff that used to be
364 * here.
365 *
366 * BEWARE!!! Protocol handlers, mangling input packets,
367 * MUST BE last in hash buckets and checking protocol handlers
368 * MUST start from promiscuous ptype_all chain in net_bh.
369 * It is true now, do not change it.
370 * Explanation follows: if protocol handler, mangling packet, will
371 * be the first on list, it is not able to sense, that packet
372 * is cloned and should be copied-on-write, so that it will
373 * change it and subsequent readers will get broken packet.
374 * --ANK (980803)
375 */
376
377 static inline struct list_head *ptype_head(const struct packet_type *pt)
378 {
379 if (pt->type == htons(ETH_P_ALL))
380 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
381 else
382 return pt->dev ? &pt->dev->ptype_specific :
383 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385
386 /**
387 * dev_add_pack - add packet handler
388 * @pt: packet type declaration
389 *
390 * Add a protocol handler to the networking stack. The passed &packet_type
391 * is linked into kernel lists and may not be freed until it has been
392 * removed from the kernel lists.
393 *
394 * This call does not sleep therefore it can not
395 * guarantee all CPU's that are in middle of receiving packets
396 * will see the new packet type (until the next received packet).
397 */
398
399 void dev_add_pack(struct packet_type *pt)
400 {
401 struct list_head *head = ptype_head(pt);
402
403 spin_lock(&ptype_lock);
404 list_add_rcu(&pt->list, head);
405 spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408
409 /**
410 * __dev_remove_pack - remove packet handler
411 * @pt: packet type declaration
412 *
413 * Remove a protocol handler that was previously added to the kernel
414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
415 * from the kernel lists and can be freed or reused once this function
416 * returns.
417 *
418 * The packet type might still be in use by receivers
419 * and must not be freed until after all the CPU's have gone
420 * through a quiescent state.
421 */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 struct list_head *head = ptype_head(pt);
425 struct packet_type *pt1;
426
427 spin_lock(&ptype_lock);
428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 pr_warn("dev_remove_pack: %p not found\n", pt);
437 out:
438 spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441
442 /**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 __dev_remove_pack(pt);
457
458 synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461
462
463 /**
464 * dev_add_offload - register offload handlers
465 * @po: protocol offload declaration
466 *
467 * Add protocol offload handlers to the networking stack. The passed
468 * &proto_offload is linked into kernel lists and may not be freed until
469 * it has been removed from the kernel lists.
470 *
471 * This call does not sleep therefore it can not
472 * guarantee all CPU's that are in middle of receiving packets
473 * will see the new offload handlers (until the next received packet).
474 */
475 void dev_add_offload(struct packet_offload *po)
476 {
477 struct packet_offload *elem;
478
479 spin_lock(&offload_lock);
480 list_for_each_entry(elem, &offload_base, list) {
481 if (po->priority < elem->priority)
482 break;
483 }
484 list_add_rcu(&po->list, elem->list.prev);
485 spin_unlock(&offload_lock);
486 }
487 EXPORT_SYMBOL(dev_add_offload);
488
489 /**
490 * __dev_remove_offload - remove offload handler
491 * @po: packet offload declaration
492 *
493 * Remove a protocol offload handler that was previously added to the
494 * kernel offload handlers by dev_add_offload(). The passed &offload_type
495 * is removed from the kernel lists and can be freed or reused once this
496 * function returns.
497 *
498 * The packet type might still be in use by receivers
499 * and must not be freed until after all the CPU's have gone
500 * through a quiescent state.
501 */
502 static void __dev_remove_offload(struct packet_offload *po)
503 {
504 struct list_head *head = &offload_base;
505 struct packet_offload *po1;
506
507 spin_lock(&offload_lock);
508
509 list_for_each_entry(po1, head, list) {
510 if (po == po1) {
511 list_del_rcu(&po->list);
512 goto out;
513 }
514 }
515
516 pr_warn("dev_remove_offload: %p not found\n", po);
517 out:
518 spin_unlock(&offload_lock);
519 }
520
521 /**
522 * dev_remove_offload - remove packet offload handler
523 * @po: packet offload declaration
524 *
525 * Remove a packet offload handler that was previously added to the kernel
526 * offload handlers by dev_add_offload(). The passed &offload_type is
527 * removed from the kernel lists and can be freed or reused once this
528 * function returns.
529 *
530 * This call sleeps to guarantee that no CPU is looking at the packet
531 * type after return.
532 */
533 void dev_remove_offload(struct packet_offload *po)
534 {
535 __dev_remove_offload(po);
536
537 synchronize_net();
538 }
539 EXPORT_SYMBOL(dev_remove_offload);
540
541 /******************************************************************************
542
543 Device Boot-time Settings Routines
544
545 *******************************************************************************/
546
547 /* Boot time configuration table */
548 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
549
550 /**
551 * netdev_boot_setup_add - add new setup entry
552 * @name: name of the device
553 * @map: configured settings for the device
554 *
555 * Adds new setup entry to the dev_boot_setup list. The function
556 * returns 0 on error and 1 on success. This is a generic routine to
557 * all netdevices.
558 */
559 static int netdev_boot_setup_add(char *name, struct ifmap *map)
560 {
561 struct netdev_boot_setup *s;
562 int i;
563
564 s = dev_boot_setup;
565 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
566 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
567 memset(s[i].name, 0, sizeof(s[i].name));
568 strlcpy(s[i].name, name, IFNAMSIZ);
569 memcpy(&s[i].map, map, sizeof(s[i].map));
570 break;
571 }
572 }
573
574 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
575 }
576
577 /**
578 * netdev_boot_setup_check - check boot time settings
579 * @dev: the netdevice
580 *
581 * Check boot time settings for the device.
582 * The found settings are set for the device to be used
583 * later in the device probing.
584 * Returns 0 if no settings found, 1 if they are.
585 */
586 int netdev_boot_setup_check(struct net_device *dev)
587 {
588 struct netdev_boot_setup *s = dev_boot_setup;
589 int i;
590
591 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
592 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
593 !strcmp(dev->name, s[i].name)) {
594 dev->irq = s[i].map.irq;
595 dev->base_addr = s[i].map.base_addr;
596 dev->mem_start = s[i].map.mem_start;
597 dev->mem_end = s[i].map.mem_end;
598 return 1;
599 }
600 }
601 return 0;
602 }
603 EXPORT_SYMBOL(netdev_boot_setup_check);
604
605
606 /**
607 * netdev_boot_base - get address from boot time settings
608 * @prefix: prefix for network device
609 * @unit: id for network device
610 *
611 * Check boot time settings for the base address of device.
612 * The found settings are set for the device to be used
613 * later in the device probing.
614 * Returns 0 if no settings found.
615 */
616 unsigned long netdev_boot_base(const char *prefix, int unit)
617 {
618 const struct netdev_boot_setup *s = dev_boot_setup;
619 char name[IFNAMSIZ];
620 int i;
621
622 sprintf(name, "%s%d", prefix, unit);
623
624 /*
625 * If device already registered then return base of 1
626 * to indicate not to probe for this interface
627 */
628 if (__dev_get_by_name(&init_net, name))
629 return 1;
630
631 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
632 if (!strcmp(name, s[i].name))
633 return s[i].map.base_addr;
634 return 0;
635 }
636
637 /*
638 * Saves at boot time configured settings for any netdevice.
639 */
640 int __init netdev_boot_setup(char *str)
641 {
642 int ints[5];
643 struct ifmap map;
644
645 str = get_options(str, ARRAY_SIZE(ints), ints);
646 if (!str || !*str)
647 return 0;
648
649 /* Save settings */
650 memset(&map, 0, sizeof(map));
651 if (ints[0] > 0)
652 map.irq = ints[1];
653 if (ints[0] > 1)
654 map.base_addr = ints[2];
655 if (ints[0] > 2)
656 map.mem_start = ints[3];
657 if (ints[0] > 3)
658 map.mem_end = ints[4];
659
660 /* Add new entry to the list */
661 return netdev_boot_setup_add(str, &map);
662 }
663
664 __setup("netdev=", netdev_boot_setup);
665
666 /*******************************************************************************
667
668 Device Interface Subroutines
669
670 *******************************************************************************/
671
672 /**
673 * dev_get_iflink - get 'iflink' value of a interface
674 * @dev: targeted interface
675 *
676 * Indicates the ifindex the interface is linked to.
677 * Physical interfaces have the same 'ifindex' and 'iflink' values.
678 */
679
680 int dev_get_iflink(const struct net_device *dev)
681 {
682 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
683 return dev->netdev_ops->ndo_get_iflink(dev);
684
685 return dev->ifindex;
686 }
687 EXPORT_SYMBOL(dev_get_iflink);
688
689 /**
690 * dev_fill_metadata_dst - Retrieve tunnel egress information.
691 * @dev: targeted interface
692 * @skb: The packet.
693 *
694 * For better visibility of tunnel traffic OVS needs to retrieve
695 * egress tunnel information for a packet. Following API allows
696 * user to get this info.
697 */
698 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
699 {
700 struct ip_tunnel_info *info;
701
702 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
703 return -EINVAL;
704
705 info = skb_tunnel_info_unclone(skb);
706 if (!info)
707 return -ENOMEM;
708 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
709 return -EINVAL;
710
711 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
712 }
713 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
714
715 /**
716 * __dev_get_by_name - find a device by its name
717 * @net: the applicable net namespace
718 * @name: name to find
719 *
720 * Find an interface by name. Must be called under RTNL semaphore
721 * or @dev_base_lock. If the name is found a pointer to the device
722 * is returned. If the name is not found then %NULL is returned. The
723 * reference counters are not incremented so the caller must be
724 * careful with locks.
725 */
726
727 struct net_device *__dev_get_by_name(struct net *net, const char *name)
728 {
729 struct net_device *dev;
730 struct hlist_head *head = dev_name_hash(net, name);
731
732 hlist_for_each_entry(dev, head, name_hlist)
733 if (!strncmp(dev->name, name, IFNAMSIZ))
734 return dev;
735
736 return NULL;
737 }
738 EXPORT_SYMBOL(__dev_get_by_name);
739
740 /**
741 * dev_get_by_name_rcu - find a device by its name
742 * @net: the applicable net namespace
743 * @name: name to find
744 *
745 * Find an interface by name.
746 * If the name is found a pointer to the device is returned.
747 * If the name is not found then %NULL is returned.
748 * The reference counters are not incremented so the caller must be
749 * careful with locks. The caller must hold RCU lock.
750 */
751
752 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
753 {
754 struct net_device *dev;
755 struct hlist_head *head = dev_name_hash(net, name);
756
757 hlist_for_each_entry_rcu(dev, head, name_hlist)
758 if (!strncmp(dev->name, name, IFNAMSIZ))
759 return dev;
760
761 return NULL;
762 }
763 EXPORT_SYMBOL(dev_get_by_name_rcu);
764
765 /**
766 * dev_get_by_name - find a device by its name
767 * @net: the applicable net namespace
768 * @name: name to find
769 *
770 * Find an interface by name. This can be called from any
771 * context and does its own locking. The returned handle has
772 * the usage count incremented and the caller must use dev_put() to
773 * release it when it is no longer needed. %NULL is returned if no
774 * matching device is found.
775 */
776
777 struct net_device *dev_get_by_name(struct net *net, const char *name)
778 {
779 struct net_device *dev;
780
781 rcu_read_lock();
782 dev = dev_get_by_name_rcu(net, name);
783 if (dev)
784 dev_hold(dev);
785 rcu_read_unlock();
786 return dev;
787 }
788 EXPORT_SYMBOL(dev_get_by_name);
789
790 /**
791 * __dev_get_by_index - find a device by its ifindex
792 * @net: the applicable net namespace
793 * @ifindex: index of device
794 *
795 * Search for an interface by index. Returns %NULL if the device
796 * is not found or a pointer to the device. The device has not
797 * had its reference counter increased so the caller must be careful
798 * about locking. The caller must hold either the RTNL semaphore
799 * or @dev_base_lock.
800 */
801
802 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
803 {
804 struct net_device *dev;
805 struct hlist_head *head = dev_index_hash(net, ifindex);
806
807 hlist_for_each_entry(dev, head, index_hlist)
808 if (dev->ifindex == ifindex)
809 return dev;
810
811 return NULL;
812 }
813 EXPORT_SYMBOL(__dev_get_by_index);
814
815 /**
816 * dev_get_by_index_rcu - find a device by its ifindex
817 * @net: the applicable net namespace
818 * @ifindex: index of device
819 *
820 * Search for an interface by index. Returns %NULL if the device
821 * is not found or a pointer to the device. The device has not
822 * had its reference counter increased so the caller must be careful
823 * about locking. The caller must hold RCU lock.
824 */
825
826 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
827 {
828 struct net_device *dev;
829 struct hlist_head *head = dev_index_hash(net, ifindex);
830
831 hlist_for_each_entry_rcu(dev, head, index_hlist)
832 if (dev->ifindex == ifindex)
833 return dev;
834
835 return NULL;
836 }
837 EXPORT_SYMBOL(dev_get_by_index_rcu);
838
839
840 /**
841 * dev_get_by_index - find a device by its ifindex
842 * @net: the applicable net namespace
843 * @ifindex: index of device
844 *
845 * Search for an interface by index. Returns NULL if the device
846 * is not found or a pointer to the device. The device returned has
847 * had a reference added and the pointer is safe until the user calls
848 * dev_put to indicate they have finished with it.
849 */
850
851 struct net_device *dev_get_by_index(struct net *net, int ifindex)
852 {
853 struct net_device *dev;
854
855 rcu_read_lock();
856 dev = dev_get_by_index_rcu(net, ifindex);
857 if (dev)
858 dev_hold(dev);
859 rcu_read_unlock();
860 return dev;
861 }
862 EXPORT_SYMBOL(dev_get_by_index);
863
864 /**
865 * netdev_get_name - get a netdevice name, knowing its ifindex.
866 * @net: network namespace
867 * @name: a pointer to the buffer where the name will be stored.
868 * @ifindex: the ifindex of the interface to get the name from.
869 *
870 * The use of raw_seqcount_begin() and cond_resched() before
871 * retrying is required as we want to give the writers a chance
872 * to complete when CONFIG_PREEMPT is not set.
873 */
874 int netdev_get_name(struct net *net, char *name, int ifindex)
875 {
876 struct net_device *dev;
877 unsigned int seq;
878
879 retry:
880 seq = raw_seqcount_begin(&devnet_rename_seq);
881 rcu_read_lock();
882 dev = dev_get_by_index_rcu(net, ifindex);
883 if (!dev) {
884 rcu_read_unlock();
885 return -ENODEV;
886 }
887
888 strcpy(name, dev->name);
889 rcu_read_unlock();
890 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
891 cond_resched();
892 goto retry;
893 }
894
895 return 0;
896 }
897
898 /**
899 * dev_getbyhwaddr_rcu - find a device by its hardware address
900 * @net: the applicable net namespace
901 * @type: media type of device
902 * @ha: hardware address
903 *
904 * Search for an interface by MAC address. Returns NULL if the device
905 * is not found or a pointer to the device.
906 * The caller must hold RCU or RTNL.
907 * The returned device has not had its ref count increased
908 * and the caller must therefore be careful about locking
909 *
910 */
911
912 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
913 const char *ha)
914 {
915 struct net_device *dev;
916
917 for_each_netdev_rcu(net, dev)
918 if (dev->type == type &&
919 !memcmp(dev->dev_addr, ha, dev->addr_len))
920 return dev;
921
922 return NULL;
923 }
924 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
925
926 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
927 {
928 struct net_device *dev;
929
930 ASSERT_RTNL();
931 for_each_netdev(net, dev)
932 if (dev->type == type)
933 return dev;
934
935 return NULL;
936 }
937 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
938
939 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
940 {
941 struct net_device *dev, *ret = NULL;
942
943 rcu_read_lock();
944 for_each_netdev_rcu(net, dev)
945 if (dev->type == type) {
946 dev_hold(dev);
947 ret = dev;
948 break;
949 }
950 rcu_read_unlock();
951 return ret;
952 }
953 EXPORT_SYMBOL(dev_getfirstbyhwtype);
954
955 /**
956 * __dev_get_by_flags - find any device with given flags
957 * @net: the applicable net namespace
958 * @if_flags: IFF_* values
959 * @mask: bitmask of bits in if_flags to check
960 *
961 * Search for any interface with the given flags. Returns NULL if a device
962 * is not found or a pointer to the device. Must be called inside
963 * rtnl_lock(), and result refcount is unchanged.
964 */
965
966 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
967 unsigned short mask)
968 {
969 struct net_device *dev, *ret;
970
971 ASSERT_RTNL();
972
973 ret = NULL;
974 for_each_netdev(net, dev) {
975 if (((dev->flags ^ if_flags) & mask) == 0) {
976 ret = dev;
977 break;
978 }
979 }
980 return ret;
981 }
982 EXPORT_SYMBOL(__dev_get_by_flags);
983
984 /**
985 * dev_valid_name - check if name is okay for network device
986 * @name: name string
987 *
988 * Network device names need to be valid file names to
989 * to allow sysfs to work. We also disallow any kind of
990 * whitespace.
991 */
992 bool dev_valid_name(const char *name)
993 {
994 if (*name == '\0')
995 return false;
996 if (strlen(name) >= IFNAMSIZ)
997 return false;
998 if (!strcmp(name, ".") || !strcmp(name, ".."))
999 return false;
1000
1001 while (*name) {
1002 if (*name == '/' || *name == ':' || isspace(*name))
1003 return false;
1004 name++;
1005 }
1006 return true;
1007 }
1008 EXPORT_SYMBOL(dev_valid_name);
1009
1010 /**
1011 * __dev_alloc_name - allocate a name for a device
1012 * @net: network namespace to allocate the device name in
1013 * @name: name format string
1014 * @buf: scratch buffer and result name string
1015 *
1016 * Passed a format string - eg "lt%d" it will try and find a suitable
1017 * id. It scans list of devices to build up a free map, then chooses
1018 * the first empty slot. The caller must hold the dev_base or rtnl lock
1019 * while allocating the name and adding the device in order to avoid
1020 * duplicates.
1021 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022 * Returns the number of the unit assigned or a negative errno code.
1023 */
1024
1025 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1026 {
1027 int i = 0;
1028 const char *p;
1029 const int max_netdevices = 8*PAGE_SIZE;
1030 unsigned long *inuse;
1031 struct net_device *d;
1032
1033 p = strnchr(name, IFNAMSIZ-1, '%');
1034 if (p) {
1035 /*
1036 * Verify the string as this thing may have come from
1037 * the user. There must be either one "%d" and no other "%"
1038 * characters.
1039 */
1040 if (p[1] != 'd' || strchr(p + 2, '%'))
1041 return -EINVAL;
1042
1043 /* Use one page as a bit array of possible slots */
1044 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1045 if (!inuse)
1046 return -ENOMEM;
1047
1048 for_each_netdev(net, d) {
1049 if (!sscanf(d->name, name, &i))
1050 continue;
1051 if (i < 0 || i >= max_netdevices)
1052 continue;
1053
1054 /* avoid cases where sscanf is not exact inverse of printf */
1055 snprintf(buf, IFNAMSIZ, name, i);
1056 if (!strncmp(buf, d->name, IFNAMSIZ))
1057 set_bit(i, inuse);
1058 }
1059
1060 i = find_first_zero_bit(inuse, max_netdevices);
1061 free_page((unsigned long) inuse);
1062 }
1063
1064 if (buf != name)
1065 snprintf(buf, IFNAMSIZ, name, i);
1066 if (!__dev_get_by_name(net, buf))
1067 return i;
1068
1069 /* It is possible to run out of possible slots
1070 * when the name is long and there isn't enough space left
1071 * for the digits, or if all bits are used.
1072 */
1073 return -ENFILE;
1074 }
1075
1076 /**
1077 * dev_alloc_name - allocate a name for a device
1078 * @dev: device
1079 * @name: name format string
1080 *
1081 * Passed a format string - eg "lt%d" it will try and find a suitable
1082 * id. It scans list of devices to build up a free map, then chooses
1083 * the first empty slot. The caller must hold the dev_base or rtnl lock
1084 * while allocating the name and adding the device in order to avoid
1085 * duplicates.
1086 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1087 * Returns the number of the unit assigned or a negative errno code.
1088 */
1089
1090 int dev_alloc_name(struct net_device *dev, const char *name)
1091 {
1092 char buf[IFNAMSIZ];
1093 struct net *net;
1094 int ret;
1095
1096 BUG_ON(!dev_net(dev));
1097 net = dev_net(dev);
1098 ret = __dev_alloc_name(net, name, buf);
1099 if (ret >= 0)
1100 strlcpy(dev->name, buf, IFNAMSIZ);
1101 return ret;
1102 }
1103 EXPORT_SYMBOL(dev_alloc_name);
1104
1105 static int dev_alloc_name_ns(struct net *net,
1106 struct net_device *dev,
1107 const char *name)
1108 {
1109 char buf[IFNAMSIZ];
1110 int ret;
1111
1112 ret = __dev_alloc_name(net, name, buf);
1113 if (ret >= 0)
1114 strlcpy(dev->name, buf, IFNAMSIZ);
1115 return ret;
1116 }
1117
1118 static int dev_get_valid_name(struct net *net,
1119 struct net_device *dev,
1120 const char *name)
1121 {
1122 BUG_ON(!net);
1123
1124 if (!dev_valid_name(name))
1125 return -EINVAL;
1126
1127 if (strchr(name, '%'))
1128 return dev_alloc_name_ns(net, dev, name);
1129 else if (__dev_get_by_name(net, name))
1130 return -EEXIST;
1131 else if (dev->name != name)
1132 strlcpy(dev->name, name, IFNAMSIZ);
1133
1134 return 0;
1135 }
1136
1137 /**
1138 * dev_change_name - change name of a device
1139 * @dev: device
1140 * @newname: name (or format string) must be at least IFNAMSIZ
1141 *
1142 * Change name of a device, can pass format strings "eth%d".
1143 * for wildcarding.
1144 */
1145 int dev_change_name(struct net_device *dev, const char *newname)
1146 {
1147 unsigned char old_assign_type;
1148 char oldname[IFNAMSIZ];
1149 int err = 0;
1150 int ret;
1151 struct net *net;
1152
1153 ASSERT_RTNL();
1154 BUG_ON(!dev_net(dev));
1155
1156 net = dev_net(dev);
1157 if (dev->flags & IFF_UP)
1158 return -EBUSY;
1159
1160 write_seqcount_begin(&devnet_rename_seq);
1161
1162 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1163 write_seqcount_end(&devnet_rename_seq);
1164 return 0;
1165 }
1166
1167 memcpy(oldname, dev->name, IFNAMSIZ);
1168
1169 err = dev_get_valid_name(net, dev, newname);
1170 if (err < 0) {
1171 write_seqcount_end(&devnet_rename_seq);
1172 return err;
1173 }
1174
1175 if (oldname[0] && !strchr(oldname, '%'))
1176 netdev_info(dev, "renamed from %s\n", oldname);
1177
1178 old_assign_type = dev->name_assign_type;
1179 dev->name_assign_type = NET_NAME_RENAMED;
1180
1181 rollback:
1182 ret = device_rename(&dev->dev, dev->name);
1183 if (ret) {
1184 memcpy(dev->name, oldname, IFNAMSIZ);
1185 dev->name_assign_type = old_assign_type;
1186 write_seqcount_end(&devnet_rename_seq);
1187 return ret;
1188 }
1189
1190 write_seqcount_end(&devnet_rename_seq);
1191
1192 netdev_adjacent_rename_links(dev, oldname);
1193
1194 write_lock_bh(&dev_base_lock);
1195 hlist_del_rcu(&dev->name_hlist);
1196 write_unlock_bh(&dev_base_lock);
1197
1198 synchronize_rcu();
1199
1200 write_lock_bh(&dev_base_lock);
1201 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1202 write_unlock_bh(&dev_base_lock);
1203
1204 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1205 ret = notifier_to_errno(ret);
1206
1207 if (ret) {
1208 /* err >= 0 after dev_alloc_name() or stores the first errno */
1209 if (err >= 0) {
1210 err = ret;
1211 write_seqcount_begin(&devnet_rename_seq);
1212 memcpy(dev->name, oldname, IFNAMSIZ);
1213 memcpy(oldname, newname, IFNAMSIZ);
1214 dev->name_assign_type = old_assign_type;
1215 old_assign_type = NET_NAME_RENAMED;
1216 goto rollback;
1217 } else {
1218 pr_err("%s: name change rollback failed: %d\n",
1219 dev->name, ret);
1220 }
1221 }
1222
1223 return err;
1224 }
1225
1226 /**
1227 * dev_set_alias - change ifalias of a device
1228 * @dev: device
1229 * @alias: name up to IFALIASZ
1230 * @len: limit of bytes to copy from info
1231 *
1232 * Set ifalias for a device,
1233 */
1234 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1235 {
1236 char *new_ifalias;
1237
1238 ASSERT_RTNL();
1239
1240 if (len >= IFALIASZ)
1241 return -EINVAL;
1242
1243 if (!len) {
1244 kfree(dev->ifalias);
1245 dev->ifalias = NULL;
1246 return 0;
1247 }
1248
1249 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1250 if (!new_ifalias)
1251 return -ENOMEM;
1252 dev->ifalias = new_ifalias;
1253
1254 strlcpy(dev->ifalias, alias, len+1);
1255 return len;
1256 }
1257
1258
1259 /**
1260 * netdev_features_change - device changes features
1261 * @dev: device to cause notification
1262 *
1263 * Called to indicate a device has changed features.
1264 */
1265 void netdev_features_change(struct net_device *dev)
1266 {
1267 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1268 }
1269 EXPORT_SYMBOL(netdev_features_change);
1270
1271 /**
1272 * netdev_state_change - device changes state
1273 * @dev: device to cause notification
1274 *
1275 * Called to indicate a device has changed state. This function calls
1276 * the notifier chains for netdev_chain and sends a NEWLINK message
1277 * to the routing socket.
1278 */
1279 void netdev_state_change(struct net_device *dev)
1280 {
1281 if (dev->flags & IFF_UP) {
1282 struct netdev_notifier_change_info change_info;
1283
1284 change_info.flags_changed = 0;
1285 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1286 &change_info.info);
1287 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1288 }
1289 }
1290 EXPORT_SYMBOL(netdev_state_change);
1291
1292 /**
1293 * netdev_notify_peers - notify network peers about existence of @dev
1294 * @dev: network device
1295 *
1296 * Generate traffic such that interested network peers are aware of
1297 * @dev, such as by generating a gratuitous ARP. This may be used when
1298 * a device wants to inform the rest of the network about some sort of
1299 * reconfiguration such as a failover event or virtual machine
1300 * migration.
1301 */
1302 void netdev_notify_peers(struct net_device *dev)
1303 {
1304 rtnl_lock();
1305 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1306 rtnl_unlock();
1307 }
1308 EXPORT_SYMBOL(netdev_notify_peers);
1309
1310 static int __dev_open(struct net_device *dev)
1311 {
1312 const struct net_device_ops *ops = dev->netdev_ops;
1313 int ret;
1314
1315 ASSERT_RTNL();
1316
1317 if (!netif_device_present(dev))
1318 return -ENODEV;
1319
1320 /* Block netpoll from trying to do any rx path servicing.
1321 * If we don't do this there is a chance ndo_poll_controller
1322 * or ndo_poll may be running while we open the device
1323 */
1324 netpoll_poll_disable(dev);
1325
1326 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1327 ret = notifier_to_errno(ret);
1328 if (ret)
1329 return ret;
1330
1331 set_bit(__LINK_STATE_START, &dev->state);
1332
1333 if (ops->ndo_validate_addr)
1334 ret = ops->ndo_validate_addr(dev);
1335
1336 if (!ret && ops->ndo_open)
1337 ret = ops->ndo_open(dev);
1338
1339 netpoll_poll_enable(dev);
1340
1341 if (ret)
1342 clear_bit(__LINK_STATE_START, &dev->state);
1343 else {
1344 dev->flags |= IFF_UP;
1345 dev_set_rx_mode(dev);
1346 dev_activate(dev);
1347 add_device_randomness(dev->dev_addr, dev->addr_len);
1348 }
1349
1350 return ret;
1351 }
1352
1353 /**
1354 * dev_open - prepare an interface for use.
1355 * @dev: device to open
1356 *
1357 * Takes a device from down to up state. The device's private open
1358 * function is invoked and then the multicast lists are loaded. Finally
1359 * the device is moved into the up state and a %NETDEV_UP message is
1360 * sent to the netdev notifier chain.
1361 *
1362 * Calling this function on an active interface is a nop. On a failure
1363 * a negative errno code is returned.
1364 */
1365 int dev_open(struct net_device *dev)
1366 {
1367 int ret;
1368
1369 if (dev->flags & IFF_UP)
1370 return 0;
1371
1372 ret = __dev_open(dev);
1373 if (ret < 0)
1374 return ret;
1375
1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1377 call_netdevice_notifiers(NETDEV_UP, dev);
1378
1379 return ret;
1380 }
1381 EXPORT_SYMBOL(dev_open);
1382
1383 static int __dev_close_many(struct list_head *head)
1384 {
1385 struct net_device *dev;
1386
1387 ASSERT_RTNL();
1388 might_sleep();
1389
1390 list_for_each_entry(dev, head, close_list) {
1391 /* Temporarily disable netpoll until the interface is down */
1392 netpoll_poll_disable(dev);
1393
1394 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1395
1396 clear_bit(__LINK_STATE_START, &dev->state);
1397
1398 /* Synchronize to scheduled poll. We cannot touch poll list, it
1399 * can be even on different cpu. So just clear netif_running().
1400 *
1401 * dev->stop() will invoke napi_disable() on all of it's
1402 * napi_struct instances on this device.
1403 */
1404 smp_mb__after_atomic(); /* Commit netif_running(). */
1405 }
1406
1407 dev_deactivate_many(head);
1408
1409 list_for_each_entry(dev, head, close_list) {
1410 const struct net_device_ops *ops = dev->netdev_ops;
1411
1412 /*
1413 * Call the device specific close. This cannot fail.
1414 * Only if device is UP
1415 *
1416 * We allow it to be called even after a DETACH hot-plug
1417 * event.
1418 */
1419 if (ops->ndo_stop)
1420 ops->ndo_stop(dev);
1421
1422 dev->flags &= ~IFF_UP;
1423 netpoll_poll_enable(dev);
1424 }
1425
1426 return 0;
1427 }
1428
1429 static int __dev_close(struct net_device *dev)
1430 {
1431 int retval;
1432 LIST_HEAD(single);
1433
1434 list_add(&dev->close_list, &single);
1435 retval = __dev_close_many(&single);
1436 list_del(&single);
1437
1438 return retval;
1439 }
1440
1441 int dev_close_many(struct list_head *head, bool unlink)
1442 {
1443 struct net_device *dev, *tmp;
1444
1445 /* Remove the devices that don't need to be closed */
1446 list_for_each_entry_safe(dev, tmp, head, close_list)
1447 if (!(dev->flags & IFF_UP))
1448 list_del_init(&dev->close_list);
1449
1450 __dev_close_many(head);
1451
1452 list_for_each_entry_safe(dev, tmp, head, close_list) {
1453 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1454 call_netdevice_notifiers(NETDEV_DOWN, dev);
1455 if (unlink)
1456 list_del_init(&dev->close_list);
1457 }
1458
1459 return 0;
1460 }
1461 EXPORT_SYMBOL(dev_close_many);
1462
1463 /**
1464 * dev_close - shutdown an interface.
1465 * @dev: device to shutdown
1466 *
1467 * This function moves an active device into down state. A
1468 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1469 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1470 * chain.
1471 */
1472 int dev_close(struct net_device *dev)
1473 {
1474 if (dev->flags & IFF_UP) {
1475 LIST_HEAD(single);
1476
1477 list_add(&dev->close_list, &single);
1478 dev_close_many(&single, true);
1479 list_del(&single);
1480 }
1481 return 0;
1482 }
1483 EXPORT_SYMBOL(dev_close);
1484
1485
1486 /**
1487 * dev_disable_lro - disable Large Receive Offload on a device
1488 * @dev: device
1489 *
1490 * Disable Large Receive Offload (LRO) on a net device. Must be
1491 * called under RTNL. This is needed if received packets may be
1492 * forwarded to another interface.
1493 */
1494 void dev_disable_lro(struct net_device *dev)
1495 {
1496 struct net_device *lower_dev;
1497 struct list_head *iter;
1498
1499 dev->wanted_features &= ~NETIF_F_LRO;
1500 netdev_update_features(dev);
1501
1502 if (unlikely(dev->features & NETIF_F_LRO))
1503 netdev_WARN(dev, "failed to disable LRO!\n");
1504
1505 netdev_for_each_lower_dev(dev, lower_dev, iter)
1506 dev_disable_lro(lower_dev);
1507 }
1508 EXPORT_SYMBOL(dev_disable_lro);
1509
1510 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1511 struct net_device *dev)
1512 {
1513 struct netdev_notifier_info info;
1514
1515 netdev_notifier_info_init(&info, dev);
1516 return nb->notifier_call(nb, val, &info);
1517 }
1518
1519 static int dev_boot_phase = 1;
1520
1521 /**
1522 * register_netdevice_notifier - register a network notifier block
1523 * @nb: notifier
1524 *
1525 * Register a notifier to be called when network device events occur.
1526 * The notifier passed is linked into the kernel structures and must
1527 * not be reused until it has been unregistered. A negative errno code
1528 * is returned on a failure.
1529 *
1530 * When registered all registration and up events are replayed
1531 * to the new notifier to allow device to have a race free
1532 * view of the network device list.
1533 */
1534
1535 int register_netdevice_notifier(struct notifier_block *nb)
1536 {
1537 struct net_device *dev;
1538 struct net_device *last;
1539 struct net *net;
1540 int err;
1541
1542 rtnl_lock();
1543 err = raw_notifier_chain_register(&netdev_chain, nb);
1544 if (err)
1545 goto unlock;
1546 if (dev_boot_phase)
1547 goto unlock;
1548 for_each_net(net) {
1549 for_each_netdev(net, dev) {
1550 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1551 err = notifier_to_errno(err);
1552 if (err)
1553 goto rollback;
1554
1555 if (!(dev->flags & IFF_UP))
1556 continue;
1557
1558 call_netdevice_notifier(nb, NETDEV_UP, dev);
1559 }
1560 }
1561
1562 unlock:
1563 rtnl_unlock();
1564 return err;
1565
1566 rollback:
1567 last = dev;
1568 for_each_net(net) {
1569 for_each_netdev(net, dev) {
1570 if (dev == last)
1571 goto outroll;
1572
1573 if (dev->flags & IFF_UP) {
1574 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1575 dev);
1576 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1577 }
1578 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1579 }
1580 }
1581
1582 outroll:
1583 raw_notifier_chain_unregister(&netdev_chain, nb);
1584 goto unlock;
1585 }
1586 EXPORT_SYMBOL(register_netdevice_notifier);
1587
1588 /**
1589 * unregister_netdevice_notifier - unregister a network notifier block
1590 * @nb: notifier
1591 *
1592 * Unregister a notifier previously registered by
1593 * register_netdevice_notifier(). The notifier is unlinked into the
1594 * kernel structures and may then be reused. A negative errno code
1595 * is returned on a failure.
1596 *
1597 * After unregistering unregister and down device events are synthesized
1598 * for all devices on the device list to the removed notifier to remove
1599 * the need for special case cleanup code.
1600 */
1601
1602 int unregister_netdevice_notifier(struct notifier_block *nb)
1603 {
1604 struct net_device *dev;
1605 struct net *net;
1606 int err;
1607
1608 rtnl_lock();
1609 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1610 if (err)
1611 goto unlock;
1612
1613 for_each_net(net) {
1614 for_each_netdev(net, dev) {
1615 if (dev->flags & IFF_UP) {
1616 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1617 dev);
1618 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1619 }
1620 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1621 }
1622 }
1623 unlock:
1624 rtnl_unlock();
1625 return err;
1626 }
1627 EXPORT_SYMBOL(unregister_netdevice_notifier);
1628
1629 /**
1630 * call_netdevice_notifiers_info - call all network notifier blocks
1631 * @val: value passed unmodified to notifier function
1632 * @dev: net_device pointer passed unmodified to notifier function
1633 * @info: notifier information data
1634 *
1635 * Call all network notifier blocks. Parameters and return value
1636 * are as for raw_notifier_call_chain().
1637 */
1638
1639 static int call_netdevice_notifiers_info(unsigned long val,
1640 struct net_device *dev,
1641 struct netdev_notifier_info *info)
1642 {
1643 ASSERT_RTNL();
1644 netdev_notifier_info_init(info, dev);
1645 return raw_notifier_call_chain(&netdev_chain, val, info);
1646 }
1647
1648 /**
1649 * call_netdevice_notifiers - call all network notifier blocks
1650 * @val: value passed unmodified to notifier function
1651 * @dev: net_device pointer passed unmodified to notifier function
1652 *
1653 * Call all network notifier blocks. Parameters and return value
1654 * are as for raw_notifier_call_chain().
1655 */
1656
1657 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1658 {
1659 struct netdev_notifier_info info;
1660
1661 return call_netdevice_notifiers_info(val, dev, &info);
1662 }
1663 EXPORT_SYMBOL(call_netdevice_notifiers);
1664
1665 #ifdef CONFIG_NET_INGRESS
1666 static struct static_key ingress_needed __read_mostly;
1667
1668 void net_inc_ingress_queue(void)
1669 {
1670 static_key_slow_inc(&ingress_needed);
1671 }
1672 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1673
1674 void net_dec_ingress_queue(void)
1675 {
1676 static_key_slow_dec(&ingress_needed);
1677 }
1678 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1679 #endif
1680
1681 #ifdef CONFIG_NET_EGRESS
1682 static struct static_key egress_needed __read_mostly;
1683
1684 void net_inc_egress_queue(void)
1685 {
1686 static_key_slow_inc(&egress_needed);
1687 }
1688 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1689
1690 void net_dec_egress_queue(void)
1691 {
1692 static_key_slow_dec(&egress_needed);
1693 }
1694 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1695 #endif
1696
1697 static struct static_key netstamp_needed __read_mostly;
1698 #ifdef HAVE_JUMP_LABEL
1699 /* We are not allowed to call static_key_slow_dec() from irq context
1700 * If net_disable_timestamp() is called from irq context, defer the
1701 * static_key_slow_dec() calls.
1702 */
1703 static atomic_t netstamp_needed_deferred;
1704 #endif
1705
1706 void net_enable_timestamp(void)
1707 {
1708 #ifdef HAVE_JUMP_LABEL
1709 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1710
1711 if (deferred) {
1712 while (--deferred)
1713 static_key_slow_dec(&netstamp_needed);
1714 return;
1715 }
1716 #endif
1717 static_key_slow_inc(&netstamp_needed);
1718 }
1719 EXPORT_SYMBOL(net_enable_timestamp);
1720
1721 void net_disable_timestamp(void)
1722 {
1723 #ifdef HAVE_JUMP_LABEL
1724 if (in_interrupt()) {
1725 atomic_inc(&netstamp_needed_deferred);
1726 return;
1727 }
1728 #endif
1729 static_key_slow_dec(&netstamp_needed);
1730 }
1731 EXPORT_SYMBOL(net_disable_timestamp);
1732
1733 static inline void net_timestamp_set(struct sk_buff *skb)
1734 {
1735 skb->tstamp.tv64 = 0;
1736 if (static_key_false(&netstamp_needed))
1737 __net_timestamp(skb);
1738 }
1739
1740 #define net_timestamp_check(COND, SKB) \
1741 if (static_key_false(&netstamp_needed)) { \
1742 if ((COND) && !(SKB)->tstamp.tv64) \
1743 __net_timestamp(SKB); \
1744 } \
1745
1746 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1747 {
1748 unsigned int len;
1749
1750 if (!(dev->flags & IFF_UP))
1751 return false;
1752
1753 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1754 if (skb->len <= len)
1755 return true;
1756
1757 /* if TSO is enabled, we don't care about the length as the packet
1758 * could be forwarded without being segmented before
1759 */
1760 if (skb_is_gso(skb))
1761 return true;
1762
1763 return false;
1764 }
1765 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1766
1767 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1768 {
1769 int ret = ____dev_forward_skb(dev, skb);
1770
1771 if (likely(!ret)) {
1772 skb->protocol = eth_type_trans(skb, dev);
1773 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1774 }
1775
1776 return ret;
1777 }
1778 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1779
1780 /**
1781 * dev_forward_skb - loopback an skb to another netif
1782 *
1783 * @dev: destination network device
1784 * @skb: buffer to forward
1785 *
1786 * return values:
1787 * NET_RX_SUCCESS (no congestion)
1788 * NET_RX_DROP (packet was dropped, but freed)
1789 *
1790 * dev_forward_skb can be used for injecting an skb from the
1791 * start_xmit function of one device into the receive queue
1792 * of another device.
1793 *
1794 * The receiving device may be in another namespace, so
1795 * we have to clear all information in the skb that could
1796 * impact namespace isolation.
1797 */
1798 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1799 {
1800 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1801 }
1802 EXPORT_SYMBOL_GPL(dev_forward_skb);
1803
1804 static inline int deliver_skb(struct sk_buff *skb,
1805 struct packet_type *pt_prev,
1806 struct net_device *orig_dev)
1807 {
1808 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1809 return -ENOMEM;
1810 atomic_inc(&skb->users);
1811 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1812 }
1813
1814 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1815 struct packet_type **pt,
1816 struct net_device *orig_dev,
1817 __be16 type,
1818 struct list_head *ptype_list)
1819 {
1820 struct packet_type *ptype, *pt_prev = *pt;
1821
1822 list_for_each_entry_rcu(ptype, ptype_list, list) {
1823 if (ptype->type != type)
1824 continue;
1825 if (pt_prev)
1826 deliver_skb(skb, pt_prev, orig_dev);
1827 pt_prev = ptype;
1828 }
1829 *pt = pt_prev;
1830 }
1831
1832 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1833 {
1834 if (!ptype->af_packet_priv || !skb->sk)
1835 return false;
1836
1837 if (ptype->id_match)
1838 return ptype->id_match(ptype, skb->sk);
1839 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1840 return true;
1841
1842 return false;
1843 }
1844
1845 /*
1846 * Support routine. Sends outgoing frames to any network
1847 * taps currently in use.
1848 */
1849
1850 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1851 {
1852 struct packet_type *ptype;
1853 struct sk_buff *skb2 = NULL;
1854 struct packet_type *pt_prev = NULL;
1855 struct list_head *ptype_list = &ptype_all;
1856
1857 rcu_read_lock();
1858 again:
1859 list_for_each_entry_rcu(ptype, ptype_list, list) {
1860 /* Never send packets back to the socket
1861 * they originated from - MvS (miquels@drinkel.ow.org)
1862 */
1863 if (skb_loop_sk(ptype, skb))
1864 continue;
1865
1866 if (pt_prev) {
1867 deliver_skb(skb2, pt_prev, skb->dev);
1868 pt_prev = ptype;
1869 continue;
1870 }
1871
1872 /* need to clone skb, done only once */
1873 skb2 = skb_clone(skb, GFP_ATOMIC);
1874 if (!skb2)
1875 goto out_unlock;
1876
1877 net_timestamp_set(skb2);
1878
1879 /* skb->nh should be correctly
1880 * set by sender, so that the second statement is
1881 * just protection against buggy protocols.
1882 */
1883 skb_reset_mac_header(skb2);
1884
1885 if (skb_network_header(skb2) < skb2->data ||
1886 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1887 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1888 ntohs(skb2->protocol),
1889 dev->name);
1890 skb_reset_network_header(skb2);
1891 }
1892
1893 skb2->transport_header = skb2->network_header;
1894 skb2->pkt_type = PACKET_OUTGOING;
1895 pt_prev = ptype;
1896 }
1897
1898 if (ptype_list == &ptype_all) {
1899 ptype_list = &dev->ptype_all;
1900 goto again;
1901 }
1902 out_unlock:
1903 if (pt_prev)
1904 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1905 rcu_read_unlock();
1906 }
1907 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1908
1909 /**
1910 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1911 * @dev: Network device
1912 * @txq: number of queues available
1913 *
1914 * If real_num_tx_queues is changed the tc mappings may no longer be
1915 * valid. To resolve this verify the tc mapping remains valid and if
1916 * not NULL the mapping. With no priorities mapping to this
1917 * offset/count pair it will no longer be used. In the worst case TC0
1918 * is invalid nothing can be done so disable priority mappings. If is
1919 * expected that drivers will fix this mapping if they can before
1920 * calling netif_set_real_num_tx_queues.
1921 */
1922 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1923 {
1924 int i;
1925 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1926
1927 /* If TC0 is invalidated disable TC mapping */
1928 if (tc->offset + tc->count > txq) {
1929 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1930 dev->num_tc = 0;
1931 return;
1932 }
1933
1934 /* Invalidated prio to tc mappings set to TC0 */
1935 for (i = 1; i < TC_BITMASK + 1; i++) {
1936 int q = netdev_get_prio_tc_map(dev, i);
1937
1938 tc = &dev->tc_to_txq[q];
1939 if (tc->offset + tc->count > txq) {
1940 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1941 i, q);
1942 netdev_set_prio_tc_map(dev, i, 0);
1943 }
1944 }
1945 }
1946
1947 #ifdef CONFIG_XPS
1948 static DEFINE_MUTEX(xps_map_mutex);
1949 #define xmap_dereference(P) \
1950 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1951
1952 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1953 int cpu, u16 index)
1954 {
1955 struct xps_map *map = NULL;
1956 int pos;
1957
1958 if (dev_maps)
1959 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1960
1961 for (pos = 0; map && pos < map->len; pos++) {
1962 if (map->queues[pos] == index) {
1963 if (map->len > 1) {
1964 map->queues[pos] = map->queues[--map->len];
1965 } else {
1966 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1967 kfree_rcu(map, rcu);
1968 map = NULL;
1969 }
1970 break;
1971 }
1972 }
1973
1974 return map;
1975 }
1976
1977 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1978 {
1979 struct xps_dev_maps *dev_maps;
1980 int cpu, i;
1981 bool active = false;
1982
1983 mutex_lock(&xps_map_mutex);
1984 dev_maps = xmap_dereference(dev->xps_maps);
1985
1986 if (!dev_maps)
1987 goto out_no_maps;
1988
1989 for_each_possible_cpu(cpu) {
1990 for (i = index; i < dev->num_tx_queues; i++) {
1991 if (!remove_xps_queue(dev_maps, cpu, i))
1992 break;
1993 }
1994 if (i == dev->num_tx_queues)
1995 active = true;
1996 }
1997
1998 if (!active) {
1999 RCU_INIT_POINTER(dev->xps_maps, NULL);
2000 kfree_rcu(dev_maps, rcu);
2001 }
2002
2003 for (i = index; i < dev->num_tx_queues; i++)
2004 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2005 NUMA_NO_NODE);
2006
2007 out_no_maps:
2008 mutex_unlock(&xps_map_mutex);
2009 }
2010
2011 static struct xps_map *expand_xps_map(struct xps_map *map,
2012 int cpu, u16 index)
2013 {
2014 struct xps_map *new_map;
2015 int alloc_len = XPS_MIN_MAP_ALLOC;
2016 int i, pos;
2017
2018 for (pos = 0; map && pos < map->len; pos++) {
2019 if (map->queues[pos] != index)
2020 continue;
2021 return map;
2022 }
2023
2024 /* Need to add queue to this CPU's existing map */
2025 if (map) {
2026 if (pos < map->alloc_len)
2027 return map;
2028
2029 alloc_len = map->alloc_len * 2;
2030 }
2031
2032 /* Need to allocate new map to store queue on this CPU's map */
2033 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2034 cpu_to_node(cpu));
2035 if (!new_map)
2036 return NULL;
2037
2038 for (i = 0; i < pos; i++)
2039 new_map->queues[i] = map->queues[i];
2040 new_map->alloc_len = alloc_len;
2041 new_map->len = pos;
2042
2043 return new_map;
2044 }
2045
2046 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2047 u16 index)
2048 {
2049 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2050 struct xps_map *map, *new_map;
2051 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2052 int cpu, numa_node_id = -2;
2053 bool active = false;
2054
2055 mutex_lock(&xps_map_mutex);
2056
2057 dev_maps = xmap_dereference(dev->xps_maps);
2058
2059 /* allocate memory for queue storage */
2060 for_each_online_cpu(cpu) {
2061 if (!cpumask_test_cpu(cpu, mask))
2062 continue;
2063
2064 if (!new_dev_maps)
2065 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2066 if (!new_dev_maps) {
2067 mutex_unlock(&xps_map_mutex);
2068 return -ENOMEM;
2069 }
2070
2071 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2072 NULL;
2073
2074 map = expand_xps_map(map, cpu, index);
2075 if (!map)
2076 goto error;
2077
2078 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2079 }
2080
2081 if (!new_dev_maps)
2082 goto out_no_new_maps;
2083
2084 for_each_possible_cpu(cpu) {
2085 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2086 /* add queue to CPU maps */
2087 int pos = 0;
2088
2089 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2090 while ((pos < map->len) && (map->queues[pos] != index))
2091 pos++;
2092
2093 if (pos == map->len)
2094 map->queues[map->len++] = index;
2095 #ifdef CONFIG_NUMA
2096 if (numa_node_id == -2)
2097 numa_node_id = cpu_to_node(cpu);
2098 else if (numa_node_id != cpu_to_node(cpu))
2099 numa_node_id = -1;
2100 #endif
2101 } else if (dev_maps) {
2102 /* fill in the new device map from the old device map */
2103 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2104 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2105 }
2106
2107 }
2108
2109 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2110
2111 /* Cleanup old maps */
2112 if (dev_maps) {
2113 for_each_possible_cpu(cpu) {
2114 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2115 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2116 if (map && map != new_map)
2117 kfree_rcu(map, rcu);
2118 }
2119
2120 kfree_rcu(dev_maps, rcu);
2121 }
2122
2123 dev_maps = new_dev_maps;
2124 active = true;
2125
2126 out_no_new_maps:
2127 /* update Tx queue numa node */
2128 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2129 (numa_node_id >= 0) ? numa_node_id :
2130 NUMA_NO_NODE);
2131
2132 if (!dev_maps)
2133 goto out_no_maps;
2134
2135 /* removes queue from unused CPUs */
2136 for_each_possible_cpu(cpu) {
2137 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2138 continue;
2139
2140 if (remove_xps_queue(dev_maps, cpu, index))
2141 active = true;
2142 }
2143
2144 /* free map if not active */
2145 if (!active) {
2146 RCU_INIT_POINTER(dev->xps_maps, NULL);
2147 kfree_rcu(dev_maps, rcu);
2148 }
2149
2150 out_no_maps:
2151 mutex_unlock(&xps_map_mutex);
2152
2153 return 0;
2154 error:
2155 /* remove any maps that we added */
2156 for_each_possible_cpu(cpu) {
2157 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2158 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2159 NULL;
2160 if (new_map && new_map != map)
2161 kfree(new_map);
2162 }
2163
2164 mutex_unlock(&xps_map_mutex);
2165
2166 kfree(new_dev_maps);
2167 return -ENOMEM;
2168 }
2169 EXPORT_SYMBOL(netif_set_xps_queue);
2170
2171 #endif
2172 /*
2173 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2174 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2175 */
2176 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2177 {
2178 int rc;
2179
2180 if (txq < 1 || txq > dev->num_tx_queues)
2181 return -EINVAL;
2182
2183 if (dev->reg_state == NETREG_REGISTERED ||
2184 dev->reg_state == NETREG_UNREGISTERING) {
2185 ASSERT_RTNL();
2186
2187 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2188 txq);
2189 if (rc)
2190 return rc;
2191
2192 if (dev->num_tc)
2193 netif_setup_tc(dev, txq);
2194
2195 if (txq < dev->real_num_tx_queues) {
2196 qdisc_reset_all_tx_gt(dev, txq);
2197 #ifdef CONFIG_XPS
2198 netif_reset_xps_queues_gt(dev, txq);
2199 #endif
2200 }
2201 }
2202
2203 dev->real_num_tx_queues = txq;
2204 return 0;
2205 }
2206 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2207
2208 #ifdef CONFIG_SYSFS
2209 /**
2210 * netif_set_real_num_rx_queues - set actual number of RX queues used
2211 * @dev: Network device
2212 * @rxq: Actual number of RX queues
2213 *
2214 * This must be called either with the rtnl_lock held or before
2215 * registration of the net device. Returns 0 on success, or a
2216 * negative error code. If called before registration, it always
2217 * succeeds.
2218 */
2219 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2220 {
2221 int rc;
2222
2223 if (rxq < 1 || rxq > dev->num_rx_queues)
2224 return -EINVAL;
2225
2226 if (dev->reg_state == NETREG_REGISTERED) {
2227 ASSERT_RTNL();
2228
2229 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2230 rxq);
2231 if (rc)
2232 return rc;
2233 }
2234
2235 dev->real_num_rx_queues = rxq;
2236 return 0;
2237 }
2238 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2239 #endif
2240
2241 /**
2242 * netif_get_num_default_rss_queues - default number of RSS queues
2243 *
2244 * This routine should set an upper limit on the number of RSS queues
2245 * used by default by multiqueue devices.
2246 */
2247 int netif_get_num_default_rss_queues(void)
2248 {
2249 return is_kdump_kernel() ?
2250 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2251 }
2252 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2253
2254 static void __netif_reschedule(struct Qdisc *q)
2255 {
2256 struct softnet_data *sd;
2257 unsigned long flags;
2258
2259 local_irq_save(flags);
2260 sd = this_cpu_ptr(&softnet_data);
2261 q->next_sched = NULL;
2262 *sd->output_queue_tailp = q;
2263 sd->output_queue_tailp = &q->next_sched;
2264 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2265 local_irq_restore(flags);
2266 }
2267
2268 void __netif_schedule(struct Qdisc *q)
2269 {
2270 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2271 __netif_reschedule(q);
2272 }
2273 EXPORT_SYMBOL(__netif_schedule);
2274
2275 struct dev_kfree_skb_cb {
2276 enum skb_free_reason reason;
2277 };
2278
2279 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2280 {
2281 return (struct dev_kfree_skb_cb *)skb->cb;
2282 }
2283
2284 void netif_schedule_queue(struct netdev_queue *txq)
2285 {
2286 rcu_read_lock();
2287 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2288 struct Qdisc *q = rcu_dereference(txq->qdisc);
2289
2290 __netif_schedule(q);
2291 }
2292 rcu_read_unlock();
2293 }
2294 EXPORT_SYMBOL(netif_schedule_queue);
2295
2296 /**
2297 * netif_wake_subqueue - allow sending packets on subqueue
2298 * @dev: network device
2299 * @queue_index: sub queue index
2300 *
2301 * Resume individual transmit queue of a device with multiple transmit queues.
2302 */
2303 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2304 {
2305 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2306
2307 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2308 struct Qdisc *q;
2309
2310 rcu_read_lock();
2311 q = rcu_dereference(txq->qdisc);
2312 __netif_schedule(q);
2313 rcu_read_unlock();
2314 }
2315 }
2316 EXPORT_SYMBOL(netif_wake_subqueue);
2317
2318 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2319 {
2320 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2321 struct Qdisc *q;
2322
2323 rcu_read_lock();
2324 q = rcu_dereference(dev_queue->qdisc);
2325 __netif_schedule(q);
2326 rcu_read_unlock();
2327 }
2328 }
2329 EXPORT_SYMBOL(netif_tx_wake_queue);
2330
2331 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2332 {
2333 unsigned long flags;
2334
2335 if (likely(atomic_read(&skb->users) == 1)) {
2336 smp_rmb();
2337 atomic_set(&skb->users, 0);
2338 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2339 return;
2340 }
2341 get_kfree_skb_cb(skb)->reason = reason;
2342 local_irq_save(flags);
2343 skb->next = __this_cpu_read(softnet_data.completion_queue);
2344 __this_cpu_write(softnet_data.completion_queue, skb);
2345 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2346 local_irq_restore(flags);
2347 }
2348 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2349
2350 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2351 {
2352 if (in_irq() || irqs_disabled())
2353 __dev_kfree_skb_irq(skb, reason);
2354 else
2355 dev_kfree_skb(skb);
2356 }
2357 EXPORT_SYMBOL(__dev_kfree_skb_any);
2358
2359
2360 /**
2361 * netif_device_detach - mark device as removed
2362 * @dev: network device
2363 *
2364 * Mark device as removed from system and therefore no longer available.
2365 */
2366 void netif_device_detach(struct net_device *dev)
2367 {
2368 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2369 netif_running(dev)) {
2370 netif_tx_stop_all_queues(dev);
2371 }
2372 }
2373 EXPORT_SYMBOL(netif_device_detach);
2374
2375 /**
2376 * netif_device_attach - mark device as attached
2377 * @dev: network device
2378 *
2379 * Mark device as attached from system and restart if needed.
2380 */
2381 void netif_device_attach(struct net_device *dev)
2382 {
2383 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2384 netif_running(dev)) {
2385 netif_tx_wake_all_queues(dev);
2386 __netdev_watchdog_up(dev);
2387 }
2388 }
2389 EXPORT_SYMBOL(netif_device_attach);
2390
2391 /*
2392 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2393 * to be used as a distribution range.
2394 */
2395 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2396 unsigned int num_tx_queues)
2397 {
2398 u32 hash;
2399 u16 qoffset = 0;
2400 u16 qcount = num_tx_queues;
2401
2402 if (skb_rx_queue_recorded(skb)) {
2403 hash = skb_get_rx_queue(skb);
2404 while (unlikely(hash >= num_tx_queues))
2405 hash -= num_tx_queues;
2406 return hash;
2407 }
2408
2409 if (dev->num_tc) {
2410 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2411 qoffset = dev->tc_to_txq[tc].offset;
2412 qcount = dev->tc_to_txq[tc].count;
2413 }
2414
2415 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2416 }
2417 EXPORT_SYMBOL(__skb_tx_hash);
2418
2419 static void skb_warn_bad_offload(const struct sk_buff *skb)
2420 {
2421 static const netdev_features_t null_features;
2422 struct net_device *dev = skb->dev;
2423 const char *name = "";
2424
2425 if (!net_ratelimit())
2426 return;
2427
2428 if (dev) {
2429 if (dev->dev.parent)
2430 name = dev_driver_string(dev->dev.parent);
2431 else
2432 name = netdev_name(dev);
2433 }
2434 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2435 "gso_type=%d ip_summed=%d\n",
2436 name, dev ? &dev->features : &null_features,
2437 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2438 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2439 skb_shinfo(skb)->gso_type, skb->ip_summed);
2440 }
2441
2442 /*
2443 * Invalidate hardware checksum when packet is to be mangled, and
2444 * complete checksum manually on outgoing path.
2445 */
2446 int skb_checksum_help(struct sk_buff *skb)
2447 {
2448 __wsum csum;
2449 int ret = 0, offset;
2450
2451 if (skb->ip_summed == CHECKSUM_COMPLETE)
2452 goto out_set_summed;
2453
2454 if (unlikely(skb_shinfo(skb)->gso_size)) {
2455 skb_warn_bad_offload(skb);
2456 return -EINVAL;
2457 }
2458
2459 /* Before computing a checksum, we should make sure no frag could
2460 * be modified by an external entity : checksum could be wrong.
2461 */
2462 if (skb_has_shared_frag(skb)) {
2463 ret = __skb_linearize(skb);
2464 if (ret)
2465 goto out;
2466 }
2467
2468 offset = skb_checksum_start_offset(skb);
2469 BUG_ON(offset >= skb_headlen(skb));
2470 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2471
2472 offset += skb->csum_offset;
2473 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2474
2475 if (skb_cloned(skb) &&
2476 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2477 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2478 if (ret)
2479 goto out;
2480 }
2481
2482 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2483 out_set_summed:
2484 skb->ip_summed = CHECKSUM_NONE;
2485 out:
2486 return ret;
2487 }
2488 EXPORT_SYMBOL(skb_checksum_help);
2489
2490 /* skb_csum_offload_check - Driver helper function to determine if a device
2491 * with limited checksum offload capabilities is able to offload the checksum
2492 * for a given packet.
2493 *
2494 * Arguments:
2495 * skb - sk_buff for the packet in question
2496 * spec - contains the description of what device can offload
2497 * csum_encapped - returns true if the checksum being offloaded is
2498 * encpasulated. That is it is checksum for the transport header
2499 * in the inner headers.
2500 * checksum_help - when set indicates that helper function should
2501 * call skb_checksum_help if offload checks fail
2502 *
2503 * Returns:
2504 * true: Packet has passed the checksum checks and should be offloadable to
2505 * the device (a driver may still need to check for additional
2506 * restrictions of its device)
2507 * false: Checksum is not offloadable. If checksum_help was set then
2508 * skb_checksum_help was called to resolve checksum for non-GSO
2509 * packets and when IP protocol is not SCTP
2510 */
2511 bool __skb_csum_offload_chk(struct sk_buff *skb,
2512 const struct skb_csum_offl_spec *spec,
2513 bool *csum_encapped,
2514 bool csum_help)
2515 {
2516 struct iphdr *iph;
2517 struct ipv6hdr *ipv6;
2518 void *nhdr;
2519 int protocol;
2520 u8 ip_proto;
2521
2522 if (skb->protocol == htons(ETH_P_8021Q) ||
2523 skb->protocol == htons(ETH_P_8021AD)) {
2524 if (!spec->vlan_okay)
2525 goto need_help;
2526 }
2527
2528 /* We check whether the checksum refers to a transport layer checksum in
2529 * the outermost header or an encapsulated transport layer checksum that
2530 * corresponds to the inner headers of the skb. If the checksum is for
2531 * something else in the packet we need help.
2532 */
2533 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2534 /* Non-encapsulated checksum */
2535 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2536 nhdr = skb_network_header(skb);
2537 *csum_encapped = false;
2538 if (spec->no_not_encapped)
2539 goto need_help;
2540 } else if (skb->encapsulation && spec->encap_okay &&
2541 skb_checksum_start_offset(skb) ==
2542 skb_inner_transport_offset(skb)) {
2543 /* Encapsulated checksum */
2544 *csum_encapped = true;
2545 switch (skb->inner_protocol_type) {
2546 case ENCAP_TYPE_ETHER:
2547 protocol = eproto_to_ipproto(skb->inner_protocol);
2548 break;
2549 case ENCAP_TYPE_IPPROTO:
2550 protocol = skb->inner_protocol;
2551 break;
2552 }
2553 nhdr = skb_inner_network_header(skb);
2554 } else {
2555 goto need_help;
2556 }
2557
2558 switch (protocol) {
2559 case IPPROTO_IP:
2560 if (!spec->ipv4_okay)
2561 goto need_help;
2562 iph = nhdr;
2563 ip_proto = iph->protocol;
2564 if (iph->ihl != 5 && !spec->ip_options_okay)
2565 goto need_help;
2566 break;
2567 case IPPROTO_IPV6:
2568 if (!spec->ipv6_okay)
2569 goto need_help;
2570 if (spec->no_encapped_ipv6 && *csum_encapped)
2571 goto need_help;
2572 ipv6 = nhdr;
2573 nhdr += sizeof(*ipv6);
2574 ip_proto = ipv6->nexthdr;
2575 break;
2576 default:
2577 goto need_help;
2578 }
2579
2580 ip_proto_again:
2581 switch (ip_proto) {
2582 case IPPROTO_TCP:
2583 if (!spec->tcp_okay ||
2584 skb->csum_offset != offsetof(struct tcphdr, check))
2585 goto need_help;
2586 break;
2587 case IPPROTO_UDP:
2588 if (!spec->udp_okay ||
2589 skb->csum_offset != offsetof(struct udphdr, check))
2590 goto need_help;
2591 break;
2592 case IPPROTO_SCTP:
2593 if (!spec->sctp_okay ||
2594 skb->csum_offset != offsetof(struct sctphdr, checksum))
2595 goto cant_help;
2596 break;
2597 case NEXTHDR_HOP:
2598 case NEXTHDR_ROUTING:
2599 case NEXTHDR_DEST: {
2600 u8 *opthdr = nhdr;
2601
2602 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2603 goto need_help;
2604
2605 ip_proto = opthdr[0];
2606 nhdr += (opthdr[1] + 1) << 3;
2607
2608 goto ip_proto_again;
2609 }
2610 default:
2611 goto need_help;
2612 }
2613
2614 /* Passed the tests for offloading checksum */
2615 return true;
2616
2617 need_help:
2618 if (csum_help && !skb_shinfo(skb)->gso_size)
2619 skb_checksum_help(skb);
2620 cant_help:
2621 return false;
2622 }
2623 EXPORT_SYMBOL(__skb_csum_offload_chk);
2624
2625 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2626 {
2627 __be16 type = skb->protocol;
2628
2629 /* Tunnel gso handlers can set protocol to ethernet. */
2630 if (type == htons(ETH_P_TEB)) {
2631 struct ethhdr *eth;
2632
2633 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2634 return 0;
2635
2636 eth = (struct ethhdr *)skb_mac_header(skb);
2637 type = eth->h_proto;
2638 }
2639
2640 return __vlan_get_protocol(skb, type, depth);
2641 }
2642
2643 /**
2644 * skb_mac_gso_segment - mac layer segmentation handler.
2645 * @skb: buffer to segment
2646 * @features: features for the output path (see dev->features)
2647 */
2648 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2649 netdev_features_t features)
2650 {
2651 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2652 struct packet_offload *ptype;
2653 int vlan_depth = skb->mac_len;
2654 __be16 type = skb_network_protocol(skb, &vlan_depth);
2655
2656 if (unlikely(!type))
2657 return ERR_PTR(-EINVAL);
2658
2659 __skb_pull(skb, vlan_depth);
2660
2661 rcu_read_lock();
2662 list_for_each_entry_rcu(ptype, &offload_base, list) {
2663 if (ptype->type == type && ptype->callbacks.gso_segment) {
2664 segs = ptype->callbacks.gso_segment(skb, features);
2665 break;
2666 }
2667 }
2668 rcu_read_unlock();
2669
2670 __skb_push(skb, skb->data - skb_mac_header(skb));
2671
2672 return segs;
2673 }
2674 EXPORT_SYMBOL(skb_mac_gso_segment);
2675
2676
2677 /* openvswitch calls this on rx path, so we need a different check.
2678 */
2679 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2680 {
2681 if (tx_path)
2682 return skb->ip_summed != CHECKSUM_PARTIAL;
2683 else
2684 return skb->ip_summed == CHECKSUM_NONE;
2685 }
2686
2687 /**
2688 * __skb_gso_segment - Perform segmentation on skb.
2689 * @skb: buffer to segment
2690 * @features: features for the output path (see dev->features)
2691 * @tx_path: whether it is called in TX path
2692 *
2693 * This function segments the given skb and returns a list of segments.
2694 *
2695 * It may return NULL if the skb requires no segmentation. This is
2696 * only possible when GSO is used for verifying header integrity.
2697 *
2698 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2699 */
2700 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2701 netdev_features_t features, bool tx_path)
2702 {
2703 if (unlikely(skb_needs_check(skb, tx_path))) {
2704 int err;
2705
2706 skb_warn_bad_offload(skb);
2707
2708 err = skb_cow_head(skb, 0);
2709 if (err < 0)
2710 return ERR_PTR(err);
2711 }
2712
2713 /* Only report GSO partial support if it will enable us to
2714 * support segmentation on this frame without needing additional
2715 * work.
2716 */
2717 if (features & NETIF_F_GSO_PARTIAL) {
2718 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2719 struct net_device *dev = skb->dev;
2720
2721 partial_features |= dev->features & dev->gso_partial_features;
2722 if (!skb_gso_ok(skb, features | partial_features))
2723 features &= ~NETIF_F_GSO_PARTIAL;
2724 }
2725
2726 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2727 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2728
2729 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2730 SKB_GSO_CB(skb)->encap_level = 0;
2731
2732 skb_reset_mac_header(skb);
2733 skb_reset_mac_len(skb);
2734
2735 return skb_mac_gso_segment(skb, features);
2736 }
2737 EXPORT_SYMBOL(__skb_gso_segment);
2738
2739 /* Take action when hardware reception checksum errors are detected. */
2740 #ifdef CONFIG_BUG
2741 void netdev_rx_csum_fault(struct net_device *dev)
2742 {
2743 if (net_ratelimit()) {
2744 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2745 dump_stack();
2746 }
2747 }
2748 EXPORT_SYMBOL(netdev_rx_csum_fault);
2749 #endif
2750
2751 /* Actually, we should eliminate this check as soon as we know, that:
2752 * 1. IOMMU is present and allows to map all the memory.
2753 * 2. No high memory really exists on this machine.
2754 */
2755
2756 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2757 {
2758 #ifdef CONFIG_HIGHMEM
2759 int i;
2760 if (!(dev->features & NETIF_F_HIGHDMA)) {
2761 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2762 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2763 if (PageHighMem(skb_frag_page(frag)))
2764 return 1;
2765 }
2766 }
2767
2768 if (PCI_DMA_BUS_IS_PHYS) {
2769 struct device *pdev = dev->dev.parent;
2770
2771 if (!pdev)
2772 return 0;
2773 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2774 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2775 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2776 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2777 return 1;
2778 }
2779 }
2780 #endif
2781 return 0;
2782 }
2783
2784 /* If MPLS offload request, verify we are testing hardware MPLS features
2785 * instead of standard features for the netdev.
2786 */
2787 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2788 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2789 netdev_features_t features,
2790 __be16 type)
2791 {
2792 if (eth_p_mpls(type))
2793 features &= skb->dev->mpls_features;
2794
2795 return features;
2796 }
2797 #else
2798 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2799 netdev_features_t features,
2800 __be16 type)
2801 {
2802 return features;
2803 }
2804 #endif
2805
2806 static netdev_features_t harmonize_features(struct sk_buff *skb,
2807 netdev_features_t features)
2808 {
2809 int tmp;
2810 __be16 type;
2811
2812 type = skb_network_protocol(skb, &tmp);
2813 features = net_mpls_features(skb, features, type);
2814
2815 if (skb->ip_summed != CHECKSUM_NONE &&
2816 !can_checksum_protocol(features, type)) {
2817 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2818 } else if (illegal_highdma(skb->dev, skb)) {
2819 features &= ~NETIF_F_SG;
2820 }
2821
2822 return features;
2823 }
2824
2825 netdev_features_t passthru_features_check(struct sk_buff *skb,
2826 struct net_device *dev,
2827 netdev_features_t features)
2828 {
2829 return features;
2830 }
2831 EXPORT_SYMBOL(passthru_features_check);
2832
2833 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2834 struct net_device *dev,
2835 netdev_features_t features)
2836 {
2837 return vlan_features_check(skb, features);
2838 }
2839
2840 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2841 struct net_device *dev,
2842 netdev_features_t features)
2843 {
2844 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2845
2846 if (gso_segs > dev->gso_max_segs)
2847 return features & ~NETIF_F_GSO_MASK;
2848
2849 /* Support for GSO partial features requires software
2850 * intervention before we can actually process the packets
2851 * so we need to strip support for any partial features now
2852 * and we can pull them back in after we have partially
2853 * segmented the frame.
2854 */
2855 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2856 features &= ~dev->gso_partial_features;
2857
2858 /* Make sure to clear the IPv4 ID mangling feature if the
2859 * IPv4 header has the potential to be fragmented.
2860 */
2861 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2862 struct iphdr *iph = skb->encapsulation ?
2863 inner_ip_hdr(skb) : ip_hdr(skb);
2864
2865 if (!(iph->frag_off & htons(IP_DF)))
2866 features &= ~NETIF_F_TSO_MANGLEID;
2867 }
2868
2869 return features;
2870 }
2871
2872 netdev_features_t netif_skb_features(struct sk_buff *skb)
2873 {
2874 struct net_device *dev = skb->dev;
2875 netdev_features_t features = dev->features;
2876
2877 if (skb_is_gso(skb))
2878 features = gso_features_check(skb, dev, features);
2879
2880 /* If encapsulation offload request, verify we are testing
2881 * hardware encapsulation features instead of standard
2882 * features for the netdev
2883 */
2884 if (skb->encapsulation)
2885 features &= dev->hw_enc_features;
2886
2887 if (skb_vlan_tagged(skb))
2888 features = netdev_intersect_features(features,
2889 dev->vlan_features |
2890 NETIF_F_HW_VLAN_CTAG_TX |
2891 NETIF_F_HW_VLAN_STAG_TX);
2892
2893 if (dev->netdev_ops->ndo_features_check)
2894 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2895 features);
2896 else
2897 features &= dflt_features_check(skb, dev, features);
2898
2899 return harmonize_features(skb, features);
2900 }
2901 EXPORT_SYMBOL(netif_skb_features);
2902
2903 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2904 struct netdev_queue *txq, bool more)
2905 {
2906 unsigned int len;
2907 int rc;
2908
2909 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2910 dev_queue_xmit_nit(skb, dev);
2911
2912 len = skb->len;
2913 trace_net_dev_start_xmit(skb, dev);
2914 rc = netdev_start_xmit(skb, dev, txq, more);
2915 trace_net_dev_xmit(skb, rc, dev, len);
2916
2917 return rc;
2918 }
2919
2920 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2921 struct netdev_queue *txq, int *ret)
2922 {
2923 struct sk_buff *skb = first;
2924 int rc = NETDEV_TX_OK;
2925
2926 while (skb) {
2927 struct sk_buff *next = skb->next;
2928
2929 skb->next = NULL;
2930 rc = xmit_one(skb, dev, txq, next != NULL);
2931 if (unlikely(!dev_xmit_complete(rc))) {
2932 skb->next = next;
2933 goto out;
2934 }
2935
2936 skb = next;
2937 if (netif_xmit_stopped(txq) && skb) {
2938 rc = NETDEV_TX_BUSY;
2939 break;
2940 }
2941 }
2942
2943 out:
2944 *ret = rc;
2945 return skb;
2946 }
2947
2948 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2949 netdev_features_t features)
2950 {
2951 if (skb_vlan_tag_present(skb) &&
2952 !vlan_hw_offload_capable(features, skb->vlan_proto))
2953 skb = __vlan_hwaccel_push_inside(skb);
2954 return skb;
2955 }
2956
2957 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2958 {
2959 netdev_features_t features;
2960
2961 features = netif_skb_features(skb);
2962 skb = validate_xmit_vlan(skb, features);
2963 if (unlikely(!skb))
2964 goto out_null;
2965
2966 if (netif_needs_gso(skb, features)) {
2967 struct sk_buff *segs;
2968
2969 segs = skb_gso_segment(skb, features);
2970 if (IS_ERR(segs)) {
2971 goto out_kfree_skb;
2972 } else if (segs) {
2973 consume_skb(skb);
2974 skb = segs;
2975 }
2976 } else {
2977 if (skb_needs_linearize(skb, features) &&
2978 __skb_linearize(skb))
2979 goto out_kfree_skb;
2980
2981 /* If packet is not checksummed and device does not
2982 * support checksumming for this protocol, complete
2983 * checksumming here.
2984 */
2985 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2986 if (skb->encapsulation)
2987 skb_set_inner_transport_header(skb,
2988 skb_checksum_start_offset(skb));
2989 else
2990 skb_set_transport_header(skb,
2991 skb_checksum_start_offset(skb));
2992 if (!(features & NETIF_F_CSUM_MASK) &&
2993 skb_checksum_help(skb))
2994 goto out_kfree_skb;
2995 }
2996 }
2997
2998 return skb;
2999
3000 out_kfree_skb:
3001 kfree_skb(skb);
3002 out_null:
3003 atomic_long_inc(&dev->tx_dropped);
3004 return NULL;
3005 }
3006
3007 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3008 {
3009 struct sk_buff *next, *head = NULL, *tail;
3010
3011 for (; skb != NULL; skb = next) {
3012 next = skb->next;
3013 skb->next = NULL;
3014
3015 /* in case skb wont be segmented, point to itself */
3016 skb->prev = skb;
3017
3018 skb = validate_xmit_skb(skb, dev);
3019 if (!skb)
3020 continue;
3021
3022 if (!head)
3023 head = skb;
3024 else
3025 tail->next = skb;
3026 /* If skb was segmented, skb->prev points to
3027 * the last segment. If not, it still contains skb.
3028 */
3029 tail = skb->prev;
3030 }
3031 return head;
3032 }
3033 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3034
3035 static void qdisc_pkt_len_init(struct sk_buff *skb)
3036 {
3037 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3038
3039 qdisc_skb_cb(skb)->pkt_len = skb->len;
3040
3041 /* To get more precise estimation of bytes sent on wire,
3042 * we add to pkt_len the headers size of all segments
3043 */
3044 if (shinfo->gso_size) {
3045 unsigned int hdr_len;
3046 u16 gso_segs = shinfo->gso_segs;
3047
3048 /* mac layer + network layer */
3049 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3050
3051 /* + transport layer */
3052 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3053 hdr_len += tcp_hdrlen(skb);
3054 else
3055 hdr_len += sizeof(struct udphdr);
3056
3057 if (shinfo->gso_type & SKB_GSO_DODGY)
3058 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3059 shinfo->gso_size);
3060
3061 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3062 }
3063 }
3064
3065 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3066 struct net_device *dev,
3067 struct netdev_queue *txq)
3068 {
3069 spinlock_t *root_lock = qdisc_lock(q);
3070 struct sk_buff *to_free = NULL;
3071 bool contended;
3072 int rc;
3073
3074 qdisc_calculate_pkt_len(skb, q);
3075 /*
3076 * Heuristic to force contended enqueues to serialize on a
3077 * separate lock before trying to get qdisc main lock.
3078 * This permits qdisc->running owner to get the lock more
3079 * often and dequeue packets faster.
3080 */
3081 contended = qdisc_is_running(q);
3082 if (unlikely(contended))
3083 spin_lock(&q->busylock);
3084
3085 spin_lock(root_lock);
3086 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3087 __qdisc_drop(skb, &to_free);
3088 rc = NET_XMIT_DROP;
3089 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3090 qdisc_run_begin(q)) {
3091 /*
3092 * This is a work-conserving queue; there are no old skbs
3093 * waiting to be sent out; and the qdisc is not running -
3094 * xmit the skb directly.
3095 */
3096
3097 qdisc_bstats_update(q, skb);
3098
3099 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3100 if (unlikely(contended)) {
3101 spin_unlock(&q->busylock);
3102 contended = false;
3103 }
3104 __qdisc_run(q);
3105 } else
3106 qdisc_run_end(q);
3107
3108 rc = NET_XMIT_SUCCESS;
3109 } else {
3110 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3111 if (qdisc_run_begin(q)) {
3112 if (unlikely(contended)) {
3113 spin_unlock(&q->busylock);
3114 contended = false;
3115 }
3116 __qdisc_run(q);
3117 }
3118 }
3119 spin_unlock(root_lock);
3120 if (unlikely(to_free))
3121 kfree_skb_list(to_free);
3122 if (unlikely(contended))
3123 spin_unlock(&q->busylock);
3124 return rc;
3125 }
3126
3127 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3128 static void skb_update_prio(struct sk_buff *skb)
3129 {
3130 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3131
3132 if (!skb->priority && skb->sk && map) {
3133 unsigned int prioidx =
3134 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3135
3136 if (prioidx < map->priomap_len)
3137 skb->priority = map->priomap[prioidx];
3138 }
3139 }
3140 #else
3141 #define skb_update_prio(skb)
3142 #endif
3143
3144 DEFINE_PER_CPU(int, xmit_recursion);
3145 EXPORT_SYMBOL(xmit_recursion);
3146
3147 /**
3148 * dev_loopback_xmit - loop back @skb
3149 * @net: network namespace this loopback is happening in
3150 * @sk: sk needed to be a netfilter okfn
3151 * @skb: buffer to transmit
3152 */
3153 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3154 {
3155 skb_reset_mac_header(skb);
3156 __skb_pull(skb, skb_network_offset(skb));
3157 skb->pkt_type = PACKET_LOOPBACK;
3158 skb->ip_summed = CHECKSUM_UNNECESSARY;
3159 WARN_ON(!skb_dst(skb));
3160 skb_dst_force(skb);
3161 netif_rx_ni(skb);
3162 return 0;
3163 }
3164 EXPORT_SYMBOL(dev_loopback_xmit);
3165
3166 #ifdef CONFIG_NET_EGRESS
3167 static struct sk_buff *
3168 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3169 {
3170 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3171 struct tcf_result cl_res;
3172
3173 if (!cl)
3174 return skb;
3175
3176 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3177 * earlier by the caller.
3178 */
3179 qdisc_bstats_cpu_update(cl->q, skb);
3180
3181 switch (tc_classify(skb, cl, &cl_res, false)) {
3182 case TC_ACT_OK:
3183 case TC_ACT_RECLASSIFY:
3184 skb->tc_index = TC_H_MIN(cl_res.classid);
3185 break;
3186 case TC_ACT_SHOT:
3187 qdisc_qstats_cpu_drop(cl->q);
3188 *ret = NET_XMIT_DROP;
3189 kfree_skb(skb);
3190 return NULL;
3191 case TC_ACT_STOLEN:
3192 case TC_ACT_QUEUED:
3193 *ret = NET_XMIT_SUCCESS;
3194 consume_skb(skb);
3195 return NULL;
3196 case TC_ACT_REDIRECT:
3197 /* No need to push/pop skb's mac_header here on egress! */
3198 skb_do_redirect(skb);
3199 *ret = NET_XMIT_SUCCESS;
3200 return NULL;
3201 default:
3202 break;
3203 }
3204
3205 return skb;
3206 }
3207 #endif /* CONFIG_NET_EGRESS */
3208
3209 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3210 {
3211 #ifdef CONFIG_XPS
3212 struct xps_dev_maps *dev_maps;
3213 struct xps_map *map;
3214 int queue_index = -1;
3215
3216 rcu_read_lock();
3217 dev_maps = rcu_dereference(dev->xps_maps);
3218 if (dev_maps) {
3219 map = rcu_dereference(
3220 dev_maps->cpu_map[skb->sender_cpu - 1]);
3221 if (map) {
3222 if (map->len == 1)
3223 queue_index = map->queues[0];
3224 else
3225 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3226 map->len)];
3227 if (unlikely(queue_index >= dev->real_num_tx_queues))
3228 queue_index = -1;
3229 }
3230 }
3231 rcu_read_unlock();
3232
3233 return queue_index;
3234 #else
3235 return -1;
3236 #endif
3237 }
3238
3239 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3240 {
3241 struct sock *sk = skb->sk;
3242 int queue_index = sk_tx_queue_get(sk);
3243
3244 if (queue_index < 0 || skb->ooo_okay ||
3245 queue_index >= dev->real_num_tx_queues) {
3246 int new_index = get_xps_queue(dev, skb);
3247 if (new_index < 0)
3248 new_index = skb_tx_hash(dev, skb);
3249
3250 if (queue_index != new_index && sk &&
3251 sk_fullsock(sk) &&
3252 rcu_access_pointer(sk->sk_dst_cache))
3253 sk_tx_queue_set(sk, new_index);
3254
3255 queue_index = new_index;
3256 }
3257
3258 return queue_index;
3259 }
3260
3261 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3262 struct sk_buff *skb,
3263 void *accel_priv)
3264 {
3265 int queue_index = 0;
3266
3267 #ifdef CONFIG_XPS
3268 u32 sender_cpu = skb->sender_cpu - 1;
3269
3270 if (sender_cpu >= (u32)NR_CPUS)
3271 skb->sender_cpu = raw_smp_processor_id() + 1;
3272 #endif
3273
3274 if (dev->real_num_tx_queues != 1) {
3275 const struct net_device_ops *ops = dev->netdev_ops;
3276 if (ops->ndo_select_queue)
3277 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3278 __netdev_pick_tx);
3279 else
3280 queue_index = __netdev_pick_tx(dev, skb);
3281
3282 if (!accel_priv)
3283 queue_index = netdev_cap_txqueue(dev, queue_index);
3284 }
3285
3286 skb_set_queue_mapping(skb, queue_index);
3287 return netdev_get_tx_queue(dev, queue_index);
3288 }
3289
3290 /**
3291 * __dev_queue_xmit - transmit a buffer
3292 * @skb: buffer to transmit
3293 * @accel_priv: private data used for L2 forwarding offload
3294 *
3295 * Queue a buffer for transmission to a network device. The caller must
3296 * have set the device and priority and built the buffer before calling
3297 * this function. The function can be called from an interrupt.
3298 *
3299 * A negative errno code is returned on a failure. A success does not
3300 * guarantee the frame will be transmitted as it may be dropped due
3301 * to congestion or traffic shaping.
3302 *
3303 * -----------------------------------------------------------------------------------
3304 * I notice this method can also return errors from the queue disciplines,
3305 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3306 * be positive.
3307 *
3308 * Regardless of the return value, the skb is consumed, so it is currently
3309 * difficult to retry a send to this method. (You can bump the ref count
3310 * before sending to hold a reference for retry if you are careful.)
3311 *
3312 * When calling this method, interrupts MUST be enabled. This is because
3313 * the BH enable code must have IRQs enabled so that it will not deadlock.
3314 * --BLG
3315 */
3316 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3317 {
3318 struct net_device *dev = skb->dev;
3319 struct netdev_queue *txq;
3320 struct Qdisc *q;
3321 int rc = -ENOMEM;
3322
3323 skb_reset_mac_header(skb);
3324
3325 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3326 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3327
3328 /* Disable soft irqs for various locks below. Also
3329 * stops preemption for RCU.
3330 */
3331 rcu_read_lock_bh();
3332
3333 skb_update_prio(skb);
3334
3335 qdisc_pkt_len_init(skb);
3336 #ifdef CONFIG_NET_CLS_ACT
3337 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3338 # ifdef CONFIG_NET_EGRESS
3339 if (static_key_false(&egress_needed)) {
3340 skb = sch_handle_egress(skb, &rc, dev);
3341 if (!skb)
3342 goto out;
3343 }
3344 # endif
3345 #endif
3346 /* If device/qdisc don't need skb->dst, release it right now while
3347 * its hot in this cpu cache.
3348 */
3349 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3350 skb_dst_drop(skb);
3351 else
3352 skb_dst_force(skb);
3353
3354 txq = netdev_pick_tx(dev, skb, accel_priv);
3355 q = rcu_dereference_bh(txq->qdisc);
3356
3357 trace_net_dev_queue(skb);
3358 if (q->enqueue) {
3359 rc = __dev_xmit_skb(skb, q, dev, txq);
3360 goto out;
3361 }
3362
3363 /* The device has no queue. Common case for software devices:
3364 loopback, all the sorts of tunnels...
3365
3366 Really, it is unlikely that netif_tx_lock protection is necessary
3367 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3368 counters.)
3369 However, it is possible, that they rely on protection
3370 made by us here.
3371
3372 Check this and shot the lock. It is not prone from deadlocks.
3373 Either shot noqueue qdisc, it is even simpler 8)
3374 */
3375 if (dev->flags & IFF_UP) {
3376 int cpu = smp_processor_id(); /* ok because BHs are off */
3377
3378 if (txq->xmit_lock_owner != cpu) {
3379 if (unlikely(__this_cpu_read(xmit_recursion) >
3380 XMIT_RECURSION_LIMIT))
3381 goto recursion_alert;
3382
3383 skb = validate_xmit_skb(skb, dev);
3384 if (!skb)
3385 goto out;
3386
3387 HARD_TX_LOCK(dev, txq, cpu);
3388
3389 if (!netif_xmit_stopped(txq)) {
3390 __this_cpu_inc(xmit_recursion);
3391 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3392 __this_cpu_dec(xmit_recursion);
3393 if (dev_xmit_complete(rc)) {
3394 HARD_TX_UNLOCK(dev, txq);
3395 goto out;
3396 }
3397 }
3398 HARD_TX_UNLOCK(dev, txq);
3399 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3400 dev->name);
3401 } else {
3402 /* Recursion is detected! It is possible,
3403 * unfortunately
3404 */
3405 recursion_alert:
3406 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3407 dev->name);
3408 }
3409 }
3410
3411 rc = -ENETDOWN;
3412 rcu_read_unlock_bh();
3413
3414 atomic_long_inc(&dev->tx_dropped);
3415 kfree_skb_list(skb);
3416 return rc;
3417 out:
3418 rcu_read_unlock_bh();
3419 return rc;
3420 }
3421
3422 int dev_queue_xmit(struct sk_buff *skb)
3423 {
3424 return __dev_queue_xmit(skb, NULL);
3425 }
3426 EXPORT_SYMBOL(dev_queue_xmit);
3427
3428 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3429 {
3430 return __dev_queue_xmit(skb, accel_priv);
3431 }
3432 EXPORT_SYMBOL(dev_queue_xmit_accel);
3433
3434
3435 /*=======================================================================
3436 Receiver routines
3437 =======================================================================*/
3438
3439 int netdev_max_backlog __read_mostly = 1000;
3440 EXPORT_SYMBOL(netdev_max_backlog);
3441
3442 int netdev_tstamp_prequeue __read_mostly = 1;
3443 int netdev_budget __read_mostly = 300;
3444 int weight_p __read_mostly = 64; /* old backlog weight */
3445
3446 /* Called with irq disabled */
3447 static inline void ____napi_schedule(struct softnet_data *sd,
3448 struct napi_struct *napi)
3449 {
3450 list_add_tail(&napi->poll_list, &sd->poll_list);
3451 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3452 }
3453
3454 #ifdef CONFIG_RPS
3455
3456 /* One global table that all flow-based protocols share. */
3457 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3458 EXPORT_SYMBOL(rps_sock_flow_table);
3459 u32 rps_cpu_mask __read_mostly;
3460 EXPORT_SYMBOL(rps_cpu_mask);
3461
3462 struct static_key rps_needed __read_mostly;
3463 EXPORT_SYMBOL(rps_needed);
3464
3465 static struct rps_dev_flow *
3466 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3467 struct rps_dev_flow *rflow, u16 next_cpu)
3468 {
3469 if (next_cpu < nr_cpu_ids) {
3470 #ifdef CONFIG_RFS_ACCEL
3471 struct netdev_rx_queue *rxqueue;
3472 struct rps_dev_flow_table *flow_table;
3473 struct rps_dev_flow *old_rflow;
3474 u32 flow_id;
3475 u16 rxq_index;
3476 int rc;
3477
3478 /* Should we steer this flow to a different hardware queue? */
3479 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3480 !(dev->features & NETIF_F_NTUPLE))
3481 goto out;
3482 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3483 if (rxq_index == skb_get_rx_queue(skb))
3484 goto out;
3485
3486 rxqueue = dev->_rx + rxq_index;
3487 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3488 if (!flow_table)
3489 goto out;
3490 flow_id = skb_get_hash(skb) & flow_table->mask;
3491 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3492 rxq_index, flow_id);
3493 if (rc < 0)
3494 goto out;
3495 old_rflow = rflow;
3496 rflow = &flow_table->flows[flow_id];
3497 rflow->filter = rc;
3498 if (old_rflow->filter == rflow->filter)
3499 old_rflow->filter = RPS_NO_FILTER;
3500 out:
3501 #endif
3502 rflow->last_qtail =
3503 per_cpu(softnet_data, next_cpu).input_queue_head;
3504 }
3505
3506 rflow->cpu = next_cpu;
3507 return rflow;
3508 }
3509
3510 /*
3511 * get_rps_cpu is called from netif_receive_skb and returns the target
3512 * CPU from the RPS map of the receiving queue for a given skb.
3513 * rcu_read_lock must be held on entry.
3514 */
3515 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3516 struct rps_dev_flow **rflowp)
3517 {
3518 const struct rps_sock_flow_table *sock_flow_table;
3519 struct netdev_rx_queue *rxqueue = dev->_rx;
3520 struct rps_dev_flow_table *flow_table;
3521 struct rps_map *map;
3522 int cpu = -1;
3523 u32 tcpu;
3524 u32 hash;
3525
3526 if (skb_rx_queue_recorded(skb)) {
3527 u16 index = skb_get_rx_queue(skb);
3528
3529 if (unlikely(index >= dev->real_num_rx_queues)) {
3530 WARN_ONCE(dev->real_num_rx_queues > 1,
3531 "%s received packet on queue %u, but number "
3532 "of RX queues is %u\n",
3533 dev->name, index, dev->real_num_rx_queues);
3534 goto done;
3535 }
3536 rxqueue += index;
3537 }
3538
3539 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3540
3541 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3542 map = rcu_dereference(rxqueue->rps_map);
3543 if (!flow_table && !map)
3544 goto done;
3545
3546 skb_reset_network_header(skb);
3547 hash = skb_get_hash(skb);
3548 if (!hash)
3549 goto done;
3550
3551 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3552 if (flow_table && sock_flow_table) {
3553 struct rps_dev_flow *rflow;
3554 u32 next_cpu;
3555 u32 ident;
3556
3557 /* First check into global flow table if there is a match */
3558 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3559 if ((ident ^ hash) & ~rps_cpu_mask)
3560 goto try_rps;
3561
3562 next_cpu = ident & rps_cpu_mask;
3563
3564 /* OK, now we know there is a match,
3565 * we can look at the local (per receive queue) flow table
3566 */
3567 rflow = &flow_table->flows[hash & flow_table->mask];
3568 tcpu = rflow->cpu;
3569
3570 /*
3571 * If the desired CPU (where last recvmsg was done) is
3572 * different from current CPU (one in the rx-queue flow
3573 * table entry), switch if one of the following holds:
3574 * - Current CPU is unset (>= nr_cpu_ids).
3575 * - Current CPU is offline.
3576 * - The current CPU's queue tail has advanced beyond the
3577 * last packet that was enqueued using this table entry.
3578 * This guarantees that all previous packets for the flow
3579 * have been dequeued, thus preserving in order delivery.
3580 */
3581 if (unlikely(tcpu != next_cpu) &&
3582 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3583 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3584 rflow->last_qtail)) >= 0)) {
3585 tcpu = next_cpu;
3586 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3587 }
3588
3589 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3590 *rflowp = rflow;
3591 cpu = tcpu;
3592 goto done;
3593 }
3594 }
3595
3596 try_rps:
3597
3598 if (map) {
3599 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3600 if (cpu_online(tcpu)) {
3601 cpu = tcpu;
3602 goto done;
3603 }
3604 }
3605
3606 done:
3607 return cpu;
3608 }
3609
3610 #ifdef CONFIG_RFS_ACCEL
3611
3612 /**
3613 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3614 * @dev: Device on which the filter was set
3615 * @rxq_index: RX queue index
3616 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3617 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3618 *
3619 * Drivers that implement ndo_rx_flow_steer() should periodically call
3620 * this function for each installed filter and remove the filters for
3621 * which it returns %true.
3622 */
3623 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3624 u32 flow_id, u16 filter_id)
3625 {
3626 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3627 struct rps_dev_flow_table *flow_table;
3628 struct rps_dev_flow *rflow;
3629 bool expire = true;
3630 unsigned int cpu;
3631
3632 rcu_read_lock();
3633 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3634 if (flow_table && flow_id <= flow_table->mask) {
3635 rflow = &flow_table->flows[flow_id];
3636 cpu = ACCESS_ONCE(rflow->cpu);
3637 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3638 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3639 rflow->last_qtail) <
3640 (int)(10 * flow_table->mask)))
3641 expire = false;
3642 }
3643 rcu_read_unlock();
3644 return expire;
3645 }
3646 EXPORT_SYMBOL(rps_may_expire_flow);
3647
3648 #endif /* CONFIG_RFS_ACCEL */
3649
3650 /* Called from hardirq (IPI) context */
3651 static void rps_trigger_softirq(void *data)
3652 {
3653 struct softnet_data *sd = data;
3654
3655 ____napi_schedule(sd, &sd->backlog);
3656 sd->received_rps++;
3657 }
3658
3659 #endif /* CONFIG_RPS */
3660
3661 /*
3662 * Check if this softnet_data structure is another cpu one
3663 * If yes, queue it to our IPI list and return 1
3664 * If no, return 0
3665 */
3666 static int rps_ipi_queued(struct softnet_data *sd)
3667 {
3668 #ifdef CONFIG_RPS
3669 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3670
3671 if (sd != mysd) {
3672 sd->rps_ipi_next = mysd->rps_ipi_list;
3673 mysd->rps_ipi_list = sd;
3674
3675 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3676 return 1;
3677 }
3678 #endif /* CONFIG_RPS */
3679 return 0;
3680 }
3681
3682 #ifdef CONFIG_NET_FLOW_LIMIT
3683 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3684 #endif
3685
3686 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3687 {
3688 #ifdef CONFIG_NET_FLOW_LIMIT
3689 struct sd_flow_limit *fl;
3690 struct softnet_data *sd;
3691 unsigned int old_flow, new_flow;
3692
3693 if (qlen < (netdev_max_backlog >> 1))
3694 return false;
3695
3696 sd = this_cpu_ptr(&softnet_data);
3697
3698 rcu_read_lock();
3699 fl = rcu_dereference(sd->flow_limit);
3700 if (fl) {
3701 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3702 old_flow = fl->history[fl->history_head];
3703 fl->history[fl->history_head] = new_flow;
3704
3705 fl->history_head++;
3706 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3707
3708 if (likely(fl->buckets[old_flow]))
3709 fl->buckets[old_flow]--;
3710
3711 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3712 fl->count++;
3713 rcu_read_unlock();
3714 return true;
3715 }
3716 }
3717 rcu_read_unlock();
3718 #endif
3719 return false;
3720 }
3721
3722 /*
3723 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3724 * queue (may be a remote CPU queue).
3725 */
3726 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3727 unsigned int *qtail)
3728 {
3729 struct softnet_data *sd;
3730 unsigned long flags;
3731 unsigned int qlen;
3732
3733 sd = &per_cpu(softnet_data, cpu);
3734
3735 local_irq_save(flags);
3736
3737 rps_lock(sd);
3738 if (!netif_running(skb->dev))
3739 goto drop;
3740 qlen = skb_queue_len(&sd->input_pkt_queue);
3741 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3742 if (qlen) {
3743 enqueue:
3744 __skb_queue_tail(&sd->input_pkt_queue, skb);
3745 input_queue_tail_incr_save(sd, qtail);
3746 rps_unlock(sd);
3747 local_irq_restore(flags);
3748 return NET_RX_SUCCESS;
3749 }
3750
3751 /* Schedule NAPI for backlog device
3752 * We can use non atomic operation since we own the queue lock
3753 */
3754 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3755 if (!rps_ipi_queued(sd))
3756 ____napi_schedule(sd, &sd->backlog);
3757 }
3758 goto enqueue;
3759 }
3760
3761 drop:
3762 sd->dropped++;
3763 rps_unlock(sd);
3764
3765 local_irq_restore(flags);
3766
3767 atomic_long_inc(&skb->dev->rx_dropped);
3768 kfree_skb(skb);
3769 return NET_RX_DROP;
3770 }
3771
3772 static int netif_rx_internal(struct sk_buff *skb)
3773 {
3774 int ret;
3775
3776 net_timestamp_check(netdev_tstamp_prequeue, skb);
3777
3778 trace_netif_rx(skb);
3779 #ifdef CONFIG_RPS
3780 if (static_key_false(&rps_needed)) {
3781 struct rps_dev_flow voidflow, *rflow = &voidflow;
3782 int cpu;
3783
3784 preempt_disable();
3785 rcu_read_lock();
3786
3787 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3788 if (cpu < 0)
3789 cpu = smp_processor_id();
3790
3791 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3792
3793 rcu_read_unlock();
3794 preempt_enable();
3795 } else
3796 #endif
3797 {
3798 unsigned int qtail;
3799 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3800 put_cpu();
3801 }
3802 return ret;
3803 }
3804
3805 /**
3806 * netif_rx - post buffer to the network code
3807 * @skb: buffer to post
3808 *
3809 * This function receives a packet from a device driver and queues it for
3810 * the upper (protocol) levels to process. It always succeeds. The buffer
3811 * may be dropped during processing for congestion control or by the
3812 * protocol layers.
3813 *
3814 * return values:
3815 * NET_RX_SUCCESS (no congestion)
3816 * NET_RX_DROP (packet was dropped)
3817 *
3818 */
3819
3820 int netif_rx(struct sk_buff *skb)
3821 {
3822 trace_netif_rx_entry(skb);
3823
3824 return netif_rx_internal(skb);
3825 }
3826 EXPORT_SYMBOL(netif_rx);
3827
3828 int netif_rx_ni(struct sk_buff *skb)
3829 {
3830 int err;
3831
3832 trace_netif_rx_ni_entry(skb);
3833
3834 preempt_disable();
3835 err = netif_rx_internal(skb);
3836 if (local_softirq_pending())
3837 do_softirq();
3838 preempt_enable();
3839
3840 return err;
3841 }
3842 EXPORT_SYMBOL(netif_rx_ni);
3843
3844 static __latent_entropy void net_tx_action(struct softirq_action *h)
3845 {
3846 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3847
3848 if (sd->completion_queue) {
3849 struct sk_buff *clist;
3850
3851 local_irq_disable();
3852 clist = sd->completion_queue;
3853 sd->completion_queue = NULL;
3854 local_irq_enable();
3855
3856 while (clist) {
3857 struct sk_buff *skb = clist;
3858 clist = clist->next;
3859
3860 WARN_ON(atomic_read(&skb->users));
3861 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3862 trace_consume_skb(skb);
3863 else
3864 trace_kfree_skb(skb, net_tx_action);
3865
3866 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3867 __kfree_skb(skb);
3868 else
3869 __kfree_skb_defer(skb);
3870 }
3871
3872 __kfree_skb_flush();
3873 }
3874
3875 if (sd->output_queue) {
3876 struct Qdisc *head;
3877
3878 local_irq_disable();
3879 head = sd->output_queue;
3880 sd->output_queue = NULL;
3881 sd->output_queue_tailp = &sd->output_queue;
3882 local_irq_enable();
3883
3884 while (head) {
3885 struct Qdisc *q = head;
3886 spinlock_t *root_lock;
3887
3888 head = head->next_sched;
3889
3890 root_lock = qdisc_lock(q);
3891 spin_lock(root_lock);
3892 /* We need to make sure head->next_sched is read
3893 * before clearing __QDISC_STATE_SCHED
3894 */
3895 smp_mb__before_atomic();
3896 clear_bit(__QDISC_STATE_SCHED, &q->state);
3897 qdisc_run(q);
3898 spin_unlock(root_lock);
3899 }
3900 }
3901 }
3902
3903 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3904 /* This hook is defined here for ATM LANE */
3905 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3906 unsigned char *addr) __read_mostly;
3907 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3908 #endif
3909
3910 static inline struct sk_buff *
3911 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3912 struct net_device *orig_dev)
3913 {
3914 #ifdef CONFIG_NET_CLS_ACT
3915 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3916 struct tcf_result cl_res;
3917
3918 /* If there's at least one ingress present somewhere (so
3919 * we get here via enabled static key), remaining devices
3920 * that are not configured with an ingress qdisc will bail
3921 * out here.
3922 */
3923 if (!cl)
3924 return skb;
3925 if (*pt_prev) {
3926 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3927 *pt_prev = NULL;
3928 }
3929
3930 qdisc_skb_cb(skb)->pkt_len = skb->len;
3931 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3932 qdisc_bstats_cpu_update(cl->q, skb);
3933
3934 switch (tc_classify(skb, cl, &cl_res, false)) {
3935 case TC_ACT_OK:
3936 case TC_ACT_RECLASSIFY:
3937 skb->tc_index = TC_H_MIN(cl_res.classid);
3938 break;
3939 case TC_ACT_SHOT:
3940 qdisc_qstats_cpu_drop(cl->q);
3941 kfree_skb(skb);
3942 return NULL;
3943 case TC_ACT_STOLEN:
3944 case TC_ACT_QUEUED:
3945 consume_skb(skb);
3946 return NULL;
3947 case TC_ACT_REDIRECT:
3948 /* skb_mac_header check was done by cls/act_bpf, so
3949 * we can safely push the L2 header back before
3950 * redirecting to another netdev
3951 */
3952 __skb_push(skb, skb->mac_len);
3953 skb_do_redirect(skb);
3954 return NULL;
3955 default:
3956 break;
3957 }
3958 #endif /* CONFIG_NET_CLS_ACT */
3959 return skb;
3960 }
3961
3962 /**
3963 * netdev_is_rx_handler_busy - check if receive handler is registered
3964 * @dev: device to check
3965 *
3966 * Check if a receive handler is already registered for a given device.
3967 * Return true if there one.
3968 *
3969 * The caller must hold the rtnl_mutex.
3970 */
3971 bool netdev_is_rx_handler_busy(struct net_device *dev)
3972 {
3973 ASSERT_RTNL();
3974 return dev && rtnl_dereference(dev->rx_handler);
3975 }
3976 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3977
3978 /**
3979 * netdev_rx_handler_register - register receive handler
3980 * @dev: device to register a handler for
3981 * @rx_handler: receive handler to register
3982 * @rx_handler_data: data pointer that is used by rx handler
3983 *
3984 * Register a receive handler for a device. This handler will then be
3985 * called from __netif_receive_skb. A negative errno code is returned
3986 * on a failure.
3987 *
3988 * The caller must hold the rtnl_mutex.
3989 *
3990 * For a general description of rx_handler, see enum rx_handler_result.
3991 */
3992 int netdev_rx_handler_register(struct net_device *dev,
3993 rx_handler_func_t *rx_handler,
3994 void *rx_handler_data)
3995 {
3996 ASSERT_RTNL();
3997
3998 if (dev->rx_handler)
3999 return -EBUSY;
4000
4001 /* Note: rx_handler_data must be set before rx_handler */
4002 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4003 rcu_assign_pointer(dev->rx_handler, rx_handler);
4004
4005 return 0;
4006 }
4007 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4008
4009 /**
4010 * netdev_rx_handler_unregister - unregister receive handler
4011 * @dev: device to unregister a handler from
4012 *
4013 * Unregister a receive handler from a device.
4014 *
4015 * The caller must hold the rtnl_mutex.
4016 */
4017 void netdev_rx_handler_unregister(struct net_device *dev)
4018 {
4019
4020 ASSERT_RTNL();
4021 RCU_INIT_POINTER(dev->rx_handler, NULL);
4022 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4023 * section has a guarantee to see a non NULL rx_handler_data
4024 * as well.
4025 */
4026 synchronize_net();
4027 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4028 }
4029 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4030
4031 /*
4032 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4033 * the special handling of PFMEMALLOC skbs.
4034 */
4035 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4036 {
4037 switch (skb->protocol) {
4038 case htons(ETH_P_ARP):
4039 case htons(ETH_P_IP):
4040 case htons(ETH_P_IPV6):
4041 case htons(ETH_P_8021Q):
4042 case htons(ETH_P_8021AD):
4043 return true;
4044 default:
4045 return false;
4046 }
4047 }
4048
4049 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4050 int *ret, struct net_device *orig_dev)
4051 {
4052 #ifdef CONFIG_NETFILTER_INGRESS
4053 if (nf_hook_ingress_active(skb)) {
4054 int ingress_retval;
4055
4056 if (*pt_prev) {
4057 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4058 *pt_prev = NULL;
4059 }
4060
4061 rcu_read_lock();
4062 ingress_retval = nf_hook_ingress(skb);
4063 rcu_read_unlock();
4064 return ingress_retval;
4065 }
4066 #endif /* CONFIG_NETFILTER_INGRESS */
4067 return 0;
4068 }
4069
4070 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4071 {
4072 struct packet_type *ptype, *pt_prev;
4073 rx_handler_func_t *rx_handler;
4074 struct net_device *orig_dev;
4075 bool deliver_exact = false;
4076 int ret = NET_RX_DROP;
4077 __be16 type;
4078
4079 net_timestamp_check(!netdev_tstamp_prequeue, skb);
4080
4081 trace_netif_receive_skb(skb);
4082
4083 orig_dev = skb->dev;
4084
4085 skb_reset_network_header(skb);
4086 if (!skb_transport_header_was_set(skb))
4087 skb_reset_transport_header(skb);
4088 skb_reset_mac_len(skb);
4089
4090 pt_prev = NULL;
4091
4092 another_round:
4093 skb->skb_iif = skb->dev->ifindex;
4094
4095 __this_cpu_inc(softnet_data.processed);
4096
4097 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4098 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4099 skb = skb_vlan_untag(skb);
4100 if (unlikely(!skb))
4101 goto out;
4102 }
4103
4104 #ifdef CONFIG_NET_CLS_ACT
4105 if (skb->tc_verd & TC_NCLS) {
4106 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4107 goto ncls;
4108 }
4109 #endif
4110
4111 if (pfmemalloc)
4112 goto skip_taps;
4113
4114 list_for_each_entry_rcu(ptype, &ptype_all, list) {
4115 if (pt_prev)
4116 ret = deliver_skb(skb, pt_prev, orig_dev);
4117 pt_prev = ptype;
4118 }
4119
4120 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4121 if (pt_prev)
4122 ret = deliver_skb(skb, pt_prev, orig_dev);
4123 pt_prev = ptype;
4124 }
4125
4126 skip_taps:
4127 #ifdef CONFIG_NET_INGRESS
4128 if (static_key_false(&ingress_needed)) {
4129 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4130 if (!skb)
4131 goto out;
4132
4133 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4134 goto out;
4135 }
4136 #endif
4137 #ifdef CONFIG_NET_CLS_ACT
4138 skb->tc_verd = 0;
4139 ncls:
4140 #endif
4141 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4142 goto drop;
4143
4144 if (skb_vlan_tag_present(skb)) {
4145 if (pt_prev) {
4146 ret = deliver_skb(skb, pt_prev, orig_dev);
4147 pt_prev = NULL;
4148 }
4149 if (vlan_do_receive(&skb))
4150 goto another_round;
4151 else if (unlikely(!skb))
4152 goto out;
4153 }
4154
4155 rx_handler = rcu_dereference(skb->dev->rx_handler);
4156 if (rx_handler) {
4157 if (pt_prev) {
4158 ret = deliver_skb(skb, pt_prev, orig_dev);
4159 pt_prev = NULL;
4160 }
4161 switch (rx_handler(&skb)) {
4162 case RX_HANDLER_CONSUMED:
4163 ret = NET_RX_SUCCESS;
4164 goto out;
4165 case RX_HANDLER_ANOTHER:
4166 goto another_round;
4167 case RX_HANDLER_EXACT:
4168 deliver_exact = true;
4169 case RX_HANDLER_PASS:
4170 break;
4171 default:
4172 BUG();
4173 }
4174 }
4175
4176 if (unlikely(skb_vlan_tag_present(skb))) {
4177 if (skb_vlan_tag_get_id(skb))
4178 skb->pkt_type = PACKET_OTHERHOST;
4179 /* Note: we might in the future use prio bits
4180 * and set skb->priority like in vlan_do_receive()
4181 * For the time being, just ignore Priority Code Point
4182 */
4183 skb->vlan_tci = 0;
4184 }
4185
4186 type = skb->protocol;
4187
4188 /* deliver only exact match when indicated */
4189 if (likely(!deliver_exact)) {
4190 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4191 &ptype_base[ntohs(type) &
4192 PTYPE_HASH_MASK]);
4193 }
4194
4195 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4196 &orig_dev->ptype_specific);
4197
4198 if (unlikely(skb->dev != orig_dev)) {
4199 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4200 &skb->dev->ptype_specific);
4201 }
4202
4203 if (pt_prev) {
4204 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4205 goto drop;
4206 else
4207 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4208 } else {
4209 drop:
4210 if (!deliver_exact)
4211 atomic_long_inc(&skb->dev->rx_dropped);
4212 else
4213 atomic_long_inc(&skb->dev->rx_nohandler);
4214 kfree_skb(skb);
4215 /* Jamal, now you will not able to escape explaining
4216 * me how you were going to use this. :-)
4217 */
4218 ret = NET_RX_DROP;
4219 }
4220
4221 out:
4222 return ret;
4223 }
4224
4225 static int __netif_receive_skb(struct sk_buff *skb)
4226 {
4227 int ret;
4228
4229 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4230 unsigned long pflags = current->flags;
4231
4232 /*
4233 * PFMEMALLOC skbs are special, they should
4234 * - be delivered to SOCK_MEMALLOC sockets only
4235 * - stay away from userspace
4236 * - have bounded memory usage
4237 *
4238 * Use PF_MEMALLOC as this saves us from propagating the allocation
4239 * context down to all allocation sites.
4240 */
4241 current->flags |= PF_MEMALLOC;
4242 ret = __netif_receive_skb_core(skb, true);
4243 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4244 } else
4245 ret = __netif_receive_skb_core(skb, false);
4246
4247 return ret;
4248 }
4249
4250 static int netif_receive_skb_internal(struct sk_buff *skb)
4251 {
4252 int ret;
4253
4254 net_timestamp_check(netdev_tstamp_prequeue, skb);
4255
4256 if (skb_defer_rx_timestamp(skb))
4257 return NET_RX_SUCCESS;
4258
4259 rcu_read_lock();
4260
4261 #ifdef CONFIG_RPS
4262 if (static_key_false(&rps_needed)) {
4263 struct rps_dev_flow voidflow, *rflow = &voidflow;
4264 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4265
4266 if (cpu >= 0) {
4267 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4268 rcu_read_unlock();
4269 return ret;
4270 }
4271 }
4272 #endif
4273 ret = __netif_receive_skb(skb);
4274 rcu_read_unlock();
4275 return ret;
4276 }
4277
4278 /**
4279 * netif_receive_skb - process receive buffer from network
4280 * @skb: buffer to process
4281 *
4282 * netif_receive_skb() is the main receive data processing function.
4283 * It always succeeds. The buffer may be dropped during processing
4284 * for congestion control or by the protocol layers.
4285 *
4286 * This function may only be called from softirq context and interrupts
4287 * should be enabled.
4288 *
4289 * Return values (usually ignored):
4290 * NET_RX_SUCCESS: no congestion
4291 * NET_RX_DROP: packet was dropped
4292 */
4293 int netif_receive_skb(struct sk_buff *skb)
4294 {
4295 trace_netif_receive_skb_entry(skb);
4296
4297 return netif_receive_skb_internal(skb);
4298 }
4299 EXPORT_SYMBOL(netif_receive_skb);
4300
4301 DEFINE_PER_CPU(struct work_struct, flush_works);
4302
4303 /* Network device is going away, flush any packets still pending */
4304 static void flush_backlog(struct work_struct *work)
4305 {
4306 struct sk_buff *skb, *tmp;
4307 struct softnet_data *sd;
4308
4309 local_bh_disable();
4310 sd = this_cpu_ptr(&softnet_data);
4311
4312 local_irq_disable();
4313 rps_lock(sd);
4314 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4315 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4316 __skb_unlink(skb, &sd->input_pkt_queue);
4317 kfree_skb(skb);
4318 input_queue_head_incr(sd);
4319 }
4320 }
4321 rps_unlock(sd);
4322 local_irq_enable();
4323
4324 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4325 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326 __skb_unlink(skb, &sd->process_queue);
4327 kfree_skb(skb);
4328 input_queue_head_incr(sd);
4329 }
4330 }
4331 local_bh_enable();
4332 }
4333
4334 static void flush_all_backlogs(void)
4335 {
4336 unsigned int cpu;
4337
4338 get_online_cpus();
4339
4340 for_each_online_cpu(cpu)
4341 queue_work_on(cpu, system_highpri_wq,
4342 per_cpu_ptr(&flush_works, cpu));
4343
4344 for_each_online_cpu(cpu)
4345 flush_work(per_cpu_ptr(&flush_works, cpu));
4346
4347 put_online_cpus();
4348 }
4349
4350 static int napi_gro_complete(struct sk_buff *skb)
4351 {
4352 struct packet_offload *ptype;
4353 __be16 type = skb->protocol;
4354 struct list_head *head = &offload_base;
4355 int err = -ENOENT;
4356
4357 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4358
4359 if (NAPI_GRO_CB(skb)->count == 1) {
4360 skb_shinfo(skb)->gso_size = 0;
4361 goto out;
4362 }
4363
4364 rcu_read_lock();
4365 list_for_each_entry_rcu(ptype, head, list) {
4366 if (ptype->type != type || !ptype->callbacks.gro_complete)
4367 continue;
4368
4369 err = ptype->callbacks.gro_complete(skb, 0);
4370 break;
4371 }
4372 rcu_read_unlock();
4373
4374 if (err) {
4375 WARN_ON(&ptype->list == head);
4376 kfree_skb(skb);
4377 return NET_RX_SUCCESS;
4378 }
4379
4380 out:
4381 return netif_receive_skb_internal(skb);
4382 }
4383
4384 /* napi->gro_list contains packets ordered by age.
4385 * youngest packets at the head of it.
4386 * Complete skbs in reverse order to reduce latencies.
4387 */
4388 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4389 {
4390 struct sk_buff *skb, *prev = NULL;
4391
4392 /* scan list and build reverse chain */
4393 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4394 skb->prev = prev;
4395 prev = skb;
4396 }
4397
4398 for (skb = prev; skb; skb = prev) {
4399 skb->next = NULL;
4400
4401 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4402 return;
4403
4404 prev = skb->prev;
4405 napi_gro_complete(skb);
4406 napi->gro_count--;
4407 }
4408
4409 napi->gro_list = NULL;
4410 }
4411 EXPORT_SYMBOL(napi_gro_flush);
4412
4413 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4414 {
4415 struct sk_buff *p;
4416 unsigned int maclen = skb->dev->hard_header_len;
4417 u32 hash = skb_get_hash_raw(skb);
4418
4419 for (p = napi->gro_list; p; p = p->next) {
4420 unsigned long diffs;
4421
4422 NAPI_GRO_CB(p)->flush = 0;
4423
4424 if (hash != skb_get_hash_raw(p)) {
4425 NAPI_GRO_CB(p)->same_flow = 0;
4426 continue;
4427 }
4428
4429 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4430 diffs |= p->vlan_tci ^ skb->vlan_tci;
4431 diffs |= skb_metadata_dst_cmp(p, skb);
4432 if (maclen == ETH_HLEN)
4433 diffs |= compare_ether_header(skb_mac_header(p),
4434 skb_mac_header(skb));
4435 else if (!diffs)
4436 diffs = memcmp(skb_mac_header(p),
4437 skb_mac_header(skb),
4438 maclen);
4439 NAPI_GRO_CB(p)->same_flow = !diffs;
4440 }
4441 }
4442
4443 static void skb_gro_reset_offset(struct sk_buff *skb)
4444 {
4445 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4446 const skb_frag_t *frag0 = &pinfo->frags[0];
4447
4448 NAPI_GRO_CB(skb)->data_offset = 0;
4449 NAPI_GRO_CB(skb)->frag0 = NULL;
4450 NAPI_GRO_CB(skb)->frag0_len = 0;
4451
4452 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4453 pinfo->nr_frags &&
4454 !PageHighMem(skb_frag_page(frag0))) {
4455 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4456 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4457 }
4458 }
4459
4460 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4461 {
4462 struct skb_shared_info *pinfo = skb_shinfo(skb);
4463
4464 BUG_ON(skb->end - skb->tail < grow);
4465
4466 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4467
4468 skb->data_len -= grow;
4469 skb->tail += grow;
4470
4471 pinfo->frags[0].page_offset += grow;
4472 skb_frag_size_sub(&pinfo->frags[0], grow);
4473
4474 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4475 skb_frag_unref(skb, 0);
4476 memmove(pinfo->frags, pinfo->frags + 1,
4477 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4478 }
4479 }
4480
4481 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4482 {
4483 struct sk_buff **pp = NULL;
4484 struct packet_offload *ptype;
4485 __be16 type = skb->protocol;
4486 struct list_head *head = &offload_base;
4487 int same_flow;
4488 enum gro_result ret;
4489 int grow;
4490
4491 if (!(skb->dev->features & NETIF_F_GRO))
4492 goto normal;
4493
4494 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4495 goto normal;
4496
4497 gro_list_prepare(napi, skb);
4498
4499 rcu_read_lock();
4500 list_for_each_entry_rcu(ptype, head, list) {
4501 if (ptype->type != type || !ptype->callbacks.gro_receive)
4502 continue;
4503
4504 skb_set_network_header(skb, skb_gro_offset(skb));
4505 skb_reset_mac_len(skb);
4506 NAPI_GRO_CB(skb)->same_flow = 0;
4507 NAPI_GRO_CB(skb)->flush = 0;
4508 NAPI_GRO_CB(skb)->free = 0;
4509 NAPI_GRO_CB(skb)->encap_mark = 0;
4510 NAPI_GRO_CB(skb)->recursion_counter = 0;
4511 NAPI_GRO_CB(skb)->is_fou = 0;
4512 NAPI_GRO_CB(skb)->is_atomic = 1;
4513 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4514
4515 /* Setup for GRO checksum validation */
4516 switch (skb->ip_summed) {
4517 case CHECKSUM_COMPLETE:
4518 NAPI_GRO_CB(skb)->csum = skb->csum;
4519 NAPI_GRO_CB(skb)->csum_valid = 1;
4520 NAPI_GRO_CB(skb)->csum_cnt = 0;
4521 break;
4522 case CHECKSUM_UNNECESSARY:
4523 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4524 NAPI_GRO_CB(skb)->csum_valid = 0;
4525 break;
4526 default:
4527 NAPI_GRO_CB(skb)->csum_cnt = 0;
4528 NAPI_GRO_CB(skb)->csum_valid = 0;
4529 }
4530
4531 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4532 break;
4533 }
4534 rcu_read_unlock();
4535
4536 if (&ptype->list == head)
4537 goto normal;
4538
4539 same_flow = NAPI_GRO_CB(skb)->same_flow;
4540 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4541
4542 if (pp) {
4543 struct sk_buff *nskb = *pp;
4544
4545 *pp = nskb->next;
4546 nskb->next = NULL;
4547 napi_gro_complete(nskb);
4548 napi->gro_count--;
4549 }
4550
4551 if (same_flow)
4552 goto ok;
4553
4554 if (NAPI_GRO_CB(skb)->flush)
4555 goto normal;
4556
4557 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4558 struct sk_buff *nskb = napi->gro_list;
4559
4560 /* locate the end of the list to select the 'oldest' flow */
4561 while (nskb->next) {
4562 pp = &nskb->next;
4563 nskb = *pp;
4564 }
4565 *pp = NULL;
4566 nskb->next = NULL;
4567 napi_gro_complete(nskb);
4568 } else {
4569 napi->gro_count++;
4570 }
4571 NAPI_GRO_CB(skb)->count = 1;
4572 NAPI_GRO_CB(skb)->age = jiffies;
4573 NAPI_GRO_CB(skb)->last = skb;
4574 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4575 skb->next = napi->gro_list;
4576 napi->gro_list = skb;
4577 ret = GRO_HELD;
4578
4579 pull:
4580 grow = skb_gro_offset(skb) - skb_headlen(skb);
4581 if (grow > 0)
4582 gro_pull_from_frag0(skb, grow);
4583 ok:
4584 return ret;
4585
4586 normal:
4587 ret = GRO_NORMAL;
4588 goto pull;
4589 }
4590
4591 struct packet_offload *gro_find_receive_by_type(__be16 type)
4592 {
4593 struct list_head *offload_head = &offload_base;
4594 struct packet_offload *ptype;
4595
4596 list_for_each_entry_rcu(ptype, offload_head, list) {
4597 if (ptype->type != type || !ptype->callbacks.gro_receive)
4598 continue;
4599 return ptype;
4600 }
4601 return NULL;
4602 }
4603 EXPORT_SYMBOL(gro_find_receive_by_type);
4604
4605 struct packet_offload *gro_find_complete_by_type(__be16 type)
4606 {
4607 struct list_head *offload_head = &offload_base;
4608 struct packet_offload *ptype;
4609
4610 list_for_each_entry_rcu(ptype, offload_head, list) {
4611 if (ptype->type != type || !ptype->callbacks.gro_complete)
4612 continue;
4613 return ptype;
4614 }
4615 return NULL;
4616 }
4617 EXPORT_SYMBOL(gro_find_complete_by_type);
4618
4619 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4620 {
4621 switch (ret) {
4622 case GRO_NORMAL:
4623 if (netif_receive_skb_internal(skb))
4624 ret = GRO_DROP;
4625 break;
4626
4627 case GRO_DROP:
4628 kfree_skb(skb);
4629 break;
4630
4631 case GRO_MERGED_FREE:
4632 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4633 skb_dst_drop(skb);
4634 kmem_cache_free(skbuff_head_cache, skb);
4635 } else {
4636 __kfree_skb(skb);
4637 }
4638 break;
4639
4640 case GRO_HELD:
4641 case GRO_MERGED:
4642 break;
4643 }
4644
4645 return ret;
4646 }
4647
4648 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4649 {
4650 skb_mark_napi_id(skb, napi);
4651 trace_napi_gro_receive_entry(skb);
4652
4653 skb_gro_reset_offset(skb);
4654
4655 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4656 }
4657 EXPORT_SYMBOL(napi_gro_receive);
4658
4659 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4660 {
4661 if (unlikely(skb->pfmemalloc)) {
4662 consume_skb(skb);
4663 return;
4664 }
4665 __skb_pull(skb, skb_headlen(skb));
4666 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4667 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4668 skb->vlan_tci = 0;
4669 skb->dev = napi->dev;
4670 skb->skb_iif = 0;
4671 skb->encapsulation = 0;
4672 skb_shinfo(skb)->gso_type = 0;
4673 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4674
4675 napi->skb = skb;
4676 }
4677
4678 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4679 {
4680 struct sk_buff *skb = napi->skb;
4681
4682 if (!skb) {
4683 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4684 if (skb) {
4685 napi->skb = skb;
4686 skb_mark_napi_id(skb, napi);
4687 }
4688 }
4689 return skb;
4690 }
4691 EXPORT_SYMBOL(napi_get_frags);
4692
4693 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4694 struct sk_buff *skb,
4695 gro_result_t ret)
4696 {
4697 switch (ret) {
4698 case GRO_NORMAL:
4699 case GRO_HELD:
4700 __skb_push(skb, ETH_HLEN);
4701 skb->protocol = eth_type_trans(skb, skb->dev);
4702 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4703 ret = GRO_DROP;
4704 break;
4705
4706 case GRO_DROP:
4707 case GRO_MERGED_FREE:
4708 napi_reuse_skb(napi, skb);
4709 break;
4710
4711 case GRO_MERGED:
4712 break;
4713 }
4714
4715 return ret;
4716 }
4717
4718 /* Upper GRO stack assumes network header starts at gro_offset=0
4719 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4720 * We copy ethernet header into skb->data to have a common layout.
4721 */
4722 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4723 {
4724 struct sk_buff *skb = napi->skb;
4725 const struct ethhdr *eth;
4726 unsigned int hlen = sizeof(*eth);
4727
4728 napi->skb = NULL;
4729
4730 skb_reset_mac_header(skb);
4731 skb_gro_reset_offset(skb);
4732
4733 eth = skb_gro_header_fast(skb, 0);
4734 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4735 eth = skb_gro_header_slow(skb, hlen, 0);
4736 if (unlikely(!eth)) {
4737 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4738 __func__, napi->dev->name);
4739 napi_reuse_skb(napi, skb);
4740 return NULL;
4741 }
4742 } else {
4743 gro_pull_from_frag0(skb, hlen);
4744 NAPI_GRO_CB(skb)->frag0 += hlen;
4745 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4746 }
4747 __skb_pull(skb, hlen);
4748
4749 /*
4750 * This works because the only protocols we care about don't require
4751 * special handling.
4752 * We'll fix it up properly in napi_frags_finish()
4753 */
4754 skb->protocol = eth->h_proto;
4755
4756 return skb;
4757 }
4758
4759 gro_result_t napi_gro_frags(struct napi_struct *napi)
4760 {
4761 struct sk_buff *skb = napi_frags_skb(napi);
4762
4763 if (!skb)
4764 return GRO_DROP;
4765
4766 trace_napi_gro_frags_entry(skb);
4767
4768 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4769 }
4770 EXPORT_SYMBOL(napi_gro_frags);
4771
4772 /* Compute the checksum from gro_offset and return the folded value
4773 * after adding in any pseudo checksum.
4774 */
4775 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4776 {
4777 __wsum wsum;
4778 __sum16 sum;
4779
4780 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4781
4782 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4783 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4784 if (likely(!sum)) {
4785 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4786 !skb->csum_complete_sw)
4787 netdev_rx_csum_fault(skb->dev);
4788 }
4789
4790 NAPI_GRO_CB(skb)->csum = wsum;
4791 NAPI_GRO_CB(skb)->csum_valid = 1;
4792
4793 return sum;
4794 }
4795 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4796
4797 /*
4798 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4799 * Note: called with local irq disabled, but exits with local irq enabled.
4800 */
4801 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4802 {
4803 #ifdef CONFIG_RPS
4804 struct softnet_data *remsd = sd->rps_ipi_list;
4805
4806 if (remsd) {
4807 sd->rps_ipi_list = NULL;
4808
4809 local_irq_enable();
4810
4811 /* Send pending IPI's to kick RPS processing on remote cpus. */
4812 while (remsd) {
4813 struct softnet_data *next = remsd->rps_ipi_next;
4814
4815 if (cpu_online(remsd->cpu))
4816 smp_call_function_single_async(remsd->cpu,
4817 &remsd->csd);
4818 remsd = next;
4819 }
4820 } else
4821 #endif
4822 local_irq_enable();
4823 }
4824
4825 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4826 {
4827 #ifdef CONFIG_RPS
4828 return sd->rps_ipi_list != NULL;
4829 #else
4830 return false;
4831 #endif
4832 }
4833
4834 static int process_backlog(struct napi_struct *napi, int quota)
4835 {
4836 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4837 bool again = true;
4838 int work = 0;
4839
4840 /* Check if we have pending ipi, its better to send them now,
4841 * not waiting net_rx_action() end.
4842 */
4843 if (sd_has_rps_ipi_waiting(sd)) {
4844 local_irq_disable();
4845 net_rps_action_and_irq_enable(sd);
4846 }
4847
4848 napi->weight = weight_p;
4849 while (again) {
4850 struct sk_buff *skb;
4851
4852 while ((skb = __skb_dequeue(&sd->process_queue))) {
4853 rcu_read_lock();
4854 __netif_receive_skb(skb);
4855 rcu_read_unlock();
4856 input_queue_head_incr(sd);
4857 if (++work >= quota)
4858 return work;
4859
4860 }
4861
4862 local_irq_disable();
4863 rps_lock(sd);
4864 if (skb_queue_empty(&sd->input_pkt_queue)) {
4865 /*
4866 * Inline a custom version of __napi_complete().
4867 * only current cpu owns and manipulates this napi,
4868 * and NAPI_STATE_SCHED is the only possible flag set
4869 * on backlog.
4870 * We can use a plain write instead of clear_bit(),
4871 * and we dont need an smp_mb() memory barrier.
4872 */
4873 napi->state = 0;
4874 again = false;
4875 } else {
4876 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4877 &sd->process_queue);
4878 }
4879 rps_unlock(sd);
4880 local_irq_enable();
4881 }
4882
4883 return work;
4884 }
4885
4886 /**
4887 * __napi_schedule - schedule for receive
4888 * @n: entry to schedule
4889 *
4890 * The entry's receive function will be scheduled to run.
4891 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4892 */
4893 void __napi_schedule(struct napi_struct *n)
4894 {
4895 unsigned long flags;
4896
4897 local_irq_save(flags);
4898 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4899 local_irq_restore(flags);
4900 }
4901 EXPORT_SYMBOL(__napi_schedule);
4902
4903 /**
4904 * __napi_schedule_irqoff - schedule for receive
4905 * @n: entry to schedule
4906 *
4907 * Variant of __napi_schedule() assuming hard irqs are masked
4908 */
4909 void __napi_schedule_irqoff(struct napi_struct *n)
4910 {
4911 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4912 }
4913 EXPORT_SYMBOL(__napi_schedule_irqoff);
4914
4915 void __napi_complete(struct napi_struct *n)
4916 {
4917 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4918
4919 list_del_init(&n->poll_list);
4920 smp_mb__before_atomic();
4921 clear_bit(NAPI_STATE_SCHED, &n->state);
4922 }
4923 EXPORT_SYMBOL(__napi_complete);
4924
4925 void napi_complete_done(struct napi_struct *n, int work_done)
4926 {
4927 unsigned long flags;
4928
4929 /*
4930 * don't let napi dequeue from the cpu poll list
4931 * just in case its running on a different cpu
4932 */
4933 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4934 return;
4935
4936 if (n->gro_list) {
4937 unsigned long timeout = 0;
4938
4939 if (work_done)
4940 timeout = n->dev->gro_flush_timeout;
4941
4942 if (timeout)
4943 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4944 HRTIMER_MODE_REL_PINNED);
4945 else
4946 napi_gro_flush(n, false);
4947 }
4948 if (likely(list_empty(&n->poll_list))) {
4949 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4950 } else {
4951 /* If n->poll_list is not empty, we need to mask irqs */
4952 local_irq_save(flags);
4953 __napi_complete(n);
4954 local_irq_restore(flags);
4955 }
4956 }
4957 EXPORT_SYMBOL(napi_complete_done);
4958
4959 /* must be called under rcu_read_lock(), as we dont take a reference */
4960 static struct napi_struct *napi_by_id(unsigned int napi_id)
4961 {
4962 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4963 struct napi_struct *napi;
4964
4965 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4966 if (napi->napi_id == napi_id)
4967 return napi;
4968
4969 return NULL;
4970 }
4971
4972 #if defined(CONFIG_NET_RX_BUSY_POLL)
4973 #define BUSY_POLL_BUDGET 8
4974 bool sk_busy_loop(struct sock *sk, int nonblock)
4975 {
4976 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4977 int (*busy_poll)(struct napi_struct *dev);
4978 struct napi_struct *napi;
4979 int rc = false;
4980
4981 rcu_read_lock();
4982
4983 napi = napi_by_id(sk->sk_napi_id);
4984 if (!napi)
4985 goto out;
4986
4987 /* Note: ndo_busy_poll method is optional in linux-4.5 */
4988 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4989
4990 do {
4991 rc = 0;
4992 local_bh_disable();
4993 if (busy_poll) {
4994 rc = busy_poll(napi);
4995 } else if (napi_schedule_prep(napi)) {
4996 void *have = netpoll_poll_lock(napi);
4997
4998 if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4999 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5000 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5001 if (rc == BUSY_POLL_BUDGET) {
5002 napi_complete_done(napi, rc);
5003 napi_schedule(napi);
5004 }
5005 }
5006 netpoll_poll_unlock(have);
5007 }
5008 if (rc > 0)
5009 __NET_ADD_STATS(sock_net(sk),
5010 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5011 local_bh_enable();
5012
5013 if (rc == LL_FLUSH_FAILED)
5014 break; /* permanent failure */
5015
5016 cpu_relax();
5017 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5018 !need_resched() && !busy_loop_timeout(end_time));
5019
5020 rc = !skb_queue_empty(&sk->sk_receive_queue);
5021 out:
5022 rcu_read_unlock();
5023 return rc;
5024 }
5025 EXPORT_SYMBOL(sk_busy_loop);
5026
5027 #endif /* CONFIG_NET_RX_BUSY_POLL */
5028
5029 void napi_hash_add(struct napi_struct *napi)
5030 {
5031 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5032 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5033 return;
5034
5035 spin_lock(&napi_hash_lock);
5036
5037 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5038 do {
5039 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5040 napi_gen_id = NR_CPUS + 1;
5041 } while (napi_by_id(napi_gen_id));
5042 napi->napi_id = napi_gen_id;
5043
5044 hlist_add_head_rcu(&napi->napi_hash_node,
5045 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5046
5047 spin_unlock(&napi_hash_lock);
5048 }
5049 EXPORT_SYMBOL_GPL(napi_hash_add);
5050
5051 /* Warning : caller is responsible to make sure rcu grace period
5052 * is respected before freeing memory containing @napi
5053 */
5054 bool napi_hash_del(struct napi_struct *napi)
5055 {
5056 bool rcu_sync_needed = false;
5057
5058 spin_lock(&napi_hash_lock);
5059
5060 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5061 rcu_sync_needed = true;
5062 hlist_del_rcu(&napi->napi_hash_node);
5063 }
5064 spin_unlock(&napi_hash_lock);
5065 return rcu_sync_needed;
5066 }
5067 EXPORT_SYMBOL_GPL(napi_hash_del);
5068
5069 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5070 {
5071 struct napi_struct *napi;
5072
5073 napi = container_of(timer, struct napi_struct, timer);
5074 if (napi->gro_list)
5075 napi_schedule(napi);
5076
5077 return HRTIMER_NORESTART;
5078 }
5079
5080 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5081 int (*poll)(struct napi_struct *, int), int weight)
5082 {
5083 INIT_LIST_HEAD(&napi->poll_list);
5084 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5085 napi->timer.function = napi_watchdog;
5086 napi->gro_count = 0;
5087 napi->gro_list = NULL;
5088 napi->skb = NULL;
5089 napi->poll = poll;
5090 if (weight > NAPI_POLL_WEIGHT)
5091 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5092 weight, dev->name);
5093 napi->weight = weight;
5094 list_add(&napi->dev_list, &dev->napi_list);
5095 napi->dev = dev;
5096 #ifdef CONFIG_NETPOLL
5097 spin_lock_init(&napi->poll_lock);
5098 napi->poll_owner = -1;
5099 #endif
5100 set_bit(NAPI_STATE_SCHED, &napi->state);
5101 napi_hash_add(napi);
5102 }
5103 EXPORT_SYMBOL(netif_napi_add);
5104
5105 void napi_disable(struct napi_struct *n)
5106 {
5107 might_sleep();
5108 set_bit(NAPI_STATE_DISABLE, &n->state);
5109
5110 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5111 msleep(1);
5112 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5113 msleep(1);
5114
5115 hrtimer_cancel(&n->timer);
5116
5117 clear_bit(NAPI_STATE_DISABLE, &n->state);
5118 }
5119 EXPORT_SYMBOL(napi_disable);
5120
5121 /* Must be called in process context */
5122 void netif_napi_del(struct napi_struct *napi)
5123 {
5124 might_sleep();
5125 if (napi_hash_del(napi))
5126 synchronize_net();
5127 list_del_init(&napi->dev_list);
5128 napi_free_frags(napi);
5129
5130 kfree_skb_list(napi->gro_list);
5131 napi->gro_list = NULL;
5132 napi->gro_count = 0;
5133 }
5134 EXPORT_SYMBOL(netif_napi_del);
5135
5136 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5137 {
5138 void *have;
5139 int work, weight;
5140
5141 list_del_init(&n->poll_list);
5142
5143 have = netpoll_poll_lock(n);
5144
5145 weight = n->weight;
5146
5147 /* This NAPI_STATE_SCHED test is for avoiding a race
5148 * with netpoll's poll_napi(). Only the entity which
5149 * obtains the lock and sees NAPI_STATE_SCHED set will
5150 * actually make the ->poll() call. Therefore we avoid
5151 * accidentally calling ->poll() when NAPI is not scheduled.
5152 */
5153 work = 0;
5154 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5155 work = n->poll(n, weight);
5156 trace_napi_poll(n, work, weight);
5157 }
5158
5159 WARN_ON_ONCE(work > weight);
5160
5161 if (likely(work < weight))
5162 goto out_unlock;
5163
5164 /* Drivers must not modify the NAPI state if they
5165 * consume the entire weight. In such cases this code
5166 * still "owns" the NAPI instance and therefore can
5167 * move the instance around on the list at-will.
5168 */
5169 if (unlikely(napi_disable_pending(n))) {
5170 napi_complete(n);
5171 goto out_unlock;
5172 }
5173
5174 if (n->gro_list) {
5175 /* flush too old packets
5176 * If HZ < 1000, flush all packets.
5177 */
5178 napi_gro_flush(n, HZ >= 1000);
5179 }
5180
5181 /* Some drivers may have called napi_schedule
5182 * prior to exhausting their budget.
5183 */
5184 if (unlikely(!list_empty(&n->poll_list))) {
5185 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5186 n->dev ? n->dev->name : "backlog");
5187 goto out_unlock;
5188 }
5189
5190 list_add_tail(&n->poll_list, repoll);
5191
5192 out_unlock:
5193 netpoll_poll_unlock(have);
5194
5195 return work;
5196 }
5197
5198 static __latent_entropy void net_rx_action(struct softirq_action *h)
5199 {
5200 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5201 unsigned long time_limit = jiffies + 2;
5202 int budget = netdev_budget;
5203 LIST_HEAD(list);
5204 LIST_HEAD(repoll);
5205
5206 local_irq_disable();
5207 list_splice_init(&sd->poll_list, &list);
5208 local_irq_enable();
5209
5210 for (;;) {
5211 struct napi_struct *n;
5212
5213 if (list_empty(&list)) {
5214 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5215 return;
5216 break;
5217 }
5218
5219 n = list_first_entry(&list, struct napi_struct, poll_list);
5220 budget -= napi_poll(n, &repoll);
5221
5222 /* If softirq window is exhausted then punt.
5223 * Allow this to run for 2 jiffies since which will allow
5224 * an average latency of 1.5/HZ.
5225 */
5226 if (unlikely(budget <= 0 ||
5227 time_after_eq(jiffies, time_limit))) {
5228 sd->time_squeeze++;
5229 break;
5230 }
5231 }
5232
5233 __kfree_skb_flush();
5234 local_irq_disable();
5235
5236 list_splice_tail_init(&sd->poll_list, &list);
5237 list_splice_tail(&repoll, &list);
5238 list_splice(&list, &sd->poll_list);
5239 if (!list_empty(&sd->poll_list))
5240 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5241
5242 net_rps_action_and_irq_enable(sd);
5243 }
5244
5245 struct netdev_adjacent {
5246 struct net_device *dev;
5247
5248 /* upper master flag, there can only be one master device per list */
5249 bool master;
5250
5251 /* counter for the number of times this device was added to us */
5252 u16 ref_nr;
5253
5254 /* private field for the users */
5255 void *private;
5256
5257 struct list_head list;
5258 struct rcu_head rcu;
5259 };
5260
5261 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5262 struct list_head *adj_list)
5263 {
5264 struct netdev_adjacent *adj;
5265
5266 list_for_each_entry(adj, adj_list, list) {
5267 if (adj->dev == adj_dev)
5268 return adj;
5269 }
5270 return NULL;
5271 }
5272
5273 /**
5274 * netdev_has_upper_dev - Check if device is linked to an upper device
5275 * @dev: device
5276 * @upper_dev: upper device to check
5277 *
5278 * Find out if a device is linked to specified upper device and return true
5279 * in case it is. Note that this checks only immediate upper device,
5280 * not through a complete stack of devices. The caller must hold the RTNL lock.
5281 */
5282 bool netdev_has_upper_dev(struct net_device *dev,
5283 struct net_device *upper_dev)
5284 {
5285 ASSERT_RTNL();
5286
5287 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5288 }
5289 EXPORT_SYMBOL(netdev_has_upper_dev);
5290
5291 /**
5292 * netdev_has_any_upper_dev - Check if device is linked to some device
5293 * @dev: device
5294 *
5295 * Find out if a device is linked to an upper device and return true in case
5296 * it is. The caller must hold the RTNL lock.
5297 */
5298 static bool netdev_has_any_upper_dev(struct net_device *dev)
5299 {
5300 ASSERT_RTNL();
5301
5302 return !list_empty(&dev->all_adj_list.upper);
5303 }
5304
5305 /**
5306 * netdev_master_upper_dev_get - Get master upper device
5307 * @dev: device
5308 *
5309 * Find a master upper device and return pointer to it or NULL in case
5310 * it's not there. The caller must hold the RTNL lock.
5311 */
5312 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5313 {
5314 struct netdev_adjacent *upper;
5315
5316 ASSERT_RTNL();
5317
5318 if (list_empty(&dev->adj_list.upper))
5319 return NULL;
5320
5321 upper = list_first_entry(&dev->adj_list.upper,
5322 struct netdev_adjacent, list);
5323 if (likely(upper->master))
5324 return upper->dev;
5325 return NULL;
5326 }
5327 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5328
5329 void *netdev_adjacent_get_private(struct list_head *adj_list)
5330 {
5331 struct netdev_adjacent *adj;
5332
5333 adj = list_entry(adj_list, struct netdev_adjacent, list);
5334
5335 return adj->private;
5336 }
5337 EXPORT_SYMBOL(netdev_adjacent_get_private);
5338
5339 /**
5340 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5341 * @dev: device
5342 * @iter: list_head ** of the current position
5343 *
5344 * Gets the next device from the dev's upper list, starting from iter
5345 * position. The caller must hold RCU read lock.
5346 */
5347 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5348 struct list_head **iter)
5349 {
5350 struct netdev_adjacent *upper;
5351
5352 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5353
5354 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5355
5356 if (&upper->list == &dev->adj_list.upper)
5357 return NULL;
5358
5359 *iter = &upper->list;
5360
5361 return upper->dev;
5362 }
5363 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5364
5365 /**
5366 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5367 * @dev: device
5368 * @iter: list_head ** of the current position
5369 *
5370 * Gets the next device from the dev's upper list, starting from iter
5371 * position. The caller must hold RCU read lock.
5372 */
5373 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5374 struct list_head **iter)
5375 {
5376 struct netdev_adjacent *upper;
5377
5378 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5379
5380 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5381
5382 if (&upper->list == &dev->all_adj_list.upper)
5383 return NULL;
5384
5385 *iter = &upper->list;
5386
5387 return upper->dev;
5388 }
5389 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5390
5391 /**
5392 * netdev_lower_get_next_private - Get the next ->private from the
5393 * lower neighbour list
5394 * @dev: device
5395 * @iter: list_head ** of the current position
5396 *
5397 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5398 * list, starting from iter position. The caller must hold either hold the
5399 * RTNL lock or its own locking that guarantees that the neighbour lower
5400 * list will remain unchanged.
5401 */
5402 void *netdev_lower_get_next_private(struct net_device *dev,
5403 struct list_head **iter)
5404 {
5405 struct netdev_adjacent *lower;
5406
5407 lower = list_entry(*iter, struct netdev_adjacent, list);
5408
5409 if (&lower->list == &dev->adj_list.lower)
5410 return NULL;
5411
5412 *iter = lower->list.next;
5413
5414 return lower->private;
5415 }
5416 EXPORT_SYMBOL(netdev_lower_get_next_private);
5417
5418 /**
5419 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5420 * lower neighbour list, RCU
5421 * variant
5422 * @dev: device
5423 * @iter: list_head ** of the current position
5424 *
5425 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5426 * list, starting from iter position. The caller must hold RCU read lock.
5427 */
5428 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5429 struct list_head **iter)
5430 {
5431 struct netdev_adjacent *lower;
5432
5433 WARN_ON_ONCE(!rcu_read_lock_held());
5434
5435 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5436
5437 if (&lower->list == &dev->adj_list.lower)
5438 return NULL;
5439
5440 *iter = &lower->list;
5441
5442 return lower->private;
5443 }
5444 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5445
5446 /**
5447 * netdev_lower_get_next - Get the next device from the lower neighbour
5448 * list
5449 * @dev: device
5450 * @iter: list_head ** of the current position
5451 *
5452 * Gets the next netdev_adjacent from the dev's lower neighbour
5453 * list, starting from iter position. The caller must hold RTNL lock or
5454 * its own locking that guarantees that the neighbour lower
5455 * list will remain unchanged.
5456 */
5457 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5458 {
5459 struct netdev_adjacent *lower;
5460
5461 lower = list_entry(*iter, struct netdev_adjacent, list);
5462
5463 if (&lower->list == &dev->adj_list.lower)
5464 return NULL;
5465
5466 *iter = lower->list.next;
5467
5468 return lower->dev;
5469 }
5470 EXPORT_SYMBOL(netdev_lower_get_next);
5471
5472 /**
5473 * netdev_all_lower_get_next - Get the next device from all lower neighbour list
5474 * @dev: device
5475 * @iter: list_head ** of the current position
5476 *
5477 * Gets the next netdev_adjacent from the dev's all lower neighbour
5478 * list, starting from iter position. The caller must hold RTNL lock or
5479 * its own locking that guarantees that the neighbour all lower
5480 * list will remain unchanged.
5481 */
5482 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5483 {
5484 struct netdev_adjacent *lower;
5485
5486 lower = list_entry(*iter, struct netdev_adjacent, list);
5487
5488 if (&lower->list == &dev->all_adj_list.lower)
5489 return NULL;
5490
5491 *iter = lower->list.next;
5492
5493 return lower->dev;
5494 }
5495 EXPORT_SYMBOL(netdev_all_lower_get_next);
5496
5497 /**
5498 * netdev_all_lower_get_next_rcu - Get the next device from all
5499 * lower neighbour list, RCU variant
5500 * @dev: device
5501 * @iter: list_head ** of the current position
5502 *
5503 * Gets the next netdev_adjacent from the dev's all lower neighbour
5504 * list, starting from iter position. The caller must hold RCU read lock.
5505 */
5506 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
5507 struct list_head **iter)
5508 {
5509 struct netdev_adjacent *lower;
5510
5511 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5512
5513 if (&lower->list == &dev->all_adj_list.lower)
5514 return NULL;
5515
5516 *iter = &lower->list;
5517
5518 return lower->dev;
5519 }
5520 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
5521
5522 /**
5523 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5524 * lower neighbour list, RCU
5525 * variant
5526 * @dev: device
5527 *
5528 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5529 * list. The caller must hold RCU read lock.
5530 */
5531 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5532 {
5533 struct netdev_adjacent *lower;
5534
5535 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5536 struct netdev_adjacent, list);
5537 if (lower)
5538 return lower->private;
5539 return NULL;
5540 }
5541 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5542
5543 /**
5544 * netdev_master_upper_dev_get_rcu - Get master upper device
5545 * @dev: device
5546 *
5547 * Find a master upper device and return pointer to it or NULL in case
5548 * it's not there. The caller must hold the RCU read lock.
5549 */
5550 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5551 {
5552 struct netdev_adjacent *upper;
5553
5554 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5555 struct netdev_adjacent, list);
5556 if (upper && likely(upper->master))
5557 return upper->dev;
5558 return NULL;
5559 }
5560 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5561
5562 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5563 struct net_device *adj_dev,
5564 struct list_head *dev_list)
5565 {
5566 char linkname[IFNAMSIZ+7];
5567 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5568 "upper_%s" : "lower_%s", adj_dev->name);
5569 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5570 linkname);
5571 }
5572 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5573 char *name,
5574 struct list_head *dev_list)
5575 {
5576 char linkname[IFNAMSIZ+7];
5577 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5578 "upper_%s" : "lower_%s", name);
5579 sysfs_remove_link(&(dev->dev.kobj), linkname);
5580 }
5581
5582 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5583 struct net_device *adj_dev,
5584 struct list_head *dev_list)
5585 {
5586 return (dev_list == &dev->adj_list.upper ||
5587 dev_list == &dev->adj_list.lower) &&
5588 net_eq(dev_net(dev), dev_net(adj_dev));
5589 }
5590
5591 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5592 struct net_device *adj_dev,
5593 u16 ref_nr,
5594 struct list_head *dev_list,
5595 void *private, bool master)
5596 {
5597 struct netdev_adjacent *adj;
5598 int ret;
5599
5600 adj = __netdev_find_adj(adj_dev, dev_list);
5601
5602 if (adj) {
5603 adj->ref_nr += ref_nr;
5604 return 0;
5605 }
5606
5607 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5608 if (!adj)
5609 return -ENOMEM;
5610
5611 adj->dev = adj_dev;
5612 adj->master = master;
5613 adj->ref_nr = ref_nr;
5614 adj->private = private;
5615 dev_hold(adj_dev);
5616
5617 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5618 adj_dev->name, dev->name, adj_dev->name);
5619
5620 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5621 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5622 if (ret)
5623 goto free_adj;
5624 }
5625
5626 /* Ensure that master link is always the first item in list. */
5627 if (master) {
5628 ret = sysfs_create_link(&(dev->dev.kobj),
5629 &(adj_dev->dev.kobj), "master");
5630 if (ret)
5631 goto remove_symlinks;
5632
5633 list_add_rcu(&adj->list, dev_list);
5634 } else {
5635 list_add_tail_rcu(&adj->list, dev_list);
5636 }
5637
5638 return 0;
5639
5640 remove_symlinks:
5641 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5642 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5643 free_adj:
5644 kfree(adj);
5645 dev_put(adj_dev);
5646
5647 return ret;
5648 }
5649
5650 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5651 struct net_device *adj_dev,
5652 u16 ref_nr,
5653 struct list_head *dev_list)
5654 {
5655 struct netdev_adjacent *adj;
5656
5657 adj = __netdev_find_adj(adj_dev, dev_list);
5658
5659 if (!adj) {
5660 pr_err("tried to remove device %s from %s\n",
5661 dev->name, adj_dev->name);
5662 BUG();
5663 }
5664
5665 if (adj->ref_nr > ref_nr) {
5666 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5667 ref_nr, adj->ref_nr-ref_nr);
5668 adj->ref_nr -= ref_nr;
5669 return;
5670 }
5671
5672 if (adj->master)
5673 sysfs_remove_link(&(dev->dev.kobj), "master");
5674
5675 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5676 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5677
5678 list_del_rcu(&adj->list);
5679 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5680 adj_dev->name, dev->name, adj_dev->name);
5681 dev_put(adj_dev);
5682 kfree_rcu(adj, rcu);
5683 }
5684
5685 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5686 struct net_device *upper_dev,
5687 u16 ref_nr,
5688 struct list_head *up_list,
5689 struct list_head *down_list,
5690 void *private, bool master)
5691 {
5692 int ret;
5693
5694 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5695 private, master);
5696 if (ret)
5697 return ret;
5698
5699 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5700 private, false);
5701 if (ret) {
5702 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5703 return ret;
5704 }
5705
5706 return 0;
5707 }
5708
5709 static int __netdev_adjacent_dev_link(struct net_device *dev,
5710 struct net_device *upper_dev,
5711 u16 ref_nr)
5712 {
5713 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5714 &dev->all_adj_list.upper,
5715 &upper_dev->all_adj_list.lower,
5716 NULL, false);
5717 }
5718
5719 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5720 struct net_device *upper_dev,
5721 u16 ref_nr,
5722 struct list_head *up_list,
5723 struct list_head *down_list)
5724 {
5725 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5726 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5727 }
5728
5729 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5730 struct net_device *upper_dev,
5731 u16 ref_nr)
5732 {
5733 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5734 &dev->all_adj_list.upper,
5735 &upper_dev->all_adj_list.lower);
5736 }
5737
5738 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5739 struct net_device *upper_dev,
5740 void *private, bool master)
5741 {
5742 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5743
5744 if (ret)
5745 return ret;
5746
5747 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5748 &dev->adj_list.upper,
5749 &upper_dev->adj_list.lower,
5750 private, master);
5751 if (ret) {
5752 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5753 return ret;
5754 }
5755
5756 return 0;
5757 }
5758
5759 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5760 struct net_device *upper_dev)
5761 {
5762 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5763 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5764 &dev->adj_list.upper,
5765 &upper_dev->adj_list.lower);
5766 }
5767
5768 static int __netdev_upper_dev_link(struct net_device *dev,
5769 struct net_device *upper_dev, bool master,
5770 void *upper_priv, void *upper_info)
5771 {
5772 struct netdev_notifier_changeupper_info changeupper_info;
5773 struct netdev_adjacent *i, *j, *to_i, *to_j;
5774 int ret = 0;
5775
5776 ASSERT_RTNL();
5777
5778 if (dev == upper_dev)
5779 return -EBUSY;
5780
5781 /* To prevent loops, check if dev is not upper device to upper_dev. */
5782 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5783 return -EBUSY;
5784
5785 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5786 return -EEXIST;
5787
5788 if (master && netdev_master_upper_dev_get(dev))
5789 return -EBUSY;
5790
5791 changeupper_info.upper_dev = upper_dev;
5792 changeupper_info.master = master;
5793 changeupper_info.linking = true;
5794 changeupper_info.upper_info = upper_info;
5795
5796 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5797 &changeupper_info.info);
5798 ret = notifier_to_errno(ret);
5799 if (ret)
5800 return ret;
5801
5802 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5803 master);
5804 if (ret)
5805 return ret;
5806
5807 /* Now that we linked these devs, make all the upper_dev's
5808 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5809 * versa, and don't forget the devices itself. All of these
5810 * links are non-neighbours.
5811 */
5812 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5813 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5814 pr_debug("Interlinking %s with %s, non-neighbour\n",
5815 i->dev->name, j->dev->name);
5816 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5817 if (ret)
5818 goto rollback_mesh;
5819 }
5820 }
5821
5822 /* add dev to every upper_dev's upper device */
5823 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5824 pr_debug("linking %s's upper device %s with %s\n",
5825 upper_dev->name, i->dev->name, dev->name);
5826 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5827 if (ret)
5828 goto rollback_upper_mesh;
5829 }
5830
5831 /* add upper_dev to every dev's lower device */
5832 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5833 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5834 i->dev->name, upper_dev->name);
5835 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5836 if (ret)
5837 goto rollback_lower_mesh;
5838 }
5839
5840 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5841 &changeupper_info.info);
5842 ret = notifier_to_errno(ret);
5843 if (ret)
5844 goto rollback_lower_mesh;
5845
5846 return 0;
5847
5848 rollback_lower_mesh:
5849 to_i = i;
5850 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5851 if (i == to_i)
5852 break;
5853 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5854 }
5855
5856 i = NULL;
5857
5858 rollback_upper_mesh:
5859 to_i = i;
5860 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5861 if (i == to_i)
5862 break;
5863 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5864 }
5865
5866 i = j = NULL;
5867
5868 rollback_mesh:
5869 to_i = i;
5870 to_j = j;
5871 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5872 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5873 if (i == to_i && j == to_j)
5874 break;
5875 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5876 }
5877 if (i == to_i)
5878 break;
5879 }
5880
5881 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5882
5883 return ret;
5884 }
5885
5886 /**
5887 * netdev_upper_dev_link - Add a link to the upper device
5888 * @dev: device
5889 * @upper_dev: new upper device
5890 *
5891 * Adds a link to device which is upper to this one. The caller must hold
5892 * the RTNL lock. On a failure a negative errno code is returned.
5893 * On success the reference counts are adjusted and the function
5894 * returns zero.
5895 */
5896 int netdev_upper_dev_link(struct net_device *dev,
5897 struct net_device *upper_dev)
5898 {
5899 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5900 }
5901 EXPORT_SYMBOL(netdev_upper_dev_link);
5902
5903 /**
5904 * netdev_master_upper_dev_link - Add a master link to the upper device
5905 * @dev: device
5906 * @upper_dev: new upper device
5907 * @upper_priv: upper device private
5908 * @upper_info: upper info to be passed down via notifier
5909 *
5910 * Adds a link to device which is upper to this one. In this case, only
5911 * one master upper device can be linked, although other non-master devices
5912 * might be linked as well. The caller must hold the RTNL lock.
5913 * On a failure a negative errno code is returned. On success the reference
5914 * counts are adjusted and the function returns zero.
5915 */
5916 int netdev_master_upper_dev_link(struct net_device *dev,
5917 struct net_device *upper_dev,
5918 void *upper_priv, void *upper_info)
5919 {
5920 return __netdev_upper_dev_link(dev, upper_dev, true,
5921 upper_priv, upper_info);
5922 }
5923 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5924
5925 /**
5926 * netdev_upper_dev_unlink - Removes a link to upper device
5927 * @dev: device
5928 * @upper_dev: new upper device
5929 *
5930 * Removes a link to device which is upper to this one. The caller must hold
5931 * the RTNL lock.
5932 */
5933 void netdev_upper_dev_unlink(struct net_device *dev,
5934 struct net_device *upper_dev)
5935 {
5936 struct netdev_notifier_changeupper_info changeupper_info;
5937 struct netdev_adjacent *i, *j;
5938 ASSERT_RTNL();
5939
5940 changeupper_info.upper_dev = upper_dev;
5941 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5942 changeupper_info.linking = false;
5943
5944 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5945 &changeupper_info.info);
5946
5947 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5948
5949 /* Here is the tricky part. We must remove all dev's lower
5950 * devices from all upper_dev's upper devices and vice
5951 * versa, to maintain the graph relationship.
5952 */
5953 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5954 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5955 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5956
5957 /* remove also the devices itself from lower/upper device
5958 * list
5959 */
5960 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5961 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5962
5963 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5964 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5965
5966 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5967 &changeupper_info.info);
5968 }
5969 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5970
5971 /**
5972 * netdev_bonding_info_change - Dispatch event about slave change
5973 * @dev: device
5974 * @bonding_info: info to dispatch
5975 *
5976 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5977 * The caller must hold the RTNL lock.
5978 */
5979 void netdev_bonding_info_change(struct net_device *dev,
5980 struct netdev_bonding_info *bonding_info)
5981 {
5982 struct netdev_notifier_bonding_info info;
5983
5984 memcpy(&info.bonding_info, bonding_info,
5985 sizeof(struct netdev_bonding_info));
5986 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5987 &info.info);
5988 }
5989 EXPORT_SYMBOL(netdev_bonding_info_change);
5990
5991 static void netdev_adjacent_add_links(struct net_device *dev)
5992 {
5993 struct netdev_adjacent *iter;
5994
5995 struct net *net = dev_net(dev);
5996
5997 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5998 if (!net_eq(net, dev_net(iter->dev)))
5999 continue;
6000 netdev_adjacent_sysfs_add(iter->dev, dev,
6001 &iter->dev->adj_list.lower);
6002 netdev_adjacent_sysfs_add(dev, iter->dev,
6003 &dev->adj_list.upper);
6004 }
6005
6006 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6007 if (!net_eq(net, dev_net(iter->dev)))
6008 continue;
6009 netdev_adjacent_sysfs_add(iter->dev, dev,
6010 &iter->dev->adj_list.upper);
6011 netdev_adjacent_sysfs_add(dev, iter->dev,
6012 &dev->adj_list.lower);
6013 }
6014 }
6015
6016 static void netdev_adjacent_del_links(struct net_device *dev)
6017 {
6018 struct netdev_adjacent *iter;
6019
6020 struct net *net = dev_net(dev);
6021
6022 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6023 if (!net_eq(net, dev_net(iter->dev)))
6024 continue;
6025 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6026 &iter->dev->adj_list.lower);
6027 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6028 &dev->adj_list.upper);
6029 }
6030
6031 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6032 if (!net_eq(net, dev_net(iter->dev)))
6033 continue;
6034 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6035 &iter->dev->adj_list.upper);
6036 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6037 &dev->adj_list.lower);
6038 }
6039 }
6040
6041 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6042 {
6043 struct netdev_adjacent *iter;
6044
6045 struct net *net = dev_net(dev);
6046
6047 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6048 if (!net_eq(net, dev_net(iter->dev)))
6049 continue;
6050 netdev_adjacent_sysfs_del(iter->dev, oldname,
6051 &iter->dev->adj_list.lower);
6052 netdev_adjacent_sysfs_add(iter->dev, dev,
6053 &iter->dev->adj_list.lower);
6054 }
6055
6056 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6057 if (!net_eq(net, dev_net(iter->dev)))
6058 continue;
6059 netdev_adjacent_sysfs_del(iter->dev, oldname,
6060 &iter->dev->adj_list.upper);
6061 netdev_adjacent_sysfs_add(iter->dev, dev,
6062 &iter->dev->adj_list.upper);
6063 }
6064 }
6065
6066 void *netdev_lower_dev_get_private(struct net_device *dev,
6067 struct net_device *lower_dev)
6068 {
6069 struct netdev_adjacent *lower;
6070
6071 if (!lower_dev)
6072 return NULL;
6073 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6074 if (!lower)
6075 return NULL;
6076
6077 return lower->private;
6078 }
6079 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6080
6081
6082 int dev_get_nest_level(struct net_device *dev)
6083 {
6084 struct net_device *lower = NULL;
6085 struct list_head *iter;
6086 int max_nest = -1;
6087 int nest;
6088
6089 ASSERT_RTNL();
6090
6091 netdev_for_each_lower_dev(dev, lower, iter) {
6092 nest = dev_get_nest_level(lower);
6093 if (max_nest < nest)
6094 max_nest = nest;
6095 }
6096
6097 return max_nest + 1;
6098 }
6099 EXPORT_SYMBOL(dev_get_nest_level);
6100
6101 /**
6102 * netdev_lower_change - Dispatch event about lower device state change
6103 * @lower_dev: device
6104 * @lower_state_info: state to dispatch
6105 *
6106 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6107 * The caller must hold the RTNL lock.
6108 */
6109 void netdev_lower_state_changed(struct net_device *lower_dev,
6110 void *lower_state_info)
6111 {
6112 struct netdev_notifier_changelowerstate_info changelowerstate_info;
6113
6114 ASSERT_RTNL();
6115 changelowerstate_info.lower_state_info = lower_state_info;
6116 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6117 &changelowerstate_info.info);
6118 }
6119 EXPORT_SYMBOL(netdev_lower_state_changed);
6120
6121 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6122 struct neighbour *n)
6123 {
6124 struct net_device *lower_dev, *stop_dev;
6125 struct list_head *iter;
6126 int err;
6127
6128 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6129 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6130 continue;
6131 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6132 if (err) {
6133 stop_dev = lower_dev;
6134 goto rollback;
6135 }
6136 }
6137 return 0;
6138
6139 rollback:
6140 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6141 if (lower_dev == stop_dev)
6142 break;
6143 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6144 continue;
6145 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6146 }
6147 return err;
6148 }
6149 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6150
6151 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6152 struct neighbour *n)
6153 {
6154 struct net_device *lower_dev;
6155 struct list_head *iter;
6156
6157 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6158 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6159 continue;
6160 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6161 }
6162 }
6163 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6164
6165 static void dev_change_rx_flags(struct net_device *dev, int flags)
6166 {
6167 const struct net_device_ops *ops = dev->netdev_ops;
6168
6169 if (ops->ndo_change_rx_flags)
6170 ops->ndo_change_rx_flags(dev, flags);
6171 }
6172
6173 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6174 {
6175 unsigned int old_flags = dev->flags;
6176 kuid_t uid;
6177 kgid_t gid;
6178
6179 ASSERT_RTNL();
6180
6181 dev->flags |= IFF_PROMISC;
6182 dev->promiscuity += inc;
6183 if (dev->promiscuity == 0) {
6184 /*
6185 * Avoid overflow.
6186 * If inc causes overflow, untouch promisc and return error.
6187 */
6188 if (inc < 0)
6189 dev->flags &= ~IFF_PROMISC;
6190 else {
6191 dev->promiscuity -= inc;
6192 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6193 dev->name);
6194 return -EOVERFLOW;
6195 }
6196 }
6197 if (dev->flags != old_flags) {
6198 pr_info("device %s %s promiscuous mode\n",
6199 dev->name,
6200 dev->flags & IFF_PROMISC ? "entered" : "left");
6201 if (audit_enabled) {
6202 current_uid_gid(&uid, &gid);
6203 audit_log(current->audit_context, GFP_ATOMIC,
6204 AUDIT_ANOM_PROMISCUOUS,
6205 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6206 dev->name, (dev->flags & IFF_PROMISC),
6207 (old_flags & IFF_PROMISC),
6208 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6209 from_kuid(&init_user_ns, uid),
6210 from_kgid(&init_user_ns, gid),
6211 audit_get_sessionid(current));
6212 }
6213
6214 dev_change_rx_flags(dev, IFF_PROMISC);
6215 }
6216 if (notify)
6217 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6218 return 0;
6219 }
6220
6221 /**
6222 * dev_set_promiscuity - update promiscuity count on a device
6223 * @dev: device
6224 * @inc: modifier
6225 *
6226 * Add or remove promiscuity from a device. While the count in the device
6227 * remains above zero the interface remains promiscuous. Once it hits zero
6228 * the device reverts back to normal filtering operation. A negative inc
6229 * value is used to drop promiscuity on the device.
6230 * Return 0 if successful or a negative errno code on error.
6231 */
6232 int dev_set_promiscuity(struct net_device *dev, int inc)
6233 {
6234 unsigned int old_flags = dev->flags;
6235 int err;
6236
6237 err = __dev_set_promiscuity(dev, inc, true);
6238 if (err < 0)
6239 return err;
6240 if (dev->flags != old_flags)
6241 dev_set_rx_mode(dev);
6242 return err;
6243 }
6244 EXPORT_SYMBOL(dev_set_promiscuity);
6245
6246 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6247 {
6248 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6249
6250 ASSERT_RTNL();
6251
6252 dev->flags |= IFF_ALLMULTI;
6253 dev->allmulti += inc;
6254 if (dev->allmulti == 0) {
6255 /*
6256 * Avoid overflow.
6257 * If inc causes overflow, untouch allmulti and return error.
6258 */
6259 if (inc < 0)
6260 dev->flags &= ~IFF_ALLMULTI;
6261 else {
6262 dev->allmulti -= inc;
6263 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6264 dev->name);
6265 return -EOVERFLOW;
6266 }
6267 }
6268 if (dev->flags ^ old_flags) {
6269 dev_change_rx_flags(dev, IFF_ALLMULTI);
6270 dev_set_rx_mode(dev);
6271 if (notify)
6272 __dev_notify_flags(dev, old_flags,
6273 dev->gflags ^ old_gflags);
6274 }
6275 return 0;
6276 }
6277
6278 /**
6279 * dev_set_allmulti - update allmulti count on a device
6280 * @dev: device
6281 * @inc: modifier
6282 *
6283 * Add or remove reception of all multicast frames to a device. While the
6284 * count in the device remains above zero the interface remains listening
6285 * to all interfaces. Once it hits zero the device reverts back to normal
6286 * filtering operation. A negative @inc value is used to drop the counter
6287 * when releasing a resource needing all multicasts.
6288 * Return 0 if successful or a negative errno code on error.
6289 */
6290
6291 int dev_set_allmulti(struct net_device *dev, int inc)
6292 {
6293 return __dev_set_allmulti(dev, inc, true);
6294 }
6295 EXPORT_SYMBOL(dev_set_allmulti);
6296
6297 /*
6298 * Upload unicast and multicast address lists to device and
6299 * configure RX filtering. When the device doesn't support unicast
6300 * filtering it is put in promiscuous mode while unicast addresses
6301 * are present.
6302 */
6303 void __dev_set_rx_mode(struct net_device *dev)
6304 {
6305 const struct net_device_ops *ops = dev->netdev_ops;
6306
6307 /* dev_open will call this function so the list will stay sane. */
6308 if (!(dev->flags&IFF_UP))
6309 return;
6310
6311 if (!netif_device_present(dev))
6312 return;
6313
6314 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6315 /* Unicast addresses changes may only happen under the rtnl,
6316 * therefore calling __dev_set_promiscuity here is safe.
6317 */
6318 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6319 __dev_set_promiscuity(dev, 1, false);
6320 dev->uc_promisc = true;
6321 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6322 __dev_set_promiscuity(dev, -1, false);
6323 dev->uc_promisc = false;
6324 }
6325 }
6326
6327 if (ops->ndo_set_rx_mode)
6328 ops->ndo_set_rx_mode(dev);
6329 }
6330
6331 void dev_set_rx_mode(struct net_device *dev)
6332 {
6333 netif_addr_lock_bh(dev);
6334 __dev_set_rx_mode(dev);
6335 netif_addr_unlock_bh(dev);
6336 }
6337
6338 /**
6339 * dev_get_flags - get flags reported to userspace
6340 * @dev: device
6341 *
6342 * Get the combination of flag bits exported through APIs to userspace.
6343 */
6344 unsigned int dev_get_flags(const struct net_device *dev)
6345 {
6346 unsigned int flags;
6347
6348 flags = (dev->flags & ~(IFF_PROMISC |
6349 IFF_ALLMULTI |
6350 IFF_RUNNING |
6351 IFF_LOWER_UP |
6352 IFF_DORMANT)) |
6353 (dev->gflags & (IFF_PROMISC |
6354 IFF_ALLMULTI));
6355
6356 if (netif_running(dev)) {
6357 if (netif_oper_up(dev))
6358 flags |= IFF_RUNNING;
6359 if (netif_carrier_ok(dev))
6360 flags |= IFF_LOWER_UP;
6361 if (netif_dormant(dev))
6362 flags |= IFF_DORMANT;
6363 }
6364
6365 return flags;
6366 }
6367 EXPORT_SYMBOL(dev_get_flags);
6368
6369 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6370 {
6371 unsigned int old_flags = dev->flags;
6372 int ret;
6373
6374 ASSERT_RTNL();
6375
6376 /*
6377 * Set the flags on our device.
6378 */
6379
6380 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6381 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6382 IFF_AUTOMEDIA)) |
6383 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6384 IFF_ALLMULTI));
6385
6386 /*
6387 * Load in the correct multicast list now the flags have changed.
6388 */
6389
6390 if ((old_flags ^ flags) & IFF_MULTICAST)
6391 dev_change_rx_flags(dev, IFF_MULTICAST);
6392
6393 dev_set_rx_mode(dev);
6394
6395 /*
6396 * Have we downed the interface. We handle IFF_UP ourselves
6397 * according to user attempts to set it, rather than blindly
6398 * setting it.
6399 */
6400
6401 ret = 0;
6402 if ((old_flags ^ flags) & IFF_UP)
6403 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6404
6405 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6406 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6407 unsigned int old_flags = dev->flags;
6408
6409 dev->gflags ^= IFF_PROMISC;
6410
6411 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6412 if (dev->flags != old_flags)
6413 dev_set_rx_mode(dev);
6414 }
6415
6416 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6417 is important. Some (broken) drivers set IFF_PROMISC, when
6418 IFF_ALLMULTI is requested not asking us and not reporting.
6419 */
6420 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6421 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6422
6423 dev->gflags ^= IFF_ALLMULTI;
6424 __dev_set_allmulti(dev, inc, false);
6425 }
6426
6427 return ret;
6428 }
6429
6430 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6431 unsigned int gchanges)
6432 {
6433 unsigned int changes = dev->flags ^ old_flags;
6434
6435 if (gchanges)
6436 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6437
6438 if (changes & IFF_UP) {
6439 if (dev->flags & IFF_UP)
6440 call_netdevice_notifiers(NETDEV_UP, dev);
6441 else
6442 call_netdevice_notifiers(NETDEV_DOWN, dev);
6443 }
6444
6445 if (dev->flags & IFF_UP &&
6446 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6447 struct netdev_notifier_change_info change_info;
6448
6449 change_info.flags_changed = changes;
6450 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6451 &change_info.info);
6452 }
6453 }
6454
6455 /**
6456 * dev_change_flags - change device settings
6457 * @dev: device
6458 * @flags: device state flags
6459 *
6460 * Change settings on device based state flags. The flags are
6461 * in the userspace exported format.
6462 */
6463 int dev_change_flags(struct net_device *dev, unsigned int flags)
6464 {
6465 int ret;
6466 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6467
6468 ret = __dev_change_flags(dev, flags);
6469 if (ret < 0)
6470 return ret;
6471
6472 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6473 __dev_notify_flags(dev, old_flags, changes);
6474 return ret;
6475 }
6476 EXPORT_SYMBOL(dev_change_flags);
6477
6478 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6479 {
6480 const struct net_device_ops *ops = dev->netdev_ops;
6481
6482 if (ops->ndo_change_mtu)
6483 return ops->ndo_change_mtu(dev, new_mtu);
6484
6485 dev->mtu = new_mtu;
6486 return 0;
6487 }
6488
6489 /**
6490 * dev_set_mtu - Change maximum transfer unit
6491 * @dev: device
6492 * @new_mtu: new transfer unit
6493 *
6494 * Change the maximum transfer size of the network device.
6495 */
6496 int dev_set_mtu(struct net_device *dev, int new_mtu)
6497 {
6498 int err, orig_mtu;
6499
6500 if (new_mtu == dev->mtu)
6501 return 0;
6502
6503 /* MTU must be positive. */
6504 if (new_mtu < 0)
6505 return -EINVAL;
6506
6507 if (!netif_device_present(dev))
6508 return -ENODEV;
6509
6510 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6511 err = notifier_to_errno(err);
6512 if (err)
6513 return err;
6514
6515 orig_mtu = dev->mtu;
6516 err = __dev_set_mtu(dev, new_mtu);
6517
6518 if (!err) {
6519 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6520 err = notifier_to_errno(err);
6521 if (err) {
6522 /* setting mtu back and notifying everyone again,
6523 * so that they have a chance to revert changes.
6524 */
6525 __dev_set_mtu(dev, orig_mtu);
6526 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6527 }
6528 }
6529 return err;
6530 }
6531 EXPORT_SYMBOL(dev_set_mtu);
6532
6533 /**
6534 * dev_set_group - Change group this device belongs to
6535 * @dev: device
6536 * @new_group: group this device should belong to
6537 */
6538 void dev_set_group(struct net_device *dev, int new_group)
6539 {
6540 dev->group = new_group;
6541 }
6542 EXPORT_SYMBOL(dev_set_group);
6543
6544 /**
6545 * dev_set_mac_address - Change Media Access Control Address
6546 * @dev: device
6547 * @sa: new address
6548 *
6549 * Change the hardware (MAC) address of the device
6550 */
6551 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6552 {
6553 const struct net_device_ops *ops = dev->netdev_ops;
6554 int err;
6555
6556 if (!ops->ndo_set_mac_address)
6557 return -EOPNOTSUPP;
6558 if (sa->sa_family != dev->type)
6559 return -EINVAL;
6560 if (!netif_device_present(dev))
6561 return -ENODEV;
6562 err = ops->ndo_set_mac_address(dev, sa);
6563 if (err)
6564 return err;
6565 dev->addr_assign_type = NET_ADDR_SET;
6566 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6567 add_device_randomness(dev->dev_addr, dev->addr_len);
6568 return 0;
6569 }
6570 EXPORT_SYMBOL(dev_set_mac_address);
6571
6572 /**
6573 * dev_change_carrier - Change device carrier
6574 * @dev: device
6575 * @new_carrier: new value
6576 *
6577 * Change device carrier
6578 */
6579 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6580 {
6581 const struct net_device_ops *ops = dev->netdev_ops;
6582
6583 if (!ops->ndo_change_carrier)
6584 return -EOPNOTSUPP;
6585 if (!netif_device_present(dev))
6586 return -ENODEV;
6587 return ops->ndo_change_carrier(dev, new_carrier);
6588 }
6589 EXPORT_SYMBOL(dev_change_carrier);
6590
6591 /**
6592 * dev_get_phys_port_id - Get device physical port ID
6593 * @dev: device
6594 * @ppid: port ID
6595 *
6596 * Get device physical port ID
6597 */
6598 int dev_get_phys_port_id(struct net_device *dev,
6599 struct netdev_phys_item_id *ppid)
6600 {
6601 const struct net_device_ops *ops = dev->netdev_ops;
6602
6603 if (!ops->ndo_get_phys_port_id)
6604 return -EOPNOTSUPP;
6605 return ops->ndo_get_phys_port_id(dev, ppid);
6606 }
6607 EXPORT_SYMBOL(dev_get_phys_port_id);
6608
6609 /**
6610 * dev_get_phys_port_name - Get device physical port name
6611 * @dev: device
6612 * @name: port name
6613 * @len: limit of bytes to copy to name
6614 *
6615 * Get device physical port name
6616 */
6617 int dev_get_phys_port_name(struct net_device *dev,
6618 char *name, size_t len)
6619 {
6620 const struct net_device_ops *ops = dev->netdev_ops;
6621
6622 if (!ops->ndo_get_phys_port_name)
6623 return -EOPNOTSUPP;
6624 return ops->ndo_get_phys_port_name(dev, name, len);
6625 }
6626 EXPORT_SYMBOL(dev_get_phys_port_name);
6627
6628 /**
6629 * dev_change_proto_down - update protocol port state information
6630 * @dev: device
6631 * @proto_down: new value
6632 *
6633 * This info can be used by switch drivers to set the phys state of the
6634 * port.
6635 */
6636 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6637 {
6638 const struct net_device_ops *ops = dev->netdev_ops;
6639
6640 if (!ops->ndo_change_proto_down)
6641 return -EOPNOTSUPP;
6642 if (!netif_device_present(dev))
6643 return -ENODEV;
6644 return ops->ndo_change_proto_down(dev, proto_down);
6645 }
6646 EXPORT_SYMBOL(dev_change_proto_down);
6647
6648 /**
6649 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6650 * @dev: device
6651 * @fd: new program fd or negative value to clear
6652 *
6653 * Set or clear a bpf program for a device
6654 */
6655 int dev_change_xdp_fd(struct net_device *dev, int fd)
6656 {
6657 const struct net_device_ops *ops = dev->netdev_ops;
6658 struct bpf_prog *prog = NULL;
6659 struct netdev_xdp xdp = {};
6660 int err;
6661
6662 if (!ops->ndo_xdp)
6663 return -EOPNOTSUPP;
6664 if (fd >= 0) {
6665 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6666 if (IS_ERR(prog))
6667 return PTR_ERR(prog);
6668 }
6669
6670 xdp.command = XDP_SETUP_PROG;
6671 xdp.prog = prog;
6672 err = ops->ndo_xdp(dev, &xdp);
6673 if (err < 0 && prog)
6674 bpf_prog_put(prog);
6675
6676 return err;
6677 }
6678 EXPORT_SYMBOL(dev_change_xdp_fd);
6679
6680 /**
6681 * dev_new_index - allocate an ifindex
6682 * @net: the applicable net namespace
6683 *
6684 * Returns a suitable unique value for a new device interface
6685 * number. The caller must hold the rtnl semaphore or the
6686 * dev_base_lock to be sure it remains unique.
6687 */
6688 static int dev_new_index(struct net *net)
6689 {
6690 int ifindex = net->ifindex;
6691 for (;;) {
6692 if (++ifindex <= 0)
6693 ifindex = 1;
6694 if (!__dev_get_by_index(net, ifindex))
6695 return net->ifindex = ifindex;
6696 }
6697 }
6698
6699 /* Delayed registration/unregisteration */
6700 static LIST_HEAD(net_todo_list);
6701 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6702
6703 static void net_set_todo(struct net_device *dev)
6704 {
6705 list_add_tail(&dev->todo_list, &net_todo_list);
6706 dev_net(dev)->dev_unreg_count++;
6707 }
6708
6709 static void rollback_registered_many(struct list_head *head)
6710 {
6711 struct net_device *dev, *tmp;
6712 LIST_HEAD(close_head);
6713
6714 BUG_ON(dev_boot_phase);
6715 ASSERT_RTNL();
6716
6717 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6718 /* Some devices call without registering
6719 * for initialization unwind. Remove those
6720 * devices and proceed with the remaining.
6721 */
6722 if (dev->reg_state == NETREG_UNINITIALIZED) {
6723 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6724 dev->name, dev);
6725
6726 WARN_ON(1);
6727 list_del(&dev->unreg_list);
6728 continue;
6729 }
6730 dev->dismantle = true;
6731 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6732 }
6733
6734 /* If device is running, close it first. */
6735 list_for_each_entry(dev, head, unreg_list)
6736 list_add_tail(&dev->close_list, &close_head);
6737 dev_close_many(&close_head, true);
6738
6739 list_for_each_entry(dev, head, unreg_list) {
6740 /* And unlink it from device chain. */
6741 unlist_netdevice(dev);
6742
6743 dev->reg_state = NETREG_UNREGISTERING;
6744 }
6745 flush_all_backlogs();
6746
6747 synchronize_net();
6748
6749 list_for_each_entry(dev, head, unreg_list) {
6750 struct sk_buff *skb = NULL;
6751
6752 /* Shutdown queueing discipline. */
6753 dev_shutdown(dev);
6754
6755
6756 /* Notify protocols, that we are about to destroy
6757 this device. They should clean all the things.
6758 */
6759 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6760
6761 if (!dev->rtnl_link_ops ||
6762 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6763 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6764 GFP_KERNEL);
6765
6766 /*
6767 * Flush the unicast and multicast chains
6768 */
6769 dev_uc_flush(dev);
6770 dev_mc_flush(dev);
6771
6772 if (dev->netdev_ops->ndo_uninit)
6773 dev->netdev_ops->ndo_uninit(dev);
6774
6775 if (skb)
6776 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6777
6778 /* Notifier chain MUST detach us all upper devices. */
6779 WARN_ON(netdev_has_any_upper_dev(dev));
6780
6781 /* Remove entries from kobject tree */
6782 netdev_unregister_kobject(dev);
6783 #ifdef CONFIG_XPS
6784 /* Remove XPS queueing entries */
6785 netif_reset_xps_queues_gt(dev, 0);
6786 #endif
6787 }
6788
6789 synchronize_net();
6790
6791 list_for_each_entry(dev, head, unreg_list)
6792 dev_put(dev);
6793 }
6794
6795 static void rollback_registered(struct net_device *dev)
6796 {
6797 LIST_HEAD(single);
6798
6799 list_add(&dev->unreg_list, &single);
6800 rollback_registered_many(&single);
6801 list_del(&single);
6802 }
6803
6804 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6805 struct net_device *upper, netdev_features_t features)
6806 {
6807 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6808 netdev_features_t feature;
6809 int feature_bit;
6810
6811 for_each_netdev_feature(&upper_disables, feature_bit) {
6812 feature = __NETIF_F_BIT(feature_bit);
6813 if (!(upper->wanted_features & feature)
6814 && (features & feature)) {
6815 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6816 &feature, upper->name);
6817 features &= ~feature;
6818 }
6819 }
6820
6821 return features;
6822 }
6823
6824 static void netdev_sync_lower_features(struct net_device *upper,
6825 struct net_device *lower, netdev_features_t features)
6826 {
6827 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6828 netdev_features_t feature;
6829 int feature_bit;
6830
6831 for_each_netdev_feature(&upper_disables, feature_bit) {
6832 feature = __NETIF_F_BIT(feature_bit);
6833 if (!(features & feature) && (lower->features & feature)) {
6834 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6835 &feature, lower->name);
6836 lower->wanted_features &= ~feature;
6837 netdev_update_features(lower);
6838
6839 if (unlikely(lower->features & feature))
6840 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6841 &feature, lower->name);
6842 }
6843 }
6844 }
6845
6846 static netdev_features_t netdev_fix_features(struct net_device *dev,
6847 netdev_features_t features)
6848 {
6849 /* Fix illegal checksum combinations */
6850 if ((features & NETIF_F_HW_CSUM) &&
6851 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6852 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6853 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6854 }
6855
6856 /* TSO requires that SG is present as well. */
6857 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6858 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6859 features &= ~NETIF_F_ALL_TSO;
6860 }
6861
6862 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6863 !(features & NETIF_F_IP_CSUM)) {
6864 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6865 features &= ~NETIF_F_TSO;
6866 features &= ~NETIF_F_TSO_ECN;
6867 }
6868
6869 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6870 !(features & NETIF_F_IPV6_CSUM)) {
6871 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6872 features &= ~NETIF_F_TSO6;
6873 }
6874
6875 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6876 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6877 features &= ~NETIF_F_TSO_MANGLEID;
6878
6879 /* TSO ECN requires that TSO is present as well. */
6880 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6881 features &= ~NETIF_F_TSO_ECN;
6882
6883 /* Software GSO depends on SG. */
6884 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6885 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6886 features &= ~NETIF_F_GSO;
6887 }
6888
6889 /* UFO needs SG and checksumming */
6890 if (features & NETIF_F_UFO) {
6891 /* maybe split UFO into V4 and V6? */
6892 if (!(features & NETIF_F_HW_CSUM) &&
6893 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6894 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6895 netdev_dbg(dev,
6896 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6897 features &= ~NETIF_F_UFO;
6898 }
6899
6900 if (!(features & NETIF_F_SG)) {
6901 netdev_dbg(dev,
6902 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6903 features &= ~NETIF_F_UFO;
6904 }
6905 }
6906
6907 /* GSO partial features require GSO partial be set */
6908 if ((features & dev->gso_partial_features) &&
6909 !(features & NETIF_F_GSO_PARTIAL)) {
6910 netdev_dbg(dev,
6911 "Dropping partially supported GSO features since no GSO partial.\n");
6912 features &= ~dev->gso_partial_features;
6913 }
6914
6915 #ifdef CONFIG_NET_RX_BUSY_POLL
6916 if (dev->netdev_ops->ndo_busy_poll)
6917 features |= NETIF_F_BUSY_POLL;
6918 else
6919 #endif
6920 features &= ~NETIF_F_BUSY_POLL;
6921
6922 return features;
6923 }
6924
6925 int __netdev_update_features(struct net_device *dev)
6926 {
6927 struct net_device *upper, *lower;
6928 netdev_features_t features;
6929 struct list_head *iter;
6930 int err = -1;
6931
6932 ASSERT_RTNL();
6933
6934 features = netdev_get_wanted_features(dev);
6935
6936 if (dev->netdev_ops->ndo_fix_features)
6937 features = dev->netdev_ops->ndo_fix_features(dev, features);
6938
6939 /* driver might be less strict about feature dependencies */
6940 features = netdev_fix_features(dev, features);
6941
6942 /* some features can't be enabled if they're off an an upper device */
6943 netdev_for_each_upper_dev_rcu(dev, upper, iter)
6944 features = netdev_sync_upper_features(dev, upper, features);
6945
6946 if (dev->features == features)
6947 goto sync_lower;
6948
6949 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6950 &dev->features, &features);
6951
6952 if (dev->netdev_ops->ndo_set_features)
6953 err = dev->netdev_ops->ndo_set_features(dev, features);
6954 else
6955 err = 0;
6956
6957 if (unlikely(err < 0)) {
6958 netdev_err(dev,
6959 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6960 err, &features, &dev->features);
6961 /* return non-0 since some features might have changed and
6962 * it's better to fire a spurious notification than miss it
6963 */
6964 return -1;
6965 }
6966
6967 sync_lower:
6968 /* some features must be disabled on lower devices when disabled
6969 * on an upper device (think: bonding master or bridge)
6970 */
6971 netdev_for_each_lower_dev(dev, lower, iter)
6972 netdev_sync_lower_features(dev, lower, features);
6973
6974 if (!err)
6975 dev->features = features;
6976
6977 return err < 0 ? 0 : 1;
6978 }
6979
6980 /**
6981 * netdev_update_features - recalculate device features
6982 * @dev: the device to check
6983 *
6984 * Recalculate dev->features set and send notifications if it
6985 * has changed. Should be called after driver or hardware dependent
6986 * conditions might have changed that influence the features.
6987 */
6988 void netdev_update_features(struct net_device *dev)
6989 {
6990 if (__netdev_update_features(dev))
6991 netdev_features_change(dev);
6992 }
6993 EXPORT_SYMBOL(netdev_update_features);
6994
6995 /**
6996 * netdev_change_features - recalculate device features
6997 * @dev: the device to check
6998 *
6999 * Recalculate dev->features set and send notifications even
7000 * if they have not changed. Should be called instead of
7001 * netdev_update_features() if also dev->vlan_features might
7002 * have changed to allow the changes to be propagated to stacked
7003 * VLAN devices.
7004 */
7005 void netdev_change_features(struct net_device *dev)
7006 {
7007 __netdev_update_features(dev);
7008 netdev_features_change(dev);
7009 }
7010 EXPORT_SYMBOL(netdev_change_features);
7011
7012 /**
7013 * netif_stacked_transfer_operstate - transfer operstate
7014 * @rootdev: the root or lower level device to transfer state from
7015 * @dev: the device to transfer operstate to
7016 *
7017 * Transfer operational state from root to device. This is normally
7018 * called when a stacking relationship exists between the root
7019 * device and the device(a leaf device).
7020 */
7021 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7022 struct net_device *dev)
7023 {
7024 if (rootdev->operstate == IF_OPER_DORMANT)
7025 netif_dormant_on(dev);
7026 else
7027 netif_dormant_off(dev);
7028
7029 if (netif_carrier_ok(rootdev)) {
7030 if (!netif_carrier_ok(dev))
7031 netif_carrier_on(dev);
7032 } else {
7033 if (netif_carrier_ok(dev))
7034 netif_carrier_off(dev);
7035 }
7036 }
7037 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7038
7039 #ifdef CONFIG_SYSFS
7040 static int netif_alloc_rx_queues(struct net_device *dev)
7041 {
7042 unsigned int i, count = dev->num_rx_queues;
7043 struct netdev_rx_queue *rx;
7044 size_t sz = count * sizeof(*rx);
7045
7046 BUG_ON(count < 1);
7047
7048 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7049 if (!rx) {
7050 rx = vzalloc(sz);
7051 if (!rx)
7052 return -ENOMEM;
7053 }
7054 dev->_rx = rx;
7055
7056 for (i = 0; i < count; i++)
7057 rx[i].dev = dev;
7058 return 0;
7059 }
7060 #endif
7061
7062 static void netdev_init_one_queue(struct net_device *dev,
7063 struct netdev_queue *queue, void *_unused)
7064 {
7065 /* Initialize queue lock */
7066 spin_lock_init(&queue->_xmit_lock);
7067 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7068 queue->xmit_lock_owner = -1;
7069 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7070 queue->dev = dev;
7071 #ifdef CONFIG_BQL
7072 dql_init(&queue->dql, HZ);
7073 #endif
7074 }
7075
7076 static void netif_free_tx_queues(struct net_device *dev)
7077 {
7078 kvfree(dev->_tx);
7079 }
7080
7081 static int netif_alloc_netdev_queues(struct net_device *dev)
7082 {
7083 unsigned int count = dev->num_tx_queues;
7084 struct netdev_queue *tx;
7085 size_t sz = count * sizeof(*tx);
7086
7087 if (count < 1 || count > 0xffff)
7088 return -EINVAL;
7089
7090 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7091 if (!tx) {
7092 tx = vzalloc(sz);
7093 if (!tx)
7094 return -ENOMEM;
7095 }
7096 dev->_tx = tx;
7097
7098 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7099 spin_lock_init(&dev->tx_global_lock);
7100
7101 return 0;
7102 }
7103
7104 void netif_tx_stop_all_queues(struct net_device *dev)
7105 {
7106 unsigned int i;
7107
7108 for (i = 0; i < dev->num_tx_queues; i++) {
7109 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7110 netif_tx_stop_queue(txq);
7111 }
7112 }
7113 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7114
7115 /**
7116 * register_netdevice - register a network device
7117 * @dev: device to register
7118 *
7119 * Take a completed network device structure and add it to the kernel
7120 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7121 * chain. 0 is returned on success. A negative errno code is returned
7122 * on a failure to set up the device, or if the name is a duplicate.
7123 *
7124 * Callers must hold the rtnl semaphore. You may want
7125 * register_netdev() instead of this.
7126 *
7127 * BUGS:
7128 * The locking appears insufficient to guarantee two parallel registers
7129 * will not get the same name.
7130 */
7131
7132 int register_netdevice(struct net_device *dev)
7133 {
7134 int ret;
7135 struct net *net = dev_net(dev);
7136
7137 BUG_ON(dev_boot_phase);
7138 ASSERT_RTNL();
7139
7140 might_sleep();
7141
7142 /* When net_device's are persistent, this will be fatal. */
7143 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7144 BUG_ON(!net);
7145
7146 spin_lock_init(&dev->addr_list_lock);
7147 netdev_set_addr_lockdep_class(dev);
7148
7149 ret = dev_get_valid_name(net, dev, dev->name);
7150 if (ret < 0)
7151 goto out;
7152
7153 /* Init, if this function is available */
7154 if (dev->netdev_ops->ndo_init) {
7155 ret = dev->netdev_ops->ndo_init(dev);
7156 if (ret) {
7157 if (ret > 0)
7158 ret = -EIO;
7159 goto out;
7160 }
7161 }
7162
7163 if (((dev->hw_features | dev->features) &
7164 NETIF_F_HW_VLAN_CTAG_FILTER) &&
7165 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7166 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7167 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7168 ret = -EINVAL;
7169 goto err_uninit;
7170 }
7171
7172 ret = -EBUSY;
7173 if (!dev->ifindex)
7174 dev->ifindex = dev_new_index(net);
7175 else if (__dev_get_by_index(net, dev->ifindex))
7176 goto err_uninit;
7177
7178 /* Transfer changeable features to wanted_features and enable
7179 * software offloads (GSO and GRO).
7180 */
7181 dev->hw_features |= NETIF_F_SOFT_FEATURES;
7182 dev->features |= NETIF_F_SOFT_FEATURES;
7183 dev->wanted_features = dev->features & dev->hw_features;
7184
7185 if (!(dev->flags & IFF_LOOPBACK))
7186 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7187
7188 /* If IPv4 TCP segmentation offload is supported we should also
7189 * allow the device to enable segmenting the frame with the option
7190 * of ignoring a static IP ID value. This doesn't enable the
7191 * feature itself but allows the user to enable it later.
7192 */
7193 if (dev->hw_features & NETIF_F_TSO)
7194 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7195 if (dev->vlan_features & NETIF_F_TSO)
7196 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7197 if (dev->mpls_features & NETIF_F_TSO)
7198 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7199 if (dev->hw_enc_features & NETIF_F_TSO)
7200 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7201
7202 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7203 */
7204 dev->vlan_features |= NETIF_F_HIGHDMA;
7205
7206 /* Make NETIF_F_SG inheritable to tunnel devices.
7207 */
7208 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7209
7210 /* Make NETIF_F_SG inheritable to MPLS.
7211 */
7212 dev->mpls_features |= NETIF_F_SG;
7213
7214 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7215 ret = notifier_to_errno(ret);
7216 if (ret)
7217 goto err_uninit;
7218
7219 ret = netdev_register_kobject(dev);
7220 if (ret)
7221 goto err_uninit;
7222 dev->reg_state = NETREG_REGISTERED;
7223
7224 __netdev_update_features(dev);
7225
7226 /*
7227 * Default initial state at registry is that the
7228 * device is present.
7229 */
7230
7231 set_bit(__LINK_STATE_PRESENT, &dev->state);
7232
7233 linkwatch_init_dev(dev);
7234
7235 dev_init_scheduler(dev);
7236 dev_hold(dev);
7237 list_netdevice(dev);
7238 add_device_randomness(dev->dev_addr, dev->addr_len);
7239
7240 /* If the device has permanent device address, driver should
7241 * set dev_addr and also addr_assign_type should be set to
7242 * NET_ADDR_PERM (default value).
7243 */
7244 if (dev->addr_assign_type == NET_ADDR_PERM)
7245 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7246
7247 /* Notify protocols, that a new device appeared. */
7248 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7249 ret = notifier_to_errno(ret);
7250 if (ret) {
7251 rollback_registered(dev);
7252 dev->reg_state = NETREG_UNREGISTERED;
7253 }
7254 /*
7255 * Prevent userspace races by waiting until the network
7256 * device is fully setup before sending notifications.
7257 */
7258 if (!dev->rtnl_link_ops ||
7259 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7260 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7261
7262 out:
7263 return ret;
7264
7265 err_uninit:
7266 if (dev->netdev_ops->ndo_uninit)
7267 dev->netdev_ops->ndo_uninit(dev);
7268 goto out;
7269 }
7270 EXPORT_SYMBOL(register_netdevice);
7271
7272 /**
7273 * init_dummy_netdev - init a dummy network device for NAPI
7274 * @dev: device to init
7275 *
7276 * This takes a network device structure and initialize the minimum
7277 * amount of fields so it can be used to schedule NAPI polls without
7278 * registering a full blown interface. This is to be used by drivers
7279 * that need to tie several hardware interfaces to a single NAPI
7280 * poll scheduler due to HW limitations.
7281 */
7282 int init_dummy_netdev(struct net_device *dev)
7283 {
7284 /* Clear everything. Note we don't initialize spinlocks
7285 * are they aren't supposed to be taken by any of the
7286 * NAPI code and this dummy netdev is supposed to be
7287 * only ever used for NAPI polls
7288 */
7289 memset(dev, 0, sizeof(struct net_device));
7290
7291 /* make sure we BUG if trying to hit standard
7292 * register/unregister code path
7293 */
7294 dev->reg_state = NETREG_DUMMY;
7295
7296 /* NAPI wants this */
7297 INIT_LIST_HEAD(&dev->napi_list);
7298
7299 /* a dummy interface is started by default */
7300 set_bit(__LINK_STATE_PRESENT, &dev->state);
7301 set_bit(__LINK_STATE_START, &dev->state);
7302
7303 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7304 * because users of this 'device' dont need to change
7305 * its refcount.
7306 */
7307
7308 return 0;
7309 }
7310 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7311
7312
7313 /**
7314 * register_netdev - register a network device
7315 * @dev: device to register
7316 *
7317 * Take a completed network device structure and add it to the kernel
7318 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7319 * chain. 0 is returned on success. A negative errno code is returned
7320 * on a failure to set up the device, or if the name is a duplicate.
7321 *
7322 * This is a wrapper around register_netdevice that takes the rtnl semaphore
7323 * and expands the device name if you passed a format string to
7324 * alloc_netdev.
7325 */
7326 int register_netdev(struct net_device *dev)
7327 {
7328 int err;
7329
7330 rtnl_lock();
7331 err = register_netdevice(dev);
7332 rtnl_unlock();
7333 return err;
7334 }
7335 EXPORT_SYMBOL(register_netdev);
7336
7337 int netdev_refcnt_read(const struct net_device *dev)
7338 {
7339 int i, refcnt = 0;
7340
7341 for_each_possible_cpu(i)
7342 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7343 return refcnt;
7344 }
7345 EXPORT_SYMBOL(netdev_refcnt_read);
7346
7347 /**
7348 * netdev_wait_allrefs - wait until all references are gone.
7349 * @dev: target net_device
7350 *
7351 * This is called when unregistering network devices.
7352 *
7353 * Any protocol or device that holds a reference should register
7354 * for netdevice notification, and cleanup and put back the
7355 * reference if they receive an UNREGISTER event.
7356 * We can get stuck here if buggy protocols don't correctly
7357 * call dev_put.
7358 */
7359 static void netdev_wait_allrefs(struct net_device *dev)
7360 {
7361 unsigned long rebroadcast_time, warning_time;
7362 int refcnt;
7363
7364 linkwatch_forget_dev(dev);
7365
7366 rebroadcast_time = warning_time = jiffies;
7367 refcnt = netdev_refcnt_read(dev);
7368
7369 while (refcnt != 0) {
7370 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7371 rtnl_lock();
7372
7373 /* Rebroadcast unregister notification */
7374 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7375
7376 __rtnl_unlock();
7377 rcu_barrier();
7378 rtnl_lock();
7379
7380 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7381 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7382 &dev->state)) {
7383 /* We must not have linkwatch events
7384 * pending on unregister. If this
7385 * happens, we simply run the queue
7386 * unscheduled, resulting in a noop
7387 * for this device.
7388 */
7389 linkwatch_run_queue();
7390 }
7391
7392 __rtnl_unlock();
7393
7394 rebroadcast_time = jiffies;
7395 }
7396
7397 msleep(250);
7398
7399 refcnt = netdev_refcnt_read(dev);
7400
7401 if (time_after(jiffies, warning_time + 10 * HZ)) {
7402 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7403 dev->name, refcnt);
7404 warning_time = jiffies;
7405 }
7406 }
7407 }
7408
7409 /* The sequence is:
7410 *
7411 * rtnl_lock();
7412 * ...
7413 * register_netdevice(x1);
7414 * register_netdevice(x2);
7415 * ...
7416 * unregister_netdevice(y1);
7417 * unregister_netdevice(y2);
7418 * ...
7419 * rtnl_unlock();
7420 * free_netdev(y1);
7421 * free_netdev(y2);
7422 *
7423 * We are invoked by rtnl_unlock().
7424 * This allows us to deal with problems:
7425 * 1) We can delete sysfs objects which invoke hotplug
7426 * without deadlocking with linkwatch via keventd.
7427 * 2) Since we run with the RTNL semaphore not held, we can sleep
7428 * safely in order to wait for the netdev refcnt to drop to zero.
7429 *
7430 * We must not return until all unregister events added during
7431 * the interval the lock was held have been completed.
7432 */
7433 void netdev_run_todo(void)
7434 {
7435 struct list_head list;
7436
7437 /* Snapshot list, allow later requests */
7438 list_replace_init(&net_todo_list, &list);
7439
7440 __rtnl_unlock();
7441
7442
7443 /* Wait for rcu callbacks to finish before next phase */
7444 if (!list_empty(&list))
7445 rcu_barrier();
7446
7447 while (!list_empty(&list)) {
7448 struct net_device *dev
7449 = list_first_entry(&list, struct net_device, todo_list);
7450 list_del(&dev->todo_list);
7451
7452 rtnl_lock();
7453 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7454 __rtnl_unlock();
7455
7456 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7457 pr_err("network todo '%s' but state %d\n",
7458 dev->name, dev->reg_state);
7459 dump_stack();
7460 continue;
7461 }
7462
7463 dev->reg_state = NETREG_UNREGISTERED;
7464
7465 netdev_wait_allrefs(dev);
7466
7467 /* paranoia */
7468 BUG_ON(netdev_refcnt_read(dev));
7469 BUG_ON(!list_empty(&dev->ptype_all));
7470 BUG_ON(!list_empty(&dev->ptype_specific));
7471 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7472 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7473 WARN_ON(dev->dn_ptr);
7474
7475 if (dev->destructor)
7476 dev->destructor(dev);
7477
7478 /* Report a network device has been unregistered */
7479 rtnl_lock();
7480 dev_net(dev)->dev_unreg_count--;
7481 __rtnl_unlock();
7482 wake_up(&netdev_unregistering_wq);
7483
7484 /* Free network device */
7485 kobject_put(&dev->dev.kobj);
7486 }
7487 }
7488
7489 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7490 * all the same fields in the same order as net_device_stats, with only
7491 * the type differing, but rtnl_link_stats64 may have additional fields
7492 * at the end for newer counters.
7493 */
7494 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7495 const struct net_device_stats *netdev_stats)
7496 {
7497 #if BITS_PER_LONG == 64
7498 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7499 memcpy(stats64, netdev_stats, sizeof(*stats64));
7500 /* zero out counters that only exist in rtnl_link_stats64 */
7501 memset((char *)stats64 + sizeof(*netdev_stats), 0,
7502 sizeof(*stats64) - sizeof(*netdev_stats));
7503 #else
7504 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7505 const unsigned long *src = (const unsigned long *)netdev_stats;
7506 u64 *dst = (u64 *)stats64;
7507
7508 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7509 for (i = 0; i < n; i++)
7510 dst[i] = src[i];
7511 /* zero out counters that only exist in rtnl_link_stats64 */
7512 memset((char *)stats64 + n * sizeof(u64), 0,
7513 sizeof(*stats64) - n * sizeof(u64));
7514 #endif
7515 }
7516 EXPORT_SYMBOL(netdev_stats_to_stats64);
7517
7518 /**
7519 * dev_get_stats - get network device statistics
7520 * @dev: device to get statistics from
7521 * @storage: place to store stats
7522 *
7523 * Get network statistics from device. Return @storage.
7524 * The device driver may provide its own method by setting
7525 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7526 * otherwise the internal statistics structure is used.
7527 */
7528 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7529 struct rtnl_link_stats64 *storage)
7530 {
7531 const struct net_device_ops *ops = dev->netdev_ops;
7532
7533 if (ops->ndo_get_stats64) {
7534 memset(storage, 0, sizeof(*storage));
7535 ops->ndo_get_stats64(dev, storage);
7536 } else if (ops->ndo_get_stats) {
7537 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7538 } else {
7539 netdev_stats_to_stats64(storage, &dev->stats);
7540 }
7541 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7542 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7543 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7544 return storage;
7545 }
7546 EXPORT_SYMBOL(dev_get_stats);
7547
7548 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7549 {
7550 struct netdev_queue *queue = dev_ingress_queue(dev);
7551
7552 #ifdef CONFIG_NET_CLS_ACT
7553 if (queue)
7554 return queue;
7555 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7556 if (!queue)
7557 return NULL;
7558 netdev_init_one_queue(dev, queue, NULL);
7559 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7560 queue->qdisc_sleeping = &noop_qdisc;
7561 rcu_assign_pointer(dev->ingress_queue, queue);
7562 #endif
7563 return queue;
7564 }
7565
7566 static const struct ethtool_ops default_ethtool_ops;
7567
7568 void netdev_set_default_ethtool_ops(struct net_device *dev,
7569 const struct ethtool_ops *ops)
7570 {
7571 if (dev->ethtool_ops == &default_ethtool_ops)
7572 dev->ethtool_ops = ops;
7573 }
7574 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7575
7576 void netdev_freemem(struct net_device *dev)
7577 {
7578 char *addr = (char *)dev - dev->padded;
7579
7580 kvfree(addr);
7581 }
7582
7583 /**
7584 * alloc_netdev_mqs - allocate network device
7585 * @sizeof_priv: size of private data to allocate space for
7586 * @name: device name format string
7587 * @name_assign_type: origin of device name
7588 * @setup: callback to initialize device
7589 * @txqs: the number of TX subqueues to allocate
7590 * @rxqs: the number of RX subqueues to allocate
7591 *
7592 * Allocates a struct net_device with private data area for driver use
7593 * and performs basic initialization. Also allocates subqueue structs
7594 * for each queue on the device.
7595 */
7596 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7597 unsigned char name_assign_type,
7598 void (*setup)(struct net_device *),
7599 unsigned int txqs, unsigned int rxqs)
7600 {
7601 struct net_device *dev;
7602 size_t alloc_size;
7603 struct net_device *p;
7604
7605 BUG_ON(strlen(name) >= sizeof(dev->name));
7606
7607 if (txqs < 1) {
7608 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7609 return NULL;
7610 }
7611
7612 #ifdef CONFIG_SYSFS
7613 if (rxqs < 1) {
7614 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7615 return NULL;
7616 }
7617 #endif
7618
7619 alloc_size = sizeof(struct net_device);
7620 if (sizeof_priv) {
7621 /* ensure 32-byte alignment of private area */
7622 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7623 alloc_size += sizeof_priv;
7624 }
7625 /* ensure 32-byte alignment of whole construct */
7626 alloc_size += NETDEV_ALIGN - 1;
7627
7628 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7629 if (!p)
7630 p = vzalloc(alloc_size);
7631 if (!p)
7632 return NULL;
7633
7634 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7635 dev->padded = (char *)dev - (char *)p;
7636
7637 dev->pcpu_refcnt = alloc_percpu(int);
7638 if (!dev->pcpu_refcnt)
7639 goto free_dev;
7640
7641 if (dev_addr_init(dev))
7642 goto free_pcpu;
7643
7644 dev_mc_init(dev);
7645 dev_uc_init(dev);
7646
7647 dev_net_set(dev, &init_net);
7648
7649 dev->gso_max_size = GSO_MAX_SIZE;
7650 dev->gso_max_segs = GSO_MAX_SEGS;
7651
7652 INIT_LIST_HEAD(&dev->napi_list);
7653 INIT_LIST_HEAD(&dev->unreg_list);
7654 INIT_LIST_HEAD(&dev->close_list);
7655 INIT_LIST_HEAD(&dev->link_watch_list);
7656 INIT_LIST_HEAD(&dev->adj_list.upper);
7657 INIT_LIST_HEAD(&dev->adj_list.lower);
7658 INIT_LIST_HEAD(&dev->all_adj_list.upper);
7659 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7660 INIT_LIST_HEAD(&dev->ptype_all);
7661 INIT_LIST_HEAD(&dev->ptype_specific);
7662 #ifdef CONFIG_NET_SCHED
7663 hash_init(dev->qdisc_hash);
7664 #endif
7665 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7666 setup(dev);
7667
7668 if (!dev->tx_queue_len) {
7669 dev->priv_flags |= IFF_NO_QUEUE;
7670 dev->tx_queue_len = 1;
7671 }
7672
7673 dev->num_tx_queues = txqs;
7674 dev->real_num_tx_queues = txqs;
7675 if (netif_alloc_netdev_queues(dev))
7676 goto free_all;
7677
7678 #ifdef CONFIG_SYSFS
7679 dev->num_rx_queues = rxqs;
7680 dev->real_num_rx_queues = rxqs;
7681 if (netif_alloc_rx_queues(dev))
7682 goto free_all;
7683 #endif
7684
7685 strcpy(dev->name, name);
7686 dev->name_assign_type = name_assign_type;
7687 dev->group = INIT_NETDEV_GROUP;
7688 if (!dev->ethtool_ops)
7689 dev->ethtool_ops = &default_ethtool_ops;
7690
7691 nf_hook_ingress_init(dev);
7692
7693 return dev;
7694
7695 free_all:
7696 free_netdev(dev);
7697 return NULL;
7698
7699 free_pcpu:
7700 free_percpu(dev->pcpu_refcnt);
7701 free_dev:
7702 netdev_freemem(dev);
7703 return NULL;
7704 }
7705 EXPORT_SYMBOL(alloc_netdev_mqs);
7706
7707 /**
7708 * free_netdev - free network device
7709 * @dev: device
7710 *
7711 * This function does the last stage of destroying an allocated device
7712 * interface. The reference to the device object is released.
7713 * If this is the last reference then it will be freed.
7714 * Must be called in process context.
7715 */
7716 void free_netdev(struct net_device *dev)
7717 {
7718 struct napi_struct *p, *n;
7719
7720 might_sleep();
7721 netif_free_tx_queues(dev);
7722 #ifdef CONFIG_SYSFS
7723 kvfree(dev->_rx);
7724 #endif
7725
7726 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7727
7728 /* Flush device addresses */
7729 dev_addr_flush(dev);
7730
7731 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7732 netif_napi_del(p);
7733
7734 free_percpu(dev->pcpu_refcnt);
7735 dev->pcpu_refcnt = NULL;
7736
7737 /* Compatibility with error handling in drivers */
7738 if (dev->reg_state == NETREG_UNINITIALIZED) {
7739 netdev_freemem(dev);
7740 return;
7741 }
7742
7743 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7744 dev->reg_state = NETREG_RELEASED;
7745
7746 /* will free via device release */
7747 put_device(&dev->dev);
7748 }
7749 EXPORT_SYMBOL(free_netdev);
7750
7751 /**
7752 * synchronize_net - Synchronize with packet receive processing
7753 *
7754 * Wait for packets currently being received to be done.
7755 * Does not block later packets from starting.
7756 */
7757 void synchronize_net(void)
7758 {
7759 might_sleep();
7760 if (rtnl_is_locked())
7761 synchronize_rcu_expedited();
7762 else
7763 synchronize_rcu();
7764 }
7765 EXPORT_SYMBOL(synchronize_net);
7766
7767 /**
7768 * unregister_netdevice_queue - remove device from the kernel
7769 * @dev: device
7770 * @head: list
7771 *
7772 * This function shuts down a device interface and removes it
7773 * from the kernel tables.
7774 * If head not NULL, device is queued to be unregistered later.
7775 *
7776 * Callers must hold the rtnl semaphore. You may want
7777 * unregister_netdev() instead of this.
7778 */
7779
7780 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7781 {
7782 ASSERT_RTNL();
7783
7784 if (head) {
7785 list_move_tail(&dev->unreg_list, head);
7786 } else {
7787 rollback_registered(dev);
7788 /* Finish processing unregister after unlock */
7789 net_set_todo(dev);
7790 }
7791 }
7792 EXPORT_SYMBOL(unregister_netdevice_queue);
7793
7794 /**
7795 * unregister_netdevice_many - unregister many devices
7796 * @head: list of devices
7797 *
7798 * Note: As most callers use a stack allocated list_head,
7799 * we force a list_del() to make sure stack wont be corrupted later.
7800 */
7801 void unregister_netdevice_many(struct list_head *head)
7802 {
7803 struct net_device *dev;
7804
7805 if (!list_empty(head)) {
7806 rollback_registered_many(head);
7807 list_for_each_entry(dev, head, unreg_list)
7808 net_set_todo(dev);
7809 list_del(head);
7810 }
7811 }
7812 EXPORT_SYMBOL(unregister_netdevice_many);
7813
7814 /**
7815 * unregister_netdev - remove device from the kernel
7816 * @dev: device
7817 *
7818 * This function shuts down a device interface and removes it
7819 * from the kernel tables.
7820 *
7821 * This is just a wrapper for unregister_netdevice that takes
7822 * the rtnl semaphore. In general you want to use this and not
7823 * unregister_netdevice.
7824 */
7825 void unregister_netdev(struct net_device *dev)
7826 {
7827 rtnl_lock();
7828 unregister_netdevice(dev);
7829 rtnl_unlock();
7830 }
7831 EXPORT_SYMBOL(unregister_netdev);
7832
7833 /**
7834 * dev_change_net_namespace - move device to different nethost namespace
7835 * @dev: device
7836 * @net: network namespace
7837 * @pat: If not NULL name pattern to try if the current device name
7838 * is already taken in the destination network namespace.
7839 *
7840 * This function shuts down a device interface and moves it
7841 * to a new network namespace. On success 0 is returned, on
7842 * a failure a netagive errno code is returned.
7843 *
7844 * Callers must hold the rtnl semaphore.
7845 */
7846
7847 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7848 {
7849 int err;
7850
7851 ASSERT_RTNL();
7852
7853 /* Don't allow namespace local devices to be moved. */
7854 err = -EINVAL;
7855 if (dev->features & NETIF_F_NETNS_LOCAL)
7856 goto out;
7857
7858 /* Ensure the device has been registrered */
7859 if (dev->reg_state != NETREG_REGISTERED)
7860 goto out;
7861
7862 /* Get out if there is nothing todo */
7863 err = 0;
7864 if (net_eq(dev_net(dev), net))
7865 goto out;
7866
7867 /* Pick the destination device name, and ensure
7868 * we can use it in the destination network namespace.
7869 */
7870 err = -EEXIST;
7871 if (__dev_get_by_name(net, dev->name)) {
7872 /* We get here if we can't use the current device name */
7873 if (!pat)
7874 goto out;
7875 if (dev_get_valid_name(net, dev, pat) < 0)
7876 goto out;
7877 }
7878
7879 /*
7880 * And now a mini version of register_netdevice unregister_netdevice.
7881 */
7882
7883 /* If device is running close it first. */
7884 dev_close(dev);
7885
7886 /* And unlink it from device chain */
7887 err = -ENODEV;
7888 unlist_netdevice(dev);
7889
7890 synchronize_net();
7891
7892 /* Shutdown queueing discipline. */
7893 dev_shutdown(dev);
7894
7895 /* Notify protocols, that we are about to destroy
7896 this device. They should clean all the things.
7897
7898 Note that dev->reg_state stays at NETREG_REGISTERED.
7899 This is wanted because this way 8021q and macvlan know
7900 the device is just moving and can keep their slaves up.
7901 */
7902 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7903 rcu_barrier();
7904 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7905 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7906
7907 /*
7908 * Flush the unicast and multicast chains
7909 */
7910 dev_uc_flush(dev);
7911 dev_mc_flush(dev);
7912
7913 /* Send a netdev-removed uevent to the old namespace */
7914 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7915 netdev_adjacent_del_links(dev);
7916
7917 /* Actually switch the network namespace */
7918 dev_net_set(dev, net);
7919
7920 /* If there is an ifindex conflict assign a new one */
7921 if (__dev_get_by_index(net, dev->ifindex))
7922 dev->ifindex = dev_new_index(net);
7923
7924 /* Send a netdev-add uevent to the new namespace */
7925 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7926 netdev_adjacent_add_links(dev);
7927
7928 /* Fixup kobjects */
7929 err = device_rename(&dev->dev, dev->name);
7930 WARN_ON(err);
7931
7932 /* Add the device back in the hashes */
7933 list_netdevice(dev);
7934
7935 /* Notify protocols, that a new device appeared. */
7936 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7937
7938 /*
7939 * Prevent userspace races by waiting until the network
7940 * device is fully setup before sending notifications.
7941 */
7942 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7943
7944 synchronize_net();
7945 err = 0;
7946 out:
7947 return err;
7948 }
7949 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7950
7951 static int dev_cpu_callback(struct notifier_block *nfb,
7952 unsigned long action,
7953 void *ocpu)
7954 {
7955 struct sk_buff **list_skb;
7956 struct sk_buff *skb;
7957 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7958 struct softnet_data *sd, *oldsd;
7959
7960 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7961 return NOTIFY_OK;
7962
7963 local_irq_disable();
7964 cpu = smp_processor_id();
7965 sd = &per_cpu(softnet_data, cpu);
7966 oldsd = &per_cpu(softnet_data, oldcpu);
7967
7968 /* Find end of our completion_queue. */
7969 list_skb = &sd->completion_queue;
7970 while (*list_skb)
7971 list_skb = &(*list_skb)->next;
7972 /* Append completion queue from offline CPU. */
7973 *list_skb = oldsd->completion_queue;
7974 oldsd->completion_queue = NULL;
7975
7976 /* Append output queue from offline CPU. */
7977 if (oldsd->output_queue) {
7978 *sd->output_queue_tailp = oldsd->output_queue;
7979 sd->output_queue_tailp = oldsd->output_queue_tailp;
7980 oldsd->output_queue = NULL;
7981 oldsd->output_queue_tailp = &oldsd->output_queue;
7982 }
7983 /* Append NAPI poll list from offline CPU, with one exception :
7984 * process_backlog() must be called by cpu owning percpu backlog.
7985 * We properly handle process_queue & input_pkt_queue later.
7986 */
7987 while (!list_empty(&oldsd->poll_list)) {
7988 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7989 struct napi_struct,
7990 poll_list);
7991
7992 list_del_init(&napi->poll_list);
7993 if (napi->poll == process_backlog)
7994 napi->state = 0;
7995 else
7996 ____napi_schedule(sd, napi);
7997 }
7998
7999 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8000 local_irq_enable();
8001
8002 /* Process offline CPU's input_pkt_queue */
8003 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8004 netif_rx_ni(skb);
8005 input_queue_head_incr(oldsd);
8006 }
8007 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8008 netif_rx_ni(skb);
8009 input_queue_head_incr(oldsd);
8010 }
8011
8012 return NOTIFY_OK;
8013 }
8014
8015
8016 /**
8017 * netdev_increment_features - increment feature set by one
8018 * @all: current feature set
8019 * @one: new feature set
8020 * @mask: mask feature set
8021 *
8022 * Computes a new feature set after adding a device with feature set
8023 * @one to the master device with current feature set @all. Will not
8024 * enable anything that is off in @mask. Returns the new feature set.
8025 */
8026 netdev_features_t netdev_increment_features(netdev_features_t all,
8027 netdev_features_t one, netdev_features_t mask)
8028 {
8029 if (mask & NETIF_F_HW_CSUM)
8030 mask |= NETIF_F_CSUM_MASK;
8031 mask |= NETIF_F_VLAN_CHALLENGED;
8032
8033 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8034 all &= one | ~NETIF_F_ALL_FOR_ALL;
8035
8036 /* If one device supports hw checksumming, set for all. */
8037 if (all & NETIF_F_HW_CSUM)
8038 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8039
8040 return all;
8041 }
8042 EXPORT_SYMBOL(netdev_increment_features);
8043
8044 static struct hlist_head * __net_init netdev_create_hash(void)
8045 {
8046 int i;
8047 struct hlist_head *hash;
8048
8049 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8050 if (hash != NULL)
8051 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8052 INIT_HLIST_HEAD(&hash[i]);
8053
8054 return hash;
8055 }
8056
8057 /* Initialize per network namespace state */
8058 static int __net_init netdev_init(struct net *net)
8059 {
8060 if (net != &init_net)
8061 INIT_LIST_HEAD(&net->dev_base_head);
8062
8063 net->dev_name_head = netdev_create_hash();
8064 if (net->dev_name_head == NULL)
8065 goto err_name;
8066
8067 net->dev_index_head = netdev_create_hash();
8068 if (net->dev_index_head == NULL)
8069 goto err_idx;
8070
8071 return 0;
8072
8073 err_idx:
8074 kfree(net->dev_name_head);
8075 err_name:
8076 return -ENOMEM;
8077 }
8078
8079 /**
8080 * netdev_drivername - network driver for the device
8081 * @dev: network device
8082 *
8083 * Determine network driver for device.
8084 */
8085 const char *netdev_drivername(const struct net_device *dev)
8086 {
8087 const struct device_driver *driver;
8088 const struct device *parent;
8089 const char *empty = "";
8090
8091 parent = dev->dev.parent;
8092 if (!parent)
8093 return empty;
8094
8095 driver = parent->driver;
8096 if (driver && driver->name)
8097 return driver->name;
8098 return empty;
8099 }
8100
8101 static void __netdev_printk(const char *level, const struct net_device *dev,
8102 struct va_format *vaf)
8103 {
8104 if (dev && dev->dev.parent) {
8105 dev_printk_emit(level[1] - '0',
8106 dev->dev.parent,
8107 "%s %s %s%s: %pV",
8108 dev_driver_string(dev->dev.parent),
8109 dev_name(dev->dev.parent),
8110 netdev_name(dev), netdev_reg_state(dev),
8111 vaf);
8112 } else if (dev) {
8113 printk("%s%s%s: %pV",
8114 level, netdev_name(dev), netdev_reg_state(dev), vaf);
8115 } else {
8116 printk("%s(NULL net_device): %pV", level, vaf);
8117 }
8118 }
8119
8120 void netdev_printk(const char *level, const struct net_device *dev,
8121 const char *format, ...)
8122 {
8123 struct va_format vaf;
8124 va_list args;
8125
8126 va_start(args, format);
8127
8128 vaf.fmt = format;
8129 vaf.va = &args;
8130
8131 __netdev_printk(level, dev, &vaf);
8132
8133 va_end(args);
8134 }
8135 EXPORT_SYMBOL(netdev_printk);
8136
8137 #define define_netdev_printk_level(func, level) \
8138 void func(const struct net_device *dev, const char *fmt, ...) \
8139 { \
8140 struct va_format vaf; \
8141 va_list args; \
8142 \
8143 va_start(args, fmt); \
8144 \
8145 vaf.fmt = fmt; \
8146 vaf.va = &args; \
8147 \
8148 __netdev_printk(level, dev, &vaf); \
8149 \
8150 va_end(args); \
8151 } \
8152 EXPORT_SYMBOL(func);
8153
8154 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8155 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8156 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8157 define_netdev_printk_level(netdev_err, KERN_ERR);
8158 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8159 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8160 define_netdev_printk_level(netdev_info, KERN_INFO);
8161
8162 static void __net_exit netdev_exit(struct net *net)
8163 {
8164 kfree(net->dev_name_head);
8165 kfree(net->dev_index_head);
8166 }
8167
8168 static struct pernet_operations __net_initdata netdev_net_ops = {
8169 .init = netdev_init,
8170 .exit = netdev_exit,
8171 };
8172
8173 static void __net_exit default_device_exit(struct net *net)
8174 {
8175 struct net_device *dev, *aux;
8176 /*
8177 * Push all migratable network devices back to the
8178 * initial network namespace
8179 */
8180 rtnl_lock();
8181 for_each_netdev_safe(net, dev, aux) {
8182 int err;
8183 char fb_name[IFNAMSIZ];
8184
8185 /* Ignore unmoveable devices (i.e. loopback) */
8186 if (dev->features & NETIF_F_NETNS_LOCAL)
8187 continue;
8188
8189 /* Leave virtual devices for the generic cleanup */
8190 if (dev->rtnl_link_ops)
8191 continue;
8192
8193 /* Push remaining network devices to init_net */
8194 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8195 err = dev_change_net_namespace(dev, &init_net, fb_name);
8196 if (err) {
8197 pr_emerg("%s: failed to move %s to init_net: %d\n",
8198 __func__, dev->name, err);
8199 BUG();
8200 }
8201 }
8202 rtnl_unlock();
8203 }
8204
8205 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8206 {
8207 /* Return with the rtnl_lock held when there are no network
8208 * devices unregistering in any network namespace in net_list.
8209 */
8210 struct net *net;
8211 bool unregistering;
8212 DEFINE_WAIT_FUNC(wait, woken_wake_function);
8213
8214 add_wait_queue(&netdev_unregistering_wq, &wait);
8215 for (;;) {
8216 unregistering = false;
8217 rtnl_lock();
8218 list_for_each_entry(net, net_list, exit_list) {
8219 if (net->dev_unreg_count > 0) {
8220 unregistering = true;
8221 break;
8222 }
8223 }
8224 if (!unregistering)
8225 break;
8226 __rtnl_unlock();
8227
8228 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8229 }
8230 remove_wait_queue(&netdev_unregistering_wq, &wait);
8231 }
8232
8233 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8234 {
8235 /* At exit all network devices most be removed from a network
8236 * namespace. Do this in the reverse order of registration.
8237 * Do this across as many network namespaces as possible to
8238 * improve batching efficiency.
8239 */
8240 struct net_device *dev;
8241 struct net *net;
8242 LIST_HEAD(dev_kill_list);
8243
8244 /* To prevent network device cleanup code from dereferencing
8245 * loopback devices or network devices that have been freed
8246 * wait here for all pending unregistrations to complete,
8247 * before unregistring the loopback device and allowing the
8248 * network namespace be freed.
8249 *
8250 * The netdev todo list containing all network devices
8251 * unregistrations that happen in default_device_exit_batch
8252 * will run in the rtnl_unlock() at the end of
8253 * default_device_exit_batch.
8254 */
8255 rtnl_lock_unregistering(net_list);
8256 list_for_each_entry(net, net_list, exit_list) {
8257 for_each_netdev_reverse(net, dev) {
8258 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8259 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8260 else
8261 unregister_netdevice_queue(dev, &dev_kill_list);
8262 }
8263 }
8264 unregister_netdevice_many(&dev_kill_list);
8265 rtnl_unlock();
8266 }
8267
8268 static struct pernet_operations __net_initdata default_device_ops = {
8269 .exit = default_device_exit,
8270 .exit_batch = default_device_exit_batch,
8271 };
8272
8273 /*
8274 * Initialize the DEV module. At boot time this walks the device list and
8275 * unhooks any devices that fail to initialise (normally hardware not
8276 * present) and leaves us with a valid list of present and active devices.
8277 *
8278 */
8279
8280 /*
8281 * This is called single threaded during boot, so no need
8282 * to take the rtnl semaphore.
8283 */
8284 static int __init net_dev_init(void)
8285 {
8286 int i, rc = -ENOMEM;
8287
8288 BUG_ON(!dev_boot_phase);
8289
8290 if (dev_proc_init())
8291 goto out;
8292
8293 if (netdev_kobject_init())
8294 goto out;
8295
8296 INIT_LIST_HEAD(&ptype_all);
8297 for (i = 0; i < PTYPE_HASH_SIZE; i++)
8298 INIT_LIST_HEAD(&ptype_base[i]);
8299
8300 INIT_LIST_HEAD(&offload_base);
8301
8302 if (register_pernet_subsys(&netdev_net_ops))
8303 goto out;
8304
8305 /*
8306 * Initialise the packet receive queues.
8307 */
8308
8309 for_each_possible_cpu(i) {
8310 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8311 struct softnet_data *sd = &per_cpu(softnet_data, i);
8312
8313 INIT_WORK(flush, flush_backlog);
8314
8315 skb_queue_head_init(&sd->input_pkt_queue);
8316 skb_queue_head_init(&sd->process_queue);
8317 INIT_LIST_HEAD(&sd->poll_list);
8318 sd->output_queue_tailp = &sd->output_queue;
8319 #ifdef CONFIG_RPS
8320 sd->csd.func = rps_trigger_softirq;
8321 sd->csd.info = sd;
8322 sd->cpu = i;
8323 #endif
8324
8325 sd->backlog.poll = process_backlog;
8326 sd->backlog.weight = weight_p;
8327 }
8328
8329 dev_boot_phase = 0;
8330
8331 /* The loopback device is special if any other network devices
8332 * is present in a network namespace the loopback device must
8333 * be present. Since we now dynamically allocate and free the
8334 * loopback device ensure this invariant is maintained by
8335 * keeping the loopback device as the first device on the
8336 * list of network devices. Ensuring the loopback devices
8337 * is the first device that appears and the last network device
8338 * that disappears.
8339 */
8340 if (register_pernet_device(&loopback_net_ops))
8341 goto out;
8342
8343 if (register_pernet_device(&default_device_ops))
8344 goto out;
8345
8346 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8347 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8348
8349 hotcpu_notifier(dev_cpu_callback, 0);
8350 dst_subsys_init();
8351 rc = 0;
8352 out:
8353 return rc;
8354 }
8355
8356 subsys_initcall(net_dev_init);