]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/core/dev.c
UBUNTU: Start new release
[mirror_ubuntu-zesty-kernel.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <linux/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/crash_dump.h>
143
144 #include "net-sysfs.h"
145
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly; /* Taps */
156 static struct list_head offload_base __read_mostly;
157
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160 struct net_device *dev,
161 struct netdev_notifier_info *info);
162
163 /*
164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165 * semaphore.
166 *
167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168 *
169 * Writers must hold the rtnl semaphore while they loop through the
170 * dev_base_head list, and hold dev_base_lock for writing when they do the
171 * actual updates. This allows pure readers to access the list even
172 * while a writer is preparing to update it.
173 *
174 * To put it another way, dev_base_lock is held for writing only to
175 * protect against pure readers; the rtnl semaphore provides the
176 * protection against other writers.
177 *
178 * See, for example usages, register_netdevice() and
179 * unregister_netdevice(), which must be called with the rtnl
180 * semaphore held.
181 */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190
191 static seqcount_t devnet_rename_seq;
192
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195 while (++net->dev_base_seq == 0);
196 }
197
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201
202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220 spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227 struct net *net = dev_net(dev);
228
229 ASSERT_RTNL();
230
231 write_lock_bh(&dev_base_lock);
232 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 hlist_add_head_rcu(&dev->index_hlist,
235 dev_index_hash(net, dev->ifindex));
236 write_unlock_bh(&dev_base_lock);
237
238 dev_base_seq_inc(net);
239 }
240
241 /* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
250 list_del_rcu(&dev->dev_list);
251 hlist_del_rcu(&dev->name_hlist);
252 hlist_del_rcu(&dev->index_hlist);
253 write_unlock_bh(&dev_base_lock);
254
255 dev_base_seq_inc(dev_net(dev));
256 }
257
258 /*
259 * Our notifier list
260 */
261
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263
264 /*
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
267 */
268
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272 #ifdef CONFIG_LOCKDEP
273 /*
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
276 */
277 static const unsigned short netdev_lock_type[] =
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293
294 static const char *const netdev_lock_name[] =
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
327 {
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333 }
334
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353
354 /*******************************************************************************
355
356 Protocol management and registration routines
357
358 *******************************************************************************/
359
360 /*
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378 if (pt->type == htons(ETH_P_ALL))
379 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380 else
381 return pt->dev ? &pt->dev->ptype_specific :
382 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384
385 /**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
393 * This call does not sleep therefore it can not
394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398 void dev_add_pack(struct packet_type *pt)
399 {
400 struct list_head *head = ptype_head(pt);
401
402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407
408 /**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
415 * returns.
416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 struct list_head *head = ptype_head(pt);
424 struct packet_type *pt1;
425
426 spin_lock(&ptype_lock);
427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
435 pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437 spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440
441 /**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 __dev_remove_pack(pt);
456
457 synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460
461
462 /**
463 * dev_add_offload - register offload handlers
464 * @po: protocol offload declaration
465 *
466 * Add protocol offload handlers to the networking stack. The passed
467 * &proto_offload is linked into kernel lists and may not be freed until
468 * it has been removed from the kernel lists.
469 *
470 * This call does not sleep therefore it can not
471 * guarantee all CPU's that are in middle of receiving packets
472 * will see the new offload handlers (until the next received packet).
473 */
474 void dev_add_offload(struct packet_offload *po)
475 {
476 struct packet_offload *elem;
477
478 spin_lock(&offload_lock);
479 list_for_each_entry(elem, &offload_base, list) {
480 if (po->priority < elem->priority)
481 break;
482 }
483 list_add_rcu(&po->list, elem->list.prev);
484 spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487
488 /**
489 * __dev_remove_offload - remove offload handler
490 * @po: packet offload declaration
491 *
492 * Remove a protocol offload handler that was previously added to the
493 * kernel offload handlers by dev_add_offload(). The passed &offload_type
494 * is removed from the kernel lists and can be freed or reused once this
495 * function returns.
496 *
497 * The packet type might still be in use by receivers
498 * and must not be freed until after all the CPU's have gone
499 * through a quiescent state.
500 */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503 struct list_head *head = &offload_base;
504 struct packet_offload *po1;
505
506 spin_lock(&offload_lock);
507
508 list_for_each_entry(po1, head, list) {
509 if (po == po1) {
510 list_del_rcu(&po->list);
511 goto out;
512 }
513 }
514
515 pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517 spin_unlock(&offload_lock);
518 }
519
520 /**
521 * dev_remove_offload - remove packet offload handler
522 * @po: packet offload declaration
523 *
524 * Remove a packet offload handler that was previously added to the kernel
525 * offload handlers by dev_add_offload(). The passed &offload_type is
526 * removed from the kernel lists and can be freed or reused once this
527 * function returns.
528 *
529 * This call sleeps to guarantee that no CPU is looking at the packet
530 * type after return.
531 */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534 __dev_remove_offload(po);
535
536 synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539
540 /******************************************************************************
541
542 Device Boot-time Settings Routines
543
544 *******************************************************************************/
545
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548
549 /**
550 * netdev_boot_setup_add - add new setup entry
551 * @name: name of the device
552 * @map: configured settings for the device
553 *
554 * Adds new setup entry to the dev_boot_setup list. The function
555 * returns 0 on error and 1 on success. This is a generic routine to
556 * all netdevices.
557 */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560 struct netdev_boot_setup *s;
561 int i;
562
563 s = dev_boot_setup;
564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 memset(s[i].name, 0, sizeof(s[i].name));
567 strlcpy(s[i].name, name, IFNAMSIZ);
568 memcpy(&s[i].map, map, sizeof(s[i].map));
569 break;
570 }
571 }
572
573 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575
576 /**
577 * netdev_boot_setup_check - check boot time settings
578 * @dev: the netdevice
579 *
580 * Check boot time settings for the device.
581 * The found settings are set for the device to be used
582 * later in the device probing.
583 * Returns 0 if no settings found, 1 if they are.
584 */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587 struct netdev_boot_setup *s = dev_boot_setup;
588 int i;
589
590 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 !strcmp(dev->name, s[i].name)) {
593 dev->irq = s[i].map.irq;
594 dev->base_addr = s[i].map.base_addr;
595 dev->mem_start = s[i].map.mem_start;
596 dev->mem_end = s[i].map.mem_end;
597 return 1;
598 }
599 }
600 return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603
604
605 /**
606 * netdev_boot_base - get address from boot time settings
607 * @prefix: prefix for network device
608 * @unit: id for network device
609 *
610 * Check boot time settings for the base address of device.
611 * The found settings are set for the device to be used
612 * later in the device probing.
613 * Returns 0 if no settings found.
614 */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617 const struct netdev_boot_setup *s = dev_boot_setup;
618 char name[IFNAMSIZ];
619 int i;
620
621 sprintf(name, "%s%d", prefix, unit);
622
623 /*
624 * If device already registered then return base of 1
625 * to indicate not to probe for this interface
626 */
627 if (__dev_get_by_name(&init_net, name))
628 return 1;
629
630 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 if (!strcmp(name, s[i].name))
632 return s[i].map.base_addr;
633 return 0;
634 }
635
636 /*
637 * Saves at boot time configured settings for any netdevice.
638 */
639 int __init netdev_boot_setup(char *str)
640 {
641 int ints[5];
642 struct ifmap map;
643
644 str = get_options(str, ARRAY_SIZE(ints), ints);
645 if (!str || !*str)
646 return 0;
647
648 /* Save settings */
649 memset(&map, 0, sizeof(map));
650 if (ints[0] > 0)
651 map.irq = ints[1];
652 if (ints[0] > 1)
653 map.base_addr = ints[2];
654 if (ints[0] > 2)
655 map.mem_start = ints[3];
656 if (ints[0] > 3)
657 map.mem_end = ints[4];
658
659 /* Add new entry to the list */
660 return netdev_boot_setup_add(str, &map);
661 }
662
663 __setup("netdev=", netdev_boot_setup);
664
665 /*******************************************************************************
666
667 Device Interface Subroutines
668
669 *******************************************************************************/
670
671 /**
672 * dev_get_iflink - get 'iflink' value of a interface
673 * @dev: targeted interface
674 *
675 * Indicates the ifindex the interface is linked to.
676 * Physical interfaces have the same 'ifindex' and 'iflink' values.
677 */
678
679 int dev_get_iflink(const struct net_device *dev)
680 {
681 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 return dev->netdev_ops->ndo_get_iflink(dev);
683
684 return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687
688 /**
689 * dev_fill_metadata_dst - Retrieve tunnel egress information.
690 * @dev: targeted interface
691 * @skb: The packet.
692 *
693 * For better visibility of tunnel traffic OVS needs to retrieve
694 * egress tunnel information for a packet. Following API allows
695 * user to get this info.
696 */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699 struct ip_tunnel_info *info;
700
701 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
702 return -EINVAL;
703
704 info = skb_tunnel_info_unclone(skb);
705 if (!info)
706 return -ENOMEM;
707 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 return -EINVAL;
709
710 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713
714 /**
715 * __dev_get_by_name - find a device by its name
716 * @net: the applicable net namespace
717 * @name: name to find
718 *
719 * Find an interface by name. Must be called under RTNL semaphore
720 * or @dev_base_lock. If the name is found a pointer to the device
721 * is returned. If the name is not found then %NULL is returned. The
722 * reference counters are not incremented so the caller must be
723 * careful with locks.
724 */
725
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728 struct net_device *dev;
729 struct hlist_head *head = dev_name_hash(net, name);
730
731 hlist_for_each_entry(dev, head, name_hlist)
732 if (!strncmp(dev->name, name, IFNAMSIZ))
733 return dev;
734
735 return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738
739 /**
740 * dev_get_by_name_rcu - find a device by its name
741 * @net: the applicable net namespace
742 * @name: name to find
743 *
744 * Find an interface by name.
745 * If the name is found a pointer to the device is returned.
746 * If the name is not found then %NULL is returned.
747 * The reference counters are not incremented so the caller must be
748 * careful with locks. The caller must hold RCU lock.
749 */
750
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753 struct net_device *dev;
754 struct hlist_head *head = dev_name_hash(net, name);
755
756 hlist_for_each_entry_rcu(dev, head, name_hlist)
757 if (!strncmp(dev->name, name, IFNAMSIZ))
758 return dev;
759
760 return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763
764 /**
765 * dev_get_by_name - find a device by its name
766 * @net: the applicable net namespace
767 * @name: name to find
768 *
769 * Find an interface by name. This can be called from any
770 * context and does its own locking. The returned handle has
771 * the usage count incremented and the caller must use dev_put() to
772 * release it when it is no longer needed. %NULL is returned if no
773 * matching device is found.
774 */
775
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778 struct net_device *dev;
779
780 rcu_read_lock();
781 dev = dev_get_by_name_rcu(net, name);
782 if (dev)
783 dev_hold(dev);
784 rcu_read_unlock();
785 return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788
789 /**
790 * __dev_get_by_index - find a device by its ifindex
791 * @net: the applicable net namespace
792 * @ifindex: index of device
793 *
794 * Search for an interface by index. Returns %NULL if the device
795 * is not found or a pointer to the device. The device has not
796 * had its reference counter increased so the caller must be careful
797 * about locking. The caller must hold either the RTNL semaphore
798 * or @dev_base_lock.
799 */
800
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803 struct net_device *dev;
804 struct hlist_head *head = dev_index_hash(net, ifindex);
805
806 hlist_for_each_entry(dev, head, index_hlist)
807 if (dev->ifindex == ifindex)
808 return dev;
809
810 return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813
814 /**
815 * dev_get_by_index_rcu - find a device by its ifindex
816 * @net: the applicable net namespace
817 * @ifindex: index of device
818 *
819 * Search for an interface by index. Returns %NULL if the device
820 * is not found or a pointer to the device. The device has not
821 * had its reference counter increased so the caller must be careful
822 * about locking. The caller must hold RCU lock.
823 */
824
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827 struct net_device *dev;
828 struct hlist_head *head = dev_index_hash(net, ifindex);
829
830 hlist_for_each_entry_rcu(dev, head, index_hlist)
831 if (dev->ifindex == ifindex)
832 return dev;
833
834 return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837
838
839 /**
840 * dev_get_by_index - find a device by its ifindex
841 * @net: the applicable net namespace
842 * @ifindex: index of device
843 *
844 * Search for an interface by index. Returns NULL if the device
845 * is not found or a pointer to the device. The device returned has
846 * had a reference added and the pointer is safe until the user calls
847 * dev_put to indicate they have finished with it.
848 */
849
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852 struct net_device *dev;
853
854 rcu_read_lock();
855 dev = dev_get_by_index_rcu(net, ifindex);
856 if (dev)
857 dev_hold(dev);
858 rcu_read_unlock();
859 return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862
863 /**
864 * netdev_get_name - get a netdevice name, knowing its ifindex.
865 * @net: network namespace
866 * @name: a pointer to the buffer where the name will be stored.
867 * @ifindex: the ifindex of the interface to get the name from.
868 *
869 * The use of raw_seqcount_begin() and cond_resched() before
870 * retrying is required as we want to give the writers a chance
871 * to complete when CONFIG_PREEMPT is not set.
872 */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875 struct net_device *dev;
876 unsigned int seq;
877
878 retry:
879 seq = raw_seqcount_begin(&devnet_rename_seq);
880 rcu_read_lock();
881 dev = dev_get_by_index_rcu(net, ifindex);
882 if (!dev) {
883 rcu_read_unlock();
884 return -ENODEV;
885 }
886
887 strcpy(name, dev->name);
888 rcu_read_unlock();
889 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 cond_resched();
891 goto retry;
892 }
893
894 return 0;
895 }
896
897 /**
898 * dev_getbyhwaddr_rcu - find a device by its hardware address
899 * @net: the applicable net namespace
900 * @type: media type of device
901 * @ha: hardware address
902 *
903 * Search for an interface by MAC address. Returns NULL if the device
904 * is not found or a pointer to the device.
905 * The caller must hold RCU or RTNL.
906 * The returned device has not had its ref count increased
907 * and the caller must therefore be careful about locking
908 *
909 */
910
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 const char *ha)
913 {
914 struct net_device *dev;
915
916 for_each_netdev_rcu(net, dev)
917 if (dev->type == type &&
918 !memcmp(dev->dev_addr, ha, dev->addr_len))
919 return dev;
920
921 return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927 struct net_device *dev;
928
929 ASSERT_RTNL();
930 for_each_netdev(net, dev)
931 if (dev->type == type)
932 return dev;
933
934 return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940 struct net_device *dev, *ret = NULL;
941
942 rcu_read_lock();
943 for_each_netdev_rcu(net, dev)
944 if (dev->type == type) {
945 dev_hold(dev);
946 ret = dev;
947 break;
948 }
949 rcu_read_unlock();
950 return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953
954 /**
955 * __dev_get_by_flags - find any device with given flags
956 * @net: the applicable net namespace
957 * @if_flags: IFF_* values
958 * @mask: bitmask of bits in if_flags to check
959 *
960 * Search for any interface with the given flags. Returns NULL if a device
961 * is not found or a pointer to the device. Must be called inside
962 * rtnl_lock(), and result refcount is unchanged.
963 */
964
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 unsigned short mask)
967 {
968 struct net_device *dev, *ret;
969
970 ASSERT_RTNL();
971
972 ret = NULL;
973 for_each_netdev(net, dev) {
974 if (((dev->flags ^ if_flags) & mask) == 0) {
975 ret = dev;
976 break;
977 }
978 }
979 return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982
983 /**
984 * dev_valid_name - check if name is okay for network device
985 * @name: name string
986 *
987 * Network device names need to be valid file names to
988 * to allow sysfs to work. We also disallow any kind of
989 * whitespace.
990 */
991 bool dev_valid_name(const char *name)
992 {
993 if (*name == '\0')
994 return false;
995 if (strlen(name) >= IFNAMSIZ)
996 return false;
997 if (!strcmp(name, ".") || !strcmp(name, ".."))
998 return false;
999
1000 while (*name) {
1001 if (*name == '/' || *name == ':' || isspace(*name))
1002 return false;
1003 name++;
1004 }
1005 return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008
1009 /**
1010 * __dev_alloc_name - allocate a name for a device
1011 * @net: network namespace to allocate the device name in
1012 * @name: name format string
1013 * @buf: scratch buffer and result name string
1014 *
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026 int i = 0;
1027 const char *p;
1028 const int max_netdevices = 8*PAGE_SIZE;
1029 unsigned long *inuse;
1030 struct net_device *d;
1031
1032 p = strnchr(name, IFNAMSIZ-1, '%');
1033 if (p) {
1034 /*
1035 * Verify the string as this thing may have come from
1036 * the user. There must be either one "%d" and no other "%"
1037 * characters.
1038 */
1039 if (p[1] != 'd' || strchr(p + 2, '%'))
1040 return -EINVAL;
1041
1042 /* Use one page as a bit array of possible slots */
1043 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044 if (!inuse)
1045 return -ENOMEM;
1046
1047 for_each_netdev(net, d) {
1048 if (!sscanf(d->name, name, &i))
1049 continue;
1050 if (i < 0 || i >= max_netdevices)
1051 continue;
1052
1053 /* avoid cases where sscanf is not exact inverse of printf */
1054 snprintf(buf, IFNAMSIZ, name, i);
1055 if (!strncmp(buf, d->name, IFNAMSIZ))
1056 set_bit(i, inuse);
1057 }
1058
1059 i = find_first_zero_bit(inuse, max_netdevices);
1060 free_page((unsigned long) inuse);
1061 }
1062
1063 if (buf != name)
1064 snprintf(buf, IFNAMSIZ, name, i);
1065 if (!__dev_get_by_name(net, buf))
1066 return i;
1067
1068 /* It is possible to run out of possible slots
1069 * when the name is long and there isn't enough space left
1070 * for the digits, or if all bits are used.
1071 */
1072 return -ENFILE;
1073 }
1074
1075 /**
1076 * dev_alloc_name - allocate a name for a device
1077 * @dev: device
1078 * @name: name format string
1079 *
1080 * Passed a format string - eg "lt%d" it will try and find a suitable
1081 * id. It scans list of devices to build up a free map, then chooses
1082 * the first empty slot. The caller must hold the dev_base or rtnl lock
1083 * while allocating the name and adding the device in order to avoid
1084 * duplicates.
1085 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 * Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091 char buf[IFNAMSIZ];
1092 struct net *net;
1093 int ret;
1094
1095 BUG_ON(!dev_net(dev));
1096 net = dev_net(dev);
1097 ret = __dev_alloc_name(net, name, buf);
1098 if (ret >= 0)
1099 strlcpy(dev->name, buf, IFNAMSIZ);
1100 return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103
1104 static int dev_alloc_name_ns(struct net *net,
1105 struct net_device *dev,
1106 const char *name)
1107 {
1108 char buf[IFNAMSIZ];
1109 int ret;
1110
1111 ret = __dev_alloc_name(net, name, buf);
1112 if (ret >= 0)
1113 strlcpy(dev->name, buf, IFNAMSIZ);
1114 return ret;
1115 }
1116
1117 static int dev_get_valid_name(struct net *net,
1118 struct net_device *dev,
1119 const char *name)
1120 {
1121 BUG_ON(!net);
1122
1123 if (!dev_valid_name(name))
1124 return -EINVAL;
1125
1126 if (strchr(name, '%'))
1127 return dev_alloc_name_ns(net, dev, name);
1128 else if (__dev_get_by_name(net, name))
1129 return -EEXIST;
1130 else if (dev->name != name)
1131 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133 return 0;
1134 }
1135
1136 /**
1137 * dev_change_name - change name of a device
1138 * @dev: device
1139 * @newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 * Change name of a device, can pass format strings "eth%d".
1142 * for wildcarding.
1143 */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146 unsigned char old_assign_type;
1147 char oldname[IFNAMSIZ];
1148 int err = 0;
1149 int ret;
1150 struct net *net;
1151
1152 ASSERT_RTNL();
1153 BUG_ON(!dev_net(dev));
1154
1155 net = dev_net(dev);
1156 if (dev->flags & IFF_UP)
1157 return -EBUSY;
1158
1159 write_seqcount_begin(&devnet_rename_seq);
1160
1161 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162 write_seqcount_end(&devnet_rename_seq);
1163 return 0;
1164 }
1165
1166 memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168 err = dev_get_valid_name(net, dev, newname);
1169 if (err < 0) {
1170 write_seqcount_end(&devnet_rename_seq);
1171 return err;
1172 }
1173
1174 if (oldname[0] && !strchr(oldname, '%'))
1175 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177 old_assign_type = dev->name_assign_type;
1178 dev->name_assign_type = NET_NAME_RENAMED;
1179
1180 rollback:
1181 ret = device_rename(&dev->dev, dev->name);
1182 if (ret) {
1183 memcpy(dev->name, oldname, IFNAMSIZ);
1184 dev->name_assign_type = old_assign_type;
1185 write_seqcount_end(&devnet_rename_seq);
1186 return ret;
1187 }
1188
1189 write_seqcount_end(&devnet_rename_seq);
1190
1191 netdev_adjacent_rename_links(dev, oldname);
1192
1193 write_lock_bh(&dev_base_lock);
1194 hlist_del_rcu(&dev->name_hlist);
1195 write_unlock_bh(&dev_base_lock);
1196
1197 synchronize_rcu();
1198
1199 write_lock_bh(&dev_base_lock);
1200 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201 write_unlock_bh(&dev_base_lock);
1202
1203 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204 ret = notifier_to_errno(ret);
1205
1206 if (ret) {
1207 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208 if (err >= 0) {
1209 err = ret;
1210 write_seqcount_begin(&devnet_rename_seq);
1211 memcpy(dev->name, oldname, IFNAMSIZ);
1212 memcpy(oldname, newname, IFNAMSIZ);
1213 dev->name_assign_type = old_assign_type;
1214 old_assign_type = NET_NAME_RENAMED;
1215 goto rollback;
1216 } else {
1217 pr_err("%s: name change rollback failed: %d\n",
1218 dev->name, ret);
1219 }
1220 }
1221
1222 return err;
1223 }
1224
1225 /**
1226 * dev_set_alias - change ifalias of a device
1227 * @dev: device
1228 * @alias: name up to IFALIASZ
1229 * @len: limit of bytes to copy from info
1230 *
1231 * Set ifalias for a device,
1232 */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235 char *new_ifalias;
1236
1237 ASSERT_RTNL();
1238
1239 if (len >= IFALIASZ)
1240 return -EINVAL;
1241
1242 if (!len) {
1243 kfree(dev->ifalias);
1244 dev->ifalias = NULL;
1245 return 0;
1246 }
1247
1248 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 if (!new_ifalias)
1250 return -ENOMEM;
1251 dev->ifalias = new_ifalias;
1252
1253 strlcpy(dev->ifalias, alias, len+1);
1254 return len;
1255 }
1256
1257
1258 /**
1259 * netdev_features_change - device changes features
1260 * @dev: device to cause notification
1261 *
1262 * Called to indicate a device has changed features.
1263 */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269
1270 /**
1271 * netdev_state_change - device changes state
1272 * @dev: device to cause notification
1273 *
1274 * Called to indicate a device has changed state. This function calls
1275 * the notifier chains for netdev_chain and sends a NEWLINK message
1276 * to the routing socket.
1277 */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280 if (dev->flags & IFF_UP) {
1281 struct netdev_notifier_change_info change_info;
1282
1283 change_info.flags_changed = 0;
1284 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 &change_info.info);
1286 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 }
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290
1291 /**
1292 * netdev_notify_peers - notify network peers about existence of @dev
1293 * @dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303 rtnl_lock();
1304 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308
1309 static int __dev_open(struct net_device *dev)
1310 {
1311 const struct net_device_ops *ops = dev->netdev_ops;
1312 int ret;
1313
1314 ASSERT_RTNL();
1315
1316 if (!netif_device_present(dev))
1317 return -ENODEV;
1318
1319 /* Block netpoll from trying to do any rx path servicing.
1320 * If we don't do this there is a chance ndo_poll_controller
1321 * or ndo_poll may be running while we open the device
1322 */
1323 netpoll_poll_disable(dev);
1324
1325 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 ret = notifier_to_errno(ret);
1327 if (ret)
1328 return ret;
1329
1330 set_bit(__LINK_STATE_START, &dev->state);
1331
1332 if (ops->ndo_validate_addr)
1333 ret = ops->ndo_validate_addr(dev);
1334
1335 if (!ret && ops->ndo_open)
1336 ret = ops->ndo_open(dev);
1337
1338 netpoll_poll_enable(dev);
1339
1340 if (ret)
1341 clear_bit(__LINK_STATE_START, &dev->state);
1342 else {
1343 dev->flags |= IFF_UP;
1344 dev_set_rx_mode(dev);
1345 dev_activate(dev);
1346 add_device_randomness(dev->dev_addr, dev->addr_len);
1347 }
1348
1349 return ret;
1350 }
1351
1352 /**
1353 * dev_open - prepare an interface for use.
1354 * @dev: device to open
1355 *
1356 * Takes a device from down to up state. The device's private open
1357 * function is invoked and then the multicast lists are loaded. Finally
1358 * the device is moved into the up state and a %NETDEV_UP message is
1359 * sent to the netdev notifier chain.
1360 *
1361 * Calling this function on an active interface is a nop. On a failure
1362 * a negative errno code is returned.
1363 */
1364 int dev_open(struct net_device *dev)
1365 {
1366 int ret;
1367
1368 if (dev->flags & IFF_UP)
1369 return 0;
1370
1371 ret = __dev_open(dev);
1372 if (ret < 0)
1373 return ret;
1374
1375 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378 return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384 struct net_device *dev;
1385
1386 ASSERT_RTNL();
1387 might_sleep();
1388
1389 list_for_each_entry(dev, head, close_list) {
1390 /* Temporarily disable netpoll until the interface is down */
1391 netpoll_poll_disable(dev);
1392
1393 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398 * can be even on different cpu. So just clear netif_running().
1399 *
1400 * dev->stop() will invoke napi_disable() on all of it's
1401 * napi_struct instances on this device.
1402 */
1403 smp_mb__after_atomic(); /* Commit netif_running(). */
1404 }
1405
1406 dev_deactivate_many(head);
1407
1408 list_for_each_entry(dev, head, close_list) {
1409 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411 /*
1412 * Call the device specific close. This cannot fail.
1413 * Only if device is UP
1414 *
1415 * We allow it to be called even after a DETACH hot-plug
1416 * event.
1417 */
1418 if (ops->ndo_stop)
1419 ops->ndo_stop(dev);
1420
1421 dev->flags &= ~IFF_UP;
1422 netpoll_poll_enable(dev);
1423 }
1424
1425 return 0;
1426 }
1427
1428 static int __dev_close(struct net_device *dev)
1429 {
1430 int retval;
1431 LIST_HEAD(single);
1432
1433 list_add(&dev->close_list, &single);
1434 retval = __dev_close_many(&single);
1435 list_del(&single);
1436
1437 return retval;
1438 }
1439
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442 struct net_device *dev, *tmp;
1443
1444 /* Remove the devices that don't need to be closed */
1445 list_for_each_entry_safe(dev, tmp, head, close_list)
1446 if (!(dev->flags & IFF_UP))
1447 list_del_init(&dev->close_list);
1448
1449 __dev_close_many(head);
1450
1451 list_for_each_entry_safe(dev, tmp, head, close_list) {
1452 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454 if (unlink)
1455 list_del_init(&dev->close_list);
1456 }
1457
1458 return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461
1462 /**
1463 * dev_close - shutdown an interface.
1464 * @dev: device to shutdown
1465 *
1466 * This function moves an active device into down state. A
1467 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 * chain.
1470 */
1471 int dev_close(struct net_device *dev)
1472 {
1473 if (dev->flags & IFF_UP) {
1474 LIST_HEAD(single);
1475
1476 list_add(&dev->close_list, &single);
1477 dev_close_many(&single, true);
1478 list_del(&single);
1479 }
1480 return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483
1484
1485 /**
1486 * dev_disable_lro - disable Large Receive Offload on a device
1487 * @dev: device
1488 *
1489 * Disable Large Receive Offload (LRO) on a net device. Must be
1490 * called under RTNL. This is needed if received packets may be
1491 * forwarded to another interface.
1492 */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495 struct net_device *lower_dev;
1496 struct list_head *iter;
1497
1498 dev->wanted_features &= ~NETIF_F_LRO;
1499 netdev_update_features(dev);
1500
1501 if (unlikely(dev->features & NETIF_F_LRO))
1502 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504 netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 struct net_device *dev)
1511 {
1512 struct netdev_notifier_info info;
1513
1514 netdev_notifier_info_init(&info, dev);
1515 return nb->notifier_call(nb, val, &info);
1516 }
1517
1518 static int dev_boot_phase = 1;
1519
1520 /**
1521 * register_netdevice_notifier - register a network notifier block
1522 * @nb: notifier
1523 *
1524 * Register a notifier to be called when network device events occur.
1525 * The notifier passed is linked into the kernel structures and must
1526 * not be reused until it has been unregistered. A negative errno code
1527 * is returned on a failure.
1528 *
1529 * When registered all registration and up events are replayed
1530 * to the new notifier to allow device to have a race free
1531 * view of the network device list.
1532 */
1533
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536 struct net_device *dev;
1537 struct net_device *last;
1538 struct net *net;
1539 int err;
1540
1541 rtnl_lock();
1542 err = raw_notifier_chain_register(&netdev_chain, nb);
1543 if (err)
1544 goto unlock;
1545 if (dev_boot_phase)
1546 goto unlock;
1547 for_each_net(net) {
1548 for_each_netdev(net, dev) {
1549 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550 err = notifier_to_errno(err);
1551 if (err)
1552 goto rollback;
1553
1554 if (!(dev->flags & IFF_UP))
1555 continue;
1556
1557 call_netdevice_notifier(nb, NETDEV_UP, dev);
1558 }
1559 }
1560
1561 unlock:
1562 rtnl_unlock();
1563 return err;
1564
1565 rollback:
1566 last = dev;
1567 for_each_net(net) {
1568 for_each_netdev(net, dev) {
1569 if (dev == last)
1570 goto outroll;
1571
1572 if (dev->flags & IFF_UP) {
1573 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 dev);
1575 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 }
1577 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578 }
1579 }
1580
1581 outroll:
1582 raw_notifier_chain_unregister(&netdev_chain, nb);
1583 goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587 /**
1588 * unregister_netdevice_notifier - unregister a network notifier block
1589 * @nb: notifier
1590 *
1591 * Unregister a notifier previously registered by
1592 * register_netdevice_notifier(). The notifier is unlinked into the
1593 * kernel structures and may then be reused. A negative errno code
1594 * is returned on a failure.
1595 *
1596 * After unregistering unregister and down device events are synthesized
1597 * for all devices on the device list to the removed notifier to remove
1598 * the need for special case cleanup code.
1599 */
1600
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603 struct net_device *dev;
1604 struct net *net;
1605 int err;
1606
1607 rtnl_lock();
1608 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609 if (err)
1610 goto unlock;
1611
1612 for_each_net(net) {
1613 for_each_netdev(net, dev) {
1614 if (dev->flags & IFF_UP) {
1615 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 dev);
1617 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 }
1619 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620 }
1621 }
1622 unlock:
1623 rtnl_unlock();
1624 return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628 /**
1629 * call_netdevice_notifiers_info - call all network notifier blocks
1630 * @val: value passed unmodified to notifier function
1631 * @dev: net_device pointer passed unmodified to notifier function
1632 * @info: notifier information data
1633 *
1634 * Call all network notifier blocks. Parameters and return value
1635 * are as for raw_notifier_call_chain().
1636 */
1637
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639 struct net_device *dev,
1640 struct netdev_notifier_info *info)
1641 {
1642 ASSERT_RTNL();
1643 netdev_notifier_info_init(info, dev);
1644 return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646
1647 /**
1648 * call_netdevice_notifiers - call all network notifier blocks
1649 * @val: value passed unmodified to notifier function
1650 * @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 * Call all network notifier blocks. Parameters and return value
1653 * are as for raw_notifier_call_chain().
1654 */
1655
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658 struct netdev_notifier_info info;
1659
1660 return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666
1667 void net_inc_ingress_queue(void)
1668 {
1669 static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673 void net_dec_ingress_queue(void)
1674 {
1675 static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682
1683 void net_inc_egress_queue(void)
1684 {
1685 static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689 void net_dec_egress_queue(void)
1690 {
1691 static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 static atomic_t netstamp_needed_deferred;
1699 static atomic_t netstamp_wanted;
1700 static void netstamp_clear(struct work_struct *work)
1701 {
1702 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703 int wanted;
1704
1705 wanted = atomic_add_return(deferred, &netstamp_wanted);
1706 if (wanted > 0)
1707 static_key_enable(&netstamp_needed);
1708 else
1709 static_key_disable(&netstamp_needed);
1710 }
1711 static DECLARE_WORK(netstamp_work, netstamp_clear);
1712 #endif
1713
1714 void net_enable_timestamp(void)
1715 {
1716 #ifdef HAVE_JUMP_LABEL
1717 int wanted;
1718
1719 while (1) {
1720 wanted = atomic_read(&netstamp_wanted);
1721 if (wanted <= 0)
1722 break;
1723 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724 return;
1725 }
1726 atomic_inc(&netstamp_needed_deferred);
1727 schedule_work(&netstamp_work);
1728 #else
1729 static_key_slow_inc(&netstamp_needed);
1730 #endif
1731 }
1732 EXPORT_SYMBOL(net_enable_timestamp);
1733
1734 void net_disable_timestamp(void)
1735 {
1736 #ifdef HAVE_JUMP_LABEL
1737 int wanted;
1738
1739 while (1) {
1740 wanted = atomic_read(&netstamp_wanted);
1741 if (wanted <= 1)
1742 break;
1743 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744 return;
1745 }
1746 atomic_dec(&netstamp_needed_deferred);
1747 schedule_work(&netstamp_work);
1748 #else
1749 static_key_slow_dec(&netstamp_needed);
1750 #endif
1751 }
1752 EXPORT_SYMBOL(net_disable_timestamp);
1753
1754 static inline void net_timestamp_set(struct sk_buff *skb)
1755 {
1756 skb->tstamp = 0;
1757 if (static_key_false(&netstamp_needed))
1758 __net_timestamp(skb);
1759 }
1760
1761 #define net_timestamp_check(COND, SKB) \
1762 if (static_key_false(&netstamp_needed)) { \
1763 if ((COND) && !(SKB)->tstamp) \
1764 __net_timestamp(SKB); \
1765 } \
1766
1767 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768 {
1769 unsigned int len;
1770
1771 if (!(dev->flags & IFF_UP))
1772 return false;
1773
1774 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775 if (skb->len <= len)
1776 return true;
1777
1778 /* if TSO is enabled, we don't care about the length as the packet
1779 * could be forwarded without being segmented before
1780 */
1781 if (skb_is_gso(skb))
1782 return true;
1783
1784 return false;
1785 }
1786 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1789 {
1790 int ret = ____dev_forward_skb(dev, skb);
1791
1792 if (likely(!ret)) {
1793 skb->protocol = eth_type_trans(skb, dev);
1794 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795 }
1796
1797 return ret;
1798 }
1799 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801 /**
1802 * dev_forward_skb - loopback an skb to another netif
1803 *
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1806 *
1807 * return values:
1808 * NET_RX_SUCCESS (no congestion)
1809 * NET_RX_DROP (packet was dropped, but freed)
1810 *
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1814 *
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1818 */
1819 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820 {
1821 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822 }
1823 EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
1825 static inline int deliver_skb(struct sk_buff *skb,
1826 struct packet_type *pt_prev,
1827 struct net_device *orig_dev)
1828 {
1829 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830 return -ENOMEM;
1831 atomic_inc(&skb->users);
1832 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833 }
1834
1835 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836 struct packet_type **pt,
1837 struct net_device *orig_dev,
1838 __be16 type,
1839 struct list_head *ptype_list)
1840 {
1841 struct packet_type *ptype, *pt_prev = *pt;
1842
1843 list_for_each_entry_rcu(ptype, ptype_list, list) {
1844 if (ptype->type != type)
1845 continue;
1846 if (pt_prev)
1847 deliver_skb(skb, pt_prev, orig_dev);
1848 pt_prev = ptype;
1849 }
1850 *pt = pt_prev;
1851 }
1852
1853 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854 {
1855 if (!ptype->af_packet_priv || !skb->sk)
1856 return false;
1857
1858 if (ptype->id_match)
1859 return ptype->id_match(ptype, skb->sk);
1860 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861 return true;
1862
1863 return false;
1864 }
1865
1866 /*
1867 * Support routine. Sends outgoing frames to any network
1868 * taps currently in use.
1869 */
1870
1871 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872 {
1873 struct packet_type *ptype;
1874 struct sk_buff *skb2 = NULL;
1875 struct packet_type *pt_prev = NULL;
1876 struct list_head *ptype_list = &ptype_all;
1877
1878 rcu_read_lock();
1879 again:
1880 list_for_each_entry_rcu(ptype, ptype_list, list) {
1881 /* Never send packets back to the socket
1882 * they originated from - MvS (miquels@drinkel.ow.org)
1883 */
1884 if (skb_loop_sk(ptype, skb))
1885 continue;
1886
1887 if (pt_prev) {
1888 deliver_skb(skb2, pt_prev, skb->dev);
1889 pt_prev = ptype;
1890 continue;
1891 }
1892
1893 /* need to clone skb, done only once */
1894 skb2 = skb_clone(skb, GFP_ATOMIC);
1895 if (!skb2)
1896 goto out_unlock;
1897
1898 net_timestamp_set(skb2);
1899
1900 /* skb->nh should be correctly
1901 * set by sender, so that the second statement is
1902 * just protection against buggy protocols.
1903 */
1904 skb_reset_mac_header(skb2);
1905
1906 if (skb_network_header(skb2) < skb2->data ||
1907 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909 ntohs(skb2->protocol),
1910 dev->name);
1911 skb_reset_network_header(skb2);
1912 }
1913
1914 skb2->transport_header = skb2->network_header;
1915 skb2->pkt_type = PACKET_OUTGOING;
1916 pt_prev = ptype;
1917 }
1918
1919 if (ptype_list == &ptype_all) {
1920 ptype_list = &dev->ptype_all;
1921 goto again;
1922 }
1923 out_unlock:
1924 if (pt_prev)
1925 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1926 rcu_read_unlock();
1927 }
1928 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930 /**
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1934 *
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1942 */
1943 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944 {
1945 int i;
1946 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948 /* If TC0 is invalidated disable TC mapping */
1949 if (tc->offset + tc->count > txq) {
1950 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951 dev->num_tc = 0;
1952 return;
1953 }
1954
1955 /* Invalidated prio to tc mappings set to TC0 */
1956 for (i = 1; i < TC_BITMASK + 1; i++) {
1957 int q = netdev_get_prio_tc_map(dev, i);
1958
1959 tc = &dev->tc_to_txq[q];
1960 if (tc->offset + tc->count > txq) {
1961 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962 i, q);
1963 netdev_set_prio_tc_map(dev, i, 0);
1964 }
1965 }
1966 }
1967
1968 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969 {
1970 if (dev->num_tc) {
1971 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972 int i;
1973
1974 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975 if ((txq - tc->offset) < tc->count)
1976 return i;
1977 }
1978
1979 return -1;
1980 }
1981
1982 return 0;
1983 }
1984
1985 #ifdef CONFIG_XPS
1986 static DEFINE_MUTEX(xps_map_mutex);
1987 #define xmap_dereference(P) \
1988 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991 int tci, u16 index)
1992 {
1993 struct xps_map *map = NULL;
1994 int pos;
1995
1996 if (dev_maps)
1997 map = xmap_dereference(dev_maps->cpu_map[tci]);
1998 if (!map)
1999 return false;
2000
2001 for (pos = map->len; pos--;) {
2002 if (map->queues[pos] != index)
2003 continue;
2004
2005 if (map->len > 1) {
2006 map->queues[pos] = map->queues[--map->len];
2007 break;
2008 }
2009
2010 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2011 kfree_rcu(map, rcu);
2012 return false;
2013 }
2014
2015 return true;
2016 }
2017
2018 static bool remove_xps_queue_cpu(struct net_device *dev,
2019 struct xps_dev_maps *dev_maps,
2020 int cpu, u16 offset, u16 count)
2021 {
2022 int num_tc = dev->num_tc ? : 1;
2023 bool active = false;
2024 int tci;
2025
2026 for (tci = cpu * num_tc; num_tc--; tci++) {
2027 int i, j;
2028
2029 for (i = count, j = offset; i--; j++) {
2030 if (!remove_xps_queue(dev_maps, cpu, j))
2031 break;
2032 }
2033
2034 active |= i < 0;
2035 }
2036
2037 return active;
2038 }
2039
2040 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041 u16 count)
2042 {
2043 struct xps_dev_maps *dev_maps;
2044 int cpu, i;
2045 bool active = false;
2046
2047 mutex_lock(&xps_map_mutex);
2048 dev_maps = xmap_dereference(dev->xps_maps);
2049
2050 if (!dev_maps)
2051 goto out_no_maps;
2052
2053 for_each_possible_cpu(cpu)
2054 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055 offset, count);
2056
2057 if (!active) {
2058 RCU_INIT_POINTER(dev->xps_maps, NULL);
2059 kfree_rcu(dev_maps, rcu);
2060 }
2061
2062 for (i = offset + (count - 1); count--; i--)
2063 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064 NUMA_NO_NODE);
2065
2066 out_no_maps:
2067 mutex_unlock(&xps_map_mutex);
2068 }
2069
2070 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071 {
2072 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073 }
2074
2075 static struct xps_map *expand_xps_map(struct xps_map *map,
2076 int cpu, u16 index)
2077 {
2078 struct xps_map *new_map;
2079 int alloc_len = XPS_MIN_MAP_ALLOC;
2080 int i, pos;
2081
2082 for (pos = 0; map && pos < map->len; pos++) {
2083 if (map->queues[pos] != index)
2084 continue;
2085 return map;
2086 }
2087
2088 /* Need to add queue to this CPU's existing map */
2089 if (map) {
2090 if (pos < map->alloc_len)
2091 return map;
2092
2093 alloc_len = map->alloc_len * 2;
2094 }
2095
2096 /* Need to allocate new map to store queue on this CPU's map */
2097 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098 cpu_to_node(cpu));
2099 if (!new_map)
2100 return NULL;
2101
2102 for (i = 0; i < pos; i++)
2103 new_map->queues[i] = map->queues[i];
2104 new_map->alloc_len = alloc_len;
2105 new_map->len = pos;
2106
2107 return new_map;
2108 }
2109
2110 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111 u16 index)
2112 {
2113 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114 int i, cpu, tci, numa_node_id = -2;
2115 int maps_sz, num_tc = 1, tc = 0;
2116 struct xps_map *map, *new_map;
2117 bool active = false;
2118
2119 if (dev->num_tc) {
2120 num_tc = dev->num_tc;
2121 tc = netdev_txq_to_tc(dev, index);
2122 if (tc < 0)
2123 return -EINVAL;
2124 }
2125
2126 maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2127 if (maps_sz < L1_CACHE_BYTES)
2128 maps_sz = L1_CACHE_BYTES;
2129
2130 mutex_lock(&xps_map_mutex);
2131
2132 dev_maps = xmap_dereference(dev->xps_maps);
2133
2134 /* allocate memory for queue storage */
2135 for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136 if (!new_dev_maps)
2137 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138 if (!new_dev_maps) {
2139 mutex_unlock(&xps_map_mutex);
2140 return -ENOMEM;
2141 }
2142
2143 tci = cpu * num_tc + tc;
2144 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145 NULL;
2146
2147 map = expand_xps_map(map, cpu, index);
2148 if (!map)
2149 goto error;
2150
2151 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152 }
2153
2154 if (!new_dev_maps)
2155 goto out_no_new_maps;
2156
2157 for_each_possible_cpu(cpu) {
2158 /* copy maps belonging to foreign traffic classes */
2159 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160 /* fill in the new device map from the old device map */
2161 map = xmap_dereference(dev_maps->cpu_map[tci]);
2162 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163 }
2164
2165 /* We need to explicitly update tci as prevous loop
2166 * could break out early if dev_maps is NULL.
2167 */
2168 tci = cpu * num_tc + tc;
2169
2170 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171 /* add queue to CPU maps */
2172 int pos = 0;
2173
2174 map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175 while ((pos < map->len) && (map->queues[pos] != index))
2176 pos++;
2177
2178 if (pos == map->len)
2179 map->queues[map->len++] = index;
2180 #ifdef CONFIG_NUMA
2181 if (numa_node_id == -2)
2182 numa_node_id = cpu_to_node(cpu);
2183 else if (numa_node_id != cpu_to_node(cpu))
2184 numa_node_id = -1;
2185 #endif
2186 } else if (dev_maps) {
2187 /* fill in the new device map from the old device map */
2188 map = xmap_dereference(dev_maps->cpu_map[tci]);
2189 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190 }
2191
2192 /* copy maps belonging to foreign traffic classes */
2193 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194 /* fill in the new device map from the old device map */
2195 map = xmap_dereference(dev_maps->cpu_map[tci]);
2196 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197 }
2198 }
2199
2200 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202 /* Cleanup old maps */
2203 if (!dev_maps)
2204 goto out_no_old_maps;
2205
2206 for_each_possible_cpu(cpu) {
2207 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209 map = xmap_dereference(dev_maps->cpu_map[tci]);
2210 if (map && map != new_map)
2211 kfree_rcu(map, rcu);
2212 }
2213 }
2214
2215 kfree_rcu(dev_maps, rcu);
2216
2217 out_no_old_maps:
2218 dev_maps = new_dev_maps;
2219 active = true;
2220
2221 out_no_new_maps:
2222 /* update Tx queue numa node */
2223 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224 (numa_node_id >= 0) ? numa_node_id :
2225 NUMA_NO_NODE);
2226
2227 if (!dev_maps)
2228 goto out_no_maps;
2229
2230 /* removes queue from unused CPUs */
2231 for_each_possible_cpu(cpu) {
2232 for (i = tc, tci = cpu * num_tc; i--; tci++)
2233 active |= remove_xps_queue(dev_maps, tci, index);
2234 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235 active |= remove_xps_queue(dev_maps, tci, index);
2236 for (i = num_tc - tc, tci++; --i; tci++)
2237 active |= remove_xps_queue(dev_maps, tci, index);
2238 }
2239
2240 /* free map if not active */
2241 if (!active) {
2242 RCU_INIT_POINTER(dev->xps_maps, NULL);
2243 kfree_rcu(dev_maps, rcu);
2244 }
2245
2246 out_no_maps:
2247 mutex_unlock(&xps_map_mutex);
2248
2249 return 0;
2250 error:
2251 /* remove any maps that we added */
2252 for_each_possible_cpu(cpu) {
2253 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255 map = dev_maps ?
2256 xmap_dereference(dev_maps->cpu_map[tci]) :
2257 NULL;
2258 if (new_map && new_map != map)
2259 kfree(new_map);
2260 }
2261 }
2262
2263 mutex_unlock(&xps_map_mutex);
2264
2265 kfree(new_dev_maps);
2266 return -ENOMEM;
2267 }
2268 EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270 #endif
2271 void netdev_reset_tc(struct net_device *dev)
2272 {
2273 #ifdef CONFIG_XPS
2274 netif_reset_xps_queues_gt(dev, 0);
2275 #endif
2276 dev->num_tc = 0;
2277 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279 }
2280 EXPORT_SYMBOL(netdev_reset_tc);
2281
2282 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283 {
2284 if (tc >= dev->num_tc)
2285 return -EINVAL;
2286
2287 #ifdef CONFIG_XPS
2288 netif_reset_xps_queues(dev, offset, count);
2289 #endif
2290 dev->tc_to_txq[tc].count = count;
2291 dev->tc_to_txq[tc].offset = offset;
2292 return 0;
2293 }
2294 EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297 {
2298 if (num_tc > TC_MAX_QUEUE)
2299 return -EINVAL;
2300
2301 #ifdef CONFIG_XPS
2302 netif_reset_xps_queues_gt(dev, 0);
2303 #endif
2304 dev->num_tc = num_tc;
2305 return 0;
2306 }
2307 EXPORT_SYMBOL(netdev_set_num_tc);
2308
2309 /*
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312 */
2313 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314 {
2315 int rc;
2316
2317 if (txq < 1 || txq > dev->num_tx_queues)
2318 return -EINVAL;
2319
2320 if (dev->reg_state == NETREG_REGISTERED ||
2321 dev->reg_state == NETREG_UNREGISTERING) {
2322 ASSERT_RTNL();
2323
2324 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325 txq);
2326 if (rc)
2327 return rc;
2328
2329 if (dev->num_tc)
2330 netif_setup_tc(dev, txq);
2331
2332 if (txq < dev->real_num_tx_queues) {
2333 qdisc_reset_all_tx_gt(dev, txq);
2334 #ifdef CONFIG_XPS
2335 netif_reset_xps_queues_gt(dev, txq);
2336 #endif
2337 }
2338 }
2339
2340 dev->real_num_tx_queues = txq;
2341 return 0;
2342 }
2343 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345 #ifdef CONFIG_SYSFS
2346 /**
2347 * netif_set_real_num_rx_queues - set actual number of RX queues used
2348 * @dev: Network device
2349 * @rxq: Actual number of RX queues
2350 *
2351 * This must be called either with the rtnl_lock held or before
2352 * registration of the net device. Returns 0 on success, or a
2353 * negative error code. If called before registration, it always
2354 * succeeds.
2355 */
2356 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357 {
2358 int rc;
2359
2360 if (rxq < 1 || rxq > dev->num_rx_queues)
2361 return -EINVAL;
2362
2363 if (dev->reg_state == NETREG_REGISTERED) {
2364 ASSERT_RTNL();
2365
2366 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367 rxq);
2368 if (rc)
2369 return rc;
2370 }
2371
2372 dev->real_num_rx_queues = rxq;
2373 return 0;
2374 }
2375 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376 #endif
2377
2378 /**
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2380 *
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2383 */
2384 int netif_get_num_default_rss_queues(void)
2385 {
2386 return is_kdump_kernel() ?
2387 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2388 }
2389 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391 static void __netif_reschedule(struct Qdisc *q)
2392 {
2393 struct softnet_data *sd;
2394 unsigned long flags;
2395
2396 local_irq_save(flags);
2397 sd = this_cpu_ptr(&softnet_data);
2398 q->next_sched = NULL;
2399 *sd->output_queue_tailp = q;
2400 sd->output_queue_tailp = &q->next_sched;
2401 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402 local_irq_restore(flags);
2403 }
2404
2405 void __netif_schedule(struct Qdisc *q)
2406 {
2407 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408 __netif_reschedule(q);
2409 }
2410 EXPORT_SYMBOL(__netif_schedule);
2411
2412 struct dev_kfree_skb_cb {
2413 enum skb_free_reason reason;
2414 };
2415
2416 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417 {
2418 return (struct dev_kfree_skb_cb *)skb->cb;
2419 }
2420
2421 void netif_schedule_queue(struct netdev_queue *txq)
2422 {
2423 rcu_read_lock();
2424 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425 struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427 __netif_schedule(q);
2428 }
2429 rcu_read_unlock();
2430 }
2431 EXPORT_SYMBOL(netif_schedule_queue);
2432
2433 /**
2434 * netif_wake_subqueue - allow sending packets on subqueue
2435 * @dev: network device
2436 * @queue_index: sub queue index
2437 *
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2439 */
2440 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441 {
2442 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445 struct Qdisc *q;
2446
2447 rcu_read_lock();
2448 q = rcu_dereference(txq->qdisc);
2449 __netif_schedule(q);
2450 rcu_read_unlock();
2451 }
2452 }
2453 EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456 {
2457 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458 struct Qdisc *q;
2459
2460 rcu_read_lock();
2461 q = rcu_dereference(dev_queue->qdisc);
2462 __netif_schedule(q);
2463 rcu_read_unlock();
2464 }
2465 }
2466 EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469 {
2470 unsigned long flags;
2471
2472 if (likely(atomic_read(&skb->users) == 1)) {
2473 smp_rmb();
2474 atomic_set(&skb->users, 0);
2475 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2476 return;
2477 }
2478 get_kfree_skb_cb(skb)->reason = reason;
2479 local_irq_save(flags);
2480 skb->next = __this_cpu_read(softnet_data.completion_queue);
2481 __this_cpu_write(softnet_data.completion_queue, skb);
2482 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483 local_irq_restore(flags);
2484 }
2485 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488 {
2489 if (in_irq() || irqs_disabled())
2490 __dev_kfree_skb_irq(skb, reason);
2491 else
2492 dev_kfree_skb(skb);
2493 }
2494 EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497 /**
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2500 *
2501 * Mark device as removed from system and therefore no longer available.
2502 */
2503 void netif_device_detach(struct net_device *dev)
2504 {
2505 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506 netif_running(dev)) {
2507 netif_tx_stop_all_queues(dev);
2508 }
2509 }
2510 EXPORT_SYMBOL(netif_device_detach);
2511
2512 /**
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2515 *
2516 * Mark device as attached from system and restart if needed.
2517 */
2518 void netif_device_attach(struct net_device *dev)
2519 {
2520 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521 netif_running(dev)) {
2522 netif_tx_wake_all_queues(dev);
2523 __netdev_watchdog_up(dev);
2524 }
2525 }
2526 EXPORT_SYMBOL(netif_device_attach);
2527
2528 /*
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2531 */
2532 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533 unsigned int num_tx_queues)
2534 {
2535 u32 hash;
2536 u16 qoffset = 0;
2537 u16 qcount = num_tx_queues;
2538
2539 if (skb_rx_queue_recorded(skb)) {
2540 hash = skb_get_rx_queue(skb);
2541 while (unlikely(hash >= num_tx_queues))
2542 hash -= num_tx_queues;
2543 return hash;
2544 }
2545
2546 if (dev->num_tc) {
2547 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2548 qoffset = dev->tc_to_txq[tc].offset;
2549 qcount = dev->tc_to_txq[tc].count;
2550 }
2551
2552 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553 }
2554 EXPORT_SYMBOL(__skb_tx_hash);
2555
2556 static void skb_warn_bad_offload(const struct sk_buff *skb)
2557 {
2558 static const netdev_features_t null_features;
2559 struct net_device *dev = skb->dev;
2560 const char *name = "";
2561
2562 if (!net_ratelimit())
2563 return;
2564
2565 if (dev) {
2566 if (dev->dev.parent)
2567 name = dev_driver_string(dev->dev.parent);
2568 else
2569 name = netdev_name(dev);
2570 }
2571 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572 "gso_type=%d ip_summed=%d\n",
2573 name, dev ? &dev->features : &null_features,
2574 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576 skb_shinfo(skb)->gso_type, skb->ip_summed);
2577 }
2578
2579 /*
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2582 */
2583 int skb_checksum_help(struct sk_buff *skb)
2584 {
2585 __wsum csum;
2586 int ret = 0, offset;
2587
2588 if (skb->ip_summed == CHECKSUM_COMPLETE)
2589 goto out_set_summed;
2590
2591 if (unlikely(skb_shinfo(skb)->gso_size)) {
2592 skb_warn_bad_offload(skb);
2593 return -EINVAL;
2594 }
2595
2596 /* Before computing a checksum, we should make sure no frag could
2597 * be modified by an external entity : checksum could be wrong.
2598 */
2599 if (skb_has_shared_frag(skb)) {
2600 ret = __skb_linearize(skb);
2601 if (ret)
2602 goto out;
2603 }
2604
2605 offset = skb_checksum_start_offset(skb);
2606 BUG_ON(offset >= skb_headlen(skb));
2607 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609 offset += skb->csum_offset;
2610 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612 if (skb_cloned(skb) &&
2613 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615 if (ret)
2616 goto out;
2617 }
2618
2619 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620 out_set_summed:
2621 skb->ip_summed = CHECKSUM_NONE;
2622 out:
2623 return ret;
2624 }
2625 EXPORT_SYMBOL(skb_checksum_help);
2626
2627 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628 {
2629 __be16 type = skb->protocol;
2630
2631 /* Tunnel gso handlers can set protocol to ethernet. */
2632 if (type == htons(ETH_P_TEB)) {
2633 struct ethhdr *eth;
2634
2635 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636 return 0;
2637
2638 eth = (struct ethhdr *)skb_mac_header(skb);
2639 type = eth->h_proto;
2640 }
2641
2642 return __vlan_get_protocol(skb, type, depth);
2643 }
2644
2645 /**
2646 * skb_mac_gso_segment - mac layer segmentation handler.
2647 * @skb: buffer to segment
2648 * @features: features for the output path (see dev->features)
2649 */
2650 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651 netdev_features_t features)
2652 {
2653 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654 struct packet_offload *ptype;
2655 int vlan_depth = skb->mac_len;
2656 __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658 if (unlikely(!type))
2659 return ERR_PTR(-EINVAL);
2660
2661 __skb_pull(skb, vlan_depth);
2662
2663 rcu_read_lock();
2664 list_for_each_entry_rcu(ptype, &offload_base, list) {
2665 if (ptype->type == type && ptype->callbacks.gso_segment) {
2666 segs = ptype->callbacks.gso_segment(skb, features);
2667 break;
2668 }
2669 }
2670 rcu_read_unlock();
2671
2672 __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674 return segs;
2675 }
2676 EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679 /* openvswitch calls this on rx path, so we need a different check.
2680 */
2681 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682 {
2683 if (tx_path)
2684 return skb->ip_summed != CHECKSUM_PARTIAL;
2685 else
2686 return skb->ip_summed == CHECKSUM_NONE;
2687 }
2688
2689 /**
2690 * __skb_gso_segment - Perform segmentation on skb.
2691 * @skb: buffer to segment
2692 * @features: features for the output path (see dev->features)
2693 * @tx_path: whether it is called in TX path
2694 *
2695 * This function segments the given skb and returns a list of segments.
2696 *
2697 * It may return NULL if the skb requires no segmentation. This is
2698 * only possible when GSO is used for verifying header integrity.
2699 *
2700 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703 netdev_features_t features, bool tx_path)
2704 {
2705 struct sk_buff *segs;
2706
2707 if (unlikely(skb_needs_check(skb, tx_path))) {
2708 int err;
2709
2710 /* We're going to init ->check field in TCP or UDP header */
2711 err = skb_cow_head(skb, 0);
2712 if (err < 0)
2713 return ERR_PTR(err);
2714 }
2715
2716 /* Only report GSO partial support if it will enable us to
2717 * support segmentation on this frame without needing additional
2718 * work.
2719 */
2720 if (features & NETIF_F_GSO_PARTIAL) {
2721 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2722 struct net_device *dev = skb->dev;
2723
2724 partial_features |= dev->features & dev->gso_partial_features;
2725 if (!skb_gso_ok(skb, features | partial_features))
2726 features &= ~NETIF_F_GSO_PARTIAL;
2727 }
2728
2729 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2730 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2731
2732 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2733 SKB_GSO_CB(skb)->encap_level = 0;
2734
2735 skb_reset_mac_header(skb);
2736 skb_reset_mac_len(skb);
2737
2738 segs = skb_mac_gso_segment(skb, features);
2739
2740 if (unlikely(skb_needs_check(skb, tx_path)))
2741 skb_warn_bad_offload(skb);
2742
2743 return segs;
2744 }
2745 EXPORT_SYMBOL(__skb_gso_segment);
2746
2747 /* Take action when hardware reception checksum errors are detected. */
2748 #ifdef CONFIG_BUG
2749 void netdev_rx_csum_fault(struct net_device *dev)
2750 {
2751 if (net_ratelimit()) {
2752 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2753 dump_stack();
2754 }
2755 }
2756 EXPORT_SYMBOL(netdev_rx_csum_fault);
2757 #endif
2758
2759 /* Actually, we should eliminate this check as soon as we know, that:
2760 * 1. IOMMU is present and allows to map all the memory.
2761 * 2. No high memory really exists on this machine.
2762 */
2763
2764 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2765 {
2766 #ifdef CONFIG_HIGHMEM
2767 int i;
2768 if (!(dev->features & NETIF_F_HIGHDMA)) {
2769 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2770 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2771 if (PageHighMem(skb_frag_page(frag)))
2772 return 1;
2773 }
2774 }
2775
2776 if (PCI_DMA_BUS_IS_PHYS) {
2777 struct device *pdev = dev->dev.parent;
2778
2779 if (!pdev)
2780 return 0;
2781 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2782 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2783 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2784 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2785 return 1;
2786 }
2787 }
2788 #endif
2789 return 0;
2790 }
2791
2792 /* If MPLS offload request, verify we are testing hardware MPLS features
2793 * instead of standard features for the netdev.
2794 */
2795 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2796 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2797 netdev_features_t features,
2798 __be16 type)
2799 {
2800 if (eth_p_mpls(type))
2801 features &= skb->dev->mpls_features;
2802
2803 return features;
2804 }
2805 #else
2806 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2807 netdev_features_t features,
2808 __be16 type)
2809 {
2810 return features;
2811 }
2812 #endif
2813
2814 static netdev_features_t harmonize_features(struct sk_buff *skb,
2815 netdev_features_t features)
2816 {
2817 int tmp;
2818 __be16 type;
2819
2820 type = skb_network_protocol(skb, &tmp);
2821 features = net_mpls_features(skb, features, type);
2822
2823 if (skb->ip_summed != CHECKSUM_NONE &&
2824 !can_checksum_protocol(features, type)) {
2825 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2826 }
2827 if (illegal_highdma(skb->dev, skb))
2828 features &= ~NETIF_F_SG;
2829
2830 return features;
2831 }
2832
2833 netdev_features_t passthru_features_check(struct sk_buff *skb,
2834 struct net_device *dev,
2835 netdev_features_t features)
2836 {
2837 return features;
2838 }
2839 EXPORT_SYMBOL(passthru_features_check);
2840
2841 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2842 struct net_device *dev,
2843 netdev_features_t features)
2844 {
2845 return vlan_features_check(skb, features);
2846 }
2847
2848 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2849 struct net_device *dev,
2850 netdev_features_t features)
2851 {
2852 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2853
2854 if (gso_segs > dev->gso_max_segs)
2855 return features & ~NETIF_F_GSO_MASK;
2856
2857 /* Support for GSO partial features requires software
2858 * intervention before we can actually process the packets
2859 * so we need to strip support for any partial features now
2860 * and we can pull them back in after we have partially
2861 * segmented the frame.
2862 */
2863 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2864 features &= ~dev->gso_partial_features;
2865
2866 /* Make sure to clear the IPv4 ID mangling feature if the
2867 * IPv4 header has the potential to be fragmented.
2868 */
2869 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2870 struct iphdr *iph = skb->encapsulation ?
2871 inner_ip_hdr(skb) : ip_hdr(skb);
2872
2873 if (!(iph->frag_off & htons(IP_DF)))
2874 features &= ~NETIF_F_TSO_MANGLEID;
2875 }
2876
2877 return features;
2878 }
2879
2880 netdev_features_t netif_skb_features(struct sk_buff *skb)
2881 {
2882 struct net_device *dev = skb->dev;
2883 netdev_features_t features = dev->features;
2884
2885 if (skb_is_gso(skb))
2886 features = gso_features_check(skb, dev, features);
2887
2888 /* If encapsulation offload request, verify we are testing
2889 * hardware encapsulation features instead of standard
2890 * features for the netdev
2891 */
2892 if (skb->encapsulation)
2893 features &= dev->hw_enc_features;
2894
2895 if (skb_vlan_tagged(skb))
2896 features = netdev_intersect_features(features,
2897 dev->vlan_features |
2898 NETIF_F_HW_VLAN_CTAG_TX |
2899 NETIF_F_HW_VLAN_STAG_TX);
2900
2901 if (dev->netdev_ops->ndo_features_check)
2902 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2903 features);
2904 else
2905 features &= dflt_features_check(skb, dev, features);
2906
2907 return harmonize_features(skb, features);
2908 }
2909 EXPORT_SYMBOL(netif_skb_features);
2910
2911 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2912 struct netdev_queue *txq, bool more)
2913 {
2914 unsigned int len;
2915 int rc;
2916
2917 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2918 dev_queue_xmit_nit(skb, dev);
2919
2920 len = skb->len;
2921 trace_net_dev_start_xmit(skb, dev);
2922 rc = netdev_start_xmit(skb, dev, txq, more);
2923 trace_net_dev_xmit(skb, rc, dev, len);
2924
2925 return rc;
2926 }
2927
2928 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2929 struct netdev_queue *txq, int *ret)
2930 {
2931 struct sk_buff *skb = first;
2932 int rc = NETDEV_TX_OK;
2933
2934 while (skb) {
2935 struct sk_buff *next = skb->next;
2936
2937 skb->next = NULL;
2938 rc = xmit_one(skb, dev, txq, next != NULL);
2939 if (unlikely(!dev_xmit_complete(rc))) {
2940 skb->next = next;
2941 goto out;
2942 }
2943
2944 skb = next;
2945 if (netif_xmit_stopped(txq) && skb) {
2946 rc = NETDEV_TX_BUSY;
2947 break;
2948 }
2949 }
2950
2951 out:
2952 *ret = rc;
2953 return skb;
2954 }
2955
2956 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2957 netdev_features_t features)
2958 {
2959 if (skb_vlan_tag_present(skb) &&
2960 !vlan_hw_offload_capable(features, skb->vlan_proto))
2961 skb = __vlan_hwaccel_push_inside(skb);
2962 return skb;
2963 }
2964
2965 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2966 {
2967 netdev_features_t features;
2968
2969 features = netif_skb_features(skb);
2970 skb = validate_xmit_vlan(skb, features);
2971 if (unlikely(!skb))
2972 goto out_null;
2973
2974 if (netif_needs_gso(skb, features)) {
2975 struct sk_buff *segs;
2976
2977 segs = skb_gso_segment(skb, features);
2978 if (IS_ERR(segs)) {
2979 goto out_kfree_skb;
2980 } else if (segs) {
2981 consume_skb(skb);
2982 skb = segs;
2983 }
2984 } else {
2985 if (skb_needs_linearize(skb, features) &&
2986 __skb_linearize(skb))
2987 goto out_kfree_skb;
2988
2989 /* If packet is not checksummed and device does not
2990 * support checksumming for this protocol, complete
2991 * checksumming here.
2992 */
2993 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2994 if (skb->encapsulation)
2995 skb_set_inner_transport_header(skb,
2996 skb_checksum_start_offset(skb));
2997 else
2998 skb_set_transport_header(skb,
2999 skb_checksum_start_offset(skb));
3000 if (!(features & NETIF_F_CSUM_MASK) &&
3001 skb_checksum_help(skb))
3002 goto out_kfree_skb;
3003 }
3004 }
3005
3006 return skb;
3007
3008 out_kfree_skb:
3009 kfree_skb(skb);
3010 out_null:
3011 atomic_long_inc(&dev->tx_dropped);
3012 return NULL;
3013 }
3014
3015 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3016 {
3017 struct sk_buff *next, *head = NULL, *tail;
3018
3019 for (; skb != NULL; skb = next) {
3020 next = skb->next;
3021 skb->next = NULL;
3022
3023 /* in case skb wont be segmented, point to itself */
3024 skb->prev = skb;
3025
3026 skb = validate_xmit_skb(skb, dev);
3027 if (!skb)
3028 continue;
3029
3030 if (!head)
3031 head = skb;
3032 else
3033 tail->next = skb;
3034 /* If skb was segmented, skb->prev points to
3035 * the last segment. If not, it still contains skb.
3036 */
3037 tail = skb->prev;
3038 }
3039 return head;
3040 }
3041 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3042
3043 static void qdisc_pkt_len_init(struct sk_buff *skb)
3044 {
3045 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3046
3047 qdisc_skb_cb(skb)->pkt_len = skb->len;
3048
3049 /* To get more precise estimation of bytes sent on wire,
3050 * we add to pkt_len the headers size of all segments
3051 */
3052 if (shinfo->gso_size) {
3053 unsigned int hdr_len;
3054 u16 gso_segs = shinfo->gso_segs;
3055
3056 /* mac layer + network layer */
3057 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3058
3059 /* + transport layer */
3060 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3061 hdr_len += tcp_hdrlen(skb);
3062 else
3063 hdr_len += sizeof(struct udphdr);
3064
3065 if (shinfo->gso_type & SKB_GSO_DODGY)
3066 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3067 shinfo->gso_size);
3068
3069 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3070 }
3071 }
3072
3073 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3074 struct net_device *dev,
3075 struct netdev_queue *txq)
3076 {
3077 spinlock_t *root_lock = qdisc_lock(q);
3078 struct sk_buff *to_free = NULL;
3079 bool contended;
3080 int rc;
3081
3082 qdisc_calculate_pkt_len(skb, q);
3083 /*
3084 * Heuristic to force contended enqueues to serialize on a
3085 * separate lock before trying to get qdisc main lock.
3086 * This permits qdisc->running owner to get the lock more
3087 * often and dequeue packets faster.
3088 */
3089 contended = qdisc_is_running(q);
3090 if (unlikely(contended))
3091 spin_lock(&q->busylock);
3092
3093 spin_lock(root_lock);
3094 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3095 __qdisc_drop(skb, &to_free);
3096 rc = NET_XMIT_DROP;
3097 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3098 qdisc_run_begin(q)) {
3099 /*
3100 * This is a work-conserving queue; there are no old skbs
3101 * waiting to be sent out; and the qdisc is not running -
3102 * xmit the skb directly.
3103 */
3104
3105 qdisc_bstats_update(q, skb);
3106
3107 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3108 if (unlikely(contended)) {
3109 spin_unlock(&q->busylock);
3110 contended = false;
3111 }
3112 __qdisc_run(q);
3113 } else
3114 qdisc_run_end(q);
3115
3116 rc = NET_XMIT_SUCCESS;
3117 } else {
3118 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3119 if (qdisc_run_begin(q)) {
3120 if (unlikely(contended)) {
3121 spin_unlock(&q->busylock);
3122 contended = false;
3123 }
3124 __qdisc_run(q);
3125 }
3126 }
3127 spin_unlock(root_lock);
3128 if (unlikely(to_free))
3129 kfree_skb_list(to_free);
3130 if (unlikely(contended))
3131 spin_unlock(&q->busylock);
3132 return rc;
3133 }
3134
3135 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3136 static void skb_update_prio(struct sk_buff *skb)
3137 {
3138 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3139
3140 if (!skb->priority && skb->sk && map) {
3141 unsigned int prioidx =
3142 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3143
3144 if (prioidx < map->priomap_len)
3145 skb->priority = map->priomap[prioidx];
3146 }
3147 }
3148 #else
3149 #define skb_update_prio(skb)
3150 #endif
3151
3152 DEFINE_PER_CPU(int, xmit_recursion);
3153 EXPORT_SYMBOL(xmit_recursion);
3154
3155 /**
3156 * dev_loopback_xmit - loop back @skb
3157 * @net: network namespace this loopback is happening in
3158 * @sk: sk needed to be a netfilter okfn
3159 * @skb: buffer to transmit
3160 */
3161 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3162 {
3163 skb_reset_mac_header(skb);
3164 __skb_pull(skb, skb_network_offset(skb));
3165 skb->pkt_type = PACKET_LOOPBACK;
3166 skb->ip_summed = CHECKSUM_UNNECESSARY;
3167 WARN_ON(!skb_dst(skb));
3168 skb_dst_force(skb);
3169 netif_rx_ni(skb);
3170 return 0;
3171 }
3172 EXPORT_SYMBOL(dev_loopback_xmit);
3173
3174 #ifdef CONFIG_NET_EGRESS
3175 static struct sk_buff *
3176 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3177 {
3178 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3179 struct tcf_result cl_res;
3180
3181 if (!cl)
3182 return skb;
3183
3184 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3185 * earlier by the caller.
3186 */
3187 qdisc_bstats_cpu_update(cl->q, skb);
3188
3189 switch (tc_classify(skb, cl, &cl_res, false)) {
3190 case TC_ACT_OK:
3191 case TC_ACT_RECLASSIFY:
3192 skb->tc_index = TC_H_MIN(cl_res.classid);
3193 break;
3194 case TC_ACT_SHOT:
3195 qdisc_qstats_cpu_drop(cl->q);
3196 *ret = NET_XMIT_DROP;
3197 kfree_skb(skb);
3198 return NULL;
3199 case TC_ACT_STOLEN:
3200 case TC_ACT_QUEUED:
3201 *ret = NET_XMIT_SUCCESS;
3202 consume_skb(skb);
3203 return NULL;
3204 case TC_ACT_REDIRECT:
3205 /* No need to push/pop skb's mac_header here on egress! */
3206 skb_do_redirect(skb);
3207 *ret = NET_XMIT_SUCCESS;
3208 return NULL;
3209 default:
3210 break;
3211 }
3212
3213 return skb;
3214 }
3215 #endif /* CONFIG_NET_EGRESS */
3216
3217 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3218 {
3219 #ifdef CONFIG_XPS
3220 struct xps_dev_maps *dev_maps;
3221 struct xps_map *map;
3222 int queue_index = -1;
3223
3224 rcu_read_lock();
3225 dev_maps = rcu_dereference(dev->xps_maps);
3226 if (dev_maps) {
3227 unsigned int tci = skb->sender_cpu - 1;
3228
3229 if (dev->num_tc) {
3230 tci *= dev->num_tc;
3231 tci += netdev_get_prio_tc_map(dev, skb->priority);
3232 }
3233
3234 map = rcu_dereference(dev_maps->cpu_map[tci]);
3235 if (map) {
3236 if (map->len == 1)
3237 queue_index = map->queues[0];
3238 else
3239 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3240 map->len)];
3241 if (unlikely(queue_index >= dev->real_num_tx_queues))
3242 queue_index = -1;
3243 }
3244 }
3245 rcu_read_unlock();
3246
3247 return queue_index;
3248 #else
3249 return -1;
3250 #endif
3251 }
3252
3253 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3254 {
3255 struct sock *sk = skb->sk;
3256 int queue_index = sk_tx_queue_get(sk);
3257
3258 if (queue_index < 0 || skb->ooo_okay ||
3259 queue_index >= dev->real_num_tx_queues) {
3260 int new_index = get_xps_queue(dev, skb);
3261 if (new_index < 0)
3262 new_index = skb_tx_hash(dev, skb);
3263
3264 if (queue_index != new_index && sk &&
3265 sk_fullsock(sk) &&
3266 rcu_access_pointer(sk->sk_dst_cache))
3267 sk_tx_queue_set(sk, new_index);
3268
3269 queue_index = new_index;
3270 }
3271
3272 return queue_index;
3273 }
3274
3275 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3276 struct sk_buff *skb,
3277 void *accel_priv)
3278 {
3279 int queue_index = 0;
3280
3281 #ifdef CONFIG_XPS
3282 u32 sender_cpu = skb->sender_cpu - 1;
3283
3284 if (sender_cpu >= (u32)NR_CPUS)
3285 skb->sender_cpu = raw_smp_processor_id() + 1;
3286 #endif
3287
3288 if (dev->real_num_tx_queues != 1) {
3289 const struct net_device_ops *ops = dev->netdev_ops;
3290 if (ops->ndo_select_queue)
3291 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3292 __netdev_pick_tx);
3293 else
3294 queue_index = __netdev_pick_tx(dev, skb);
3295
3296 if (!accel_priv)
3297 queue_index = netdev_cap_txqueue(dev, queue_index);
3298 }
3299
3300 skb_set_queue_mapping(skb, queue_index);
3301 return netdev_get_tx_queue(dev, queue_index);
3302 }
3303
3304 /**
3305 * __dev_queue_xmit - transmit a buffer
3306 * @skb: buffer to transmit
3307 * @accel_priv: private data used for L2 forwarding offload
3308 *
3309 * Queue a buffer for transmission to a network device. The caller must
3310 * have set the device and priority and built the buffer before calling
3311 * this function. The function can be called from an interrupt.
3312 *
3313 * A negative errno code is returned on a failure. A success does not
3314 * guarantee the frame will be transmitted as it may be dropped due
3315 * to congestion or traffic shaping.
3316 *
3317 * -----------------------------------------------------------------------------------
3318 * I notice this method can also return errors from the queue disciplines,
3319 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3320 * be positive.
3321 *
3322 * Regardless of the return value, the skb is consumed, so it is currently
3323 * difficult to retry a send to this method. (You can bump the ref count
3324 * before sending to hold a reference for retry if you are careful.)
3325 *
3326 * When calling this method, interrupts MUST be enabled. This is because
3327 * the BH enable code must have IRQs enabled so that it will not deadlock.
3328 * --BLG
3329 */
3330 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3331 {
3332 struct net_device *dev = skb->dev;
3333 struct netdev_queue *txq;
3334 struct Qdisc *q;
3335 int rc = -ENOMEM;
3336
3337 skb_reset_mac_header(skb);
3338
3339 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3340 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3341
3342 /* Disable soft irqs for various locks below. Also
3343 * stops preemption for RCU.
3344 */
3345 rcu_read_lock_bh();
3346
3347 skb_update_prio(skb);
3348
3349 qdisc_pkt_len_init(skb);
3350 #ifdef CONFIG_NET_CLS_ACT
3351 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3352 # ifdef CONFIG_NET_EGRESS
3353 if (static_key_false(&egress_needed)) {
3354 skb = sch_handle_egress(skb, &rc, dev);
3355 if (!skb)
3356 goto out;
3357 }
3358 # endif
3359 #endif
3360 /* If device/qdisc don't need skb->dst, release it right now while
3361 * its hot in this cpu cache.
3362 */
3363 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3364 skb_dst_drop(skb);
3365 else
3366 skb_dst_force(skb);
3367
3368 txq = netdev_pick_tx(dev, skb, accel_priv);
3369 q = rcu_dereference_bh(txq->qdisc);
3370
3371 trace_net_dev_queue(skb);
3372 if (q->enqueue) {
3373 rc = __dev_xmit_skb(skb, q, dev, txq);
3374 goto out;
3375 }
3376
3377 /* The device has no queue. Common case for software devices:
3378 loopback, all the sorts of tunnels...
3379
3380 Really, it is unlikely that netif_tx_lock protection is necessary
3381 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3382 counters.)
3383 However, it is possible, that they rely on protection
3384 made by us here.
3385
3386 Check this and shot the lock. It is not prone from deadlocks.
3387 Either shot noqueue qdisc, it is even simpler 8)
3388 */
3389 if (dev->flags & IFF_UP) {
3390 int cpu = smp_processor_id(); /* ok because BHs are off */
3391
3392 if (txq->xmit_lock_owner != cpu) {
3393 if (unlikely(__this_cpu_read(xmit_recursion) >
3394 XMIT_RECURSION_LIMIT))
3395 goto recursion_alert;
3396
3397 skb = validate_xmit_skb(skb, dev);
3398 if (!skb)
3399 goto out;
3400
3401 HARD_TX_LOCK(dev, txq, cpu);
3402
3403 if (!netif_xmit_stopped(txq)) {
3404 __this_cpu_inc(xmit_recursion);
3405 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3406 __this_cpu_dec(xmit_recursion);
3407 if (dev_xmit_complete(rc)) {
3408 HARD_TX_UNLOCK(dev, txq);
3409 goto out;
3410 }
3411 }
3412 HARD_TX_UNLOCK(dev, txq);
3413 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3414 dev->name);
3415 } else {
3416 /* Recursion is detected! It is possible,
3417 * unfortunately
3418 */
3419 recursion_alert:
3420 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3421 dev->name);
3422 }
3423 }
3424
3425 rc = -ENETDOWN;
3426 rcu_read_unlock_bh();
3427
3428 atomic_long_inc(&dev->tx_dropped);
3429 kfree_skb_list(skb);
3430 return rc;
3431 out:
3432 rcu_read_unlock_bh();
3433 return rc;
3434 }
3435
3436 int dev_queue_xmit(struct sk_buff *skb)
3437 {
3438 return __dev_queue_xmit(skb, NULL);
3439 }
3440 EXPORT_SYMBOL(dev_queue_xmit);
3441
3442 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3443 {
3444 return __dev_queue_xmit(skb, accel_priv);
3445 }
3446 EXPORT_SYMBOL(dev_queue_xmit_accel);
3447
3448
3449 /*=======================================================================
3450 Receiver routines
3451 =======================================================================*/
3452
3453 int netdev_max_backlog __read_mostly = 1000;
3454 EXPORT_SYMBOL(netdev_max_backlog);
3455
3456 int netdev_tstamp_prequeue __read_mostly = 1;
3457 int netdev_budget __read_mostly = 300;
3458 int weight_p __read_mostly = 64; /* old backlog weight */
3459
3460 /* Called with irq disabled */
3461 static inline void ____napi_schedule(struct softnet_data *sd,
3462 struct napi_struct *napi)
3463 {
3464 list_add_tail(&napi->poll_list, &sd->poll_list);
3465 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3466 }
3467
3468 #ifdef CONFIG_RPS
3469
3470 /* One global table that all flow-based protocols share. */
3471 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3472 EXPORT_SYMBOL(rps_sock_flow_table);
3473 u32 rps_cpu_mask __read_mostly;
3474 EXPORT_SYMBOL(rps_cpu_mask);
3475
3476 struct static_key rps_needed __read_mostly;
3477 EXPORT_SYMBOL(rps_needed);
3478 struct static_key rfs_needed __read_mostly;
3479 EXPORT_SYMBOL(rfs_needed);
3480
3481 static struct rps_dev_flow *
3482 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3483 struct rps_dev_flow *rflow, u16 next_cpu)
3484 {
3485 if (next_cpu < nr_cpu_ids) {
3486 #ifdef CONFIG_RFS_ACCEL
3487 struct netdev_rx_queue *rxqueue;
3488 struct rps_dev_flow_table *flow_table;
3489 struct rps_dev_flow *old_rflow;
3490 u32 flow_id;
3491 u16 rxq_index;
3492 int rc;
3493
3494 /* Should we steer this flow to a different hardware queue? */
3495 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3496 !(dev->features & NETIF_F_NTUPLE))
3497 goto out;
3498 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3499 if (rxq_index == skb_get_rx_queue(skb))
3500 goto out;
3501
3502 rxqueue = dev->_rx + rxq_index;
3503 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3504 if (!flow_table)
3505 goto out;
3506 flow_id = skb_get_hash(skb) & flow_table->mask;
3507 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3508 rxq_index, flow_id);
3509 if (rc < 0)
3510 goto out;
3511 old_rflow = rflow;
3512 rflow = &flow_table->flows[flow_id];
3513 rflow->filter = rc;
3514 if (old_rflow->filter == rflow->filter)
3515 old_rflow->filter = RPS_NO_FILTER;
3516 out:
3517 #endif
3518 rflow->last_qtail =
3519 per_cpu(softnet_data, next_cpu).input_queue_head;
3520 }
3521
3522 rflow->cpu = next_cpu;
3523 return rflow;
3524 }
3525
3526 /*
3527 * get_rps_cpu is called from netif_receive_skb and returns the target
3528 * CPU from the RPS map of the receiving queue for a given skb.
3529 * rcu_read_lock must be held on entry.
3530 */
3531 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3532 struct rps_dev_flow **rflowp)
3533 {
3534 const struct rps_sock_flow_table *sock_flow_table;
3535 struct netdev_rx_queue *rxqueue = dev->_rx;
3536 struct rps_dev_flow_table *flow_table;
3537 struct rps_map *map;
3538 int cpu = -1;
3539 u32 tcpu;
3540 u32 hash;
3541
3542 if (skb_rx_queue_recorded(skb)) {
3543 u16 index = skb_get_rx_queue(skb);
3544
3545 if (unlikely(index >= dev->real_num_rx_queues)) {
3546 WARN_ONCE(dev->real_num_rx_queues > 1,
3547 "%s received packet on queue %u, but number "
3548 "of RX queues is %u\n",
3549 dev->name, index, dev->real_num_rx_queues);
3550 goto done;
3551 }
3552 rxqueue += index;
3553 }
3554
3555 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3556
3557 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3558 map = rcu_dereference(rxqueue->rps_map);
3559 if (!flow_table && !map)
3560 goto done;
3561
3562 skb_reset_network_header(skb);
3563 hash = skb_get_hash(skb);
3564 if (!hash)
3565 goto done;
3566
3567 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3568 if (flow_table && sock_flow_table) {
3569 struct rps_dev_flow *rflow;
3570 u32 next_cpu;
3571 u32 ident;
3572
3573 /* First check into global flow table if there is a match */
3574 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3575 if ((ident ^ hash) & ~rps_cpu_mask)
3576 goto try_rps;
3577
3578 next_cpu = ident & rps_cpu_mask;
3579
3580 /* OK, now we know there is a match,
3581 * we can look at the local (per receive queue) flow table
3582 */
3583 rflow = &flow_table->flows[hash & flow_table->mask];
3584 tcpu = rflow->cpu;
3585
3586 /*
3587 * If the desired CPU (where last recvmsg was done) is
3588 * different from current CPU (one in the rx-queue flow
3589 * table entry), switch if one of the following holds:
3590 * - Current CPU is unset (>= nr_cpu_ids).
3591 * - Current CPU is offline.
3592 * - The current CPU's queue tail has advanced beyond the
3593 * last packet that was enqueued using this table entry.
3594 * This guarantees that all previous packets for the flow
3595 * have been dequeued, thus preserving in order delivery.
3596 */
3597 if (unlikely(tcpu != next_cpu) &&
3598 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3599 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3600 rflow->last_qtail)) >= 0)) {
3601 tcpu = next_cpu;
3602 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3603 }
3604
3605 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3606 *rflowp = rflow;
3607 cpu = tcpu;
3608 goto done;
3609 }
3610 }
3611
3612 try_rps:
3613
3614 if (map) {
3615 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3616 if (cpu_online(tcpu)) {
3617 cpu = tcpu;
3618 goto done;
3619 }
3620 }
3621
3622 done:
3623 return cpu;
3624 }
3625
3626 #ifdef CONFIG_RFS_ACCEL
3627
3628 /**
3629 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3630 * @dev: Device on which the filter was set
3631 * @rxq_index: RX queue index
3632 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3633 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3634 *
3635 * Drivers that implement ndo_rx_flow_steer() should periodically call
3636 * this function for each installed filter and remove the filters for
3637 * which it returns %true.
3638 */
3639 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3640 u32 flow_id, u16 filter_id)
3641 {
3642 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3643 struct rps_dev_flow_table *flow_table;
3644 struct rps_dev_flow *rflow;
3645 bool expire = true;
3646 unsigned int cpu;
3647
3648 rcu_read_lock();
3649 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3650 if (flow_table && flow_id <= flow_table->mask) {
3651 rflow = &flow_table->flows[flow_id];
3652 cpu = ACCESS_ONCE(rflow->cpu);
3653 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3654 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3655 rflow->last_qtail) <
3656 (int)(10 * flow_table->mask)))
3657 expire = false;
3658 }
3659 rcu_read_unlock();
3660 return expire;
3661 }
3662 EXPORT_SYMBOL(rps_may_expire_flow);
3663
3664 #endif /* CONFIG_RFS_ACCEL */
3665
3666 /* Called from hardirq (IPI) context */
3667 static void rps_trigger_softirq(void *data)
3668 {
3669 struct softnet_data *sd = data;
3670
3671 ____napi_schedule(sd, &sd->backlog);
3672 sd->received_rps++;
3673 }
3674
3675 #endif /* CONFIG_RPS */
3676
3677 /*
3678 * Check if this softnet_data structure is another cpu one
3679 * If yes, queue it to our IPI list and return 1
3680 * If no, return 0
3681 */
3682 static int rps_ipi_queued(struct softnet_data *sd)
3683 {
3684 #ifdef CONFIG_RPS
3685 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3686
3687 if (sd != mysd) {
3688 sd->rps_ipi_next = mysd->rps_ipi_list;
3689 mysd->rps_ipi_list = sd;
3690
3691 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3692 return 1;
3693 }
3694 #endif /* CONFIG_RPS */
3695 return 0;
3696 }
3697
3698 #ifdef CONFIG_NET_FLOW_LIMIT
3699 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3700 #endif
3701
3702 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3703 {
3704 #ifdef CONFIG_NET_FLOW_LIMIT
3705 struct sd_flow_limit *fl;
3706 struct softnet_data *sd;
3707 unsigned int old_flow, new_flow;
3708
3709 if (qlen < (netdev_max_backlog >> 1))
3710 return false;
3711
3712 sd = this_cpu_ptr(&softnet_data);
3713
3714 rcu_read_lock();
3715 fl = rcu_dereference(sd->flow_limit);
3716 if (fl) {
3717 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3718 old_flow = fl->history[fl->history_head];
3719 fl->history[fl->history_head] = new_flow;
3720
3721 fl->history_head++;
3722 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3723
3724 if (likely(fl->buckets[old_flow]))
3725 fl->buckets[old_flow]--;
3726
3727 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3728 fl->count++;
3729 rcu_read_unlock();
3730 return true;
3731 }
3732 }
3733 rcu_read_unlock();
3734 #endif
3735 return false;
3736 }
3737
3738 /*
3739 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3740 * queue (may be a remote CPU queue).
3741 */
3742 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3743 unsigned int *qtail)
3744 {
3745 struct softnet_data *sd;
3746 unsigned long flags;
3747 unsigned int qlen;
3748
3749 sd = &per_cpu(softnet_data, cpu);
3750
3751 local_irq_save(flags);
3752
3753 rps_lock(sd);
3754 if (!netif_running(skb->dev))
3755 goto drop;
3756 qlen = skb_queue_len(&sd->input_pkt_queue);
3757 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3758 if (qlen) {
3759 enqueue:
3760 __skb_queue_tail(&sd->input_pkt_queue, skb);
3761 input_queue_tail_incr_save(sd, qtail);
3762 rps_unlock(sd);
3763 local_irq_restore(flags);
3764 return NET_RX_SUCCESS;
3765 }
3766
3767 /* Schedule NAPI for backlog device
3768 * We can use non atomic operation since we own the queue lock
3769 */
3770 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3771 if (!rps_ipi_queued(sd))
3772 ____napi_schedule(sd, &sd->backlog);
3773 }
3774 goto enqueue;
3775 }
3776
3777 drop:
3778 sd->dropped++;
3779 rps_unlock(sd);
3780
3781 local_irq_restore(flags);
3782
3783 atomic_long_inc(&skb->dev->rx_dropped);
3784 kfree_skb(skb);
3785 return NET_RX_DROP;
3786 }
3787
3788 static int netif_rx_internal(struct sk_buff *skb)
3789 {
3790 int ret;
3791
3792 net_timestamp_check(netdev_tstamp_prequeue, skb);
3793
3794 trace_netif_rx(skb);
3795 #ifdef CONFIG_RPS
3796 if (static_key_false(&rps_needed)) {
3797 struct rps_dev_flow voidflow, *rflow = &voidflow;
3798 int cpu;
3799
3800 preempt_disable();
3801 rcu_read_lock();
3802
3803 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3804 if (cpu < 0)
3805 cpu = smp_processor_id();
3806
3807 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3808
3809 rcu_read_unlock();
3810 preempt_enable();
3811 } else
3812 #endif
3813 {
3814 unsigned int qtail;
3815 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3816 put_cpu();
3817 }
3818 return ret;
3819 }
3820
3821 /**
3822 * netif_rx - post buffer to the network code
3823 * @skb: buffer to post
3824 *
3825 * This function receives a packet from a device driver and queues it for
3826 * the upper (protocol) levels to process. It always succeeds. The buffer
3827 * may be dropped during processing for congestion control or by the
3828 * protocol layers.
3829 *
3830 * return values:
3831 * NET_RX_SUCCESS (no congestion)
3832 * NET_RX_DROP (packet was dropped)
3833 *
3834 */
3835
3836 int netif_rx(struct sk_buff *skb)
3837 {
3838 trace_netif_rx_entry(skb);
3839
3840 return netif_rx_internal(skb);
3841 }
3842 EXPORT_SYMBOL(netif_rx);
3843
3844 int netif_rx_ni(struct sk_buff *skb)
3845 {
3846 int err;
3847
3848 trace_netif_rx_ni_entry(skb);
3849
3850 preempt_disable();
3851 err = netif_rx_internal(skb);
3852 if (local_softirq_pending())
3853 do_softirq();
3854 preempt_enable();
3855
3856 return err;
3857 }
3858 EXPORT_SYMBOL(netif_rx_ni);
3859
3860 static __latent_entropy void net_tx_action(struct softirq_action *h)
3861 {
3862 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3863
3864 if (sd->completion_queue) {
3865 struct sk_buff *clist;
3866
3867 local_irq_disable();
3868 clist = sd->completion_queue;
3869 sd->completion_queue = NULL;
3870 local_irq_enable();
3871
3872 while (clist) {
3873 struct sk_buff *skb = clist;
3874 clist = clist->next;
3875
3876 WARN_ON(atomic_read(&skb->users));
3877 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3878 trace_consume_skb(skb);
3879 else
3880 trace_kfree_skb(skb, net_tx_action);
3881
3882 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3883 __kfree_skb(skb);
3884 else
3885 __kfree_skb_defer(skb);
3886 }
3887
3888 __kfree_skb_flush();
3889 }
3890
3891 if (sd->output_queue) {
3892 struct Qdisc *head;
3893
3894 local_irq_disable();
3895 head = sd->output_queue;
3896 sd->output_queue = NULL;
3897 sd->output_queue_tailp = &sd->output_queue;
3898 local_irq_enable();
3899
3900 while (head) {
3901 struct Qdisc *q = head;
3902 spinlock_t *root_lock;
3903
3904 head = head->next_sched;
3905
3906 root_lock = qdisc_lock(q);
3907 spin_lock(root_lock);
3908 /* We need to make sure head->next_sched is read
3909 * before clearing __QDISC_STATE_SCHED
3910 */
3911 smp_mb__before_atomic();
3912 clear_bit(__QDISC_STATE_SCHED, &q->state);
3913 qdisc_run(q);
3914 spin_unlock(root_lock);
3915 }
3916 }
3917 }
3918
3919 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3920 /* This hook is defined here for ATM LANE */
3921 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3922 unsigned char *addr) __read_mostly;
3923 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3924 #endif
3925
3926 static inline struct sk_buff *
3927 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3928 struct net_device *orig_dev)
3929 {
3930 #ifdef CONFIG_NET_CLS_ACT
3931 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3932 struct tcf_result cl_res;
3933
3934 /* If there's at least one ingress present somewhere (so
3935 * we get here via enabled static key), remaining devices
3936 * that are not configured with an ingress qdisc will bail
3937 * out here.
3938 */
3939 if (!cl)
3940 return skb;
3941 if (*pt_prev) {
3942 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3943 *pt_prev = NULL;
3944 }
3945
3946 qdisc_skb_cb(skb)->pkt_len = skb->len;
3947 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3948 qdisc_bstats_cpu_update(cl->q, skb);
3949
3950 switch (tc_classify(skb, cl, &cl_res, false)) {
3951 case TC_ACT_OK:
3952 case TC_ACT_RECLASSIFY:
3953 skb->tc_index = TC_H_MIN(cl_res.classid);
3954 break;
3955 case TC_ACT_SHOT:
3956 qdisc_qstats_cpu_drop(cl->q);
3957 kfree_skb(skb);
3958 return NULL;
3959 case TC_ACT_STOLEN:
3960 case TC_ACT_QUEUED:
3961 consume_skb(skb);
3962 return NULL;
3963 case TC_ACT_REDIRECT:
3964 /* skb_mac_header check was done by cls/act_bpf, so
3965 * we can safely push the L2 header back before
3966 * redirecting to another netdev
3967 */
3968 __skb_push(skb, skb->mac_len);
3969 skb_do_redirect(skb);
3970 return NULL;
3971 default:
3972 break;
3973 }
3974 #endif /* CONFIG_NET_CLS_ACT */
3975 return skb;
3976 }
3977
3978 /**
3979 * netdev_is_rx_handler_busy - check if receive handler is registered
3980 * @dev: device to check
3981 *
3982 * Check if a receive handler is already registered for a given device.
3983 * Return true if there one.
3984 *
3985 * The caller must hold the rtnl_mutex.
3986 */
3987 bool netdev_is_rx_handler_busy(struct net_device *dev)
3988 {
3989 ASSERT_RTNL();
3990 return dev && rtnl_dereference(dev->rx_handler);
3991 }
3992 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3993
3994 /**
3995 * netdev_rx_handler_register - register receive handler
3996 * @dev: device to register a handler for
3997 * @rx_handler: receive handler to register
3998 * @rx_handler_data: data pointer that is used by rx handler
3999 *
4000 * Register a receive handler for a device. This handler will then be
4001 * called from __netif_receive_skb. A negative errno code is returned
4002 * on a failure.
4003 *
4004 * The caller must hold the rtnl_mutex.
4005 *
4006 * For a general description of rx_handler, see enum rx_handler_result.
4007 */
4008 int netdev_rx_handler_register(struct net_device *dev,
4009 rx_handler_func_t *rx_handler,
4010 void *rx_handler_data)
4011 {
4012 ASSERT_RTNL();
4013
4014 if (dev->rx_handler)
4015 return -EBUSY;
4016
4017 /* Note: rx_handler_data must be set before rx_handler */
4018 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4019 rcu_assign_pointer(dev->rx_handler, rx_handler);
4020
4021 return 0;
4022 }
4023 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4024
4025 /**
4026 * netdev_rx_handler_unregister - unregister receive handler
4027 * @dev: device to unregister a handler from
4028 *
4029 * Unregister a receive handler from a device.
4030 *
4031 * The caller must hold the rtnl_mutex.
4032 */
4033 void netdev_rx_handler_unregister(struct net_device *dev)
4034 {
4035
4036 ASSERT_RTNL();
4037 RCU_INIT_POINTER(dev->rx_handler, NULL);
4038 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4039 * section has a guarantee to see a non NULL rx_handler_data
4040 * as well.
4041 */
4042 synchronize_net();
4043 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4044 }
4045 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4046
4047 /*
4048 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4049 * the special handling of PFMEMALLOC skbs.
4050 */
4051 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4052 {
4053 switch (skb->protocol) {
4054 case htons(ETH_P_ARP):
4055 case htons(ETH_P_IP):
4056 case htons(ETH_P_IPV6):
4057 case htons(ETH_P_8021Q):
4058 case htons(ETH_P_8021AD):
4059 return true;
4060 default:
4061 return false;
4062 }
4063 }
4064
4065 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4066 int *ret, struct net_device *orig_dev)
4067 {
4068 #ifdef CONFIG_NETFILTER_INGRESS
4069 if (nf_hook_ingress_active(skb)) {
4070 int ingress_retval;
4071
4072 if (*pt_prev) {
4073 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4074 *pt_prev = NULL;
4075 }
4076
4077 rcu_read_lock();
4078 ingress_retval = nf_hook_ingress(skb);
4079 rcu_read_unlock();
4080 return ingress_retval;
4081 }
4082 #endif /* CONFIG_NETFILTER_INGRESS */
4083 return 0;
4084 }
4085
4086 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4087 {
4088 struct packet_type *ptype, *pt_prev;
4089 rx_handler_func_t *rx_handler;
4090 struct net_device *orig_dev;
4091 bool deliver_exact = false;
4092 int ret = NET_RX_DROP;
4093 __be16 type;
4094
4095 net_timestamp_check(!netdev_tstamp_prequeue, skb);
4096
4097 trace_netif_receive_skb(skb);
4098
4099 orig_dev = skb->dev;
4100
4101 skb_reset_network_header(skb);
4102 if (!skb_transport_header_was_set(skb))
4103 skb_reset_transport_header(skb);
4104 skb_reset_mac_len(skb);
4105
4106 pt_prev = NULL;
4107
4108 another_round:
4109 skb->skb_iif = skb->dev->ifindex;
4110
4111 __this_cpu_inc(softnet_data.processed);
4112
4113 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4114 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4115 skb = skb_vlan_untag(skb);
4116 if (unlikely(!skb))
4117 goto out;
4118 }
4119
4120 #ifdef CONFIG_NET_CLS_ACT
4121 if (skb->tc_verd & TC_NCLS) {
4122 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4123 goto ncls;
4124 }
4125 #endif
4126
4127 if (pfmemalloc)
4128 goto skip_taps;
4129
4130 list_for_each_entry_rcu(ptype, &ptype_all, list) {
4131 if (pt_prev)
4132 ret = deliver_skb(skb, pt_prev, orig_dev);
4133 pt_prev = ptype;
4134 }
4135
4136 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4137 if (pt_prev)
4138 ret = deliver_skb(skb, pt_prev, orig_dev);
4139 pt_prev = ptype;
4140 }
4141
4142 skip_taps:
4143 #ifdef CONFIG_NET_INGRESS
4144 if (static_key_false(&ingress_needed)) {
4145 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4146 if (!skb)
4147 goto out;
4148
4149 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4150 goto out;
4151 }
4152 #endif
4153 #ifdef CONFIG_NET_CLS_ACT
4154 skb->tc_verd = 0;
4155 ncls:
4156 #endif
4157 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4158 goto drop;
4159
4160 if (skb_vlan_tag_present(skb)) {
4161 if (pt_prev) {
4162 ret = deliver_skb(skb, pt_prev, orig_dev);
4163 pt_prev = NULL;
4164 }
4165 if (vlan_do_receive(&skb))
4166 goto another_round;
4167 else if (unlikely(!skb))
4168 goto out;
4169 }
4170
4171 rx_handler = rcu_dereference(skb->dev->rx_handler);
4172 if (rx_handler) {
4173 if (pt_prev) {
4174 ret = deliver_skb(skb, pt_prev, orig_dev);
4175 pt_prev = NULL;
4176 }
4177 switch (rx_handler(&skb)) {
4178 case RX_HANDLER_CONSUMED:
4179 ret = NET_RX_SUCCESS;
4180 goto out;
4181 case RX_HANDLER_ANOTHER:
4182 goto another_round;
4183 case RX_HANDLER_EXACT:
4184 deliver_exact = true;
4185 case RX_HANDLER_PASS:
4186 break;
4187 default:
4188 BUG();
4189 }
4190 }
4191
4192 if (unlikely(skb_vlan_tag_present(skb))) {
4193 if (skb_vlan_tag_get_id(skb))
4194 skb->pkt_type = PACKET_OTHERHOST;
4195 /* Note: we might in the future use prio bits
4196 * and set skb->priority like in vlan_do_receive()
4197 * For the time being, just ignore Priority Code Point
4198 */
4199 skb->vlan_tci = 0;
4200 }
4201
4202 type = skb->protocol;
4203
4204 /* deliver only exact match when indicated */
4205 if (likely(!deliver_exact)) {
4206 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4207 &ptype_base[ntohs(type) &
4208 PTYPE_HASH_MASK]);
4209 }
4210
4211 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4212 &orig_dev->ptype_specific);
4213
4214 if (unlikely(skb->dev != orig_dev)) {
4215 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4216 &skb->dev->ptype_specific);
4217 }
4218
4219 if (pt_prev) {
4220 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4221 goto drop;
4222 else
4223 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4224 } else {
4225 drop:
4226 if (!deliver_exact)
4227 atomic_long_inc(&skb->dev->rx_dropped);
4228 else
4229 atomic_long_inc(&skb->dev->rx_nohandler);
4230 kfree_skb(skb);
4231 /* Jamal, now you will not able to escape explaining
4232 * me how you were going to use this. :-)
4233 */
4234 ret = NET_RX_DROP;
4235 }
4236
4237 out:
4238 return ret;
4239 }
4240
4241 static int __netif_receive_skb(struct sk_buff *skb)
4242 {
4243 int ret;
4244
4245 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4246 unsigned long pflags = current->flags;
4247
4248 /*
4249 * PFMEMALLOC skbs are special, they should
4250 * - be delivered to SOCK_MEMALLOC sockets only
4251 * - stay away from userspace
4252 * - have bounded memory usage
4253 *
4254 * Use PF_MEMALLOC as this saves us from propagating the allocation
4255 * context down to all allocation sites.
4256 */
4257 current->flags |= PF_MEMALLOC;
4258 ret = __netif_receive_skb_core(skb, true);
4259 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4260 } else
4261 ret = __netif_receive_skb_core(skb, false);
4262
4263 return ret;
4264 }
4265
4266 static int netif_receive_skb_internal(struct sk_buff *skb)
4267 {
4268 int ret;
4269
4270 net_timestamp_check(netdev_tstamp_prequeue, skb);
4271
4272 if (skb_defer_rx_timestamp(skb))
4273 return NET_RX_SUCCESS;
4274
4275 rcu_read_lock();
4276
4277 #ifdef CONFIG_RPS
4278 if (static_key_false(&rps_needed)) {
4279 struct rps_dev_flow voidflow, *rflow = &voidflow;
4280 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4281
4282 if (cpu >= 0) {
4283 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4284 rcu_read_unlock();
4285 return ret;
4286 }
4287 }
4288 #endif
4289 ret = __netif_receive_skb(skb);
4290 rcu_read_unlock();
4291 return ret;
4292 }
4293
4294 /**
4295 * netif_receive_skb - process receive buffer from network
4296 * @skb: buffer to process
4297 *
4298 * netif_receive_skb() is the main receive data processing function.
4299 * It always succeeds. The buffer may be dropped during processing
4300 * for congestion control or by the protocol layers.
4301 *
4302 * This function may only be called from softirq context and interrupts
4303 * should be enabled.
4304 *
4305 * Return values (usually ignored):
4306 * NET_RX_SUCCESS: no congestion
4307 * NET_RX_DROP: packet was dropped
4308 */
4309 int netif_receive_skb(struct sk_buff *skb)
4310 {
4311 trace_netif_receive_skb_entry(skb);
4312
4313 return netif_receive_skb_internal(skb);
4314 }
4315 EXPORT_SYMBOL(netif_receive_skb);
4316
4317 DEFINE_PER_CPU(struct work_struct, flush_works);
4318
4319 /* Network device is going away, flush any packets still pending */
4320 static void flush_backlog(struct work_struct *work)
4321 {
4322 struct sk_buff *skb, *tmp;
4323 struct softnet_data *sd;
4324
4325 local_bh_disable();
4326 sd = this_cpu_ptr(&softnet_data);
4327
4328 local_irq_disable();
4329 rps_lock(sd);
4330 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4331 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4332 __skb_unlink(skb, &sd->input_pkt_queue);
4333 kfree_skb(skb);
4334 input_queue_head_incr(sd);
4335 }
4336 }
4337 rps_unlock(sd);
4338 local_irq_enable();
4339
4340 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4341 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4342 __skb_unlink(skb, &sd->process_queue);
4343 kfree_skb(skb);
4344 input_queue_head_incr(sd);
4345 }
4346 }
4347 local_bh_enable();
4348 }
4349
4350 static void flush_all_backlogs(void)
4351 {
4352 unsigned int cpu;
4353
4354 get_online_cpus();
4355
4356 for_each_online_cpu(cpu)
4357 queue_work_on(cpu, system_highpri_wq,
4358 per_cpu_ptr(&flush_works, cpu));
4359
4360 for_each_online_cpu(cpu)
4361 flush_work(per_cpu_ptr(&flush_works, cpu));
4362
4363 put_online_cpus();
4364 }
4365
4366 static int napi_gro_complete(struct sk_buff *skb)
4367 {
4368 struct packet_offload *ptype;
4369 __be16 type = skb->protocol;
4370 struct list_head *head = &offload_base;
4371 int err = -ENOENT;
4372
4373 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4374
4375 if (NAPI_GRO_CB(skb)->count == 1) {
4376 skb_shinfo(skb)->gso_size = 0;
4377 goto out;
4378 }
4379
4380 rcu_read_lock();
4381 list_for_each_entry_rcu(ptype, head, list) {
4382 if (ptype->type != type || !ptype->callbacks.gro_complete)
4383 continue;
4384
4385 err = ptype->callbacks.gro_complete(skb, 0);
4386 break;
4387 }
4388 rcu_read_unlock();
4389
4390 if (err) {
4391 WARN_ON(&ptype->list == head);
4392 kfree_skb(skb);
4393 return NET_RX_SUCCESS;
4394 }
4395
4396 out:
4397 return netif_receive_skb_internal(skb);
4398 }
4399
4400 /* napi->gro_list contains packets ordered by age.
4401 * youngest packets at the head of it.
4402 * Complete skbs in reverse order to reduce latencies.
4403 */
4404 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4405 {
4406 struct sk_buff *skb, *prev = NULL;
4407
4408 /* scan list and build reverse chain */
4409 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4410 skb->prev = prev;
4411 prev = skb;
4412 }
4413
4414 for (skb = prev; skb; skb = prev) {
4415 skb->next = NULL;
4416
4417 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4418 return;
4419
4420 prev = skb->prev;
4421 napi_gro_complete(skb);
4422 napi->gro_count--;
4423 }
4424
4425 napi->gro_list = NULL;
4426 }
4427 EXPORT_SYMBOL(napi_gro_flush);
4428
4429 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4430 {
4431 struct sk_buff *p;
4432 unsigned int maclen = skb->dev->hard_header_len;
4433 u32 hash = skb_get_hash_raw(skb);
4434
4435 for (p = napi->gro_list; p; p = p->next) {
4436 unsigned long diffs;
4437
4438 NAPI_GRO_CB(p)->flush = 0;
4439
4440 if (hash != skb_get_hash_raw(p)) {
4441 NAPI_GRO_CB(p)->same_flow = 0;
4442 continue;
4443 }
4444
4445 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4446 diffs |= p->vlan_tci ^ skb->vlan_tci;
4447 diffs |= skb_metadata_dst_cmp(p, skb);
4448 if (maclen == ETH_HLEN)
4449 diffs |= compare_ether_header(skb_mac_header(p),
4450 skb_mac_header(skb));
4451 else if (!diffs)
4452 diffs = memcmp(skb_mac_header(p),
4453 skb_mac_header(skb),
4454 maclen);
4455 NAPI_GRO_CB(p)->same_flow = !diffs;
4456 }
4457 }
4458
4459 static void skb_gro_reset_offset(struct sk_buff *skb)
4460 {
4461 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4462 const skb_frag_t *frag0 = &pinfo->frags[0];
4463
4464 NAPI_GRO_CB(skb)->data_offset = 0;
4465 NAPI_GRO_CB(skb)->frag0 = NULL;
4466 NAPI_GRO_CB(skb)->frag0_len = 0;
4467
4468 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4469 pinfo->nr_frags &&
4470 !PageHighMem(skb_frag_page(frag0))) {
4471 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4472 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4473 skb_frag_size(frag0),
4474 skb->end - skb->tail);
4475 }
4476 }
4477
4478 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4479 {
4480 struct skb_shared_info *pinfo = skb_shinfo(skb);
4481
4482 BUG_ON(skb->end - skb->tail < grow);
4483
4484 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4485
4486 skb->data_len -= grow;
4487 skb->tail += grow;
4488
4489 pinfo->frags[0].page_offset += grow;
4490 skb_frag_size_sub(&pinfo->frags[0], grow);
4491
4492 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4493 skb_frag_unref(skb, 0);
4494 memmove(pinfo->frags, pinfo->frags + 1,
4495 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4496 }
4497 }
4498
4499 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4500 {
4501 struct sk_buff **pp = NULL;
4502 struct packet_offload *ptype;
4503 __be16 type = skb->protocol;
4504 struct list_head *head = &offload_base;
4505 int same_flow;
4506 enum gro_result ret;
4507 int grow;
4508
4509 if (!(skb->dev->features & NETIF_F_GRO))
4510 goto normal;
4511
4512 if (skb->csum_bad)
4513 goto normal;
4514
4515 gro_list_prepare(napi, skb);
4516
4517 rcu_read_lock();
4518 list_for_each_entry_rcu(ptype, head, list) {
4519 if (ptype->type != type || !ptype->callbacks.gro_receive)
4520 continue;
4521
4522 skb_set_network_header(skb, skb_gro_offset(skb));
4523 skb_reset_mac_len(skb);
4524 NAPI_GRO_CB(skb)->same_flow = 0;
4525 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4526 NAPI_GRO_CB(skb)->free = 0;
4527 NAPI_GRO_CB(skb)->encap_mark = 0;
4528 NAPI_GRO_CB(skb)->recursion_counter = 0;
4529 NAPI_GRO_CB(skb)->is_fou = 0;
4530 NAPI_GRO_CB(skb)->is_atomic = 1;
4531 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4532
4533 /* Setup for GRO checksum validation */
4534 switch (skb->ip_summed) {
4535 case CHECKSUM_COMPLETE:
4536 NAPI_GRO_CB(skb)->csum = skb->csum;
4537 NAPI_GRO_CB(skb)->csum_valid = 1;
4538 NAPI_GRO_CB(skb)->csum_cnt = 0;
4539 break;
4540 case CHECKSUM_UNNECESSARY:
4541 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4542 NAPI_GRO_CB(skb)->csum_valid = 0;
4543 break;
4544 default:
4545 NAPI_GRO_CB(skb)->csum_cnt = 0;
4546 NAPI_GRO_CB(skb)->csum_valid = 0;
4547 }
4548
4549 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4550 break;
4551 }
4552 rcu_read_unlock();
4553
4554 if (&ptype->list == head)
4555 goto normal;
4556
4557 same_flow = NAPI_GRO_CB(skb)->same_flow;
4558 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4559
4560 if (pp) {
4561 struct sk_buff *nskb = *pp;
4562
4563 *pp = nskb->next;
4564 nskb->next = NULL;
4565 napi_gro_complete(nskb);
4566 napi->gro_count--;
4567 }
4568
4569 if (same_flow)
4570 goto ok;
4571
4572 if (NAPI_GRO_CB(skb)->flush)
4573 goto normal;
4574
4575 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4576 struct sk_buff *nskb = napi->gro_list;
4577
4578 /* locate the end of the list to select the 'oldest' flow */
4579 while (nskb->next) {
4580 pp = &nskb->next;
4581 nskb = *pp;
4582 }
4583 *pp = NULL;
4584 nskb->next = NULL;
4585 napi_gro_complete(nskb);
4586 } else {
4587 napi->gro_count++;
4588 }
4589 NAPI_GRO_CB(skb)->count = 1;
4590 NAPI_GRO_CB(skb)->age = jiffies;
4591 NAPI_GRO_CB(skb)->last = skb;
4592 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4593 skb->next = napi->gro_list;
4594 napi->gro_list = skb;
4595 ret = GRO_HELD;
4596
4597 pull:
4598 grow = skb_gro_offset(skb) - skb_headlen(skb);
4599 if (grow > 0)
4600 gro_pull_from_frag0(skb, grow);
4601 ok:
4602 return ret;
4603
4604 normal:
4605 ret = GRO_NORMAL;
4606 goto pull;
4607 }
4608
4609 struct packet_offload *gro_find_receive_by_type(__be16 type)
4610 {
4611 struct list_head *offload_head = &offload_base;
4612 struct packet_offload *ptype;
4613
4614 list_for_each_entry_rcu(ptype, offload_head, list) {
4615 if (ptype->type != type || !ptype->callbacks.gro_receive)
4616 continue;
4617 return ptype;
4618 }
4619 return NULL;
4620 }
4621 EXPORT_SYMBOL(gro_find_receive_by_type);
4622
4623 struct packet_offload *gro_find_complete_by_type(__be16 type)
4624 {
4625 struct list_head *offload_head = &offload_base;
4626 struct packet_offload *ptype;
4627
4628 list_for_each_entry_rcu(ptype, offload_head, list) {
4629 if (ptype->type != type || !ptype->callbacks.gro_complete)
4630 continue;
4631 return ptype;
4632 }
4633 return NULL;
4634 }
4635 EXPORT_SYMBOL(gro_find_complete_by_type);
4636
4637 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4638 {
4639 switch (ret) {
4640 case GRO_NORMAL:
4641 if (netif_receive_skb_internal(skb))
4642 ret = GRO_DROP;
4643 break;
4644
4645 case GRO_DROP:
4646 kfree_skb(skb);
4647 break;
4648
4649 case GRO_MERGED_FREE:
4650 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4651 skb_dst_drop(skb);
4652 kmem_cache_free(skbuff_head_cache, skb);
4653 } else {
4654 __kfree_skb(skb);
4655 }
4656 break;
4657
4658 case GRO_HELD:
4659 case GRO_MERGED:
4660 break;
4661 }
4662
4663 return ret;
4664 }
4665
4666 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4667 {
4668 skb_mark_napi_id(skb, napi);
4669 trace_napi_gro_receive_entry(skb);
4670
4671 skb_gro_reset_offset(skb);
4672
4673 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4674 }
4675 EXPORT_SYMBOL(napi_gro_receive);
4676
4677 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4678 {
4679 if (unlikely(skb->pfmemalloc)) {
4680 consume_skb(skb);
4681 return;
4682 }
4683 __skb_pull(skb, skb_headlen(skb));
4684 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4685 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4686 skb->vlan_tci = 0;
4687 skb->dev = napi->dev;
4688 skb->skb_iif = 0;
4689 skb->encapsulation = 0;
4690 skb_shinfo(skb)->gso_type = 0;
4691 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4692
4693 napi->skb = skb;
4694 }
4695
4696 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4697 {
4698 struct sk_buff *skb = napi->skb;
4699
4700 if (!skb) {
4701 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4702 if (skb) {
4703 napi->skb = skb;
4704 skb_mark_napi_id(skb, napi);
4705 }
4706 }
4707 return skb;
4708 }
4709 EXPORT_SYMBOL(napi_get_frags);
4710
4711 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4712 struct sk_buff *skb,
4713 gro_result_t ret)
4714 {
4715 switch (ret) {
4716 case GRO_NORMAL:
4717 case GRO_HELD:
4718 __skb_push(skb, ETH_HLEN);
4719 skb->protocol = eth_type_trans(skb, skb->dev);
4720 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4721 ret = GRO_DROP;
4722 break;
4723
4724 case GRO_DROP:
4725 case GRO_MERGED_FREE:
4726 napi_reuse_skb(napi, skb);
4727 break;
4728
4729 case GRO_MERGED:
4730 break;
4731 }
4732
4733 return ret;
4734 }
4735
4736 /* Upper GRO stack assumes network header starts at gro_offset=0
4737 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4738 * We copy ethernet header into skb->data to have a common layout.
4739 */
4740 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4741 {
4742 struct sk_buff *skb = napi->skb;
4743 const struct ethhdr *eth;
4744 unsigned int hlen = sizeof(*eth);
4745
4746 napi->skb = NULL;
4747
4748 skb_reset_mac_header(skb);
4749 skb_gro_reset_offset(skb);
4750
4751 eth = skb_gro_header_fast(skb, 0);
4752 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4753 eth = skb_gro_header_slow(skb, hlen, 0);
4754 if (unlikely(!eth)) {
4755 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4756 __func__, napi->dev->name);
4757 napi_reuse_skb(napi, skb);
4758 return NULL;
4759 }
4760 } else {
4761 gro_pull_from_frag0(skb, hlen);
4762 NAPI_GRO_CB(skb)->frag0 += hlen;
4763 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4764 }
4765 __skb_pull(skb, hlen);
4766
4767 /*
4768 * This works because the only protocols we care about don't require
4769 * special handling.
4770 * We'll fix it up properly in napi_frags_finish()
4771 */
4772 skb->protocol = eth->h_proto;
4773
4774 return skb;
4775 }
4776
4777 gro_result_t napi_gro_frags(struct napi_struct *napi)
4778 {
4779 struct sk_buff *skb = napi_frags_skb(napi);
4780
4781 if (!skb)
4782 return GRO_DROP;
4783
4784 trace_napi_gro_frags_entry(skb);
4785
4786 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4787 }
4788 EXPORT_SYMBOL(napi_gro_frags);
4789
4790 /* Compute the checksum from gro_offset and return the folded value
4791 * after adding in any pseudo checksum.
4792 */
4793 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4794 {
4795 __wsum wsum;
4796 __sum16 sum;
4797
4798 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4799
4800 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4801 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4802 if (likely(!sum)) {
4803 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4804 !skb->csum_complete_sw)
4805 netdev_rx_csum_fault(skb->dev);
4806 }
4807
4808 NAPI_GRO_CB(skb)->csum = wsum;
4809 NAPI_GRO_CB(skb)->csum_valid = 1;
4810
4811 return sum;
4812 }
4813 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4814
4815 /*
4816 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4817 * Note: called with local irq disabled, but exits with local irq enabled.
4818 */
4819 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4820 {
4821 #ifdef CONFIG_RPS
4822 struct softnet_data *remsd = sd->rps_ipi_list;
4823
4824 if (remsd) {
4825 sd->rps_ipi_list = NULL;
4826
4827 local_irq_enable();
4828
4829 /* Send pending IPI's to kick RPS processing on remote cpus. */
4830 while (remsd) {
4831 struct softnet_data *next = remsd->rps_ipi_next;
4832
4833 if (cpu_online(remsd->cpu))
4834 smp_call_function_single_async(remsd->cpu,
4835 &remsd->csd);
4836 remsd = next;
4837 }
4838 } else
4839 #endif
4840 local_irq_enable();
4841 }
4842
4843 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4844 {
4845 #ifdef CONFIG_RPS
4846 return sd->rps_ipi_list != NULL;
4847 #else
4848 return false;
4849 #endif
4850 }
4851
4852 static int process_backlog(struct napi_struct *napi, int quota)
4853 {
4854 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4855 bool again = true;
4856 int work = 0;
4857
4858 /* Check if we have pending ipi, its better to send them now,
4859 * not waiting net_rx_action() end.
4860 */
4861 if (sd_has_rps_ipi_waiting(sd)) {
4862 local_irq_disable();
4863 net_rps_action_and_irq_enable(sd);
4864 }
4865
4866 napi->weight = weight_p;
4867 while (again) {
4868 struct sk_buff *skb;
4869
4870 while ((skb = __skb_dequeue(&sd->process_queue))) {
4871 rcu_read_lock();
4872 __netif_receive_skb(skb);
4873 rcu_read_unlock();
4874 input_queue_head_incr(sd);
4875 if (++work >= quota)
4876 return work;
4877
4878 }
4879
4880 local_irq_disable();
4881 rps_lock(sd);
4882 if (skb_queue_empty(&sd->input_pkt_queue)) {
4883 /*
4884 * Inline a custom version of __napi_complete().
4885 * only current cpu owns and manipulates this napi,
4886 * and NAPI_STATE_SCHED is the only possible flag set
4887 * on backlog.
4888 * We can use a plain write instead of clear_bit(),
4889 * and we dont need an smp_mb() memory barrier.
4890 */
4891 napi->state = 0;
4892 again = false;
4893 } else {
4894 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4895 &sd->process_queue);
4896 }
4897 rps_unlock(sd);
4898 local_irq_enable();
4899 }
4900
4901 return work;
4902 }
4903
4904 /**
4905 * __napi_schedule - schedule for receive
4906 * @n: entry to schedule
4907 *
4908 * The entry's receive function will be scheduled to run.
4909 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4910 */
4911 void __napi_schedule(struct napi_struct *n)
4912 {
4913 unsigned long flags;
4914
4915 local_irq_save(flags);
4916 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4917 local_irq_restore(flags);
4918 }
4919 EXPORT_SYMBOL(__napi_schedule);
4920
4921 /**
4922 * napi_schedule_prep - check if napi can be scheduled
4923 * @n: napi context
4924 *
4925 * Test if NAPI routine is already running, and if not mark
4926 * it as running. This is used as a condition variable
4927 * insure only one NAPI poll instance runs. We also make
4928 * sure there is no pending NAPI disable.
4929 */
4930 bool napi_schedule_prep(struct napi_struct *n)
4931 {
4932 unsigned long val, new;
4933
4934 do {
4935 val = READ_ONCE(n->state);
4936 if (unlikely(val & NAPIF_STATE_DISABLE))
4937 return false;
4938 new = val | NAPIF_STATE_SCHED;
4939
4940 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4941 * This was suggested by Alexander Duyck, as compiler
4942 * emits better code than :
4943 * if (val & NAPIF_STATE_SCHED)
4944 * new |= NAPIF_STATE_MISSED;
4945 */
4946 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4947 NAPIF_STATE_MISSED;
4948 } while (cmpxchg(&n->state, val, new) != val);
4949
4950 return !(val & NAPIF_STATE_SCHED);
4951 }
4952 EXPORT_SYMBOL(napi_schedule_prep);
4953
4954 /**
4955 * __napi_schedule_irqoff - schedule for receive
4956 * @n: entry to schedule
4957 *
4958 * Variant of __napi_schedule() assuming hard irqs are masked
4959 */
4960 void __napi_schedule_irqoff(struct napi_struct *n)
4961 {
4962 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4963 }
4964 EXPORT_SYMBOL(__napi_schedule_irqoff);
4965
4966 bool __napi_complete(struct napi_struct *n)
4967 {
4968 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4969
4970 /* Some drivers call us directly, instead of calling
4971 * napi_complete_done().
4972 */
4973 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4974 return false;
4975
4976 list_del_init(&n->poll_list);
4977 smp_mb__before_atomic();
4978 clear_bit(NAPI_STATE_SCHED, &n->state);
4979 return true;
4980 }
4981 EXPORT_SYMBOL(__napi_complete);
4982
4983 bool napi_complete_done(struct napi_struct *n, int work_done)
4984 {
4985 unsigned long flags, val, new;
4986
4987 /*
4988 * 1) Don't let napi dequeue from the cpu poll list
4989 * just in case its running on a different cpu.
4990 * 2) If we are busy polling, do nothing here, we have
4991 * the guarantee we will be called later.
4992 */
4993 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4994 NAPIF_STATE_IN_BUSY_POLL)))
4995 return false;
4996
4997 if (n->gro_list) {
4998 unsigned long timeout = 0;
4999
5000 if (work_done)
5001 timeout = n->dev->gro_flush_timeout;
5002
5003 if (timeout)
5004 hrtimer_start(&n->timer, ns_to_ktime(timeout),
5005 HRTIMER_MODE_REL_PINNED);
5006 else
5007 napi_gro_flush(n, false);
5008 }
5009 if (unlikely(!list_empty(&n->poll_list))) {
5010 /* If n->poll_list is not empty, we need to mask irqs */
5011 local_irq_save(flags);
5012 list_del_init(&n->poll_list);
5013 local_irq_restore(flags);
5014 }
5015
5016 do {
5017 val = READ_ONCE(n->state);
5018
5019 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5020
5021 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5022
5023 /* If STATE_MISSED was set, leave STATE_SCHED set,
5024 * because we will call napi->poll() one more time.
5025 * This C code was suggested by Alexander Duyck to help gcc.
5026 */
5027 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5028 NAPIF_STATE_SCHED;
5029 } while (cmpxchg(&n->state, val, new) != val);
5030
5031 if (unlikely(val & NAPIF_STATE_MISSED)) {
5032 __napi_schedule(n);
5033 return false;
5034 }
5035
5036 return true;
5037 }
5038 EXPORT_SYMBOL(napi_complete_done);
5039
5040 /* must be called under rcu_read_lock(), as we dont take a reference */
5041 static struct napi_struct *napi_by_id(unsigned int napi_id)
5042 {
5043 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5044 struct napi_struct *napi;
5045
5046 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5047 if (napi->napi_id == napi_id)
5048 return napi;
5049
5050 return NULL;
5051 }
5052
5053 #if defined(CONFIG_NET_RX_BUSY_POLL)
5054
5055 #define BUSY_POLL_BUDGET 8
5056
5057 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5058 {
5059 int rc;
5060
5061 /* Busy polling means there is a high chance device driver hard irq
5062 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5063 * set in napi_schedule_prep().
5064 * Since we are about to call napi->poll() once more, we can safely
5065 * clear NAPI_STATE_MISSED.
5066 *
5067 * Note: x86 could use a single "lock and ..." instruction
5068 * to perform these two clear_bit()
5069 */
5070 clear_bit(NAPI_STATE_MISSED, &napi->state);
5071 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5072
5073 local_bh_disable();
5074
5075 /* All we really want here is to re-enable device interrupts.
5076 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5077 */
5078 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5079 netpoll_poll_unlock(have_poll_lock);
5080 if (rc == BUSY_POLL_BUDGET)
5081 __napi_schedule(napi);
5082 local_bh_enable();
5083 if (local_softirq_pending())
5084 do_softirq();
5085 }
5086
5087 bool sk_busy_loop(struct sock *sk, int nonblock)
5088 {
5089 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5090 int (*napi_poll)(struct napi_struct *napi, int budget);
5091 int (*busy_poll)(struct napi_struct *dev);
5092 void *have_poll_lock = NULL;
5093 struct napi_struct *napi;
5094 int rc;
5095
5096 restart:
5097 rc = false;
5098 napi_poll = NULL;
5099
5100 rcu_read_lock();
5101
5102 napi = napi_by_id(sk->sk_napi_id);
5103 if (!napi)
5104 goto out;
5105
5106 /* Note: ndo_busy_poll method is optional in linux-4.5 */
5107 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5108
5109 preempt_disable();
5110 for (;;) {
5111 rc = 0;
5112 local_bh_disable();
5113 if (busy_poll) {
5114 rc = busy_poll(napi);
5115 goto count;
5116 }
5117 if (!napi_poll) {
5118 unsigned long val = READ_ONCE(napi->state);
5119
5120 /* If multiple threads are competing for this napi,
5121 * we avoid dirtying napi->state as much as we can.
5122 */
5123 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5124 NAPIF_STATE_IN_BUSY_POLL))
5125 goto count;
5126 if (cmpxchg(&napi->state, val,
5127 val | NAPIF_STATE_IN_BUSY_POLL |
5128 NAPIF_STATE_SCHED) != val)
5129 goto count;
5130 have_poll_lock = netpoll_poll_lock(napi);
5131 napi_poll = napi->poll;
5132 }
5133 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5134 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5135 count:
5136 if (rc > 0)
5137 __NET_ADD_STATS(sock_net(sk),
5138 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5139 local_bh_enable();
5140
5141 if (rc == LL_FLUSH_FAILED)
5142 break; /* permanent failure */
5143
5144 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5145 busy_loop_timeout(end_time))
5146 break;
5147
5148 if (unlikely(need_resched())) {
5149 if (napi_poll)
5150 busy_poll_stop(napi, have_poll_lock);
5151 preempt_enable();
5152 rcu_read_unlock();
5153 cond_resched();
5154 rc = !skb_queue_empty(&sk->sk_receive_queue);
5155 if (rc || busy_loop_timeout(end_time))
5156 return rc;
5157 goto restart;
5158 }
5159 cpu_relax();
5160 }
5161 if (napi_poll)
5162 busy_poll_stop(napi, have_poll_lock);
5163 preempt_enable();
5164 rc = !skb_queue_empty(&sk->sk_receive_queue);
5165 out:
5166 rcu_read_unlock();
5167 return rc;
5168 }
5169 EXPORT_SYMBOL(sk_busy_loop);
5170
5171 #endif /* CONFIG_NET_RX_BUSY_POLL */
5172
5173 static void napi_hash_add(struct napi_struct *napi)
5174 {
5175 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5176 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5177 return;
5178
5179 spin_lock(&napi_hash_lock);
5180
5181 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5182 do {
5183 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5184 napi_gen_id = NR_CPUS + 1;
5185 } while (napi_by_id(napi_gen_id));
5186 napi->napi_id = napi_gen_id;
5187
5188 hlist_add_head_rcu(&napi->napi_hash_node,
5189 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5190
5191 spin_unlock(&napi_hash_lock);
5192 }
5193
5194 /* Warning : caller is responsible to make sure rcu grace period
5195 * is respected before freeing memory containing @napi
5196 */
5197 bool napi_hash_del(struct napi_struct *napi)
5198 {
5199 bool rcu_sync_needed = false;
5200
5201 spin_lock(&napi_hash_lock);
5202
5203 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5204 rcu_sync_needed = true;
5205 hlist_del_rcu(&napi->napi_hash_node);
5206 }
5207 spin_unlock(&napi_hash_lock);
5208 return rcu_sync_needed;
5209 }
5210 EXPORT_SYMBOL_GPL(napi_hash_del);
5211
5212 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5213 {
5214 struct napi_struct *napi;
5215
5216 napi = container_of(timer, struct napi_struct, timer);
5217
5218 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5219 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5220 */
5221 if (napi->gro_list && !napi_disable_pending(napi) &&
5222 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5223 __napi_schedule_irqoff(napi);
5224
5225 return HRTIMER_NORESTART;
5226 }
5227
5228 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5229 int (*poll)(struct napi_struct *, int), int weight)
5230 {
5231 INIT_LIST_HEAD(&napi->poll_list);
5232 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5233 napi->timer.function = napi_watchdog;
5234 napi->gro_count = 0;
5235 napi->gro_list = NULL;
5236 napi->skb = NULL;
5237 napi->poll = poll;
5238 if (weight > NAPI_POLL_WEIGHT)
5239 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5240 weight, dev->name);
5241 napi->weight = weight;
5242 list_add(&napi->dev_list, &dev->napi_list);
5243 napi->dev = dev;
5244 #ifdef CONFIG_NETPOLL
5245 napi->poll_owner = -1;
5246 #endif
5247 set_bit(NAPI_STATE_SCHED, &napi->state);
5248 napi_hash_add(napi);
5249 }
5250 EXPORT_SYMBOL(netif_napi_add);
5251
5252 void napi_disable(struct napi_struct *n)
5253 {
5254 might_sleep();
5255 set_bit(NAPI_STATE_DISABLE, &n->state);
5256
5257 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5258 msleep(1);
5259 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5260 msleep(1);
5261
5262 hrtimer_cancel(&n->timer);
5263
5264 clear_bit(NAPI_STATE_DISABLE, &n->state);
5265 }
5266 EXPORT_SYMBOL(napi_disable);
5267
5268 /* Must be called in process context */
5269 void netif_napi_del(struct napi_struct *napi)
5270 {
5271 might_sleep();
5272 if (napi_hash_del(napi))
5273 synchronize_net();
5274 list_del_init(&napi->dev_list);
5275 napi_free_frags(napi);
5276
5277 kfree_skb_list(napi->gro_list);
5278 napi->gro_list = NULL;
5279 napi->gro_count = 0;
5280 }
5281 EXPORT_SYMBOL(netif_napi_del);
5282
5283 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5284 {
5285 void *have;
5286 int work, weight;
5287
5288 list_del_init(&n->poll_list);
5289
5290 have = netpoll_poll_lock(n);
5291
5292 weight = n->weight;
5293
5294 /* This NAPI_STATE_SCHED test is for avoiding a race
5295 * with netpoll's poll_napi(). Only the entity which
5296 * obtains the lock and sees NAPI_STATE_SCHED set will
5297 * actually make the ->poll() call. Therefore we avoid
5298 * accidentally calling ->poll() when NAPI is not scheduled.
5299 */
5300 work = 0;
5301 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5302 work = n->poll(n, weight);
5303 trace_napi_poll(n, work, weight);
5304 }
5305
5306 WARN_ON_ONCE(work > weight);
5307
5308 if (likely(work < weight))
5309 goto out_unlock;
5310
5311 /* Drivers must not modify the NAPI state if they
5312 * consume the entire weight. In such cases this code
5313 * still "owns" the NAPI instance and therefore can
5314 * move the instance around on the list at-will.
5315 */
5316 if (unlikely(napi_disable_pending(n))) {
5317 napi_complete(n);
5318 goto out_unlock;
5319 }
5320
5321 if (n->gro_list) {
5322 /* flush too old packets
5323 * If HZ < 1000, flush all packets.
5324 */
5325 napi_gro_flush(n, HZ >= 1000);
5326 }
5327
5328 /* Some drivers may have called napi_schedule
5329 * prior to exhausting their budget.
5330 */
5331 if (unlikely(!list_empty(&n->poll_list))) {
5332 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5333 n->dev ? n->dev->name : "backlog");
5334 goto out_unlock;
5335 }
5336
5337 list_add_tail(&n->poll_list, repoll);
5338
5339 out_unlock:
5340 netpoll_poll_unlock(have);
5341
5342 return work;
5343 }
5344
5345 static __latent_entropy void net_rx_action(struct softirq_action *h)
5346 {
5347 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5348 unsigned long time_limit = jiffies + 2;
5349 int budget = netdev_budget;
5350 LIST_HEAD(list);
5351 LIST_HEAD(repoll);
5352
5353 local_irq_disable();
5354 list_splice_init(&sd->poll_list, &list);
5355 local_irq_enable();
5356
5357 for (;;) {
5358 struct napi_struct *n;
5359
5360 if (list_empty(&list)) {
5361 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5362 goto out;
5363 break;
5364 }
5365
5366 n = list_first_entry(&list, struct napi_struct, poll_list);
5367 budget -= napi_poll(n, &repoll);
5368
5369 /* If softirq window is exhausted then punt.
5370 * Allow this to run for 2 jiffies since which will allow
5371 * an average latency of 1.5/HZ.
5372 */
5373 if (unlikely(budget <= 0 ||
5374 time_after_eq(jiffies, time_limit))) {
5375 sd->time_squeeze++;
5376 break;
5377 }
5378 }
5379
5380 local_irq_disable();
5381
5382 list_splice_tail_init(&sd->poll_list, &list);
5383 list_splice_tail(&repoll, &list);
5384 list_splice(&list, &sd->poll_list);
5385 if (!list_empty(&sd->poll_list))
5386 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5387
5388 net_rps_action_and_irq_enable(sd);
5389 out:
5390 __kfree_skb_flush();
5391 }
5392
5393 struct netdev_adjacent {
5394 struct net_device *dev;
5395
5396 /* upper master flag, there can only be one master device per list */
5397 bool master;
5398
5399 /* counter for the number of times this device was added to us */
5400 u16 ref_nr;
5401
5402 /* private field for the users */
5403 void *private;
5404
5405 struct list_head list;
5406 struct rcu_head rcu;
5407 };
5408
5409 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5410 struct list_head *adj_list)
5411 {
5412 struct netdev_adjacent *adj;
5413
5414 list_for_each_entry(adj, adj_list, list) {
5415 if (adj->dev == adj_dev)
5416 return adj;
5417 }
5418 return NULL;
5419 }
5420
5421 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5422 {
5423 struct net_device *dev = data;
5424
5425 return upper_dev == dev;
5426 }
5427
5428 /**
5429 * netdev_has_upper_dev - Check if device is linked to an upper device
5430 * @dev: device
5431 * @upper_dev: upper device to check
5432 *
5433 * Find out if a device is linked to specified upper device and return true
5434 * in case it is. Note that this checks only immediate upper device,
5435 * not through a complete stack of devices. The caller must hold the RTNL lock.
5436 */
5437 bool netdev_has_upper_dev(struct net_device *dev,
5438 struct net_device *upper_dev)
5439 {
5440 ASSERT_RTNL();
5441
5442 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5443 upper_dev);
5444 }
5445 EXPORT_SYMBOL(netdev_has_upper_dev);
5446
5447 /**
5448 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5449 * @dev: device
5450 * @upper_dev: upper device to check
5451 *
5452 * Find out if a device is linked to specified upper device and return true
5453 * in case it is. Note that this checks the entire upper device chain.
5454 * The caller must hold rcu lock.
5455 */
5456
5457 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5458 struct net_device *upper_dev)
5459 {
5460 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5461 upper_dev);
5462 }
5463 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5464
5465 /**
5466 * netdev_has_any_upper_dev - Check if device is linked to some device
5467 * @dev: device
5468 *
5469 * Find out if a device is linked to an upper device and return true in case
5470 * it is. The caller must hold the RTNL lock.
5471 */
5472 static bool netdev_has_any_upper_dev(struct net_device *dev)
5473 {
5474 ASSERT_RTNL();
5475
5476 return !list_empty(&dev->adj_list.upper);
5477 }
5478
5479 /**
5480 * netdev_master_upper_dev_get - Get master upper device
5481 * @dev: device
5482 *
5483 * Find a master upper device and return pointer to it or NULL in case
5484 * it's not there. The caller must hold the RTNL lock.
5485 */
5486 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5487 {
5488 struct netdev_adjacent *upper;
5489
5490 ASSERT_RTNL();
5491
5492 if (list_empty(&dev->adj_list.upper))
5493 return NULL;
5494
5495 upper = list_first_entry(&dev->adj_list.upper,
5496 struct netdev_adjacent, list);
5497 if (likely(upper->master))
5498 return upper->dev;
5499 return NULL;
5500 }
5501 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5502
5503 /**
5504 * netdev_has_any_lower_dev - Check if device is linked to some device
5505 * @dev: device
5506 *
5507 * Find out if a device is linked to a lower device and return true in case
5508 * it is. The caller must hold the RTNL lock.
5509 */
5510 static bool netdev_has_any_lower_dev(struct net_device *dev)
5511 {
5512 ASSERT_RTNL();
5513
5514 return !list_empty(&dev->adj_list.lower);
5515 }
5516
5517 void *netdev_adjacent_get_private(struct list_head *adj_list)
5518 {
5519 struct netdev_adjacent *adj;
5520
5521 adj = list_entry(adj_list, struct netdev_adjacent, list);
5522
5523 return adj->private;
5524 }
5525 EXPORT_SYMBOL(netdev_adjacent_get_private);
5526
5527 /**
5528 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5529 * @dev: device
5530 * @iter: list_head ** of the current position
5531 *
5532 * Gets the next device from the dev's upper list, starting from iter
5533 * position. The caller must hold RCU read lock.
5534 */
5535 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5536 struct list_head **iter)
5537 {
5538 struct netdev_adjacent *upper;
5539
5540 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5541
5542 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5543
5544 if (&upper->list == &dev->adj_list.upper)
5545 return NULL;
5546
5547 *iter = &upper->list;
5548
5549 return upper->dev;
5550 }
5551 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5552
5553 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5554 struct list_head **iter)
5555 {
5556 struct netdev_adjacent *upper;
5557
5558 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5559
5560 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5561
5562 if (&upper->list == &dev->adj_list.upper)
5563 return NULL;
5564
5565 *iter = &upper->list;
5566
5567 return upper->dev;
5568 }
5569
5570 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5571 int (*fn)(struct net_device *dev,
5572 void *data),
5573 void *data)
5574 {
5575 struct net_device *udev;
5576 struct list_head *iter;
5577 int ret;
5578
5579 for (iter = &dev->adj_list.upper,
5580 udev = netdev_next_upper_dev_rcu(dev, &iter);
5581 udev;
5582 udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5583 /* first is the upper device itself */
5584 ret = fn(udev, data);
5585 if (ret)
5586 return ret;
5587
5588 /* then look at all of its upper devices */
5589 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5590 if (ret)
5591 return ret;
5592 }
5593
5594 return 0;
5595 }
5596 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5597
5598 /**
5599 * netdev_lower_get_next_private - Get the next ->private from the
5600 * lower neighbour list
5601 * @dev: device
5602 * @iter: list_head ** of the current position
5603 *
5604 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5605 * list, starting from iter position. The caller must hold either hold the
5606 * RTNL lock or its own locking that guarantees that the neighbour lower
5607 * list will remain unchanged.
5608 */
5609 void *netdev_lower_get_next_private(struct net_device *dev,
5610 struct list_head **iter)
5611 {
5612 struct netdev_adjacent *lower;
5613
5614 lower = list_entry(*iter, struct netdev_adjacent, list);
5615
5616 if (&lower->list == &dev->adj_list.lower)
5617 return NULL;
5618
5619 *iter = lower->list.next;
5620
5621 return lower->private;
5622 }
5623 EXPORT_SYMBOL(netdev_lower_get_next_private);
5624
5625 /**
5626 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5627 * lower neighbour list, RCU
5628 * variant
5629 * @dev: device
5630 * @iter: list_head ** of the current position
5631 *
5632 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5633 * list, starting from iter position. The caller must hold RCU read lock.
5634 */
5635 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5636 struct list_head **iter)
5637 {
5638 struct netdev_adjacent *lower;
5639
5640 WARN_ON_ONCE(!rcu_read_lock_held());
5641
5642 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5643
5644 if (&lower->list == &dev->adj_list.lower)
5645 return NULL;
5646
5647 *iter = &lower->list;
5648
5649 return lower->private;
5650 }
5651 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5652
5653 /**
5654 * netdev_lower_get_next - Get the next device from the lower neighbour
5655 * list
5656 * @dev: device
5657 * @iter: list_head ** of the current position
5658 *
5659 * Gets the next netdev_adjacent from the dev's lower neighbour
5660 * list, starting from iter position. The caller must hold RTNL lock or
5661 * its own locking that guarantees that the neighbour lower
5662 * list will remain unchanged.
5663 */
5664 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5665 {
5666 struct netdev_adjacent *lower;
5667
5668 lower = list_entry(*iter, struct netdev_adjacent, list);
5669
5670 if (&lower->list == &dev->adj_list.lower)
5671 return NULL;
5672
5673 *iter = lower->list.next;
5674
5675 return lower->dev;
5676 }
5677 EXPORT_SYMBOL(netdev_lower_get_next);
5678
5679 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5680 struct list_head **iter)
5681 {
5682 struct netdev_adjacent *lower;
5683
5684 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5685
5686 if (&lower->list == &dev->adj_list.lower)
5687 return NULL;
5688
5689 *iter = &lower->list;
5690
5691 return lower->dev;
5692 }
5693
5694 int netdev_walk_all_lower_dev(struct net_device *dev,
5695 int (*fn)(struct net_device *dev,
5696 void *data),
5697 void *data)
5698 {
5699 struct net_device *ldev;
5700 struct list_head *iter;
5701 int ret;
5702
5703 for (iter = &dev->adj_list.lower,
5704 ldev = netdev_next_lower_dev(dev, &iter);
5705 ldev;
5706 ldev = netdev_next_lower_dev(dev, &iter)) {
5707 /* first is the lower device itself */
5708 ret = fn(ldev, data);
5709 if (ret)
5710 return ret;
5711
5712 /* then look at all of its lower devices */
5713 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5714 if (ret)
5715 return ret;
5716 }
5717
5718 return 0;
5719 }
5720 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5721
5722 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5723 struct list_head **iter)
5724 {
5725 struct netdev_adjacent *lower;
5726
5727 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5728 if (&lower->list == &dev->adj_list.lower)
5729 return NULL;
5730
5731 *iter = &lower->list;
5732
5733 return lower->dev;
5734 }
5735
5736 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5737 int (*fn)(struct net_device *dev,
5738 void *data),
5739 void *data)
5740 {
5741 struct net_device *ldev;
5742 struct list_head *iter;
5743 int ret;
5744
5745 for (iter = &dev->adj_list.lower,
5746 ldev = netdev_next_lower_dev_rcu(dev, &iter);
5747 ldev;
5748 ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5749 /* first is the lower device itself */
5750 ret = fn(ldev, data);
5751 if (ret)
5752 return ret;
5753
5754 /* then look at all of its lower devices */
5755 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5756 if (ret)
5757 return ret;
5758 }
5759
5760 return 0;
5761 }
5762 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5763
5764 /**
5765 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5766 * lower neighbour list, RCU
5767 * variant
5768 * @dev: device
5769 *
5770 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5771 * list. The caller must hold RCU read lock.
5772 */
5773 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5774 {
5775 struct netdev_adjacent *lower;
5776
5777 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5778 struct netdev_adjacent, list);
5779 if (lower)
5780 return lower->private;
5781 return NULL;
5782 }
5783 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5784
5785 /**
5786 * netdev_master_upper_dev_get_rcu - Get master upper device
5787 * @dev: device
5788 *
5789 * Find a master upper device and return pointer to it or NULL in case
5790 * it's not there. The caller must hold the RCU read lock.
5791 */
5792 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5793 {
5794 struct netdev_adjacent *upper;
5795
5796 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5797 struct netdev_adjacent, list);
5798 if (upper && likely(upper->master))
5799 return upper->dev;
5800 return NULL;
5801 }
5802 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5803
5804 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5805 struct net_device *adj_dev,
5806 struct list_head *dev_list)
5807 {
5808 char linkname[IFNAMSIZ+7];
5809 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5810 "upper_%s" : "lower_%s", adj_dev->name);
5811 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5812 linkname);
5813 }
5814 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5815 char *name,
5816 struct list_head *dev_list)
5817 {
5818 char linkname[IFNAMSIZ+7];
5819 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5820 "upper_%s" : "lower_%s", name);
5821 sysfs_remove_link(&(dev->dev.kobj), linkname);
5822 }
5823
5824 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5825 struct net_device *adj_dev,
5826 struct list_head *dev_list)
5827 {
5828 return (dev_list == &dev->adj_list.upper ||
5829 dev_list == &dev->adj_list.lower) &&
5830 net_eq(dev_net(dev), dev_net(adj_dev));
5831 }
5832
5833 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5834 struct net_device *adj_dev,
5835 struct list_head *dev_list,
5836 void *private, bool master)
5837 {
5838 struct netdev_adjacent *adj;
5839 int ret;
5840
5841 adj = __netdev_find_adj(adj_dev, dev_list);
5842
5843 if (adj) {
5844 adj->ref_nr += 1;
5845 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5846 dev->name, adj_dev->name, adj->ref_nr);
5847
5848 return 0;
5849 }
5850
5851 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5852 if (!adj)
5853 return -ENOMEM;
5854
5855 adj->dev = adj_dev;
5856 adj->master = master;
5857 adj->ref_nr = 1;
5858 adj->private = private;
5859 dev_hold(adj_dev);
5860
5861 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5862 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5863
5864 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5865 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5866 if (ret)
5867 goto free_adj;
5868 }
5869
5870 /* Ensure that master link is always the first item in list. */
5871 if (master) {
5872 ret = sysfs_create_link(&(dev->dev.kobj),
5873 &(adj_dev->dev.kobj), "master");
5874 if (ret)
5875 goto remove_symlinks;
5876
5877 list_add_rcu(&adj->list, dev_list);
5878 } else {
5879 list_add_tail_rcu(&adj->list, dev_list);
5880 }
5881
5882 return 0;
5883
5884 remove_symlinks:
5885 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5886 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5887 free_adj:
5888 kfree(adj);
5889 dev_put(adj_dev);
5890
5891 return ret;
5892 }
5893
5894 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5895 struct net_device *adj_dev,
5896 u16 ref_nr,
5897 struct list_head *dev_list)
5898 {
5899 struct netdev_adjacent *adj;
5900
5901 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5902 dev->name, adj_dev->name, ref_nr);
5903
5904 adj = __netdev_find_adj(adj_dev, dev_list);
5905
5906 if (!adj) {
5907 pr_err("Adjacency does not exist for device %s from %s\n",
5908 dev->name, adj_dev->name);
5909 WARN_ON(1);
5910 return;
5911 }
5912
5913 if (adj->ref_nr > ref_nr) {
5914 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5915 dev->name, adj_dev->name, ref_nr,
5916 adj->ref_nr - ref_nr);
5917 adj->ref_nr -= ref_nr;
5918 return;
5919 }
5920
5921 if (adj->master)
5922 sysfs_remove_link(&(dev->dev.kobj), "master");
5923
5924 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5925 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5926
5927 list_del_rcu(&adj->list);
5928 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5929 adj_dev->name, dev->name, adj_dev->name);
5930 dev_put(adj_dev);
5931 kfree_rcu(adj, rcu);
5932 }
5933
5934 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5935 struct net_device *upper_dev,
5936 struct list_head *up_list,
5937 struct list_head *down_list,
5938 void *private, bool master)
5939 {
5940 int ret;
5941
5942 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5943 private, master);
5944 if (ret)
5945 return ret;
5946
5947 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5948 private, false);
5949 if (ret) {
5950 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5951 return ret;
5952 }
5953
5954 return 0;
5955 }
5956
5957 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5958 struct net_device *upper_dev,
5959 u16 ref_nr,
5960 struct list_head *up_list,
5961 struct list_head *down_list)
5962 {
5963 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5964 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5965 }
5966
5967 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5968 struct net_device *upper_dev,
5969 void *private, bool master)
5970 {
5971 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5972 &dev->adj_list.upper,
5973 &upper_dev->adj_list.lower,
5974 private, master);
5975 }
5976
5977 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5978 struct net_device *upper_dev)
5979 {
5980 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5981 &dev->adj_list.upper,
5982 &upper_dev->adj_list.lower);
5983 }
5984
5985 static int __netdev_upper_dev_link(struct net_device *dev,
5986 struct net_device *upper_dev, bool master,
5987 void *upper_priv, void *upper_info)
5988 {
5989 struct netdev_notifier_changeupper_info changeupper_info;
5990 int ret = 0;
5991
5992 ASSERT_RTNL();
5993
5994 if (dev == upper_dev)
5995 return -EBUSY;
5996
5997 /* To prevent loops, check if dev is not upper device to upper_dev. */
5998 if (netdev_has_upper_dev(upper_dev, dev))
5999 return -EBUSY;
6000
6001 if (netdev_has_upper_dev(dev, upper_dev))
6002 return -EEXIST;
6003
6004 if (master && netdev_master_upper_dev_get(dev))
6005 return -EBUSY;
6006
6007 changeupper_info.upper_dev = upper_dev;
6008 changeupper_info.master = master;
6009 changeupper_info.linking = true;
6010 changeupper_info.upper_info = upper_info;
6011
6012 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6013 &changeupper_info.info);
6014 ret = notifier_to_errno(ret);
6015 if (ret)
6016 return ret;
6017
6018 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6019 master);
6020 if (ret)
6021 return ret;
6022
6023 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6024 &changeupper_info.info);
6025 ret = notifier_to_errno(ret);
6026 if (ret)
6027 goto rollback;
6028
6029 return 0;
6030
6031 rollback:
6032 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6033
6034 return ret;
6035 }
6036
6037 /**
6038 * netdev_upper_dev_link - Add a link to the upper device
6039 * @dev: device
6040 * @upper_dev: new upper device
6041 *
6042 * Adds a link to device which is upper to this one. The caller must hold
6043 * the RTNL lock. On a failure a negative errno code is returned.
6044 * On success the reference counts are adjusted and the function
6045 * returns zero.
6046 */
6047 int netdev_upper_dev_link(struct net_device *dev,
6048 struct net_device *upper_dev)
6049 {
6050 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
6051 }
6052 EXPORT_SYMBOL(netdev_upper_dev_link);
6053
6054 /**
6055 * netdev_master_upper_dev_link - Add a master link to the upper device
6056 * @dev: device
6057 * @upper_dev: new upper device
6058 * @upper_priv: upper device private
6059 * @upper_info: upper info to be passed down via notifier
6060 *
6061 * Adds a link to device which is upper to this one. In this case, only
6062 * one master upper device can be linked, although other non-master devices
6063 * might be linked as well. The caller must hold the RTNL lock.
6064 * On a failure a negative errno code is returned. On success the reference
6065 * counts are adjusted and the function returns zero.
6066 */
6067 int netdev_master_upper_dev_link(struct net_device *dev,
6068 struct net_device *upper_dev,
6069 void *upper_priv, void *upper_info)
6070 {
6071 return __netdev_upper_dev_link(dev, upper_dev, true,
6072 upper_priv, upper_info);
6073 }
6074 EXPORT_SYMBOL(netdev_master_upper_dev_link);
6075
6076 /**
6077 * netdev_upper_dev_unlink - Removes a link to upper device
6078 * @dev: device
6079 * @upper_dev: new upper device
6080 *
6081 * Removes a link to device which is upper to this one. The caller must hold
6082 * the RTNL lock.
6083 */
6084 void netdev_upper_dev_unlink(struct net_device *dev,
6085 struct net_device *upper_dev)
6086 {
6087 struct netdev_notifier_changeupper_info changeupper_info;
6088 ASSERT_RTNL();
6089
6090 changeupper_info.upper_dev = upper_dev;
6091 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6092 changeupper_info.linking = false;
6093
6094 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6095 &changeupper_info.info);
6096
6097 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6098
6099 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6100 &changeupper_info.info);
6101 }
6102 EXPORT_SYMBOL(netdev_upper_dev_unlink);
6103
6104 /**
6105 * netdev_bonding_info_change - Dispatch event about slave change
6106 * @dev: device
6107 * @bonding_info: info to dispatch
6108 *
6109 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6110 * The caller must hold the RTNL lock.
6111 */
6112 void netdev_bonding_info_change(struct net_device *dev,
6113 struct netdev_bonding_info *bonding_info)
6114 {
6115 struct netdev_notifier_bonding_info info;
6116
6117 memcpy(&info.bonding_info, bonding_info,
6118 sizeof(struct netdev_bonding_info));
6119 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6120 &info.info);
6121 }
6122 EXPORT_SYMBOL(netdev_bonding_info_change);
6123
6124 static void netdev_adjacent_add_links(struct net_device *dev)
6125 {
6126 struct netdev_adjacent *iter;
6127
6128 struct net *net = dev_net(dev);
6129
6130 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6131 if (!net_eq(net, dev_net(iter->dev)))
6132 continue;
6133 netdev_adjacent_sysfs_add(iter->dev, dev,
6134 &iter->dev->adj_list.lower);
6135 netdev_adjacent_sysfs_add(dev, iter->dev,
6136 &dev->adj_list.upper);
6137 }
6138
6139 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6140 if (!net_eq(net, dev_net(iter->dev)))
6141 continue;
6142 netdev_adjacent_sysfs_add(iter->dev, dev,
6143 &iter->dev->adj_list.upper);
6144 netdev_adjacent_sysfs_add(dev, iter->dev,
6145 &dev->adj_list.lower);
6146 }
6147 }
6148
6149 static void netdev_adjacent_del_links(struct net_device *dev)
6150 {
6151 struct netdev_adjacent *iter;
6152
6153 struct net *net = dev_net(dev);
6154
6155 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6156 if (!net_eq(net, dev_net(iter->dev)))
6157 continue;
6158 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6159 &iter->dev->adj_list.lower);
6160 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6161 &dev->adj_list.upper);
6162 }
6163
6164 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6165 if (!net_eq(net, dev_net(iter->dev)))
6166 continue;
6167 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6168 &iter->dev->adj_list.upper);
6169 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6170 &dev->adj_list.lower);
6171 }
6172 }
6173
6174 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6175 {
6176 struct netdev_adjacent *iter;
6177
6178 struct net *net = dev_net(dev);
6179
6180 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6181 if (!net_eq(net, dev_net(iter->dev)))
6182 continue;
6183 netdev_adjacent_sysfs_del(iter->dev, oldname,
6184 &iter->dev->adj_list.lower);
6185 netdev_adjacent_sysfs_add(iter->dev, dev,
6186 &iter->dev->adj_list.lower);
6187 }
6188
6189 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6190 if (!net_eq(net, dev_net(iter->dev)))
6191 continue;
6192 netdev_adjacent_sysfs_del(iter->dev, oldname,
6193 &iter->dev->adj_list.upper);
6194 netdev_adjacent_sysfs_add(iter->dev, dev,
6195 &iter->dev->adj_list.upper);
6196 }
6197 }
6198
6199 void *netdev_lower_dev_get_private(struct net_device *dev,
6200 struct net_device *lower_dev)
6201 {
6202 struct netdev_adjacent *lower;
6203
6204 if (!lower_dev)
6205 return NULL;
6206 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6207 if (!lower)
6208 return NULL;
6209
6210 return lower->private;
6211 }
6212 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6213
6214
6215 int dev_get_nest_level(struct net_device *dev)
6216 {
6217 struct net_device *lower = NULL;
6218 struct list_head *iter;
6219 int max_nest = -1;
6220 int nest;
6221
6222 ASSERT_RTNL();
6223
6224 netdev_for_each_lower_dev(dev, lower, iter) {
6225 nest = dev_get_nest_level(lower);
6226 if (max_nest < nest)
6227 max_nest = nest;
6228 }
6229
6230 return max_nest + 1;
6231 }
6232 EXPORT_SYMBOL(dev_get_nest_level);
6233
6234 /**
6235 * netdev_lower_change - Dispatch event about lower device state change
6236 * @lower_dev: device
6237 * @lower_state_info: state to dispatch
6238 *
6239 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6240 * The caller must hold the RTNL lock.
6241 */
6242 void netdev_lower_state_changed(struct net_device *lower_dev,
6243 void *lower_state_info)
6244 {
6245 struct netdev_notifier_changelowerstate_info changelowerstate_info;
6246
6247 ASSERT_RTNL();
6248 changelowerstate_info.lower_state_info = lower_state_info;
6249 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6250 &changelowerstate_info.info);
6251 }
6252 EXPORT_SYMBOL(netdev_lower_state_changed);
6253
6254 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6255 struct neighbour *n)
6256 {
6257 struct net_device *lower_dev, *stop_dev;
6258 struct list_head *iter;
6259 int err;
6260
6261 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6262 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6263 continue;
6264 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6265 if (err) {
6266 stop_dev = lower_dev;
6267 goto rollback;
6268 }
6269 }
6270 return 0;
6271
6272 rollback:
6273 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6274 if (lower_dev == stop_dev)
6275 break;
6276 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6277 continue;
6278 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6279 }
6280 return err;
6281 }
6282 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6283
6284 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6285 struct neighbour *n)
6286 {
6287 struct net_device *lower_dev;
6288 struct list_head *iter;
6289
6290 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6291 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6292 continue;
6293 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6294 }
6295 }
6296 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6297
6298 static void dev_change_rx_flags(struct net_device *dev, int flags)
6299 {
6300 const struct net_device_ops *ops = dev->netdev_ops;
6301
6302 if (ops->ndo_change_rx_flags)
6303 ops->ndo_change_rx_flags(dev, flags);
6304 }
6305
6306 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6307 {
6308 unsigned int old_flags = dev->flags;
6309 kuid_t uid;
6310 kgid_t gid;
6311
6312 ASSERT_RTNL();
6313
6314 dev->flags |= IFF_PROMISC;
6315 dev->promiscuity += inc;
6316 if (dev->promiscuity == 0) {
6317 /*
6318 * Avoid overflow.
6319 * If inc causes overflow, untouch promisc and return error.
6320 */
6321 if (inc < 0)
6322 dev->flags &= ~IFF_PROMISC;
6323 else {
6324 dev->promiscuity -= inc;
6325 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6326 dev->name);
6327 return -EOVERFLOW;
6328 }
6329 }
6330 if (dev->flags != old_flags) {
6331 pr_info("device %s %s promiscuous mode\n",
6332 dev->name,
6333 dev->flags & IFF_PROMISC ? "entered" : "left");
6334 if (audit_enabled) {
6335 current_uid_gid(&uid, &gid);
6336 audit_log(current->audit_context, GFP_ATOMIC,
6337 AUDIT_ANOM_PROMISCUOUS,
6338 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6339 dev->name, (dev->flags & IFF_PROMISC),
6340 (old_flags & IFF_PROMISC),
6341 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6342 from_kuid(&init_user_ns, uid),
6343 from_kgid(&init_user_ns, gid),
6344 audit_get_sessionid(current));
6345 }
6346
6347 dev_change_rx_flags(dev, IFF_PROMISC);
6348 }
6349 if (notify)
6350 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6351 return 0;
6352 }
6353
6354 /**
6355 * dev_set_promiscuity - update promiscuity count on a device
6356 * @dev: device
6357 * @inc: modifier
6358 *
6359 * Add or remove promiscuity from a device. While the count in the device
6360 * remains above zero the interface remains promiscuous. Once it hits zero
6361 * the device reverts back to normal filtering operation. A negative inc
6362 * value is used to drop promiscuity on the device.
6363 * Return 0 if successful or a negative errno code on error.
6364 */
6365 int dev_set_promiscuity(struct net_device *dev, int inc)
6366 {
6367 unsigned int old_flags = dev->flags;
6368 int err;
6369
6370 err = __dev_set_promiscuity(dev, inc, true);
6371 if (err < 0)
6372 return err;
6373 if (dev->flags != old_flags)
6374 dev_set_rx_mode(dev);
6375 return err;
6376 }
6377 EXPORT_SYMBOL(dev_set_promiscuity);
6378
6379 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6380 {
6381 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6382
6383 ASSERT_RTNL();
6384
6385 dev->flags |= IFF_ALLMULTI;
6386 dev->allmulti += inc;
6387 if (dev->allmulti == 0) {
6388 /*
6389 * Avoid overflow.
6390 * If inc causes overflow, untouch allmulti and return error.
6391 */
6392 if (inc < 0)
6393 dev->flags &= ~IFF_ALLMULTI;
6394 else {
6395 dev->allmulti -= inc;
6396 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6397 dev->name);
6398 return -EOVERFLOW;
6399 }
6400 }
6401 if (dev->flags ^ old_flags) {
6402 dev_change_rx_flags(dev, IFF_ALLMULTI);
6403 dev_set_rx_mode(dev);
6404 if (notify)
6405 __dev_notify_flags(dev, old_flags,
6406 dev->gflags ^ old_gflags);
6407 }
6408 return 0;
6409 }
6410
6411 /**
6412 * dev_set_allmulti - update allmulti count on a device
6413 * @dev: device
6414 * @inc: modifier
6415 *
6416 * Add or remove reception of all multicast frames to a device. While the
6417 * count in the device remains above zero the interface remains listening
6418 * to all interfaces. Once it hits zero the device reverts back to normal
6419 * filtering operation. A negative @inc value is used to drop the counter
6420 * when releasing a resource needing all multicasts.
6421 * Return 0 if successful or a negative errno code on error.
6422 */
6423
6424 int dev_set_allmulti(struct net_device *dev, int inc)
6425 {
6426 return __dev_set_allmulti(dev, inc, true);
6427 }
6428 EXPORT_SYMBOL(dev_set_allmulti);
6429
6430 /*
6431 * Upload unicast and multicast address lists to device and
6432 * configure RX filtering. When the device doesn't support unicast
6433 * filtering it is put in promiscuous mode while unicast addresses
6434 * are present.
6435 */
6436 void __dev_set_rx_mode(struct net_device *dev)
6437 {
6438 const struct net_device_ops *ops = dev->netdev_ops;
6439
6440 /* dev_open will call this function so the list will stay sane. */
6441 if (!(dev->flags&IFF_UP))
6442 return;
6443
6444 if (!netif_device_present(dev))
6445 return;
6446
6447 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6448 /* Unicast addresses changes may only happen under the rtnl,
6449 * therefore calling __dev_set_promiscuity here is safe.
6450 */
6451 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6452 __dev_set_promiscuity(dev, 1, false);
6453 dev->uc_promisc = true;
6454 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6455 __dev_set_promiscuity(dev, -1, false);
6456 dev->uc_promisc = false;
6457 }
6458 }
6459
6460 if (ops->ndo_set_rx_mode)
6461 ops->ndo_set_rx_mode(dev);
6462 }
6463
6464 void dev_set_rx_mode(struct net_device *dev)
6465 {
6466 netif_addr_lock_bh(dev);
6467 __dev_set_rx_mode(dev);
6468 netif_addr_unlock_bh(dev);
6469 }
6470
6471 /**
6472 * dev_get_flags - get flags reported to userspace
6473 * @dev: device
6474 *
6475 * Get the combination of flag bits exported through APIs to userspace.
6476 */
6477 unsigned int dev_get_flags(const struct net_device *dev)
6478 {
6479 unsigned int flags;
6480
6481 flags = (dev->flags & ~(IFF_PROMISC |
6482 IFF_ALLMULTI |
6483 IFF_RUNNING |
6484 IFF_LOWER_UP |
6485 IFF_DORMANT)) |
6486 (dev->gflags & (IFF_PROMISC |
6487 IFF_ALLMULTI));
6488
6489 if (netif_running(dev)) {
6490 if (netif_oper_up(dev))
6491 flags |= IFF_RUNNING;
6492 if (netif_carrier_ok(dev))
6493 flags |= IFF_LOWER_UP;
6494 if (netif_dormant(dev))
6495 flags |= IFF_DORMANT;
6496 }
6497
6498 return flags;
6499 }
6500 EXPORT_SYMBOL(dev_get_flags);
6501
6502 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6503 {
6504 unsigned int old_flags = dev->flags;
6505 int ret;
6506
6507 ASSERT_RTNL();
6508
6509 /*
6510 * Set the flags on our device.
6511 */
6512
6513 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6514 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6515 IFF_AUTOMEDIA)) |
6516 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6517 IFF_ALLMULTI));
6518
6519 /*
6520 * Load in the correct multicast list now the flags have changed.
6521 */
6522
6523 if ((old_flags ^ flags) & IFF_MULTICAST)
6524 dev_change_rx_flags(dev, IFF_MULTICAST);
6525
6526 dev_set_rx_mode(dev);
6527
6528 /*
6529 * Have we downed the interface. We handle IFF_UP ourselves
6530 * according to user attempts to set it, rather than blindly
6531 * setting it.
6532 */
6533
6534 ret = 0;
6535 if ((old_flags ^ flags) & IFF_UP)
6536 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6537
6538 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6539 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6540 unsigned int old_flags = dev->flags;
6541
6542 dev->gflags ^= IFF_PROMISC;
6543
6544 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6545 if (dev->flags != old_flags)
6546 dev_set_rx_mode(dev);
6547 }
6548
6549 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6550 is important. Some (broken) drivers set IFF_PROMISC, when
6551 IFF_ALLMULTI is requested not asking us and not reporting.
6552 */
6553 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6554 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6555
6556 dev->gflags ^= IFF_ALLMULTI;
6557 __dev_set_allmulti(dev, inc, false);
6558 }
6559
6560 return ret;
6561 }
6562
6563 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6564 unsigned int gchanges)
6565 {
6566 unsigned int changes = dev->flags ^ old_flags;
6567
6568 if (gchanges)
6569 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6570
6571 if (changes & IFF_UP) {
6572 if (dev->flags & IFF_UP)
6573 call_netdevice_notifiers(NETDEV_UP, dev);
6574 else
6575 call_netdevice_notifiers(NETDEV_DOWN, dev);
6576 }
6577
6578 if (dev->flags & IFF_UP &&
6579 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6580 struct netdev_notifier_change_info change_info;
6581
6582 change_info.flags_changed = changes;
6583 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6584 &change_info.info);
6585 }
6586 }
6587
6588 /**
6589 * dev_change_flags - change device settings
6590 * @dev: device
6591 * @flags: device state flags
6592 *
6593 * Change settings on device based state flags. The flags are
6594 * in the userspace exported format.
6595 */
6596 int dev_change_flags(struct net_device *dev, unsigned int flags)
6597 {
6598 int ret;
6599 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6600
6601 ret = __dev_change_flags(dev, flags);
6602 if (ret < 0)
6603 return ret;
6604
6605 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6606 __dev_notify_flags(dev, old_flags, changes);
6607 return ret;
6608 }
6609 EXPORT_SYMBOL(dev_change_flags);
6610
6611 int __dev_set_mtu(struct net_device *dev, int new_mtu)
6612 {
6613 const struct net_device_ops *ops = dev->netdev_ops;
6614
6615 if (ops->ndo_change_mtu)
6616 return ops->ndo_change_mtu(dev, new_mtu);
6617
6618 dev->mtu = new_mtu;
6619 return 0;
6620 }
6621 EXPORT_SYMBOL(__dev_set_mtu);
6622
6623 /**
6624 * dev_set_mtu - Change maximum transfer unit
6625 * @dev: device
6626 * @new_mtu: new transfer unit
6627 *
6628 * Change the maximum transfer size of the network device.
6629 */
6630 int dev_set_mtu(struct net_device *dev, int new_mtu)
6631 {
6632 int err, orig_mtu;
6633
6634 if (new_mtu == dev->mtu)
6635 return 0;
6636
6637 /* MTU must be positive, and in range */
6638 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6639 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6640 dev->name, new_mtu, dev->min_mtu);
6641 return -EINVAL;
6642 }
6643
6644 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6645 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6646 dev->name, new_mtu, dev->max_mtu);
6647 return -EINVAL;
6648 }
6649
6650 if (!netif_device_present(dev))
6651 return -ENODEV;
6652
6653 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6654 err = notifier_to_errno(err);
6655 if (err)
6656 return err;
6657
6658 orig_mtu = dev->mtu;
6659 err = __dev_set_mtu(dev, new_mtu);
6660
6661 if (!err) {
6662 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6663 err = notifier_to_errno(err);
6664 if (err) {
6665 /* setting mtu back and notifying everyone again,
6666 * so that they have a chance to revert changes.
6667 */
6668 __dev_set_mtu(dev, orig_mtu);
6669 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6670 }
6671 }
6672 return err;
6673 }
6674 EXPORT_SYMBOL(dev_set_mtu);
6675
6676 /**
6677 * dev_set_group - Change group this device belongs to
6678 * @dev: device
6679 * @new_group: group this device should belong to
6680 */
6681 void dev_set_group(struct net_device *dev, int new_group)
6682 {
6683 dev->group = new_group;
6684 }
6685 EXPORT_SYMBOL(dev_set_group);
6686
6687 /**
6688 * dev_set_mac_address - Change Media Access Control Address
6689 * @dev: device
6690 * @sa: new address
6691 *
6692 * Change the hardware (MAC) address of the device
6693 */
6694 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6695 {
6696 const struct net_device_ops *ops = dev->netdev_ops;
6697 int err;
6698
6699 if (!ops->ndo_set_mac_address)
6700 return -EOPNOTSUPP;
6701 if (sa->sa_family != dev->type)
6702 return -EINVAL;
6703 if (!netif_device_present(dev))
6704 return -ENODEV;
6705 err = ops->ndo_set_mac_address(dev, sa);
6706 if (err)
6707 return err;
6708 dev->addr_assign_type = NET_ADDR_SET;
6709 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6710 add_device_randomness(dev->dev_addr, dev->addr_len);
6711 return 0;
6712 }
6713 EXPORT_SYMBOL(dev_set_mac_address);
6714
6715 /**
6716 * dev_change_carrier - Change device carrier
6717 * @dev: device
6718 * @new_carrier: new value
6719 *
6720 * Change device carrier
6721 */
6722 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6723 {
6724 const struct net_device_ops *ops = dev->netdev_ops;
6725
6726 if (!ops->ndo_change_carrier)
6727 return -EOPNOTSUPP;
6728 if (!netif_device_present(dev))
6729 return -ENODEV;
6730 return ops->ndo_change_carrier(dev, new_carrier);
6731 }
6732 EXPORT_SYMBOL(dev_change_carrier);
6733
6734 /**
6735 * dev_get_phys_port_id - Get device physical port ID
6736 * @dev: device
6737 * @ppid: port ID
6738 *
6739 * Get device physical port ID
6740 */
6741 int dev_get_phys_port_id(struct net_device *dev,
6742 struct netdev_phys_item_id *ppid)
6743 {
6744 const struct net_device_ops *ops = dev->netdev_ops;
6745
6746 if (!ops->ndo_get_phys_port_id)
6747 return -EOPNOTSUPP;
6748 return ops->ndo_get_phys_port_id(dev, ppid);
6749 }
6750 EXPORT_SYMBOL(dev_get_phys_port_id);
6751
6752 /**
6753 * dev_get_phys_port_name - Get device physical port name
6754 * @dev: device
6755 * @name: port name
6756 * @len: limit of bytes to copy to name
6757 *
6758 * Get device physical port name
6759 */
6760 int dev_get_phys_port_name(struct net_device *dev,
6761 char *name, size_t len)
6762 {
6763 const struct net_device_ops *ops = dev->netdev_ops;
6764
6765 if (!ops->ndo_get_phys_port_name)
6766 return -EOPNOTSUPP;
6767 return ops->ndo_get_phys_port_name(dev, name, len);
6768 }
6769 EXPORT_SYMBOL(dev_get_phys_port_name);
6770
6771 /**
6772 * dev_change_proto_down - update protocol port state information
6773 * @dev: device
6774 * @proto_down: new value
6775 *
6776 * This info can be used by switch drivers to set the phys state of the
6777 * port.
6778 */
6779 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6780 {
6781 const struct net_device_ops *ops = dev->netdev_ops;
6782
6783 if (!ops->ndo_change_proto_down)
6784 return -EOPNOTSUPP;
6785 if (!netif_device_present(dev))
6786 return -ENODEV;
6787 return ops->ndo_change_proto_down(dev, proto_down);
6788 }
6789 EXPORT_SYMBOL(dev_change_proto_down);
6790
6791 /**
6792 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6793 * @dev: device
6794 * @fd: new program fd or negative value to clear
6795 * @flags: xdp-related flags
6796 *
6797 * Set or clear a bpf program for a device
6798 */
6799 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6800 {
6801 const struct net_device_ops *ops = dev->netdev_ops;
6802 struct bpf_prog *prog = NULL;
6803 struct netdev_xdp xdp;
6804 int err;
6805
6806 ASSERT_RTNL();
6807
6808 if (!ops->ndo_xdp)
6809 return -EOPNOTSUPP;
6810 if (fd >= 0) {
6811 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6812 memset(&xdp, 0, sizeof(xdp));
6813 xdp.command = XDP_QUERY_PROG;
6814
6815 err = ops->ndo_xdp(dev, &xdp);
6816 if (err < 0)
6817 return err;
6818 if (xdp.prog_attached)
6819 return -EBUSY;
6820 }
6821
6822 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6823 if (IS_ERR(prog))
6824 return PTR_ERR(prog);
6825 }
6826
6827 memset(&xdp, 0, sizeof(xdp));
6828 xdp.command = XDP_SETUP_PROG;
6829 xdp.prog = prog;
6830
6831 err = ops->ndo_xdp(dev, &xdp);
6832 if (err < 0 && prog)
6833 bpf_prog_put(prog);
6834
6835 return err;
6836 }
6837 EXPORT_SYMBOL(dev_change_xdp_fd);
6838
6839 /**
6840 * dev_new_index - allocate an ifindex
6841 * @net: the applicable net namespace
6842 *
6843 * Returns a suitable unique value for a new device interface
6844 * number. The caller must hold the rtnl semaphore or the
6845 * dev_base_lock to be sure it remains unique.
6846 */
6847 static int dev_new_index(struct net *net)
6848 {
6849 int ifindex = net->ifindex;
6850 for (;;) {
6851 if (++ifindex <= 0)
6852 ifindex = 1;
6853 if (!__dev_get_by_index(net, ifindex))
6854 return net->ifindex = ifindex;
6855 }
6856 }
6857
6858 /* Delayed registration/unregisteration */
6859 static LIST_HEAD(net_todo_list);
6860 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6861
6862 static void net_set_todo(struct net_device *dev)
6863 {
6864 list_add_tail(&dev->todo_list, &net_todo_list);
6865 dev_net(dev)->dev_unreg_count++;
6866 }
6867
6868 static void rollback_registered_many(struct list_head *head)
6869 {
6870 struct net_device *dev, *tmp;
6871 LIST_HEAD(close_head);
6872
6873 BUG_ON(dev_boot_phase);
6874 ASSERT_RTNL();
6875
6876 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6877 /* Some devices call without registering
6878 * for initialization unwind. Remove those
6879 * devices and proceed with the remaining.
6880 */
6881 if (dev->reg_state == NETREG_UNINITIALIZED) {
6882 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6883 dev->name, dev);
6884
6885 WARN_ON(1);
6886 list_del(&dev->unreg_list);
6887 continue;
6888 }
6889 dev->dismantle = true;
6890 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6891 }
6892
6893 /* If device is running, close it first. */
6894 list_for_each_entry(dev, head, unreg_list)
6895 list_add_tail(&dev->close_list, &close_head);
6896 dev_close_many(&close_head, true);
6897
6898 list_for_each_entry(dev, head, unreg_list) {
6899 /* And unlink it from device chain. */
6900 unlist_netdevice(dev);
6901
6902 dev->reg_state = NETREG_UNREGISTERING;
6903 }
6904 flush_all_backlogs();
6905
6906 synchronize_net();
6907
6908 list_for_each_entry(dev, head, unreg_list) {
6909 struct sk_buff *skb = NULL;
6910
6911 /* Shutdown queueing discipline. */
6912 dev_shutdown(dev);
6913
6914
6915 /* Notify protocols, that we are about to destroy
6916 this device. They should clean all the things.
6917 */
6918 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6919
6920 if (!dev->rtnl_link_ops ||
6921 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6922 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6923 GFP_KERNEL);
6924
6925 /*
6926 * Flush the unicast and multicast chains
6927 */
6928 dev_uc_flush(dev);
6929 dev_mc_flush(dev);
6930
6931 if (dev->netdev_ops->ndo_uninit)
6932 dev->netdev_ops->ndo_uninit(dev);
6933
6934 if (skb)
6935 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6936
6937 /* Notifier chain MUST detach us all upper devices. */
6938 WARN_ON(netdev_has_any_upper_dev(dev));
6939 WARN_ON(netdev_has_any_lower_dev(dev));
6940
6941 /* Remove entries from kobject tree */
6942 netdev_unregister_kobject(dev);
6943 #ifdef CONFIG_XPS
6944 /* Remove XPS queueing entries */
6945 netif_reset_xps_queues_gt(dev, 0);
6946 #endif
6947 }
6948
6949 synchronize_net();
6950
6951 list_for_each_entry(dev, head, unreg_list)
6952 dev_put(dev);
6953 }
6954
6955 static void rollback_registered(struct net_device *dev)
6956 {
6957 LIST_HEAD(single);
6958
6959 list_add(&dev->unreg_list, &single);
6960 rollback_registered_many(&single);
6961 list_del(&single);
6962 }
6963
6964 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6965 struct net_device *upper, netdev_features_t features)
6966 {
6967 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6968 netdev_features_t feature;
6969 int feature_bit;
6970
6971 for_each_netdev_feature(&upper_disables, feature_bit) {
6972 feature = __NETIF_F_BIT(feature_bit);
6973 if (!(upper->wanted_features & feature)
6974 && (features & feature)) {
6975 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6976 &feature, upper->name);
6977 features &= ~feature;
6978 }
6979 }
6980
6981 return features;
6982 }
6983
6984 static void netdev_sync_lower_features(struct net_device *upper,
6985 struct net_device *lower, netdev_features_t features)
6986 {
6987 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6988 netdev_features_t feature;
6989 int feature_bit;
6990
6991 for_each_netdev_feature(&upper_disables, feature_bit) {
6992 feature = __NETIF_F_BIT(feature_bit);
6993 if (!(features & feature) && (lower->features & feature)) {
6994 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6995 &feature, lower->name);
6996 lower->wanted_features &= ~feature;
6997 netdev_update_features(lower);
6998
6999 if (unlikely(lower->features & feature))
7000 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
7001 &feature, lower->name);
7002 }
7003 }
7004 }
7005
7006 static netdev_features_t netdev_fix_features(struct net_device *dev,
7007 netdev_features_t features)
7008 {
7009 /* Fix illegal checksum combinations */
7010 if ((features & NETIF_F_HW_CSUM) &&
7011 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7012 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7013 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7014 }
7015
7016 /* TSO requires that SG is present as well. */
7017 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7018 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7019 features &= ~NETIF_F_ALL_TSO;
7020 }
7021
7022 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7023 !(features & NETIF_F_IP_CSUM)) {
7024 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7025 features &= ~NETIF_F_TSO;
7026 features &= ~NETIF_F_TSO_ECN;
7027 }
7028
7029 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7030 !(features & NETIF_F_IPV6_CSUM)) {
7031 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7032 features &= ~NETIF_F_TSO6;
7033 }
7034
7035 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7036 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7037 features &= ~NETIF_F_TSO_MANGLEID;
7038
7039 /* TSO ECN requires that TSO is present as well. */
7040 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7041 features &= ~NETIF_F_TSO_ECN;
7042
7043 /* Software GSO depends on SG. */
7044 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7045 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7046 features &= ~NETIF_F_GSO;
7047 }
7048
7049 /* UFO needs SG and checksumming */
7050 if (features & NETIF_F_UFO) {
7051 /* maybe split UFO into V4 and V6? */
7052 if (!(features & NETIF_F_HW_CSUM) &&
7053 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7054 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7055 netdev_dbg(dev,
7056 "Dropping NETIF_F_UFO since no checksum offload features.\n");
7057 features &= ~NETIF_F_UFO;
7058 }
7059
7060 if (!(features & NETIF_F_SG)) {
7061 netdev_dbg(dev,
7062 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7063 features &= ~NETIF_F_UFO;
7064 }
7065 }
7066
7067 /* GSO partial features require GSO partial be set */
7068 if ((features & dev->gso_partial_features) &&
7069 !(features & NETIF_F_GSO_PARTIAL)) {
7070 netdev_dbg(dev,
7071 "Dropping partially supported GSO features since no GSO partial.\n");
7072 features &= ~dev->gso_partial_features;
7073 }
7074
7075 #ifdef CONFIG_NET_RX_BUSY_POLL
7076 if (dev->netdev_ops->ndo_busy_poll)
7077 features |= NETIF_F_BUSY_POLL;
7078 else
7079 #endif
7080 features &= ~NETIF_F_BUSY_POLL;
7081
7082 return features;
7083 }
7084
7085 int __netdev_update_features(struct net_device *dev)
7086 {
7087 struct net_device *upper, *lower;
7088 netdev_features_t features;
7089 struct list_head *iter;
7090 int err = -1;
7091
7092 ASSERT_RTNL();
7093
7094 features = netdev_get_wanted_features(dev);
7095
7096 if (dev->netdev_ops->ndo_fix_features)
7097 features = dev->netdev_ops->ndo_fix_features(dev, features);
7098
7099 /* driver might be less strict about feature dependencies */
7100 features = netdev_fix_features(dev, features);
7101
7102 /* some features can't be enabled if they're off an an upper device */
7103 netdev_for_each_upper_dev_rcu(dev, upper, iter)
7104 features = netdev_sync_upper_features(dev, upper, features);
7105
7106 if (dev->features == features)
7107 goto sync_lower;
7108
7109 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7110 &dev->features, &features);
7111
7112 if (dev->netdev_ops->ndo_set_features)
7113 err = dev->netdev_ops->ndo_set_features(dev, features);
7114 else
7115 err = 0;
7116
7117 if (unlikely(err < 0)) {
7118 netdev_err(dev,
7119 "set_features() failed (%d); wanted %pNF, left %pNF\n",
7120 err, &features, &dev->features);
7121 /* return non-0 since some features might have changed and
7122 * it's better to fire a spurious notification than miss it
7123 */
7124 return -1;
7125 }
7126
7127 sync_lower:
7128 /* some features must be disabled on lower devices when disabled
7129 * on an upper device (think: bonding master or bridge)
7130 */
7131 netdev_for_each_lower_dev(dev, lower, iter)
7132 netdev_sync_lower_features(dev, lower, features);
7133
7134 if (!err)
7135 dev->features = features;
7136
7137 return err < 0 ? 0 : 1;
7138 }
7139
7140 /**
7141 * netdev_update_features - recalculate device features
7142 * @dev: the device to check
7143 *
7144 * Recalculate dev->features set and send notifications if it
7145 * has changed. Should be called after driver or hardware dependent
7146 * conditions might have changed that influence the features.
7147 */
7148 void netdev_update_features(struct net_device *dev)
7149 {
7150 if (__netdev_update_features(dev))
7151 netdev_features_change(dev);
7152 }
7153 EXPORT_SYMBOL(netdev_update_features);
7154
7155 /**
7156 * netdev_change_features - recalculate device features
7157 * @dev: the device to check
7158 *
7159 * Recalculate dev->features set and send notifications even
7160 * if they have not changed. Should be called instead of
7161 * netdev_update_features() if also dev->vlan_features might
7162 * have changed to allow the changes to be propagated to stacked
7163 * VLAN devices.
7164 */
7165 void netdev_change_features(struct net_device *dev)
7166 {
7167 __netdev_update_features(dev);
7168 netdev_features_change(dev);
7169 }
7170 EXPORT_SYMBOL(netdev_change_features);
7171
7172 /**
7173 * netif_stacked_transfer_operstate - transfer operstate
7174 * @rootdev: the root or lower level device to transfer state from
7175 * @dev: the device to transfer operstate to
7176 *
7177 * Transfer operational state from root to device. This is normally
7178 * called when a stacking relationship exists between the root
7179 * device and the device(a leaf device).
7180 */
7181 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7182 struct net_device *dev)
7183 {
7184 if (rootdev->operstate == IF_OPER_DORMANT)
7185 netif_dormant_on(dev);
7186 else
7187 netif_dormant_off(dev);
7188
7189 if (netif_carrier_ok(rootdev)) {
7190 if (!netif_carrier_ok(dev))
7191 netif_carrier_on(dev);
7192 } else {
7193 if (netif_carrier_ok(dev))
7194 netif_carrier_off(dev);
7195 }
7196 }
7197 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7198
7199 #ifdef CONFIG_SYSFS
7200 static int netif_alloc_rx_queues(struct net_device *dev)
7201 {
7202 unsigned int i, count = dev->num_rx_queues;
7203 struct netdev_rx_queue *rx;
7204 size_t sz = count * sizeof(*rx);
7205
7206 BUG_ON(count < 1);
7207
7208 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7209 if (!rx) {
7210 rx = vzalloc(sz);
7211 if (!rx)
7212 return -ENOMEM;
7213 }
7214 dev->_rx = rx;
7215
7216 for (i = 0; i < count; i++)
7217 rx[i].dev = dev;
7218 return 0;
7219 }
7220 #endif
7221
7222 static void netdev_init_one_queue(struct net_device *dev,
7223 struct netdev_queue *queue, void *_unused)
7224 {
7225 /* Initialize queue lock */
7226 spin_lock_init(&queue->_xmit_lock);
7227 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7228 queue->xmit_lock_owner = -1;
7229 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7230 queue->dev = dev;
7231 #ifdef CONFIG_BQL
7232 dql_init(&queue->dql, HZ);
7233 #endif
7234 }
7235
7236 static void netif_free_tx_queues(struct net_device *dev)
7237 {
7238 kvfree(dev->_tx);
7239 }
7240
7241 static int netif_alloc_netdev_queues(struct net_device *dev)
7242 {
7243 unsigned int count = dev->num_tx_queues;
7244 struct netdev_queue *tx;
7245 size_t sz = count * sizeof(*tx);
7246
7247 if (count < 1 || count > 0xffff)
7248 return -EINVAL;
7249
7250 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7251 if (!tx) {
7252 tx = vzalloc(sz);
7253 if (!tx)
7254 return -ENOMEM;
7255 }
7256 dev->_tx = tx;
7257
7258 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7259 spin_lock_init(&dev->tx_global_lock);
7260
7261 return 0;
7262 }
7263
7264 void netif_tx_stop_all_queues(struct net_device *dev)
7265 {
7266 unsigned int i;
7267
7268 for (i = 0; i < dev->num_tx_queues; i++) {
7269 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7270 netif_tx_stop_queue(txq);
7271 }
7272 }
7273 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7274
7275 /**
7276 * register_netdevice - register a network device
7277 * @dev: device to register
7278 *
7279 * Take a completed network device structure and add it to the kernel
7280 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7281 * chain. 0 is returned on success. A negative errno code is returned
7282 * on a failure to set up the device, or if the name is a duplicate.
7283 *
7284 * Callers must hold the rtnl semaphore. You may want
7285 * register_netdev() instead of this.
7286 *
7287 * BUGS:
7288 * The locking appears insufficient to guarantee two parallel registers
7289 * will not get the same name.
7290 */
7291
7292 int register_netdevice(struct net_device *dev)
7293 {
7294 int ret;
7295 struct net *net = dev_net(dev);
7296
7297 BUG_ON(dev_boot_phase);
7298 ASSERT_RTNL();
7299
7300 might_sleep();
7301
7302 /* When net_device's are persistent, this will be fatal. */
7303 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7304 BUG_ON(!net);
7305
7306 spin_lock_init(&dev->addr_list_lock);
7307 netdev_set_addr_lockdep_class(dev);
7308
7309 ret = dev_get_valid_name(net, dev, dev->name);
7310 if (ret < 0)
7311 goto out;
7312
7313 /* Init, if this function is available */
7314 if (dev->netdev_ops->ndo_init) {
7315 ret = dev->netdev_ops->ndo_init(dev);
7316 if (ret) {
7317 if (ret > 0)
7318 ret = -EIO;
7319 goto out;
7320 }
7321 }
7322
7323 if (((dev->hw_features | dev->features) &
7324 NETIF_F_HW_VLAN_CTAG_FILTER) &&
7325 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7326 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7327 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7328 ret = -EINVAL;
7329 goto err_uninit;
7330 }
7331
7332 ret = -EBUSY;
7333 if (!dev->ifindex)
7334 dev->ifindex = dev_new_index(net);
7335 else if (__dev_get_by_index(net, dev->ifindex))
7336 goto err_uninit;
7337
7338 /* Transfer changeable features to wanted_features and enable
7339 * software offloads (GSO and GRO).
7340 */
7341 dev->hw_features |= NETIF_F_SOFT_FEATURES;
7342 dev->features |= NETIF_F_SOFT_FEATURES;
7343 dev->wanted_features = dev->features & dev->hw_features;
7344
7345 if (!(dev->flags & IFF_LOOPBACK))
7346 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7347
7348 /* If IPv4 TCP segmentation offload is supported we should also
7349 * allow the device to enable segmenting the frame with the option
7350 * of ignoring a static IP ID value. This doesn't enable the
7351 * feature itself but allows the user to enable it later.
7352 */
7353 if (dev->hw_features & NETIF_F_TSO)
7354 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7355 if (dev->vlan_features & NETIF_F_TSO)
7356 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7357 if (dev->mpls_features & NETIF_F_TSO)
7358 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7359 if (dev->hw_enc_features & NETIF_F_TSO)
7360 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7361
7362 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7363 */
7364 dev->vlan_features |= NETIF_F_HIGHDMA;
7365
7366 /* Make NETIF_F_SG inheritable to tunnel devices.
7367 */
7368 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7369
7370 /* Make NETIF_F_SG inheritable to MPLS.
7371 */
7372 dev->mpls_features |= NETIF_F_SG;
7373
7374 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7375 ret = notifier_to_errno(ret);
7376 if (ret)
7377 goto err_uninit;
7378
7379 ret = netdev_register_kobject(dev);
7380 if (ret)
7381 goto err_uninit;
7382 dev->reg_state = NETREG_REGISTERED;
7383
7384 __netdev_update_features(dev);
7385
7386 /*
7387 * Default initial state at registry is that the
7388 * device is present.
7389 */
7390
7391 set_bit(__LINK_STATE_PRESENT, &dev->state);
7392
7393 linkwatch_init_dev(dev);
7394
7395 dev_init_scheduler(dev);
7396 dev_hold(dev);
7397 list_netdevice(dev);
7398 add_device_randomness(dev->dev_addr, dev->addr_len);
7399
7400 /* If the device has permanent device address, driver should
7401 * set dev_addr and also addr_assign_type should be set to
7402 * NET_ADDR_PERM (default value).
7403 */
7404 if (dev->addr_assign_type == NET_ADDR_PERM)
7405 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7406
7407 /* Notify protocols, that a new device appeared. */
7408 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7409 ret = notifier_to_errno(ret);
7410 if (ret) {
7411 rollback_registered(dev);
7412 dev->reg_state = NETREG_UNREGISTERED;
7413 }
7414 /*
7415 * Prevent userspace races by waiting until the network
7416 * device is fully setup before sending notifications.
7417 */
7418 if (!dev->rtnl_link_ops ||
7419 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7420 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7421
7422 out:
7423 return ret;
7424
7425 err_uninit:
7426 if (dev->netdev_ops->ndo_uninit)
7427 dev->netdev_ops->ndo_uninit(dev);
7428 goto out;
7429 }
7430 EXPORT_SYMBOL(register_netdevice);
7431
7432 /**
7433 * init_dummy_netdev - init a dummy network device for NAPI
7434 * @dev: device to init
7435 *
7436 * This takes a network device structure and initialize the minimum
7437 * amount of fields so it can be used to schedule NAPI polls without
7438 * registering a full blown interface. This is to be used by drivers
7439 * that need to tie several hardware interfaces to a single NAPI
7440 * poll scheduler due to HW limitations.
7441 */
7442 int init_dummy_netdev(struct net_device *dev)
7443 {
7444 /* Clear everything. Note we don't initialize spinlocks
7445 * are they aren't supposed to be taken by any of the
7446 * NAPI code and this dummy netdev is supposed to be
7447 * only ever used for NAPI polls
7448 */
7449 memset(dev, 0, sizeof(struct net_device));
7450
7451 /* make sure we BUG if trying to hit standard
7452 * register/unregister code path
7453 */
7454 dev->reg_state = NETREG_DUMMY;
7455
7456 /* NAPI wants this */
7457 INIT_LIST_HEAD(&dev->napi_list);
7458
7459 /* a dummy interface is started by default */
7460 set_bit(__LINK_STATE_PRESENT, &dev->state);
7461 set_bit(__LINK_STATE_START, &dev->state);
7462
7463 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7464 * because users of this 'device' dont need to change
7465 * its refcount.
7466 */
7467
7468 return 0;
7469 }
7470 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7471
7472
7473 /**
7474 * register_netdev - register a network device
7475 * @dev: device to register
7476 *
7477 * Take a completed network device structure and add it to the kernel
7478 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7479 * chain. 0 is returned on success. A negative errno code is returned
7480 * on a failure to set up the device, or if the name is a duplicate.
7481 *
7482 * This is a wrapper around register_netdevice that takes the rtnl semaphore
7483 * and expands the device name if you passed a format string to
7484 * alloc_netdev.
7485 */
7486 int register_netdev(struct net_device *dev)
7487 {
7488 int err;
7489
7490 rtnl_lock();
7491 err = register_netdevice(dev);
7492 rtnl_unlock();
7493 return err;
7494 }
7495 EXPORT_SYMBOL(register_netdev);
7496
7497 int netdev_refcnt_read(const struct net_device *dev)
7498 {
7499 int i, refcnt = 0;
7500
7501 for_each_possible_cpu(i)
7502 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7503 return refcnt;
7504 }
7505 EXPORT_SYMBOL(netdev_refcnt_read);
7506
7507 /**
7508 * netdev_wait_allrefs - wait until all references are gone.
7509 * @dev: target net_device
7510 *
7511 * This is called when unregistering network devices.
7512 *
7513 * Any protocol or device that holds a reference should register
7514 * for netdevice notification, and cleanup and put back the
7515 * reference if they receive an UNREGISTER event.
7516 * We can get stuck here if buggy protocols don't correctly
7517 * call dev_put.
7518 */
7519 static void netdev_wait_allrefs(struct net_device *dev)
7520 {
7521 unsigned long rebroadcast_time, warning_time;
7522 int refcnt;
7523
7524 linkwatch_forget_dev(dev);
7525
7526 rebroadcast_time = warning_time = jiffies;
7527 refcnt = netdev_refcnt_read(dev);
7528
7529 while (refcnt != 0) {
7530 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7531 rtnl_lock();
7532
7533 /* Rebroadcast unregister notification */
7534 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7535
7536 __rtnl_unlock();
7537 rcu_barrier();
7538 rtnl_lock();
7539
7540 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7541 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7542 &dev->state)) {
7543 /* We must not have linkwatch events
7544 * pending on unregister. If this
7545 * happens, we simply run the queue
7546 * unscheduled, resulting in a noop
7547 * for this device.
7548 */
7549 linkwatch_run_queue();
7550 }
7551
7552 __rtnl_unlock();
7553
7554 rebroadcast_time = jiffies;
7555 }
7556
7557 msleep(250);
7558
7559 refcnt = netdev_refcnt_read(dev);
7560
7561 if (time_after(jiffies, warning_time + 10 * HZ)) {
7562 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7563 dev->name, refcnt);
7564 warning_time = jiffies;
7565 }
7566 }
7567 }
7568
7569 /* The sequence is:
7570 *
7571 * rtnl_lock();
7572 * ...
7573 * register_netdevice(x1);
7574 * register_netdevice(x2);
7575 * ...
7576 * unregister_netdevice(y1);
7577 * unregister_netdevice(y2);
7578 * ...
7579 * rtnl_unlock();
7580 * free_netdev(y1);
7581 * free_netdev(y2);
7582 *
7583 * We are invoked by rtnl_unlock().
7584 * This allows us to deal with problems:
7585 * 1) We can delete sysfs objects which invoke hotplug
7586 * without deadlocking with linkwatch via keventd.
7587 * 2) Since we run with the RTNL semaphore not held, we can sleep
7588 * safely in order to wait for the netdev refcnt to drop to zero.
7589 *
7590 * We must not return until all unregister events added during
7591 * the interval the lock was held have been completed.
7592 */
7593 void netdev_run_todo(void)
7594 {
7595 struct list_head list;
7596
7597 /* Snapshot list, allow later requests */
7598 list_replace_init(&net_todo_list, &list);
7599
7600 __rtnl_unlock();
7601
7602
7603 /* Wait for rcu callbacks to finish before next phase */
7604 if (!list_empty(&list))
7605 rcu_barrier();
7606
7607 while (!list_empty(&list)) {
7608 struct net_device *dev
7609 = list_first_entry(&list, struct net_device, todo_list);
7610 list_del(&dev->todo_list);
7611
7612 rtnl_lock();
7613 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7614 __rtnl_unlock();
7615
7616 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7617 pr_err("network todo '%s' but state %d\n",
7618 dev->name, dev->reg_state);
7619 dump_stack();
7620 continue;
7621 }
7622
7623 dev->reg_state = NETREG_UNREGISTERED;
7624
7625 netdev_wait_allrefs(dev);
7626
7627 /* paranoia */
7628 BUG_ON(netdev_refcnt_read(dev));
7629 BUG_ON(!list_empty(&dev->ptype_all));
7630 BUG_ON(!list_empty(&dev->ptype_specific));
7631 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7632 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7633 WARN_ON(dev->dn_ptr);
7634
7635 if (dev->destructor)
7636 dev->destructor(dev);
7637
7638 /* Report a network device has been unregistered */
7639 rtnl_lock();
7640 dev_net(dev)->dev_unreg_count--;
7641 __rtnl_unlock();
7642 wake_up(&netdev_unregistering_wq);
7643
7644 /* Free network device */
7645 kobject_put(&dev->dev.kobj);
7646 }
7647 }
7648
7649 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7650 * all the same fields in the same order as net_device_stats, with only
7651 * the type differing, but rtnl_link_stats64 may have additional fields
7652 * at the end for newer counters.
7653 */
7654 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7655 const struct net_device_stats *netdev_stats)
7656 {
7657 #if BITS_PER_LONG == 64
7658 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7659 memcpy(stats64, netdev_stats, sizeof(*stats64));
7660 /* zero out counters that only exist in rtnl_link_stats64 */
7661 memset((char *)stats64 + sizeof(*netdev_stats), 0,
7662 sizeof(*stats64) - sizeof(*netdev_stats));
7663 #else
7664 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7665 const unsigned long *src = (const unsigned long *)netdev_stats;
7666 u64 *dst = (u64 *)stats64;
7667
7668 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7669 for (i = 0; i < n; i++)
7670 dst[i] = src[i];
7671 /* zero out counters that only exist in rtnl_link_stats64 */
7672 memset((char *)stats64 + n * sizeof(u64), 0,
7673 sizeof(*stats64) - n * sizeof(u64));
7674 #endif
7675 }
7676 EXPORT_SYMBOL(netdev_stats_to_stats64);
7677
7678 /**
7679 * dev_get_stats - get network device statistics
7680 * @dev: device to get statistics from
7681 * @storage: place to store stats
7682 *
7683 * Get network statistics from device. Return @storage.
7684 * The device driver may provide its own method by setting
7685 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7686 * otherwise the internal statistics structure is used.
7687 */
7688 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7689 struct rtnl_link_stats64 *storage)
7690 {
7691 const struct net_device_ops *ops = dev->netdev_ops;
7692
7693 if (ops->ndo_get_stats64) {
7694 memset(storage, 0, sizeof(*storage));
7695 ops->ndo_get_stats64(dev, storage);
7696 } else if (ops->ndo_get_stats) {
7697 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7698 } else {
7699 netdev_stats_to_stats64(storage, &dev->stats);
7700 }
7701 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7702 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7703 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7704 return storage;
7705 }
7706 EXPORT_SYMBOL(dev_get_stats);
7707
7708 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7709 {
7710 struct netdev_queue *queue = dev_ingress_queue(dev);
7711
7712 #ifdef CONFIG_NET_CLS_ACT
7713 if (queue)
7714 return queue;
7715 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7716 if (!queue)
7717 return NULL;
7718 netdev_init_one_queue(dev, queue, NULL);
7719 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7720 queue->qdisc_sleeping = &noop_qdisc;
7721 rcu_assign_pointer(dev->ingress_queue, queue);
7722 #endif
7723 return queue;
7724 }
7725
7726 static const struct ethtool_ops default_ethtool_ops;
7727
7728 void netdev_set_default_ethtool_ops(struct net_device *dev,
7729 const struct ethtool_ops *ops)
7730 {
7731 if (dev->ethtool_ops == &default_ethtool_ops)
7732 dev->ethtool_ops = ops;
7733 }
7734 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7735
7736 void netdev_freemem(struct net_device *dev)
7737 {
7738 char *addr = (char *)dev - dev->padded;
7739
7740 kvfree(addr);
7741 }
7742
7743 /**
7744 * alloc_netdev_mqs - allocate network device
7745 * @sizeof_priv: size of private data to allocate space for
7746 * @name: device name format string
7747 * @name_assign_type: origin of device name
7748 * @setup: callback to initialize device
7749 * @txqs: the number of TX subqueues to allocate
7750 * @rxqs: the number of RX subqueues to allocate
7751 *
7752 * Allocates a struct net_device with private data area for driver use
7753 * and performs basic initialization. Also allocates subqueue structs
7754 * for each queue on the device.
7755 */
7756 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7757 unsigned char name_assign_type,
7758 void (*setup)(struct net_device *),
7759 unsigned int txqs, unsigned int rxqs)
7760 {
7761 struct net_device *dev;
7762 size_t alloc_size;
7763 struct net_device *p;
7764
7765 BUG_ON(strlen(name) >= sizeof(dev->name));
7766
7767 if (txqs < 1) {
7768 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7769 return NULL;
7770 }
7771
7772 #ifdef CONFIG_SYSFS
7773 if (rxqs < 1) {
7774 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7775 return NULL;
7776 }
7777 #endif
7778
7779 alloc_size = sizeof(struct net_device);
7780 if (sizeof_priv) {
7781 /* ensure 32-byte alignment of private area */
7782 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7783 alloc_size += sizeof_priv;
7784 }
7785 /* ensure 32-byte alignment of whole construct */
7786 alloc_size += NETDEV_ALIGN - 1;
7787
7788 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7789 if (!p)
7790 p = vzalloc(alloc_size);
7791 if (!p)
7792 return NULL;
7793
7794 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7795 dev->padded = (char *)dev - (char *)p;
7796
7797 dev->pcpu_refcnt = alloc_percpu(int);
7798 if (!dev->pcpu_refcnt)
7799 goto free_dev;
7800
7801 if (dev_addr_init(dev))
7802 goto free_pcpu;
7803
7804 dev_mc_init(dev);
7805 dev_uc_init(dev);
7806
7807 dev_net_set(dev, &init_net);
7808
7809 dev->gso_max_size = GSO_MAX_SIZE;
7810 dev->gso_max_segs = GSO_MAX_SEGS;
7811
7812 INIT_LIST_HEAD(&dev->napi_list);
7813 INIT_LIST_HEAD(&dev->unreg_list);
7814 INIT_LIST_HEAD(&dev->close_list);
7815 INIT_LIST_HEAD(&dev->link_watch_list);
7816 INIT_LIST_HEAD(&dev->adj_list.upper);
7817 INIT_LIST_HEAD(&dev->adj_list.lower);
7818 INIT_LIST_HEAD(&dev->ptype_all);
7819 INIT_LIST_HEAD(&dev->ptype_specific);
7820 #ifdef CONFIG_NET_SCHED
7821 hash_init(dev->qdisc_hash);
7822 #endif
7823 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7824 setup(dev);
7825
7826 if (!dev->tx_queue_len) {
7827 dev->priv_flags |= IFF_NO_QUEUE;
7828 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7829 }
7830
7831 dev->num_tx_queues = txqs;
7832 dev->real_num_tx_queues = txqs;
7833 if (netif_alloc_netdev_queues(dev))
7834 goto free_all;
7835
7836 #ifdef CONFIG_SYSFS
7837 dev->num_rx_queues = rxqs;
7838 dev->real_num_rx_queues = rxqs;
7839 if (netif_alloc_rx_queues(dev))
7840 goto free_all;
7841 #endif
7842
7843 strcpy(dev->name, name);
7844 dev->name_assign_type = name_assign_type;
7845 dev->group = INIT_NETDEV_GROUP;
7846 if (!dev->ethtool_ops)
7847 dev->ethtool_ops = &default_ethtool_ops;
7848
7849 nf_hook_ingress_init(dev);
7850
7851 return dev;
7852
7853 free_all:
7854 free_netdev(dev);
7855 return NULL;
7856
7857 free_pcpu:
7858 free_percpu(dev->pcpu_refcnt);
7859 free_dev:
7860 netdev_freemem(dev);
7861 return NULL;
7862 }
7863 EXPORT_SYMBOL(alloc_netdev_mqs);
7864
7865 /**
7866 * free_netdev - free network device
7867 * @dev: device
7868 *
7869 * This function does the last stage of destroying an allocated device
7870 * interface. The reference to the device object is released.
7871 * If this is the last reference then it will be freed.
7872 * Must be called in process context.
7873 */
7874 void free_netdev(struct net_device *dev)
7875 {
7876 struct napi_struct *p, *n;
7877
7878 might_sleep();
7879 netif_free_tx_queues(dev);
7880 #ifdef CONFIG_SYSFS
7881 kvfree(dev->_rx);
7882 #endif
7883
7884 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7885
7886 /* Flush device addresses */
7887 dev_addr_flush(dev);
7888
7889 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7890 netif_napi_del(p);
7891
7892 free_percpu(dev->pcpu_refcnt);
7893 dev->pcpu_refcnt = NULL;
7894
7895 /* Compatibility with error handling in drivers */
7896 if (dev->reg_state == NETREG_UNINITIALIZED) {
7897 netdev_freemem(dev);
7898 return;
7899 }
7900
7901 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7902 dev->reg_state = NETREG_RELEASED;
7903
7904 /* will free via device release */
7905 put_device(&dev->dev);
7906 }
7907 EXPORT_SYMBOL(free_netdev);
7908
7909 /**
7910 * synchronize_net - Synchronize with packet receive processing
7911 *
7912 * Wait for packets currently being received to be done.
7913 * Does not block later packets from starting.
7914 */
7915 void synchronize_net(void)
7916 {
7917 might_sleep();
7918 if (rtnl_is_locked())
7919 synchronize_rcu_expedited();
7920 else
7921 synchronize_rcu();
7922 }
7923 EXPORT_SYMBOL(synchronize_net);
7924
7925 /**
7926 * unregister_netdevice_queue - remove device from the kernel
7927 * @dev: device
7928 * @head: list
7929 *
7930 * This function shuts down a device interface and removes it
7931 * from the kernel tables.
7932 * If head not NULL, device is queued to be unregistered later.
7933 *
7934 * Callers must hold the rtnl semaphore. You may want
7935 * unregister_netdev() instead of this.
7936 */
7937
7938 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7939 {
7940 ASSERT_RTNL();
7941
7942 if (head) {
7943 list_move_tail(&dev->unreg_list, head);
7944 } else {
7945 rollback_registered(dev);
7946 /* Finish processing unregister after unlock */
7947 net_set_todo(dev);
7948 }
7949 }
7950 EXPORT_SYMBOL(unregister_netdevice_queue);
7951
7952 /**
7953 * unregister_netdevice_many - unregister many devices
7954 * @head: list of devices
7955 *
7956 * Note: As most callers use a stack allocated list_head,
7957 * we force a list_del() to make sure stack wont be corrupted later.
7958 */
7959 void unregister_netdevice_many(struct list_head *head)
7960 {
7961 struct net_device *dev;
7962
7963 if (!list_empty(head)) {
7964 rollback_registered_many(head);
7965 list_for_each_entry(dev, head, unreg_list)
7966 net_set_todo(dev);
7967 list_del(head);
7968 }
7969 }
7970 EXPORT_SYMBOL(unregister_netdevice_many);
7971
7972 /**
7973 * unregister_netdev - remove device from the kernel
7974 * @dev: device
7975 *
7976 * This function shuts down a device interface and removes it
7977 * from the kernel tables.
7978 *
7979 * This is just a wrapper for unregister_netdevice that takes
7980 * the rtnl semaphore. In general you want to use this and not
7981 * unregister_netdevice.
7982 */
7983 void unregister_netdev(struct net_device *dev)
7984 {
7985 rtnl_lock();
7986 unregister_netdevice(dev);
7987 rtnl_unlock();
7988 }
7989 EXPORT_SYMBOL(unregister_netdev);
7990
7991 /**
7992 * dev_change_net_namespace - move device to different nethost namespace
7993 * @dev: device
7994 * @net: network namespace
7995 * @pat: If not NULL name pattern to try if the current device name
7996 * is already taken in the destination network namespace.
7997 *
7998 * This function shuts down a device interface and moves it
7999 * to a new network namespace. On success 0 is returned, on
8000 * a failure a netagive errno code is returned.
8001 *
8002 * Callers must hold the rtnl semaphore.
8003 */
8004
8005 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
8006 {
8007 int err;
8008
8009 ASSERT_RTNL();
8010
8011 /* Don't allow namespace local devices to be moved. */
8012 err = -EINVAL;
8013 if (dev->features & NETIF_F_NETNS_LOCAL)
8014 goto out;
8015
8016 /* Ensure the device has been registrered */
8017 if (dev->reg_state != NETREG_REGISTERED)
8018 goto out;
8019
8020 /* Get out if there is nothing todo */
8021 err = 0;
8022 if (net_eq(dev_net(dev), net))
8023 goto out;
8024
8025 /* Pick the destination device name, and ensure
8026 * we can use it in the destination network namespace.
8027 */
8028 err = -EEXIST;
8029 if (__dev_get_by_name(net, dev->name)) {
8030 /* We get here if we can't use the current device name */
8031 if (!pat)
8032 goto out;
8033 if (dev_get_valid_name(net, dev, pat) < 0)
8034 goto out;
8035 }
8036
8037 /*
8038 * And now a mini version of register_netdevice unregister_netdevice.
8039 */
8040
8041 /* If device is running close it first. */
8042 dev_close(dev);
8043
8044 /* And unlink it from device chain */
8045 err = -ENODEV;
8046 unlist_netdevice(dev);
8047
8048 synchronize_net();
8049
8050 /* Shutdown queueing discipline. */
8051 dev_shutdown(dev);
8052
8053 /* Notify protocols, that we are about to destroy
8054 this device. They should clean all the things.
8055
8056 Note that dev->reg_state stays at NETREG_REGISTERED.
8057 This is wanted because this way 8021q and macvlan know
8058 the device is just moving and can keep their slaves up.
8059 */
8060 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8061 rcu_barrier();
8062 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8063 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
8064
8065 /*
8066 * Flush the unicast and multicast chains
8067 */
8068 dev_uc_flush(dev);
8069 dev_mc_flush(dev);
8070
8071 /* Send a netdev-removed uevent to the old namespace */
8072 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8073 netdev_adjacent_del_links(dev);
8074
8075 /* Actually switch the network namespace */
8076 dev_net_set(dev, net);
8077
8078 /* If there is an ifindex conflict assign a new one */
8079 if (__dev_get_by_index(net, dev->ifindex))
8080 dev->ifindex = dev_new_index(net);
8081
8082 /* Send a netdev-add uevent to the new namespace */
8083 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8084 netdev_adjacent_add_links(dev);
8085
8086 /* Fixup kobjects */
8087 err = device_rename(&dev->dev, dev->name);
8088 WARN_ON(err);
8089
8090 /* Add the device back in the hashes */
8091 list_netdevice(dev);
8092
8093 /* Notify protocols, that a new device appeared. */
8094 call_netdevice_notifiers(NETDEV_REGISTER, dev);
8095
8096 /*
8097 * Prevent userspace races by waiting until the network
8098 * device is fully setup before sending notifications.
8099 */
8100 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8101
8102 synchronize_net();
8103 err = 0;
8104 out:
8105 return err;
8106 }
8107 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8108
8109 static int dev_cpu_dead(unsigned int oldcpu)
8110 {
8111 struct sk_buff **list_skb;
8112 struct sk_buff *skb;
8113 unsigned int cpu;
8114 struct softnet_data *sd, *oldsd;
8115
8116 local_irq_disable();
8117 cpu = smp_processor_id();
8118 sd = &per_cpu(softnet_data, cpu);
8119 oldsd = &per_cpu(softnet_data, oldcpu);
8120
8121 /* Find end of our completion_queue. */
8122 list_skb = &sd->completion_queue;
8123 while (*list_skb)
8124 list_skb = &(*list_skb)->next;
8125 /* Append completion queue from offline CPU. */
8126 *list_skb = oldsd->completion_queue;
8127 oldsd->completion_queue = NULL;
8128
8129 /* Append output queue from offline CPU. */
8130 if (oldsd->output_queue) {
8131 *sd->output_queue_tailp = oldsd->output_queue;
8132 sd->output_queue_tailp = oldsd->output_queue_tailp;
8133 oldsd->output_queue = NULL;
8134 oldsd->output_queue_tailp = &oldsd->output_queue;
8135 }
8136 /* Append NAPI poll list from offline CPU, with one exception :
8137 * process_backlog() must be called by cpu owning percpu backlog.
8138 * We properly handle process_queue & input_pkt_queue later.
8139 */
8140 while (!list_empty(&oldsd->poll_list)) {
8141 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8142 struct napi_struct,
8143 poll_list);
8144
8145 list_del_init(&napi->poll_list);
8146 if (napi->poll == process_backlog)
8147 napi->state = 0;
8148 else
8149 ____napi_schedule(sd, napi);
8150 }
8151
8152 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8153 local_irq_enable();
8154
8155 /* Process offline CPU's input_pkt_queue */
8156 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8157 netif_rx_ni(skb);
8158 input_queue_head_incr(oldsd);
8159 }
8160 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8161 netif_rx_ni(skb);
8162 input_queue_head_incr(oldsd);
8163 }
8164
8165 return 0;
8166 }
8167
8168 /**
8169 * netdev_increment_features - increment feature set by one
8170 * @all: current feature set
8171 * @one: new feature set
8172 * @mask: mask feature set
8173 *
8174 * Computes a new feature set after adding a device with feature set
8175 * @one to the master device with current feature set @all. Will not
8176 * enable anything that is off in @mask. Returns the new feature set.
8177 */
8178 netdev_features_t netdev_increment_features(netdev_features_t all,
8179 netdev_features_t one, netdev_features_t mask)
8180 {
8181 if (mask & NETIF_F_HW_CSUM)
8182 mask |= NETIF_F_CSUM_MASK;
8183 mask |= NETIF_F_VLAN_CHALLENGED;
8184
8185 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8186 all &= one | ~NETIF_F_ALL_FOR_ALL;
8187
8188 /* If one device supports hw checksumming, set for all. */
8189 if (all & NETIF_F_HW_CSUM)
8190 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8191
8192 return all;
8193 }
8194 EXPORT_SYMBOL(netdev_increment_features);
8195
8196 static struct hlist_head * __net_init netdev_create_hash(void)
8197 {
8198 int i;
8199 struct hlist_head *hash;
8200
8201 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8202 if (hash != NULL)
8203 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8204 INIT_HLIST_HEAD(&hash[i]);
8205
8206 return hash;
8207 }
8208
8209 /* Initialize per network namespace state */
8210 static int __net_init netdev_init(struct net *net)
8211 {
8212 if (net != &init_net)
8213 INIT_LIST_HEAD(&net->dev_base_head);
8214
8215 net->dev_name_head = netdev_create_hash();
8216 if (net->dev_name_head == NULL)
8217 goto err_name;
8218
8219 net->dev_index_head = netdev_create_hash();
8220 if (net->dev_index_head == NULL)
8221 goto err_idx;
8222
8223 return 0;
8224
8225 err_idx:
8226 kfree(net->dev_name_head);
8227 err_name:
8228 return -ENOMEM;
8229 }
8230
8231 /**
8232 * netdev_drivername - network driver for the device
8233 * @dev: network device
8234 *
8235 * Determine network driver for device.
8236 */
8237 const char *netdev_drivername(const struct net_device *dev)
8238 {
8239 const struct device_driver *driver;
8240 const struct device *parent;
8241 const char *empty = "";
8242
8243 parent = dev->dev.parent;
8244 if (!parent)
8245 return empty;
8246
8247 driver = parent->driver;
8248 if (driver && driver->name)
8249 return driver->name;
8250 return empty;
8251 }
8252
8253 static void __netdev_printk(const char *level, const struct net_device *dev,
8254 struct va_format *vaf)
8255 {
8256 if (dev && dev->dev.parent) {
8257 dev_printk_emit(level[1] - '0',
8258 dev->dev.parent,
8259 "%s %s %s%s: %pV",
8260 dev_driver_string(dev->dev.parent),
8261 dev_name(dev->dev.parent),
8262 netdev_name(dev), netdev_reg_state(dev),
8263 vaf);
8264 } else if (dev) {
8265 printk("%s%s%s: %pV",
8266 level, netdev_name(dev), netdev_reg_state(dev), vaf);
8267 } else {
8268 printk("%s(NULL net_device): %pV", level, vaf);
8269 }
8270 }
8271
8272 void netdev_printk(const char *level, const struct net_device *dev,
8273 const char *format, ...)
8274 {
8275 struct va_format vaf;
8276 va_list args;
8277
8278 va_start(args, format);
8279
8280 vaf.fmt = format;
8281 vaf.va = &args;
8282
8283 __netdev_printk(level, dev, &vaf);
8284
8285 va_end(args);
8286 }
8287 EXPORT_SYMBOL(netdev_printk);
8288
8289 #define define_netdev_printk_level(func, level) \
8290 void func(const struct net_device *dev, const char *fmt, ...) \
8291 { \
8292 struct va_format vaf; \
8293 va_list args; \
8294 \
8295 va_start(args, fmt); \
8296 \
8297 vaf.fmt = fmt; \
8298 vaf.va = &args; \
8299 \
8300 __netdev_printk(level, dev, &vaf); \
8301 \
8302 va_end(args); \
8303 } \
8304 EXPORT_SYMBOL(func);
8305
8306 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8307 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8308 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8309 define_netdev_printk_level(netdev_err, KERN_ERR);
8310 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8311 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8312 define_netdev_printk_level(netdev_info, KERN_INFO);
8313
8314 static void __net_exit netdev_exit(struct net *net)
8315 {
8316 kfree(net->dev_name_head);
8317 kfree(net->dev_index_head);
8318 }
8319
8320 static struct pernet_operations __net_initdata netdev_net_ops = {
8321 .init = netdev_init,
8322 .exit = netdev_exit,
8323 };
8324
8325 static void __net_exit default_device_exit(struct net *net)
8326 {
8327 struct net_device *dev, *aux;
8328 /*
8329 * Push all migratable network devices back to the
8330 * initial network namespace
8331 */
8332 rtnl_lock();
8333 for_each_netdev_safe(net, dev, aux) {
8334 int err;
8335 char fb_name[IFNAMSIZ];
8336
8337 /* Ignore unmoveable devices (i.e. loopback) */
8338 if (dev->features & NETIF_F_NETNS_LOCAL)
8339 continue;
8340
8341 /* Leave virtual devices for the generic cleanup */
8342 if (dev->rtnl_link_ops)
8343 continue;
8344
8345 /* Push remaining network devices to init_net */
8346 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8347 err = dev_change_net_namespace(dev, &init_net, fb_name);
8348 if (err) {
8349 pr_emerg("%s: failed to move %s to init_net: %d\n",
8350 __func__, dev->name, err);
8351 BUG();
8352 }
8353 }
8354 rtnl_unlock();
8355 }
8356
8357 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8358 {
8359 /* Return with the rtnl_lock held when there are no network
8360 * devices unregistering in any network namespace in net_list.
8361 */
8362 struct net *net;
8363 bool unregistering;
8364 DEFINE_WAIT_FUNC(wait, woken_wake_function);
8365
8366 add_wait_queue(&netdev_unregistering_wq, &wait);
8367 for (;;) {
8368 unregistering = false;
8369 rtnl_lock();
8370 list_for_each_entry(net, net_list, exit_list) {
8371 if (net->dev_unreg_count > 0) {
8372 unregistering = true;
8373 break;
8374 }
8375 }
8376 if (!unregistering)
8377 break;
8378 __rtnl_unlock();
8379
8380 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8381 }
8382 remove_wait_queue(&netdev_unregistering_wq, &wait);
8383 }
8384
8385 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8386 {
8387 /* At exit all network devices most be removed from a network
8388 * namespace. Do this in the reverse order of registration.
8389 * Do this across as many network namespaces as possible to
8390 * improve batching efficiency.
8391 */
8392 struct net_device *dev;
8393 struct net *net;
8394 LIST_HEAD(dev_kill_list);
8395
8396 /* To prevent network device cleanup code from dereferencing
8397 * loopback devices or network devices that have been freed
8398 * wait here for all pending unregistrations to complete,
8399 * before unregistring the loopback device and allowing the
8400 * network namespace be freed.
8401 *
8402 * The netdev todo list containing all network devices
8403 * unregistrations that happen in default_device_exit_batch
8404 * will run in the rtnl_unlock() at the end of
8405 * default_device_exit_batch.
8406 */
8407 rtnl_lock_unregistering(net_list);
8408 list_for_each_entry(net, net_list, exit_list) {
8409 for_each_netdev_reverse(net, dev) {
8410 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8411 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8412 else
8413 unregister_netdevice_queue(dev, &dev_kill_list);
8414 }
8415 }
8416 unregister_netdevice_many(&dev_kill_list);
8417 rtnl_unlock();
8418 }
8419
8420 static struct pernet_operations __net_initdata default_device_ops = {
8421 .exit = default_device_exit,
8422 .exit_batch = default_device_exit_batch,
8423 };
8424
8425 /*
8426 * Initialize the DEV module. At boot time this walks the device list and
8427 * unhooks any devices that fail to initialise (normally hardware not
8428 * present) and leaves us with a valid list of present and active devices.
8429 *
8430 */
8431
8432 /*
8433 * This is called single threaded during boot, so no need
8434 * to take the rtnl semaphore.
8435 */
8436 static int __init net_dev_init(void)
8437 {
8438 int i, rc = -ENOMEM;
8439
8440 BUG_ON(!dev_boot_phase);
8441
8442 if (dev_proc_init())
8443 goto out;
8444
8445 if (netdev_kobject_init())
8446 goto out;
8447
8448 INIT_LIST_HEAD(&ptype_all);
8449 for (i = 0; i < PTYPE_HASH_SIZE; i++)
8450 INIT_LIST_HEAD(&ptype_base[i]);
8451
8452 INIT_LIST_HEAD(&offload_base);
8453
8454 if (register_pernet_subsys(&netdev_net_ops))
8455 goto out;
8456
8457 /*
8458 * Initialise the packet receive queues.
8459 */
8460
8461 for_each_possible_cpu(i) {
8462 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8463 struct softnet_data *sd = &per_cpu(softnet_data, i);
8464
8465 INIT_WORK(flush, flush_backlog);
8466
8467 skb_queue_head_init(&sd->input_pkt_queue);
8468 skb_queue_head_init(&sd->process_queue);
8469 INIT_LIST_HEAD(&sd->poll_list);
8470 sd->output_queue_tailp = &sd->output_queue;
8471 #ifdef CONFIG_RPS
8472 sd->csd.func = rps_trigger_softirq;
8473 sd->csd.info = sd;
8474 sd->cpu = i;
8475 #endif
8476
8477 sd->backlog.poll = process_backlog;
8478 sd->backlog.weight = weight_p;
8479 }
8480
8481 dev_boot_phase = 0;
8482
8483 /* The loopback device is special if any other network devices
8484 * is present in a network namespace the loopback device must
8485 * be present. Since we now dynamically allocate and free the
8486 * loopback device ensure this invariant is maintained by
8487 * keeping the loopback device as the first device on the
8488 * list of network devices. Ensuring the loopback devices
8489 * is the first device that appears and the last network device
8490 * that disappears.
8491 */
8492 if (register_pernet_device(&loopback_net_ops))
8493 goto out;
8494
8495 if (register_pernet_device(&default_device_ops))
8496 goto out;
8497
8498 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8499 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8500
8501 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8502 NULL, dev_cpu_dead);
8503 WARN_ON(rc < 0);
8504 dst_subsys_init();
8505 rc = 0;
8506 out:
8507 return rc;
8508 }
8509
8510 subsys_initcall(net_dev_init);