]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/core/dev.c
net: proc: change proc_net_remove to remove_proc_entry
[mirror_ubuntu-artful-kernel.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/wext.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135
136 #include "net-sysfs.h"
137
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143
144 /*
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
147 *
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
150 *
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
155 * the average user (w/out VLANs) will not be adversely affected.
156 * --BLG
157 *
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
170 */
171
172 #define PTYPE_HASH_SIZE (16)
173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174
175 static DEFINE_SPINLOCK(ptype_lock);
176 static DEFINE_SPINLOCK(offload_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly; /* Taps */
179 static struct list_head offload_base __read_mostly;
180
181 /*
182 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
183 * semaphore.
184 *
185 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
186 *
187 * Writers must hold the rtnl semaphore while they loop through the
188 * dev_base_head list, and hold dev_base_lock for writing when they do the
189 * actual updates. This allows pure readers to access the list even
190 * while a writer is preparing to update it.
191 *
192 * To put it another way, dev_base_lock is held for writing only to
193 * protect against pure readers; the rtnl semaphore provides the
194 * protection against other writers.
195 *
196 * See, for example usages, register_netdevice() and
197 * unregister_netdevice(), which must be called with the rtnl
198 * semaphore held.
199 */
200 DEFINE_RWLOCK(dev_base_lock);
201 EXPORT_SYMBOL(dev_base_lock);
202
203 seqcount_t devnet_rename_seq;
204
205 static inline void dev_base_seq_inc(struct net *net)
206 {
207 while (++net->dev_base_seq == 0);
208 }
209
210 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
211 {
212 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
213
214 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
215 }
216
217 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
218 {
219 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
220 }
221
222 static inline void rps_lock(struct softnet_data *sd)
223 {
224 #ifdef CONFIG_RPS
225 spin_lock(&sd->input_pkt_queue.lock);
226 #endif
227 }
228
229 static inline void rps_unlock(struct softnet_data *sd)
230 {
231 #ifdef CONFIG_RPS
232 spin_unlock(&sd->input_pkt_queue.lock);
233 #endif
234 }
235
236 /* Device list insertion */
237 static int list_netdevice(struct net_device *dev)
238 {
239 struct net *net = dev_net(dev);
240
241 ASSERT_RTNL();
242
243 write_lock_bh(&dev_base_lock);
244 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
245 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
246 hlist_add_head_rcu(&dev->index_hlist,
247 dev_index_hash(net, dev->ifindex));
248 write_unlock_bh(&dev_base_lock);
249
250 dev_base_seq_inc(net);
251
252 return 0;
253 }
254
255 /* Device list removal
256 * caller must respect a RCU grace period before freeing/reusing dev
257 */
258 static void unlist_netdevice(struct net_device *dev)
259 {
260 ASSERT_RTNL();
261
262 /* Unlink dev from the device chain */
263 write_lock_bh(&dev_base_lock);
264 list_del_rcu(&dev->dev_list);
265 hlist_del_rcu(&dev->name_hlist);
266 hlist_del_rcu(&dev->index_hlist);
267 write_unlock_bh(&dev_base_lock);
268
269 dev_base_seq_inc(dev_net(dev));
270 }
271
272 /*
273 * Our notifier list
274 */
275
276 static RAW_NOTIFIER_HEAD(netdev_chain);
277
278 /*
279 * Device drivers call our routines to queue packets here. We empty the
280 * queue in the local softnet handler.
281 */
282
283 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
284 EXPORT_PER_CPU_SYMBOL(softnet_data);
285
286 #ifdef CONFIG_LOCKDEP
287 /*
288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
289 * according to dev->type
290 */
291 static const unsigned short netdev_lock_type[] =
292 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
293 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
294 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
295 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
296 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
297 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
298 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
299 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
300 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
301 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
302 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
303 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
304 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
305 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
306 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
307
308 static const char *const netdev_lock_name[] =
309 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
310 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
311 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
312 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
313 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
314 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
315 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
316 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
317 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
318 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
319 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
320 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
321 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
322 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
323 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
324
325 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
326 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
327
328 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
329 {
330 int i;
331
332 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
333 if (netdev_lock_type[i] == dev_type)
334 return i;
335 /* the last key is used by default */
336 return ARRAY_SIZE(netdev_lock_type) - 1;
337 }
338
339 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
341 {
342 int i;
343
344 i = netdev_lock_pos(dev_type);
345 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
346 netdev_lock_name[i]);
347 }
348
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 int i;
352
353 i = netdev_lock_pos(dev->type);
354 lockdep_set_class_and_name(&dev->addr_list_lock,
355 &netdev_addr_lock_key[i],
356 netdev_lock_name[i]);
357 }
358 #else
359 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
360 unsigned short dev_type)
361 {
362 }
363 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
364 {
365 }
366 #endif
367
368 /*******************************************************************************
369
370 Protocol management and registration routines
371
372 *******************************************************************************/
373
374 /*
375 * Add a protocol ID to the list. Now that the input handler is
376 * smarter we can dispense with all the messy stuff that used to be
377 * here.
378 *
379 * BEWARE!!! Protocol handlers, mangling input packets,
380 * MUST BE last in hash buckets and checking protocol handlers
381 * MUST start from promiscuous ptype_all chain in net_bh.
382 * It is true now, do not change it.
383 * Explanation follows: if protocol handler, mangling packet, will
384 * be the first on list, it is not able to sense, that packet
385 * is cloned and should be copied-on-write, so that it will
386 * change it and subsequent readers will get broken packet.
387 * --ANK (980803)
388 */
389
390 static inline struct list_head *ptype_head(const struct packet_type *pt)
391 {
392 if (pt->type == htons(ETH_P_ALL))
393 return &ptype_all;
394 else
395 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
396 }
397
398 /**
399 * dev_add_pack - add packet handler
400 * @pt: packet type declaration
401 *
402 * Add a protocol handler to the networking stack. The passed &packet_type
403 * is linked into kernel lists and may not be freed until it has been
404 * removed from the kernel lists.
405 *
406 * This call does not sleep therefore it can not
407 * guarantee all CPU's that are in middle of receiving packets
408 * will see the new packet type (until the next received packet).
409 */
410
411 void dev_add_pack(struct packet_type *pt)
412 {
413 struct list_head *head = ptype_head(pt);
414
415 spin_lock(&ptype_lock);
416 list_add_rcu(&pt->list, head);
417 spin_unlock(&ptype_lock);
418 }
419 EXPORT_SYMBOL(dev_add_pack);
420
421 /**
422 * __dev_remove_pack - remove packet handler
423 * @pt: packet type declaration
424 *
425 * Remove a protocol handler that was previously added to the kernel
426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
427 * from the kernel lists and can be freed or reused once this function
428 * returns.
429 *
430 * The packet type might still be in use by receivers
431 * and must not be freed until after all the CPU's have gone
432 * through a quiescent state.
433 */
434 void __dev_remove_pack(struct packet_type *pt)
435 {
436 struct list_head *head = ptype_head(pt);
437 struct packet_type *pt1;
438
439 spin_lock(&ptype_lock);
440
441 list_for_each_entry(pt1, head, list) {
442 if (pt == pt1) {
443 list_del_rcu(&pt->list);
444 goto out;
445 }
446 }
447
448 pr_warn("dev_remove_pack: %p not found\n", pt);
449 out:
450 spin_unlock(&ptype_lock);
451 }
452 EXPORT_SYMBOL(__dev_remove_pack);
453
454 /**
455 * dev_remove_pack - remove packet handler
456 * @pt: packet type declaration
457 *
458 * Remove a protocol handler that was previously added to the kernel
459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
460 * from the kernel lists and can be freed or reused once this function
461 * returns.
462 *
463 * This call sleeps to guarantee that no CPU is looking at the packet
464 * type after return.
465 */
466 void dev_remove_pack(struct packet_type *pt)
467 {
468 __dev_remove_pack(pt);
469
470 synchronize_net();
471 }
472 EXPORT_SYMBOL(dev_remove_pack);
473
474
475 /**
476 * dev_add_offload - register offload handlers
477 * @po: protocol offload declaration
478 *
479 * Add protocol offload handlers to the networking stack. The passed
480 * &proto_offload is linked into kernel lists and may not be freed until
481 * it has been removed from the kernel lists.
482 *
483 * This call does not sleep therefore it can not
484 * guarantee all CPU's that are in middle of receiving packets
485 * will see the new offload handlers (until the next received packet).
486 */
487 void dev_add_offload(struct packet_offload *po)
488 {
489 struct list_head *head = &offload_base;
490
491 spin_lock(&offload_lock);
492 list_add_rcu(&po->list, head);
493 spin_unlock(&offload_lock);
494 }
495 EXPORT_SYMBOL(dev_add_offload);
496
497 /**
498 * __dev_remove_offload - remove offload handler
499 * @po: packet offload declaration
500 *
501 * Remove a protocol offload handler that was previously added to the
502 * kernel offload handlers by dev_add_offload(). The passed &offload_type
503 * is removed from the kernel lists and can be freed or reused once this
504 * function returns.
505 *
506 * The packet type might still be in use by receivers
507 * and must not be freed until after all the CPU's have gone
508 * through a quiescent state.
509 */
510 void __dev_remove_offload(struct packet_offload *po)
511 {
512 struct list_head *head = &offload_base;
513 struct packet_offload *po1;
514
515 spin_lock(&offload_lock);
516
517 list_for_each_entry(po1, head, list) {
518 if (po == po1) {
519 list_del_rcu(&po->list);
520 goto out;
521 }
522 }
523
524 pr_warn("dev_remove_offload: %p not found\n", po);
525 out:
526 spin_unlock(&offload_lock);
527 }
528 EXPORT_SYMBOL(__dev_remove_offload);
529
530 /**
531 * dev_remove_offload - remove packet offload handler
532 * @po: packet offload declaration
533 *
534 * Remove a packet offload handler that was previously added to the kernel
535 * offload handlers by dev_add_offload(). The passed &offload_type is
536 * removed from the kernel lists and can be freed or reused once this
537 * function returns.
538 *
539 * This call sleeps to guarantee that no CPU is looking at the packet
540 * type after return.
541 */
542 void dev_remove_offload(struct packet_offload *po)
543 {
544 __dev_remove_offload(po);
545
546 synchronize_net();
547 }
548 EXPORT_SYMBOL(dev_remove_offload);
549
550 /******************************************************************************
551
552 Device Boot-time Settings Routines
553
554 *******************************************************************************/
555
556 /* Boot time configuration table */
557 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
558
559 /**
560 * netdev_boot_setup_add - add new setup entry
561 * @name: name of the device
562 * @map: configured settings for the device
563 *
564 * Adds new setup entry to the dev_boot_setup list. The function
565 * returns 0 on error and 1 on success. This is a generic routine to
566 * all netdevices.
567 */
568 static int netdev_boot_setup_add(char *name, struct ifmap *map)
569 {
570 struct netdev_boot_setup *s;
571 int i;
572
573 s = dev_boot_setup;
574 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
575 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
576 memset(s[i].name, 0, sizeof(s[i].name));
577 strlcpy(s[i].name, name, IFNAMSIZ);
578 memcpy(&s[i].map, map, sizeof(s[i].map));
579 break;
580 }
581 }
582
583 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
584 }
585
586 /**
587 * netdev_boot_setup_check - check boot time settings
588 * @dev: the netdevice
589 *
590 * Check boot time settings for the device.
591 * The found settings are set for the device to be used
592 * later in the device probing.
593 * Returns 0 if no settings found, 1 if they are.
594 */
595 int netdev_boot_setup_check(struct net_device *dev)
596 {
597 struct netdev_boot_setup *s = dev_boot_setup;
598 int i;
599
600 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
601 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
602 !strcmp(dev->name, s[i].name)) {
603 dev->irq = s[i].map.irq;
604 dev->base_addr = s[i].map.base_addr;
605 dev->mem_start = s[i].map.mem_start;
606 dev->mem_end = s[i].map.mem_end;
607 return 1;
608 }
609 }
610 return 0;
611 }
612 EXPORT_SYMBOL(netdev_boot_setup_check);
613
614
615 /**
616 * netdev_boot_base - get address from boot time settings
617 * @prefix: prefix for network device
618 * @unit: id for network device
619 *
620 * Check boot time settings for the base address of device.
621 * The found settings are set for the device to be used
622 * later in the device probing.
623 * Returns 0 if no settings found.
624 */
625 unsigned long netdev_boot_base(const char *prefix, int unit)
626 {
627 const struct netdev_boot_setup *s = dev_boot_setup;
628 char name[IFNAMSIZ];
629 int i;
630
631 sprintf(name, "%s%d", prefix, unit);
632
633 /*
634 * If device already registered then return base of 1
635 * to indicate not to probe for this interface
636 */
637 if (__dev_get_by_name(&init_net, name))
638 return 1;
639
640 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
641 if (!strcmp(name, s[i].name))
642 return s[i].map.base_addr;
643 return 0;
644 }
645
646 /*
647 * Saves at boot time configured settings for any netdevice.
648 */
649 int __init netdev_boot_setup(char *str)
650 {
651 int ints[5];
652 struct ifmap map;
653
654 str = get_options(str, ARRAY_SIZE(ints), ints);
655 if (!str || !*str)
656 return 0;
657
658 /* Save settings */
659 memset(&map, 0, sizeof(map));
660 if (ints[0] > 0)
661 map.irq = ints[1];
662 if (ints[0] > 1)
663 map.base_addr = ints[2];
664 if (ints[0] > 2)
665 map.mem_start = ints[3];
666 if (ints[0] > 3)
667 map.mem_end = ints[4];
668
669 /* Add new entry to the list */
670 return netdev_boot_setup_add(str, &map);
671 }
672
673 __setup("netdev=", netdev_boot_setup);
674
675 /*******************************************************************************
676
677 Device Interface Subroutines
678
679 *******************************************************************************/
680
681 /**
682 * __dev_get_by_name - find a device by its name
683 * @net: the applicable net namespace
684 * @name: name to find
685 *
686 * Find an interface by name. Must be called under RTNL semaphore
687 * or @dev_base_lock. If the name is found a pointer to the device
688 * is returned. If the name is not found then %NULL is returned. The
689 * reference counters are not incremented so the caller must be
690 * careful with locks.
691 */
692
693 struct net_device *__dev_get_by_name(struct net *net, const char *name)
694 {
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_name_hash(net, name);
698
699 hlist_for_each_entry(dev, p, head, name_hlist)
700 if (!strncmp(dev->name, name, IFNAMSIZ))
701 return dev;
702
703 return NULL;
704 }
705 EXPORT_SYMBOL(__dev_get_by_name);
706
707 /**
708 * dev_get_by_name_rcu - find a device by its name
709 * @net: the applicable net namespace
710 * @name: name to find
711 *
712 * Find an interface by name.
713 * If the name is found a pointer to the device is returned.
714 * If the name is not found then %NULL is returned.
715 * The reference counters are not incremented so the caller must be
716 * careful with locks. The caller must hold RCU lock.
717 */
718
719 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
720 {
721 struct hlist_node *p;
722 struct net_device *dev;
723 struct hlist_head *head = dev_name_hash(net, name);
724
725 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
726 if (!strncmp(dev->name, name, IFNAMSIZ))
727 return dev;
728
729 return NULL;
730 }
731 EXPORT_SYMBOL(dev_get_by_name_rcu);
732
733 /**
734 * dev_get_by_name - find a device by its name
735 * @net: the applicable net namespace
736 * @name: name to find
737 *
738 * Find an interface by name. This can be called from any
739 * context and does its own locking. The returned handle has
740 * the usage count incremented and the caller must use dev_put() to
741 * release it when it is no longer needed. %NULL is returned if no
742 * matching device is found.
743 */
744
745 struct net_device *dev_get_by_name(struct net *net, const char *name)
746 {
747 struct net_device *dev;
748
749 rcu_read_lock();
750 dev = dev_get_by_name_rcu(net, name);
751 if (dev)
752 dev_hold(dev);
753 rcu_read_unlock();
754 return dev;
755 }
756 EXPORT_SYMBOL(dev_get_by_name);
757
758 /**
759 * __dev_get_by_index - find a device by its ifindex
760 * @net: the applicable net namespace
761 * @ifindex: index of device
762 *
763 * Search for an interface by index. Returns %NULL if the device
764 * is not found or a pointer to the device. The device has not
765 * had its reference counter increased so the caller must be careful
766 * about locking. The caller must hold either the RTNL semaphore
767 * or @dev_base_lock.
768 */
769
770 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
771 {
772 struct hlist_node *p;
773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
775
776 hlist_for_each_entry(dev, p, head, index_hlist)
777 if (dev->ifindex == ifindex)
778 return dev;
779
780 return NULL;
781 }
782 EXPORT_SYMBOL(__dev_get_by_index);
783
784 /**
785 * dev_get_by_index_rcu - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
788 *
789 * Search for an interface by index. Returns %NULL if the device
790 * is not found or a pointer to the device. The device has not
791 * had its reference counter increased so the caller must be careful
792 * about locking. The caller must hold RCU lock.
793 */
794
795 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
796 {
797 struct hlist_node *p;
798 struct net_device *dev;
799 struct hlist_head *head = dev_index_hash(net, ifindex);
800
801 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
802 if (dev->ifindex == ifindex)
803 return dev;
804
805 return NULL;
806 }
807 EXPORT_SYMBOL(dev_get_by_index_rcu);
808
809
810 /**
811 * dev_get_by_index - find a device by its ifindex
812 * @net: the applicable net namespace
813 * @ifindex: index of device
814 *
815 * Search for an interface by index. Returns NULL if the device
816 * is not found or a pointer to the device. The device returned has
817 * had a reference added and the pointer is safe until the user calls
818 * dev_put to indicate they have finished with it.
819 */
820
821 struct net_device *dev_get_by_index(struct net *net, int ifindex)
822 {
823 struct net_device *dev;
824
825 rcu_read_lock();
826 dev = dev_get_by_index_rcu(net, ifindex);
827 if (dev)
828 dev_hold(dev);
829 rcu_read_unlock();
830 return dev;
831 }
832 EXPORT_SYMBOL(dev_get_by_index);
833
834 /**
835 * dev_getbyhwaddr_rcu - find a device by its hardware address
836 * @net: the applicable net namespace
837 * @type: media type of device
838 * @ha: hardware address
839 *
840 * Search for an interface by MAC address. Returns NULL if the device
841 * is not found or a pointer to the device.
842 * The caller must hold RCU or RTNL.
843 * The returned device has not had its ref count increased
844 * and the caller must therefore be careful about locking
845 *
846 */
847
848 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
849 const char *ha)
850 {
851 struct net_device *dev;
852
853 for_each_netdev_rcu(net, dev)
854 if (dev->type == type &&
855 !memcmp(dev->dev_addr, ha, dev->addr_len))
856 return dev;
857
858 return NULL;
859 }
860 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
861
862 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
863 {
864 struct net_device *dev;
865
866 ASSERT_RTNL();
867 for_each_netdev(net, dev)
868 if (dev->type == type)
869 return dev;
870
871 return NULL;
872 }
873 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
874
875 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
876 {
877 struct net_device *dev, *ret = NULL;
878
879 rcu_read_lock();
880 for_each_netdev_rcu(net, dev)
881 if (dev->type == type) {
882 dev_hold(dev);
883 ret = dev;
884 break;
885 }
886 rcu_read_unlock();
887 return ret;
888 }
889 EXPORT_SYMBOL(dev_getfirstbyhwtype);
890
891 /**
892 * dev_get_by_flags_rcu - find any device with given flags
893 * @net: the applicable net namespace
894 * @if_flags: IFF_* values
895 * @mask: bitmask of bits in if_flags to check
896 *
897 * Search for any interface with the given flags. Returns NULL if a device
898 * is not found or a pointer to the device. Must be called inside
899 * rcu_read_lock(), and result refcount is unchanged.
900 */
901
902 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
903 unsigned short mask)
904 {
905 struct net_device *dev, *ret;
906
907 ret = NULL;
908 for_each_netdev_rcu(net, dev) {
909 if (((dev->flags ^ if_flags) & mask) == 0) {
910 ret = dev;
911 break;
912 }
913 }
914 return ret;
915 }
916 EXPORT_SYMBOL(dev_get_by_flags_rcu);
917
918 /**
919 * dev_valid_name - check if name is okay for network device
920 * @name: name string
921 *
922 * Network device names need to be valid file names to
923 * to allow sysfs to work. We also disallow any kind of
924 * whitespace.
925 */
926 bool dev_valid_name(const char *name)
927 {
928 if (*name == '\0')
929 return false;
930 if (strlen(name) >= IFNAMSIZ)
931 return false;
932 if (!strcmp(name, ".") || !strcmp(name, ".."))
933 return false;
934
935 while (*name) {
936 if (*name == '/' || isspace(*name))
937 return false;
938 name++;
939 }
940 return true;
941 }
942 EXPORT_SYMBOL(dev_valid_name);
943
944 /**
945 * __dev_alloc_name - allocate a name for a device
946 * @net: network namespace to allocate the device name in
947 * @name: name format string
948 * @buf: scratch buffer and result name string
949 *
950 * Passed a format string - eg "lt%d" it will try and find a suitable
951 * id. It scans list of devices to build up a free map, then chooses
952 * the first empty slot. The caller must hold the dev_base or rtnl lock
953 * while allocating the name and adding the device in order to avoid
954 * duplicates.
955 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
956 * Returns the number of the unit assigned or a negative errno code.
957 */
958
959 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
960 {
961 int i = 0;
962 const char *p;
963 const int max_netdevices = 8*PAGE_SIZE;
964 unsigned long *inuse;
965 struct net_device *d;
966
967 p = strnchr(name, IFNAMSIZ-1, '%');
968 if (p) {
969 /*
970 * Verify the string as this thing may have come from
971 * the user. There must be either one "%d" and no other "%"
972 * characters.
973 */
974 if (p[1] != 'd' || strchr(p + 2, '%'))
975 return -EINVAL;
976
977 /* Use one page as a bit array of possible slots */
978 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
979 if (!inuse)
980 return -ENOMEM;
981
982 for_each_netdev(net, d) {
983 if (!sscanf(d->name, name, &i))
984 continue;
985 if (i < 0 || i >= max_netdevices)
986 continue;
987
988 /* avoid cases where sscanf is not exact inverse of printf */
989 snprintf(buf, IFNAMSIZ, name, i);
990 if (!strncmp(buf, d->name, IFNAMSIZ))
991 set_bit(i, inuse);
992 }
993
994 i = find_first_zero_bit(inuse, max_netdevices);
995 free_page((unsigned long) inuse);
996 }
997
998 if (buf != name)
999 snprintf(buf, IFNAMSIZ, name, i);
1000 if (!__dev_get_by_name(net, buf))
1001 return i;
1002
1003 /* It is possible to run out of possible slots
1004 * when the name is long and there isn't enough space left
1005 * for the digits, or if all bits are used.
1006 */
1007 return -ENFILE;
1008 }
1009
1010 /**
1011 * dev_alloc_name - allocate a name for a device
1012 * @dev: device
1013 * @name: name format string
1014 *
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024 int dev_alloc_name(struct net_device *dev, const char *name)
1025 {
1026 char buf[IFNAMSIZ];
1027 struct net *net;
1028 int ret;
1029
1030 BUG_ON(!dev_net(dev));
1031 net = dev_net(dev);
1032 ret = __dev_alloc_name(net, name, buf);
1033 if (ret >= 0)
1034 strlcpy(dev->name, buf, IFNAMSIZ);
1035 return ret;
1036 }
1037 EXPORT_SYMBOL(dev_alloc_name);
1038
1039 static int dev_alloc_name_ns(struct net *net,
1040 struct net_device *dev,
1041 const char *name)
1042 {
1043 char buf[IFNAMSIZ];
1044 int ret;
1045
1046 ret = __dev_alloc_name(net, name, buf);
1047 if (ret >= 0)
1048 strlcpy(dev->name, buf, IFNAMSIZ);
1049 return ret;
1050 }
1051
1052 static int dev_get_valid_name(struct net *net,
1053 struct net_device *dev,
1054 const char *name)
1055 {
1056 BUG_ON(!net);
1057
1058 if (!dev_valid_name(name))
1059 return -EINVAL;
1060
1061 if (strchr(name, '%'))
1062 return dev_alloc_name_ns(net, dev, name);
1063 else if (__dev_get_by_name(net, name))
1064 return -EEXIST;
1065 else if (dev->name != name)
1066 strlcpy(dev->name, name, IFNAMSIZ);
1067
1068 return 0;
1069 }
1070
1071 /**
1072 * dev_change_name - change name of a device
1073 * @dev: device
1074 * @newname: name (or format string) must be at least IFNAMSIZ
1075 *
1076 * Change name of a device, can pass format strings "eth%d".
1077 * for wildcarding.
1078 */
1079 int dev_change_name(struct net_device *dev, const char *newname)
1080 {
1081 char oldname[IFNAMSIZ];
1082 int err = 0;
1083 int ret;
1084 struct net *net;
1085
1086 ASSERT_RTNL();
1087 BUG_ON(!dev_net(dev));
1088
1089 net = dev_net(dev);
1090 if (dev->flags & IFF_UP)
1091 return -EBUSY;
1092
1093 write_seqcount_begin(&devnet_rename_seq);
1094
1095 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1096 write_seqcount_end(&devnet_rename_seq);
1097 return 0;
1098 }
1099
1100 memcpy(oldname, dev->name, IFNAMSIZ);
1101
1102 err = dev_get_valid_name(net, dev, newname);
1103 if (err < 0) {
1104 write_seqcount_end(&devnet_rename_seq);
1105 return err;
1106 }
1107
1108 rollback:
1109 ret = device_rename(&dev->dev, dev->name);
1110 if (ret) {
1111 memcpy(dev->name, oldname, IFNAMSIZ);
1112 write_seqcount_end(&devnet_rename_seq);
1113 return ret;
1114 }
1115
1116 write_seqcount_end(&devnet_rename_seq);
1117
1118 write_lock_bh(&dev_base_lock);
1119 hlist_del_rcu(&dev->name_hlist);
1120 write_unlock_bh(&dev_base_lock);
1121
1122 synchronize_rcu();
1123
1124 write_lock_bh(&dev_base_lock);
1125 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1126 write_unlock_bh(&dev_base_lock);
1127
1128 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1129 ret = notifier_to_errno(ret);
1130
1131 if (ret) {
1132 /* err >= 0 after dev_alloc_name() or stores the first errno */
1133 if (err >= 0) {
1134 err = ret;
1135 write_seqcount_begin(&devnet_rename_seq);
1136 memcpy(dev->name, oldname, IFNAMSIZ);
1137 goto rollback;
1138 } else {
1139 pr_err("%s: name change rollback failed: %d\n",
1140 dev->name, ret);
1141 }
1142 }
1143
1144 return err;
1145 }
1146
1147 /**
1148 * dev_set_alias - change ifalias of a device
1149 * @dev: device
1150 * @alias: name up to IFALIASZ
1151 * @len: limit of bytes to copy from info
1152 *
1153 * Set ifalias for a device,
1154 */
1155 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1156 {
1157 char *new_ifalias;
1158
1159 ASSERT_RTNL();
1160
1161 if (len >= IFALIASZ)
1162 return -EINVAL;
1163
1164 if (!len) {
1165 kfree(dev->ifalias);
1166 dev->ifalias = NULL;
1167 return 0;
1168 }
1169
1170 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1171 if (!new_ifalias)
1172 return -ENOMEM;
1173 dev->ifalias = new_ifalias;
1174
1175 strlcpy(dev->ifalias, alias, len+1);
1176 return len;
1177 }
1178
1179
1180 /**
1181 * netdev_features_change - device changes features
1182 * @dev: device to cause notification
1183 *
1184 * Called to indicate a device has changed features.
1185 */
1186 void netdev_features_change(struct net_device *dev)
1187 {
1188 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1189 }
1190 EXPORT_SYMBOL(netdev_features_change);
1191
1192 /**
1193 * netdev_state_change - device changes state
1194 * @dev: device to cause notification
1195 *
1196 * Called to indicate a device has changed state. This function calls
1197 * the notifier chains for netdev_chain and sends a NEWLINK message
1198 * to the routing socket.
1199 */
1200 void netdev_state_change(struct net_device *dev)
1201 {
1202 if (dev->flags & IFF_UP) {
1203 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1204 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1205 }
1206 }
1207 EXPORT_SYMBOL(netdev_state_change);
1208
1209 /**
1210 * netdev_notify_peers - notify network peers about existence of @dev
1211 * @dev: network device
1212 *
1213 * Generate traffic such that interested network peers are aware of
1214 * @dev, such as by generating a gratuitous ARP. This may be used when
1215 * a device wants to inform the rest of the network about some sort of
1216 * reconfiguration such as a failover event or virtual machine
1217 * migration.
1218 */
1219 void netdev_notify_peers(struct net_device *dev)
1220 {
1221 rtnl_lock();
1222 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1223 rtnl_unlock();
1224 }
1225 EXPORT_SYMBOL(netdev_notify_peers);
1226
1227 static int __dev_open(struct net_device *dev)
1228 {
1229 const struct net_device_ops *ops = dev->netdev_ops;
1230 int ret;
1231
1232 ASSERT_RTNL();
1233
1234 if (!netif_device_present(dev))
1235 return -ENODEV;
1236
1237 /* Block netpoll from trying to do any rx path servicing.
1238 * If we don't do this there is a chance ndo_poll_controller
1239 * or ndo_poll may be running while we open the device
1240 */
1241 ret = netpoll_rx_disable(dev);
1242 if (ret)
1243 return ret;
1244
1245 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1246 ret = notifier_to_errno(ret);
1247 if (ret)
1248 return ret;
1249
1250 set_bit(__LINK_STATE_START, &dev->state);
1251
1252 if (ops->ndo_validate_addr)
1253 ret = ops->ndo_validate_addr(dev);
1254
1255 if (!ret && ops->ndo_open)
1256 ret = ops->ndo_open(dev);
1257
1258 netpoll_rx_enable(dev);
1259
1260 if (ret)
1261 clear_bit(__LINK_STATE_START, &dev->state);
1262 else {
1263 dev->flags |= IFF_UP;
1264 net_dmaengine_get();
1265 dev_set_rx_mode(dev);
1266 dev_activate(dev);
1267 add_device_randomness(dev->dev_addr, dev->addr_len);
1268 }
1269
1270 return ret;
1271 }
1272
1273 /**
1274 * dev_open - prepare an interface for use.
1275 * @dev: device to open
1276 *
1277 * Takes a device from down to up state. The device's private open
1278 * function is invoked and then the multicast lists are loaded. Finally
1279 * the device is moved into the up state and a %NETDEV_UP message is
1280 * sent to the netdev notifier chain.
1281 *
1282 * Calling this function on an active interface is a nop. On a failure
1283 * a negative errno code is returned.
1284 */
1285 int dev_open(struct net_device *dev)
1286 {
1287 int ret;
1288
1289 if (dev->flags & IFF_UP)
1290 return 0;
1291
1292 ret = __dev_open(dev);
1293 if (ret < 0)
1294 return ret;
1295
1296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297 call_netdevice_notifiers(NETDEV_UP, dev);
1298
1299 return ret;
1300 }
1301 EXPORT_SYMBOL(dev_open);
1302
1303 static int __dev_close_many(struct list_head *head)
1304 {
1305 struct net_device *dev;
1306
1307 ASSERT_RTNL();
1308 might_sleep();
1309
1310 list_for_each_entry(dev, head, unreg_list) {
1311 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1312
1313 clear_bit(__LINK_STATE_START, &dev->state);
1314
1315 /* Synchronize to scheduled poll. We cannot touch poll list, it
1316 * can be even on different cpu. So just clear netif_running().
1317 *
1318 * dev->stop() will invoke napi_disable() on all of it's
1319 * napi_struct instances on this device.
1320 */
1321 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1322 }
1323
1324 dev_deactivate_many(head);
1325
1326 list_for_each_entry(dev, head, unreg_list) {
1327 const struct net_device_ops *ops = dev->netdev_ops;
1328
1329 /*
1330 * Call the device specific close. This cannot fail.
1331 * Only if device is UP
1332 *
1333 * We allow it to be called even after a DETACH hot-plug
1334 * event.
1335 */
1336 if (ops->ndo_stop)
1337 ops->ndo_stop(dev);
1338
1339 dev->flags &= ~IFF_UP;
1340 net_dmaengine_put();
1341 }
1342
1343 return 0;
1344 }
1345
1346 static int __dev_close(struct net_device *dev)
1347 {
1348 int retval;
1349 LIST_HEAD(single);
1350
1351 /* Temporarily disable netpoll until the interface is down */
1352 retval = netpoll_rx_disable(dev);
1353 if (retval)
1354 return retval;
1355
1356 list_add(&dev->unreg_list, &single);
1357 retval = __dev_close_many(&single);
1358 list_del(&single);
1359
1360 netpoll_rx_enable(dev);
1361 return retval;
1362 }
1363
1364 static int dev_close_many(struct list_head *head)
1365 {
1366 struct net_device *dev, *tmp;
1367 LIST_HEAD(tmp_list);
1368
1369 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1370 if (!(dev->flags & IFF_UP))
1371 list_move(&dev->unreg_list, &tmp_list);
1372
1373 __dev_close_many(head);
1374
1375 list_for_each_entry(dev, head, unreg_list) {
1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1377 call_netdevice_notifiers(NETDEV_DOWN, dev);
1378 }
1379
1380 /* rollback_registered_many needs the complete original list */
1381 list_splice(&tmp_list, head);
1382 return 0;
1383 }
1384
1385 /**
1386 * dev_close - shutdown an interface.
1387 * @dev: device to shutdown
1388 *
1389 * This function moves an active device into down state. A
1390 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1391 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1392 * chain.
1393 */
1394 int dev_close(struct net_device *dev)
1395 {
1396 int ret = 0;
1397 if (dev->flags & IFF_UP) {
1398 LIST_HEAD(single);
1399
1400 /* Block netpoll rx while the interface is going down */
1401 ret = netpoll_rx_disable(dev);
1402 if (ret)
1403 return ret;
1404
1405 list_add(&dev->unreg_list, &single);
1406 dev_close_many(&single);
1407 list_del(&single);
1408
1409 netpoll_rx_enable(dev);
1410 }
1411 return ret;
1412 }
1413 EXPORT_SYMBOL(dev_close);
1414
1415
1416 /**
1417 * dev_disable_lro - disable Large Receive Offload on a device
1418 * @dev: device
1419 *
1420 * Disable Large Receive Offload (LRO) on a net device. Must be
1421 * called under RTNL. This is needed if received packets may be
1422 * forwarded to another interface.
1423 */
1424 void dev_disable_lro(struct net_device *dev)
1425 {
1426 /*
1427 * If we're trying to disable lro on a vlan device
1428 * use the underlying physical device instead
1429 */
1430 if (is_vlan_dev(dev))
1431 dev = vlan_dev_real_dev(dev);
1432
1433 dev->wanted_features &= ~NETIF_F_LRO;
1434 netdev_update_features(dev);
1435
1436 if (unlikely(dev->features & NETIF_F_LRO))
1437 netdev_WARN(dev, "failed to disable LRO!\n");
1438 }
1439 EXPORT_SYMBOL(dev_disable_lro);
1440
1441
1442 static int dev_boot_phase = 1;
1443
1444 /**
1445 * register_netdevice_notifier - register a network notifier block
1446 * @nb: notifier
1447 *
1448 * Register a notifier to be called when network device events occur.
1449 * The notifier passed is linked into the kernel structures and must
1450 * not be reused until it has been unregistered. A negative errno code
1451 * is returned on a failure.
1452 *
1453 * When registered all registration and up events are replayed
1454 * to the new notifier to allow device to have a race free
1455 * view of the network device list.
1456 */
1457
1458 int register_netdevice_notifier(struct notifier_block *nb)
1459 {
1460 struct net_device *dev;
1461 struct net_device *last;
1462 struct net *net;
1463 int err;
1464
1465 rtnl_lock();
1466 err = raw_notifier_chain_register(&netdev_chain, nb);
1467 if (err)
1468 goto unlock;
1469 if (dev_boot_phase)
1470 goto unlock;
1471 for_each_net(net) {
1472 for_each_netdev(net, dev) {
1473 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1474 err = notifier_to_errno(err);
1475 if (err)
1476 goto rollback;
1477
1478 if (!(dev->flags & IFF_UP))
1479 continue;
1480
1481 nb->notifier_call(nb, NETDEV_UP, dev);
1482 }
1483 }
1484
1485 unlock:
1486 rtnl_unlock();
1487 return err;
1488
1489 rollback:
1490 last = dev;
1491 for_each_net(net) {
1492 for_each_netdev(net, dev) {
1493 if (dev == last)
1494 goto outroll;
1495
1496 if (dev->flags & IFF_UP) {
1497 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1498 nb->notifier_call(nb, NETDEV_DOWN, dev);
1499 }
1500 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1501 }
1502 }
1503
1504 outroll:
1505 raw_notifier_chain_unregister(&netdev_chain, nb);
1506 goto unlock;
1507 }
1508 EXPORT_SYMBOL(register_netdevice_notifier);
1509
1510 /**
1511 * unregister_netdevice_notifier - unregister a network notifier block
1512 * @nb: notifier
1513 *
1514 * Unregister a notifier previously registered by
1515 * register_netdevice_notifier(). The notifier is unlinked into the
1516 * kernel structures and may then be reused. A negative errno code
1517 * is returned on a failure.
1518 *
1519 * After unregistering unregister and down device events are synthesized
1520 * for all devices on the device list to the removed notifier to remove
1521 * the need for special case cleanup code.
1522 */
1523
1524 int unregister_netdevice_notifier(struct notifier_block *nb)
1525 {
1526 struct net_device *dev;
1527 struct net *net;
1528 int err;
1529
1530 rtnl_lock();
1531 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1532 if (err)
1533 goto unlock;
1534
1535 for_each_net(net) {
1536 for_each_netdev(net, dev) {
1537 if (dev->flags & IFF_UP) {
1538 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1539 nb->notifier_call(nb, NETDEV_DOWN, dev);
1540 }
1541 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1542 }
1543 }
1544 unlock:
1545 rtnl_unlock();
1546 return err;
1547 }
1548 EXPORT_SYMBOL(unregister_netdevice_notifier);
1549
1550 /**
1551 * call_netdevice_notifiers - call all network notifier blocks
1552 * @val: value passed unmodified to notifier function
1553 * @dev: net_device pointer passed unmodified to notifier function
1554 *
1555 * Call all network notifier blocks. Parameters and return value
1556 * are as for raw_notifier_call_chain().
1557 */
1558
1559 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1560 {
1561 ASSERT_RTNL();
1562 return raw_notifier_call_chain(&netdev_chain, val, dev);
1563 }
1564 EXPORT_SYMBOL(call_netdevice_notifiers);
1565
1566 static struct static_key netstamp_needed __read_mostly;
1567 #ifdef HAVE_JUMP_LABEL
1568 /* We are not allowed to call static_key_slow_dec() from irq context
1569 * If net_disable_timestamp() is called from irq context, defer the
1570 * static_key_slow_dec() calls.
1571 */
1572 static atomic_t netstamp_needed_deferred;
1573 #endif
1574
1575 void net_enable_timestamp(void)
1576 {
1577 #ifdef HAVE_JUMP_LABEL
1578 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1579
1580 if (deferred) {
1581 while (--deferred)
1582 static_key_slow_dec(&netstamp_needed);
1583 return;
1584 }
1585 #endif
1586 WARN_ON(in_interrupt());
1587 static_key_slow_inc(&netstamp_needed);
1588 }
1589 EXPORT_SYMBOL(net_enable_timestamp);
1590
1591 void net_disable_timestamp(void)
1592 {
1593 #ifdef HAVE_JUMP_LABEL
1594 if (in_interrupt()) {
1595 atomic_inc(&netstamp_needed_deferred);
1596 return;
1597 }
1598 #endif
1599 static_key_slow_dec(&netstamp_needed);
1600 }
1601 EXPORT_SYMBOL(net_disable_timestamp);
1602
1603 static inline void net_timestamp_set(struct sk_buff *skb)
1604 {
1605 skb->tstamp.tv64 = 0;
1606 if (static_key_false(&netstamp_needed))
1607 __net_timestamp(skb);
1608 }
1609
1610 #define net_timestamp_check(COND, SKB) \
1611 if (static_key_false(&netstamp_needed)) { \
1612 if ((COND) && !(SKB)->tstamp.tv64) \
1613 __net_timestamp(SKB); \
1614 } \
1615
1616 static inline bool is_skb_forwardable(struct net_device *dev,
1617 struct sk_buff *skb)
1618 {
1619 unsigned int len;
1620
1621 if (!(dev->flags & IFF_UP))
1622 return false;
1623
1624 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1625 if (skb->len <= len)
1626 return true;
1627
1628 /* if TSO is enabled, we don't care about the length as the packet
1629 * could be forwarded without being segmented before
1630 */
1631 if (skb_is_gso(skb))
1632 return true;
1633
1634 return false;
1635 }
1636
1637 /**
1638 * dev_forward_skb - loopback an skb to another netif
1639 *
1640 * @dev: destination network device
1641 * @skb: buffer to forward
1642 *
1643 * return values:
1644 * NET_RX_SUCCESS (no congestion)
1645 * NET_RX_DROP (packet was dropped, but freed)
1646 *
1647 * dev_forward_skb can be used for injecting an skb from the
1648 * start_xmit function of one device into the receive queue
1649 * of another device.
1650 *
1651 * The receiving device may be in another namespace, so
1652 * we have to clear all information in the skb that could
1653 * impact namespace isolation.
1654 */
1655 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1656 {
1657 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1658 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1659 atomic_long_inc(&dev->rx_dropped);
1660 kfree_skb(skb);
1661 return NET_RX_DROP;
1662 }
1663 }
1664
1665 skb_orphan(skb);
1666 nf_reset(skb);
1667
1668 if (unlikely(!is_skb_forwardable(dev, skb))) {
1669 atomic_long_inc(&dev->rx_dropped);
1670 kfree_skb(skb);
1671 return NET_RX_DROP;
1672 }
1673 skb->skb_iif = 0;
1674 skb->dev = dev;
1675 skb_dst_drop(skb);
1676 skb->tstamp.tv64 = 0;
1677 skb->pkt_type = PACKET_HOST;
1678 skb->protocol = eth_type_trans(skb, dev);
1679 skb->mark = 0;
1680 secpath_reset(skb);
1681 nf_reset(skb);
1682 return netif_rx(skb);
1683 }
1684 EXPORT_SYMBOL_GPL(dev_forward_skb);
1685
1686 static inline int deliver_skb(struct sk_buff *skb,
1687 struct packet_type *pt_prev,
1688 struct net_device *orig_dev)
1689 {
1690 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1691 return -ENOMEM;
1692 atomic_inc(&skb->users);
1693 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1694 }
1695
1696 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1697 {
1698 if (!ptype->af_packet_priv || !skb->sk)
1699 return false;
1700
1701 if (ptype->id_match)
1702 return ptype->id_match(ptype, skb->sk);
1703 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1704 return true;
1705
1706 return false;
1707 }
1708
1709 /*
1710 * Support routine. Sends outgoing frames to any network
1711 * taps currently in use.
1712 */
1713
1714 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1715 {
1716 struct packet_type *ptype;
1717 struct sk_buff *skb2 = NULL;
1718 struct packet_type *pt_prev = NULL;
1719
1720 rcu_read_lock();
1721 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1722 /* Never send packets back to the socket
1723 * they originated from - MvS (miquels@drinkel.ow.org)
1724 */
1725 if ((ptype->dev == dev || !ptype->dev) &&
1726 (!skb_loop_sk(ptype, skb))) {
1727 if (pt_prev) {
1728 deliver_skb(skb2, pt_prev, skb->dev);
1729 pt_prev = ptype;
1730 continue;
1731 }
1732
1733 skb2 = skb_clone(skb, GFP_ATOMIC);
1734 if (!skb2)
1735 break;
1736
1737 net_timestamp_set(skb2);
1738
1739 /* skb->nh should be correctly
1740 set by sender, so that the second statement is
1741 just protection against buggy protocols.
1742 */
1743 skb_reset_mac_header(skb2);
1744
1745 if (skb_network_header(skb2) < skb2->data ||
1746 skb2->network_header > skb2->tail) {
1747 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1748 ntohs(skb2->protocol),
1749 dev->name);
1750 skb_reset_network_header(skb2);
1751 }
1752
1753 skb2->transport_header = skb2->network_header;
1754 skb2->pkt_type = PACKET_OUTGOING;
1755 pt_prev = ptype;
1756 }
1757 }
1758 if (pt_prev)
1759 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1760 rcu_read_unlock();
1761 }
1762
1763 /**
1764 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1765 * @dev: Network device
1766 * @txq: number of queues available
1767 *
1768 * If real_num_tx_queues is changed the tc mappings may no longer be
1769 * valid. To resolve this verify the tc mapping remains valid and if
1770 * not NULL the mapping. With no priorities mapping to this
1771 * offset/count pair it will no longer be used. In the worst case TC0
1772 * is invalid nothing can be done so disable priority mappings. If is
1773 * expected that drivers will fix this mapping if they can before
1774 * calling netif_set_real_num_tx_queues.
1775 */
1776 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1777 {
1778 int i;
1779 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1780
1781 /* If TC0 is invalidated disable TC mapping */
1782 if (tc->offset + tc->count > txq) {
1783 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1784 dev->num_tc = 0;
1785 return;
1786 }
1787
1788 /* Invalidated prio to tc mappings set to TC0 */
1789 for (i = 1; i < TC_BITMASK + 1; i++) {
1790 int q = netdev_get_prio_tc_map(dev, i);
1791
1792 tc = &dev->tc_to_txq[q];
1793 if (tc->offset + tc->count > txq) {
1794 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1795 i, q);
1796 netdev_set_prio_tc_map(dev, i, 0);
1797 }
1798 }
1799 }
1800
1801 #ifdef CONFIG_XPS
1802 static DEFINE_MUTEX(xps_map_mutex);
1803 #define xmap_dereference(P) \
1804 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1805
1806 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1807 int cpu, u16 index)
1808 {
1809 struct xps_map *map = NULL;
1810 int pos;
1811
1812 if (dev_maps)
1813 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1814
1815 for (pos = 0; map && pos < map->len; pos++) {
1816 if (map->queues[pos] == index) {
1817 if (map->len > 1) {
1818 map->queues[pos] = map->queues[--map->len];
1819 } else {
1820 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1821 kfree_rcu(map, rcu);
1822 map = NULL;
1823 }
1824 break;
1825 }
1826 }
1827
1828 return map;
1829 }
1830
1831 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1832 {
1833 struct xps_dev_maps *dev_maps;
1834 int cpu, i;
1835 bool active = false;
1836
1837 mutex_lock(&xps_map_mutex);
1838 dev_maps = xmap_dereference(dev->xps_maps);
1839
1840 if (!dev_maps)
1841 goto out_no_maps;
1842
1843 for_each_possible_cpu(cpu) {
1844 for (i = index; i < dev->num_tx_queues; i++) {
1845 if (!remove_xps_queue(dev_maps, cpu, i))
1846 break;
1847 }
1848 if (i == dev->num_tx_queues)
1849 active = true;
1850 }
1851
1852 if (!active) {
1853 RCU_INIT_POINTER(dev->xps_maps, NULL);
1854 kfree_rcu(dev_maps, rcu);
1855 }
1856
1857 for (i = index; i < dev->num_tx_queues; i++)
1858 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1859 NUMA_NO_NODE);
1860
1861 out_no_maps:
1862 mutex_unlock(&xps_map_mutex);
1863 }
1864
1865 static struct xps_map *expand_xps_map(struct xps_map *map,
1866 int cpu, u16 index)
1867 {
1868 struct xps_map *new_map;
1869 int alloc_len = XPS_MIN_MAP_ALLOC;
1870 int i, pos;
1871
1872 for (pos = 0; map && pos < map->len; pos++) {
1873 if (map->queues[pos] != index)
1874 continue;
1875 return map;
1876 }
1877
1878 /* Need to add queue to this CPU's existing map */
1879 if (map) {
1880 if (pos < map->alloc_len)
1881 return map;
1882
1883 alloc_len = map->alloc_len * 2;
1884 }
1885
1886 /* Need to allocate new map to store queue on this CPU's map */
1887 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1888 cpu_to_node(cpu));
1889 if (!new_map)
1890 return NULL;
1891
1892 for (i = 0; i < pos; i++)
1893 new_map->queues[i] = map->queues[i];
1894 new_map->alloc_len = alloc_len;
1895 new_map->len = pos;
1896
1897 return new_map;
1898 }
1899
1900 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1901 {
1902 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1903 struct xps_map *map, *new_map;
1904 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1905 int cpu, numa_node_id = -2;
1906 bool active = false;
1907
1908 mutex_lock(&xps_map_mutex);
1909
1910 dev_maps = xmap_dereference(dev->xps_maps);
1911
1912 /* allocate memory for queue storage */
1913 for_each_online_cpu(cpu) {
1914 if (!cpumask_test_cpu(cpu, mask))
1915 continue;
1916
1917 if (!new_dev_maps)
1918 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1919 if (!new_dev_maps)
1920 return -ENOMEM;
1921
1922 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1923 NULL;
1924
1925 map = expand_xps_map(map, cpu, index);
1926 if (!map)
1927 goto error;
1928
1929 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1930 }
1931
1932 if (!new_dev_maps)
1933 goto out_no_new_maps;
1934
1935 for_each_possible_cpu(cpu) {
1936 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1937 /* add queue to CPU maps */
1938 int pos = 0;
1939
1940 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1941 while ((pos < map->len) && (map->queues[pos] != index))
1942 pos++;
1943
1944 if (pos == map->len)
1945 map->queues[map->len++] = index;
1946 #ifdef CONFIG_NUMA
1947 if (numa_node_id == -2)
1948 numa_node_id = cpu_to_node(cpu);
1949 else if (numa_node_id != cpu_to_node(cpu))
1950 numa_node_id = -1;
1951 #endif
1952 } else if (dev_maps) {
1953 /* fill in the new device map from the old device map */
1954 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1955 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1956 }
1957
1958 }
1959
1960 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1961
1962 /* Cleanup old maps */
1963 if (dev_maps) {
1964 for_each_possible_cpu(cpu) {
1965 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1966 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1967 if (map && map != new_map)
1968 kfree_rcu(map, rcu);
1969 }
1970
1971 kfree_rcu(dev_maps, rcu);
1972 }
1973
1974 dev_maps = new_dev_maps;
1975 active = true;
1976
1977 out_no_new_maps:
1978 /* update Tx queue numa node */
1979 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1980 (numa_node_id >= 0) ? numa_node_id :
1981 NUMA_NO_NODE);
1982
1983 if (!dev_maps)
1984 goto out_no_maps;
1985
1986 /* removes queue from unused CPUs */
1987 for_each_possible_cpu(cpu) {
1988 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1989 continue;
1990
1991 if (remove_xps_queue(dev_maps, cpu, index))
1992 active = true;
1993 }
1994
1995 /* free map if not active */
1996 if (!active) {
1997 RCU_INIT_POINTER(dev->xps_maps, NULL);
1998 kfree_rcu(dev_maps, rcu);
1999 }
2000
2001 out_no_maps:
2002 mutex_unlock(&xps_map_mutex);
2003
2004 return 0;
2005 error:
2006 /* remove any maps that we added */
2007 for_each_possible_cpu(cpu) {
2008 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2009 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2010 NULL;
2011 if (new_map && new_map != map)
2012 kfree(new_map);
2013 }
2014
2015 mutex_unlock(&xps_map_mutex);
2016
2017 kfree(new_dev_maps);
2018 return -ENOMEM;
2019 }
2020 EXPORT_SYMBOL(netif_set_xps_queue);
2021
2022 #endif
2023 /*
2024 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2025 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2026 */
2027 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2028 {
2029 int rc;
2030
2031 if (txq < 1 || txq > dev->num_tx_queues)
2032 return -EINVAL;
2033
2034 if (dev->reg_state == NETREG_REGISTERED ||
2035 dev->reg_state == NETREG_UNREGISTERING) {
2036 ASSERT_RTNL();
2037
2038 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2039 txq);
2040 if (rc)
2041 return rc;
2042
2043 if (dev->num_tc)
2044 netif_setup_tc(dev, txq);
2045
2046 if (txq < dev->real_num_tx_queues) {
2047 qdisc_reset_all_tx_gt(dev, txq);
2048 #ifdef CONFIG_XPS
2049 netif_reset_xps_queues_gt(dev, txq);
2050 #endif
2051 }
2052 }
2053
2054 dev->real_num_tx_queues = txq;
2055 return 0;
2056 }
2057 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2058
2059 #ifdef CONFIG_RPS
2060 /**
2061 * netif_set_real_num_rx_queues - set actual number of RX queues used
2062 * @dev: Network device
2063 * @rxq: Actual number of RX queues
2064 *
2065 * This must be called either with the rtnl_lock held or before
2066 * registration of the net device. Returns 0 on success, or a
2067 * negative error code. If called before registration, it always
2068 * succeeds.
2069 */
2070 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2071 {
2072 int rc;
2073
2074 if (rxq < 1 || rxq > dev->num_rx_queues)
2075 return -EINVAL;
2076
2077 if (dev->reg_state == NETREG_REGISTERED) {
2078 ASSERT_RTNL();
2079
2080 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2081 rxq);
2082 if (rc)
2083 return rc;
2084 }
2085
2086 dev->real_num_rx_queues = rxq;
2087 return 0;
2088 }
2089 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2090 #endif
2091
2092 /**
2093 * netif_get_num_default_rss_queues - default number of RSS queues
2094 *
2095 * This routine should set an upper limit on the number of RSS queues
2096 * used by default by multiqueue devices.
2097 */
2098 int netif_get_num_default_rss_queues(void)
2099 {
2100 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2101 }
2102 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2103
2104 static inline void __netif_reschedule(struct Qdisc *q)
2105 {
2106 struct softnet_data *sd;
2107 unsigned long flags;
2108
2109 local_irq_save(flags);
2110 sd = &__get_cpu_var(softnet_data);
2111 q->next_sched = NULL;
2112 *sd->output_queue_tailp = q;
2113 sd->output_queue_tailp = &q->next_sched;
2114 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2115 local_irq_restore(flags);
2116 }
2117
2118 void __netif_schedule(struct Qdisc *q)
2119 {
2120 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2121 __netif_reschedule(q);
2122 }
2123 EXPORT_SYMBOL(__netif_schedule);
2124
2125 void dev_kfree_skb_irq(struct sk_buff *skb)
2126 {
2127 if (atomic_dec_and_test(&skb->users)) {
2128 struct softnet_data *sd;
2129 unsigned long flags;
2130
2131 local_irq_save(flags);
2132 sd = &__get_cpu_var(softnet_data);
2133 skb->next = sd->completion_queue;
2134 sd->completion_queue = skb;
2135 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2136 local_irq_restore(flags);
2137 }
2138 }
2139 EXPORT_SYMBOL(dev_kfree_skb_irq);
2140
2141 void dev_kfree_skb_any(struct sk_buff *skb)
2142 {
2143 if (in_irq() || irqs_disabled())
2144 dev_kfree_skb_irq(skb);
2145 else
2146 dev_kfree_skb(skb);
2147 }
2148 EXPORT_SYMBOL(dev_kfree_skb_any);
2149
2150
2151 /**
2152 * netif_device_detach - mark device as removed
2153 * @dev: network device
2154 *
2155 * Mark device as removed from system and therefore no longer available.
2156 */
2157 void netif_device_detach(struct net_device *dev)
2158 {
2159 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2160 netif_running(dev)) {
2161 netif_tx_stop_all_queues(dev);
2162 }
2163 }
2164 EXPORT_SYMBOL(netif_device_detach);
2165
2166 /**
2167 * netif_device_attach - mark device as attached
2168 * @dev: network device
2169 *
2170 * Mark device as attached from system and restart if needed.
2171 */
2172 void netif_device_attach(struct net_device *dev)
2173 {
2174 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2175 netif_running(dev)) {
2176 netif_tx_wake_all_queues(dev);
2177 __netdev_watchdog_up(dev);
2178 }
2179 }
2180 EXPORT_SYMBOL(netif_device_attach);
2181
2182 static void skb_warn_bad_offload(const struct sk_buff *skb)
2183 {
2184 static const netdev_features_t null_features = 0;
2185 struct net_device *dev = skb->dev;
2186 const char *driver = "";
2187
2188 if (dev && dev->dev.parent)
2189 driver = dev_driver_string(dev->dev.parent);
2190
2191 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2192 "gso_type=%d ip_summed=%d\n",
2193 driver, dev ? &dev->features : &null_features,
2194 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2195 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2196 skb_shinfo(skb)->gso_type, skb->ip_summed);
2197 }
2198
2199 /*
2200 * Invalidate hardware checksum when packet is to be mangled, and
2201 * complete checksum manually on outgoing path.
2202 */
2203 int skb_checksum_help(struct sk_buff *skb)
2204 {
2205 __wsum csum;
2206 int ret = 0, offset;
2207
2208 if (skb->ip_summed == CHECKSUM_COMPLETE)
2209 goto out_set_summed;
2210
2211 if (unlikely(skb_shinfo(skb)->gso_size)) {
2212 skb_warn_bad_offload(skb);
2213 return -EINVAL;
2214 }
2215
2216 /* Before computing a checksum, we should make sure no frag could
2217 * be modified by an external entity : checksum could be wrong.
2218 */
2219 if (skb_has_shared_frag(skb)) {
2220 ret = __skb_linearize(skb);
2221 if (ret)
2222 goto out;
2223 }
2224
2225 offset = skb_checksum_start_offset(skb);
2226 BUG_ON(offset >= skb_headlen(skb));
2227 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2228
2229 offset += skb->csum_offset;
2230 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2231
2232 if (skb_cloned(skb) &&
2233 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2234 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2235 if (ret)
2236 goto out;
2237 }
2238
2239 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2240 out_set_summed:
2241 skb->ip_summed = CHECKSUM_NONE;
2242 out:
2243 return ret;
2244 }
2245 EXPORT_SYMBOL(skb_checksum_help);
2246
2247 /**
2248 * skb_mac_gso_segment - mac layer segmentation handler.
2249 * @skb: buffer to segment
2250 * @features: features for the output path (see dev->features)
2251 */
2252 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2253 netdev_features_t features)
2254 {
2255 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2256 struct packet_offload *ptype;
2257 __be16 type = skb->protocol;
2258
2259 while (type == htons(ETH_P_8021Q)) {
2260 int vlan_depth = ETH_HLEN;
2261 struct vlan_hdr *vh;
2262
2263 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2264 return ERR_PTR(-EINVAL);
2265
2266 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2267 type = vh->h_vlan_encapsulated_proto;
2268 vlan_depth += VLAN_HLEN;
2269 }
2270
2271 __skb_pull(skb, skb->mac_len);
2272
2273 rcu_read_lock();
2274 list_for_each_entry_rcu(ptype, &offload_base, list) {
2275 if (ptype->type == type && ptype->callbacks.gso_segment) {
2276 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2277 int err;
2278
2279 err = ptype->callbacks.gso_send_check(skb);
2280 segs = ERR_PTR(err);
2281 if (err || skb_gso_ok(skb, features))
2282 break;
2283 __skb_push(skb, (skb->data -
2284 skb_network_header(skb)));
2285 }
2286 segs = ptype->callbacks.gso_segment(skb, features);
2287 break;
2288 }
2289 }
2290 rcu_read_unlock();
2291
2292 __skb_push(skb, skb->data - skb_mac_header(skb));
2293
2294 return segs;
2295 }
2296 EXPORT_SYMBOL(skb_mac_gso_segment);
2297
2298
2299 /* openvswitch calls this on rx path, so we need a different check.
2300 */
2301 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2302 {
2303 if (tx_path)
2304 return skb->ip_summed != CHECKSUM_PARTIAL;
2305 else
2306 return skb->ip_summed == CHECKSUM_NONE;
2307 }
2308
2309 /**
2310 * __skb_gso_segment - Perform segmentation on skb.
2311 * @skb: buffer to segment
2312 * @features: features for the output path (see dev->features)
2313 * @tx_path: whether it is called in TX path
2314 *
2315 * This function segments the given skb and returns a list of segments.
2316 *
2317 * It may return NULL if the skb requires no segmentation. This is
2318 * only possible when GSO is used for verifying header integrity.
2319 */
2320 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2321 netdev_features_t features, bool tx_path)
2322 {
2323 if (unlikely(skb_needs_check(skb, tx_path))) {
2324 int err;
2325
2326 skb_warn_bad_offload(skb);
2327
2328 if (skb_header_cloned(skb) &&
2329 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2330 return ERR_PTR(err);
2331 }
2332
2333 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2334 skb_reset_mac_header(skb);
2335 skb_reset_mac_len(skb);
2336
2337 return skb_mac_gso_segment(skb, features);
2338 }
2339 EXPORT_SYMBOL(__skb_gso_segment);
2340
2341 /* Take action when hardware reception checksum errors are detected. */
2342 #ifdef CONFIG_BUG
2343 void netdev_rx_csum_fault(struct net_device *dev)
2344 {
2345 if (net_ratelimit()) {
2346 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2347 dump_stack();
2348 }
2349 }
2350 EXPORT_SYMBOL(netdev_rx_csum_fault);
2351 #endif
2352
2353 /* Actually, we should eliminate this check as soon as we know, that:
2354 * 1. IOMMU is present and allows to map all the memory.
2355 * 2. No high memory really exists on this machine.
2356 */
2357
2358 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2359 {
2360 #ifdef CONFIG_HIGHMEM
2361 int i;
2362 if (!(dev->features & NETIF_F_HIGHDMA)) {
2363 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2364 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2365 if (PageHighMem(skb_frag_page(frag)))
2366 return 1;
2367 }
2368 }
2369
2370 if (PCI_DMA_BUS_IS_PHYS) {
2371 struct device *pdev = dev->dev.parent;
2372
2373 if (!pdev)
2374 return 0;
2375 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2376 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2377 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2378 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2379 return 1;
2380 }
2381 }
2382 #endif
2383 return 0;
2384 }
2385
2386 struct dev_gso_cb {
2387 void (*destructor)(struct sk_buff *skb);
2388 };
2389
2390 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2391
2392 static void dev_gso_skb_destructor(struct sk_buff *skb)
2393 {
2394 struct dev_gso_cb *cb;
2395
2396 do {
2397 struct sk_buff *nskb = skb->next;
2398
2399 skb->next = nskb->next;
2400 nskb->next = NULL;
2401 kfree_skb(nskb);
2402 } while (skb->next);
2403
2404 cb = DEV_GSO_CB(skb);
2405 if (cb->destructor)
2406 cb->destructor(skb);
2407 }
2408
2409 /**
2410 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2411 * @skb: buffer to segment
2412 * @features: device features as applicable to this skb
2413 *
2414 * This function segments the given skb and stores the list of segments
2415 * in skb->next.
2416 */
2417 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2418 {
2419 struct sk_buff *segs;
2420
2421 segs = skb_gso_segment(skb, features);
2422
2423 /* Verifying header integrity only. */
2424 if (!segs)
2425 return 0;
2426
2427 if (IS_ERR(segs))
2428 return PTR_ERR(segs);
2429
2430 skb->next = segs;
2431 DEV_GSO_CB(skb)->destructor = skb->destructor;
2432 skb->destructor = dev_gso_skb_destructor;
2433
2434 return 0;
2435 }
2436
2437 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2438 {
2439 return ((features & NETIF_F_GEN_CSUM) ||
2440 ((features & NETIF_F_V4_CSUM) &&
2441 protocol == htons(ETH_P_IP)) ||
2442 ((features & NETIF_F_V6_CSUM) &&
2443 protocol == htons(ETH_P_IPV6)) ||
2444 ((features & NETIF_F_FCOE_CRC) &&
2445 protocol == htons(ETH_P_FCOE)));
2446 }
2447
2448 static netdev_features_t harmonize_features(struct sk_buff *skb,
2449 __be16 protocol, netdev_features_t features)
2450 {
2451 if (skb->ip_summed != CHECKSUM_NONE &&
2452 !can_checksum_protocol(features, protocol)) {
2453 features &= ~NETIF_F_ALL_CSUM;
2454 features &= ~NETIF_F_SG;
2455 } else if (illegal_highdma(skb->dev, skb)) {
2456 features &= ~NETIF_F_SG;
2457 }
2458
2459 return features;
2460 }
2461
2462 netdev_features_t netif_skb_features(struct sk_buff *skb)
2463 {
2464 __be16 protocol = skb->protocol;
2465 netdev_features_t features = skb->dev->features;
2466
2467 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2468 features &= ~NETIF_F_GSO_MASK;
2469
2470 if (protocol == htons(ETH_P_8021Q)) {
2471 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2472 protocol = veh->h_vlan_encapsulated_proto;
2473 } else if (!vlan_tx_tag_present(skb)) {
2474 return harmonize_features(skb, protocol, features);
2475 }
2476
2477 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2478
2479 if (protocol != htons(ETH_P_8021Q)) {
2480 return harmonize_features(skb, protocol, features);
2481 } else {
2482 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2483 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2484 return harmonize_features(skb, protocol, features);
2485 }
2486 }
2487 EXPORT_SYMBOL(netif_skb_features);
2488
2489 /*
2490 * Returns true if either:
2491 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2492 * 2. skb is fragmented and the device does not support SG.
2493 */
2494 static inline int skb_needs_linearize(struct sk_buff *skb,
2495 int features)
2496 {
2497 return skb_is_nonlinear(skb) &&
2498 ((skb_has_frag_list(skb) &&
2499 !(features & NETIF_F_FRAGLIST)) ||
2500 (skb_shinfo(skb)->nr_frags &&
2501 !(features & NETIF_F_SG)));
2502 }
2503
2504 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2505 struct netdev_queue *txq)
2506 {
2507 const struct net_device_ops *ops = dev->netdev_ops;
2508 int rc = NETDEV_TX_OK;
2509 unsigned int skb_len;
2510
2511 if (likely(!skb->next)) {
2512 netdev_features_t features;
2513
2514 /*
2515 * If device doesn't need skb->dst, release it right now while
2516 * its hot in this cpu cache
2517 */
2518 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2519 skb_dst_drop(skb);
2520
2521 features = netif_skb_features(skb);
2522
2523 if (vlan_tx_tag_present(skb) &&
2524 !(features & NETIF_F_HW_VLAN_TX)) {
2525 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2526 if (unlikely(!skb))
2527 goto out;
2528
2529 skb->vlan_tci = 0;
2530 }
2531
2532 /* If encapsulation offload request, verify we are testing
2533 * hardware encapsulation features instead of standard
2534 * features for the netdev
2535 */
2536 if (skb->encapsulation)
2537 features &= dev->hw_enc_features;
2538
2539 if (netif_needs_gso(skb, features)) {
2540 if (unlikely(dev_gso_segment(skb, features)))
2541 goto out_kfree_skb;
2542 if (skb->next)
2543 goto gso;
2544 } else {
2545 if (skb_needs_linearize(skb, features) &&
2546 __skb_linearize(skb))
2547 goto out_kfree_skb;
2548
2549 /* If packet is not checksummed and device does not
2550 * support checksumming for this protocol, complete
2551 * checksumming here.
2552 */
2553 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2554 if (skb->encapsulation)
2555 skb_set_inner_transport_header(skb,
2556 skb_checksum_start_offset(skb));
2557 else
2558 skb_set_transport_header(skb,
2559 skb_checksum_start_offset(skb));
2560 if (!(features & NETIF_F_ALL_CSUM) &&
2561 skb_checksum_help(skb))
2562 goto out_kfree_skb;
2563 }
2564 }
2565
2566 if (!list_empty(&ptype_all))
2567 dev_queue_xmit_nit(skb, dev);
2568
2569 skb_len = skb->len;
2570 rc = ops->ndo_start_xmit(skb, dev);
2571 trace_net_dev_xmit(skb, rc, dev, skb_len);
2572 if (rc == NETDEV_TX_OK)
2573 txq_trans_update(txq);
2574 return rc;
2575 }
2576
2577 gso:
2578 do {
2579 struct sk_buff *nskb = skb->next;
2580
2581 skb->next = nskb->next;
2582 nskb->next = NULL;
2583
2584 /*
2585 * If device doesn't need nskb->dst, release it right now while
2586 * its hot in this cpu cache
2587 */
2588 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2589 skb_dst_drop(nskb);
2590
2591 if (!list_empty(&ptype_all))
2592 dev_queue_xmit_nit(nskb, dev);
2593
2594 skb_len = nskb->len;
2595 rc = ops->ndo_start_xmit(nskb, dev);
2596 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2597 if (unlikely(rc != NETDEV_TX_OK)) {
2598 if (rc & ~NETDEV_TX_MASK)
2599 goto out_kfree_gso_skb;
2600 nskb->next = skb->next;
2601 skb->next = nskb;
2602 return rc;
2603 }
2604 txq_trans_update(txq);
2605 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2606 return NETDEV_TX_BUSY;
2607 } while (skb->next);
2608
2609 out_kfree_gso_skb:
2610 if (likely(skb->next == NULL))
2611 skb->destructor = DEV_GSO_CB(skb)->destructor;
2612 out_kfree_skb:
2613 kfree_skb(skb);
2614 out:
2615 return rc;
2616 }
2617
2618 static void qdisc_pkt_len_init(struct sk_buff *skb)
2619 {
2620 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2621
2622 qdisc_skb_cb(skb)->pkt_len = skb->len;
2623
2624 /* To get more precise estimation of bytes sent on wire,
2625 * we add to pkt_len the headers size of all segments
2626 */
2627 if (shinfo->gso_size) {
2628 unsigned int hdr_len;
2629
2630 /* mac layer + network layer */
2631 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2632
2633 /* + transport layer */
2634 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2635 hdr_len += tcp_hdrlen(skb);
2636 else
2637 hdr_len += sizeof(struct udphdr);
2638 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2639 }
2640 }
2641
2642 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2643 struct net_device *dev,
2644 struct netdev_queue *txq)
2645 {
2646 spinlock_t *root_lock = qdisc_lock(q);
2647 bool contended;
2648 int rc;
2649
2650 qdisc_pkt_len_init(skb);
2651 qdisc_calculate_pkt_len(skb, q);
2652 /*
2653 * Heuristic to force contended enqueues to serialize on a
2654 * separate lock before trying to get qdisc main lock.
2655 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2656 * and dequeue packets faster.
2657 */
2658 contended = qdisc_is_running(q);
2659 if (unlikely(contended))
2660 spin_lock(&q->busylock);
2661
2662 spin_lock(root_lock);
2663 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2664 kfree_skb(skb);
2665 rc = NET_XMIT_DROP;
2666 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2667 qdisc_run_begin(q)) {
2668 /*
2669 * This is a work-conserving queue; there are no old skbs
2670 * waiting to be sent out; and the qdisc is not running -
2671 * xmit the skb directly.
2672 */
2673 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2674 skb_dst_force(skb);
2675
2676 qdisc_bstats_update(q, skb);
2677
2678 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2679 if (unlikely(contended)) {
2680 spin_unlock(&q->busylock);
2681 contended = false;
2682 }
2683 __qdisc_run(q);
2684 } else
2685 qdisc_run_end(q);
2686
2687 rc = NET_XMIT_SUCCESS;
2688 } else {
2689 skb_dst_force(skb);
2690 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2691 if (qdisc_run_begin(q)) {
2692 if (unlikely(contended)) {
2693 spin_unlock(&q->busylock);
2694 contended = false;
2695 }
2696 __qdisc_run(q);
2697 }
2698 }
2699 spin_unlock(root_lock);
2700 if (unlikely(contended))
2701 spin_unlock(&q->busylock);
2702 return rc;
2703 }
2704
2705 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2706 static void skb_update_prio(struct sk_buff *skb)
2707 {
2708 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2709
2710 if (!skb->priority && skb->sk && map) {
2711 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2712
2713 if (prioidx < map->priomap_len)
2714 skb->priority = map->priomap[prioidx];
2715 }
2716 }
2717 #else
2718 #define skb_update_prio(skb)
2719 #endif
2720
2721 static DEFINE_PER_CPU(int, xmit_recursion);
2722 #define RECURSION_LIMIT 10
2723
2724 /**
2725 * dev_loopback_xmit - loop back @skb
2726 * @skb: buffer to transmit
2727 */
2728 int dev_loopback_xmit(struct sk_buff *skb)
2729 {
2730 skb_reset_mac_header(skb);
2731 __skb_pull(skb, skb_network_offset(skb));
2732 skb->pkt_type = PACKET_LOOPBACK;
2733 skb->ip_summed = CHECKSUM_UNNECESSARY;
2734 WARN_ON(!skb_dst(skb));
2735 skb_dst_force(skb);
2736 netif_rx_ni(skb);
2737 return 0;
2738 }
2739 EXPORT_SYMBOL(dev_loopback_xmit);
2740
2741 /**
2742 * dev_queue_xmit - transmit a buffer
2743 * @skb: buffer to transmit
2744 *
2745 * Queue a buffer for transmission to a network device. The caller must
2746 * have set the device and priority and built the buffer before calling
2747 * this function. The function can be called from an interrupt.
2748 *
2749 * A negative errno code is returned on a failure. A success does not
2750 * guarantee the frame will be transmitted as it may be dropped due
2751 * to congestion or traffic shaping.
2752 *
2753 * -----------------------------------------------------------------------------------
2754 * I notice this method can also return errors from the queue disciplines,
2755 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2756 * be positive.
2757 *
2758 * Regardless of the return value, the skb is consumed, so it is currently
2759 * difficult to retry a send to this method. (You can bump the ref count
2760 * before sending to hold a reference for retry if you are careful.)
2761 *
2762 * When calling this method, interrupts MUST be enabled. This is because
2763 * the BH enable code must have IRQs enabled so that it will not deadlock.
2764 * --BLG
2765 */
2766 int dev_queue_xmit(struct sk_buff *skb)
2767 {
2768 struct net_device *dev = skb->dev;
2769 struct netdev_queue *txq;
2770 struct Qdisc *q;
2771 int rc = -ENOMEM;
2772
2773 skb_reset_mac_header(skb);
2774
2775 /* Disable soft irqs for various locks below. Also
2776 * stops preemption for RCU.
2777 */
2778 rcu_read_lock_bh();
2779
2780 skb_update_prio(skb);
2781
2782 txq = netdev_pick_tx(dev, skb);
2783 q = rcu_dereference_bh(txq->qdisc);
2784
2785 #ifdef CONFIG_NET_CLS_ACT
2786 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2787 #endif
2788 trace_net_dev_queue(skb);
2789 if (q->enqueue) {
2790 rc = __dev_xmit_skb(skb, q, dev, txq);
2791 goto out;
2792 }
2793
2794 /* The device has no queue. Common case for software devices:
2795 loopback, all the sorts of tunnels...
2796
2797 Really, it is unlikely that netif_tx_lock protection is necessary
2798 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2799 counters.)
2800 However, it is possible, that they rely on protection
2801 made by us here.
2802
2803 Check this and shot the lock. It is not prone from deadlocks.
2804 Either shot noqueue qdisc, it is even simpler 8)
2805 */
2806 if (dev->flags & IFF_UP) {
2807 int cpu = smp_processor_id(); /* ok because BHs are off */
2808
2809 if (txq->xmit_lock_owner != cpu) {
2810
2811 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2812 goto recursion_alert;
2813
2814 HARD_TX_LOCK(dev, txq, cpu);
2815
2816 if (!netif_xmit_stopped(txq)) {
2817 __this_cpu_inc(xmit_recursion);
2818 rc = dev_hard_start_xmit(skb, dev, txq);
2819 __this_cpu_dec(xmit_recursion);
2820 if (dev_xmit_complete(rc)) {
2821 HARD_TX_UNLOCK(dev, txq);
2822 goto out;
2823 }
2824 }
2825 HARD_TX_UNLOCK(dev, txq);
2826 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2827 dev->name);
2828 } else {
2829 /* Recursion is detected! It is possible,
2830 * unfortunately
2831 */
2832 recursion_alert:
2833 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2834 dev->name);
2835 }
2836 }
2837
2838 rc = -ENETDOWN;
2839 rcu_read_unlock_bh();
2840
2841 kfree_skb(skb);
2842 return rc;
2843 out:
2844 rcu_read_unlock_bh();
2845 return rc;
2846 }
2847 EXPORT_SYMBOL(dev_queue_xmit);
2848
2849
2850 /*=======================================================================
2851 Receiver routines
2852 =======================================================================*/
2853
2854 int netdev_max_backlog __read_mostly = 1000;
2855 EXPORT_SYMBOL(netdev_max_backlog);
2856
2857 int netdev_tstamp_prequeue __read_mostly = 1;
2858 int netdev_budget __read_mostly = 300;
2859 int weight_p __read_mostly = 64; /* old backlog weight */
2860
2861 /* Called with irq disabled */
2862 static inline void ____napi_schedule(struct softnet_data *sd,
2863 struct napi_struct *napi)
2864 {
2865 list_add_tail(&napi->poll_list, &sd->poll_list);
2866 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2867 }
2868
2869 #ifdef CONFIG_RPS
2870
2871 /* One global table that all flow-based protocols share. */
2872 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2873 EXPORT_SYMBOL(rps_sock_flow_table);
2874
2875 struct static_key rps_needed __read_mostly;
2876
2877 static struct rps_dev_flow *
2878 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2879 struct rps_dev_flow *rflow, u16 next_cpu)
2880 {
2881 if (next_cpu != RPS_NO_CPU) {
2882 #ifdef CONFIG_RFS_ACCEL
2883 struct netdev_rx_queue *rxqueue;
2884 struct rps_dev_flow_table *flow_table;
2885 struct rps_dev_flow *old_rflow;
2886 u32 flow_id;
2887 u16 rxq_index;
2888 int rc;
2889
2890 /* Should we steer this flow to a different hardware queue? */
2891 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2892 !(dev->features & NETIF_F_NTUPLE))
2893 goto out;
2894 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2895 if (rxq_index == skb_get_rx_queue(skb))
2896 goto out;
2897
2898 rxqueue = dev->_rx + rxq_index;
2899 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2900 if (!flow_table)
2901 goto out;
2902 flow_id = skb->rxhash & flow_table->mask;
2903 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2904 rxq_index, flow_id);
2905 if (rc < 0)
2906 goto out;
2907 old_rflow = rflow;
2908 rflow = &flow_table->flows[flow_id];
2909 rflow->filter = rc;
2910 if (old_rflow->filter == rflow->filter)
2911 old_rflow->filter = RPS_NO_FILTER;
2912 out:
2913 #endif
2914 rflow->last_qtail =
2915 per_cpu(softnet_data, next_cpu).input_queue_head;
2916 }
2917
2918 rflow->cpu = next_cpu;
2919 return rflow;
2920 }
2921
2922 /*
2923 * get_rps_cpu is called from netif_receive_skb and returns the target
2924 * CPU from the RPS map of the receiving queue for a given skb.
2925 * rcu_read_lock must be held on entry.
2926 */
2927 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2928 struct rps_dev_flow **rflowp)
2929 {
2930 struct netdev_rx_queue *rxqueue;
2931 struct rps_map *map;
2932 struct rps_dev_flow_table *flow_table;
2933 struct rps_sock_flow_table *sock_flow_table;
2934 int cpu = -1;
2935 u16 tcpu;
2936
2937 if (skb_rx_queue_recorded(skb)) {
2938 u16 index = skb_get_rx_queue(skb);
2939 if (unlikely(index >= dev->real_num_rx_queues)) {
2940 WARN_ONCE(dev->real_num_rx_queues > 1,
2941 "%s received packet on queue %u, but number "
2942 "of RX queues is %u\n",
2943 dev->name, index, dev->real_num_rx_queues);
2944 goto done;
2945 }
2946 rxqueue = dev->_rx + index;
2947 } else
2948 rxqueue = dev->_rx;
2949
2950 map = rcu_dereference(rxqueue->rps_map);
2951 if (map) {
2952 if (map->len == 1 &&
2953 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2954 tcpu = map->cpus[0];
2955 if (cpu_online(tcpu))
2956 cpu = tcpu;
2957 goto done;
2958 }
2959 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2960 goto done;
2961 }
2962
2963 skb_reset_network_header(skb);
2964 if (!skb_get_rxhash(skb))
2965 goto done;
2966
2967 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2969 if (flow_table && sock_flow_table) {
2970 u16 next_cpu;
2971 struct rps_dev_flow *rflow;
2972
2973 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2974 tcpu = rflow->cpu;
2975
2976 next_cpu = sock_flow_table->ents[skb->rxhash &
2977 sock_flow_table->mask];
2978
2979 /*
2980 * If the desired CPU (where last recvmsg was done) is
2981 * different from current CPU (one in the rx-queue flow
2982 * table entry), switch if one of the following holds:
2983 * - Current CPU is unset (equal to RPS_NO_CPU).
2984 * - Current CPU is offline.
2985 * - The current CPU's queue tail has advanced beyond the
2986 * last packet that was enqueued using this table entry.
2987 * This guarantees that all previous packets for the flow
2988 * have been dequeued, thus preserving in order delivery.
2989 */
2990 if (unlikely(tcpu != next_cpu) &&
2991 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2992 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2993 rflow->last_qtail)) >= 0)) {
2994 tcpu = next_cpu;
2995 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2996 }
2997
2998 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2999 *rflowp = rflow;
3000 cpu = tcpu;
3001 goto done;
3002 }
3003 }
3004
3005 if (map) {
3006 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3007
3008 if (cpu_online(tcpu)) {
3009 cpu = tcpu;
3010 goto done;
3011 }
3012 }
3013
3014 done:
3015 return cpu;
3016 }
3017
3018 #ifdef CONFIG_RFS_ACCEL
3019
3020 /**
3021 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3022 * @dev: Device on which the filter was set
3023 * @rxq_index: RX queue index
3024 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3025 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3026 *
3027 * Drivers that implement ndo_rx_flow_steer() should periodically call
3028 * this function for each installed filter and remove the filters for
3029 * which it returns %true.
3030 */
3031 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3032 u32 flow_id, u16 filter_id)
3033 {
3034 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3035 struct rps_dev_flow_table *flow_table;
3036 struct rps_dev_flow *rflow;
3037 bool expire = true;
3038 int cpu;
3039
3040 rcu_read_lock();
3041 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3042 if (flow_table && flow_id <= flow_table->mask) {
3043 rflow = &flow_table->flows[flow_id];
3044 cpu = ACCESS_ONCE(rflow->cpu);
3045 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3046 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3047 rflow->last_qtail) <
3048 (int)(10 * flow_table->mask)))
3049 expire = false;
3050 }
3051 rcu_read_unlock();
3052 return expire;
3053 }
3054 EXPORT_SYMBOL(rps_may_expire_flow);
3055
3056 #endif /* CONFIG_RFS_ACCEL */
3057
3058 /* Called from hardirq (IPI) context */
3059 static void rps_trigger_softirq(void *data)
3060 {
3061 struct softnet_data *sd = data;
3062
3063 ____napi_schedule(sd, &sd->backlog);
3064 sd->received_rps++;
3065 }
3066
3067 #endif /* CONFIG_RPS */
3068
3069 /*
3070 * Check if this softnet_data structure is another cpu one
3071 * If yes, queue it to our IPI list and return 1
3072 * If no, return 0
3073 */
3074 static int rps_ipi_queued(struct softnet_data *sd)
3075 {
3076 #ifdef CONFIG_RPS
3077 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3078
3079 if (sd != mysd) {
3080 sd->rps_ipi_next = mysd->rps_ipi_list;
3081 mysd->rps_ipi_list = sd;
3082
3083 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3084 return 1;
3085 }
3086 #endif /* CONFIG_RPS */
3087 return 0;
3088 }
3089
3090 /*
3091 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3092 * queue (may be a remote CPU queue).
3093 */
3094 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3095 unsigned int *qtail)
3096 {
3097 struct softnet_data *sd;
3098 unsigned long flags;
3099
3100 sd = &per_cpu(softnet_data, cpu);
3101
3102 local_irq_save(flags);
3103
3104 rps_lock(sd);
3105 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3106 if (skb_queue_len(&sd->input_pkt_queue)) {
3107 enqueue:
3108 __skb_queue_tail(&sd->input_pkt_queue, skb);
3109 input_queue_tail_incr_save(sd, qtail);
3110 rps_unlock(sd);
3111 local_irq_restore(flags);
3112 return NET_RX_SUCCESS;
3113 }
3114
3115 /* Schedule NAPI for backlog device
3116 * We can use non atomic operation since we own the queue lock
3117 */
3118 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3119 if (!rps_ipi_queued(sd))
3120 ____napi_schedule(sd, &sd->backlog);
3121 }
3122 goto enqueue;
3123 }
3124
3125 sd->dropped++;
3126 rps_unlock(sd);
3127
3128 local_irq_restore(flags);
3129
3130 atomic_long_inc(&skb->dev->rx_dropped);
3131 kfree_skb(skb);
3132 return NET_RX_DROP;
3133 }
3134
3135 /**
3136 * netif_rx - post buffer to the network code
3137 * @skb: buffer to post
3138 *
3139 * This function receives a packet from a device driver and queues it for
3140 * the upper (protocol) levels to process. It always succeeds. The buffer
3141 * may be dropped during processing for congestion control or by the
3142 * protocol layers.
3143 *
3144 * return values:
3145 * NET_RX_SUCCESS (no congestion)
3146 * NET_RX_DROP (packet was dropped)
3147 *
3148 */
3149
3150 int netif_rx(struct sk_buff *skb)
3151 {
3152 int ret;
3153
3154 /* if netpoll wants it, pretend we never saw it */
3155 if (netpoll_rx(skb))
3156 return NET_RX_DROP;
3157
3158 net_timestamp_check(netdev_tstamp_prequeue, skb);
3159
3160 trace_netif_rx(skb);
3161 #ifdef CONFIG_RPS
3162 if (static_key_false(&rps_needed)) {
3163 struct rps_dev_flow voidflow, *rflow = &voidflow;
3164 int cpu;
3165
3166 preempt_disable();
3167 rcu_read_lock();
3168
3169 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3170 if (cpu < 0)
3171 cpu = smp_processor_id();
3172
3173 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3174
3175 rcu_read_unlock();
3176 preempt_enable();
3177 } else
3178 #endif
3179 {
3180 unsigned int qtail;
3181 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3182 put_cpu();
3183 }
3184 return ret;
3185 }
3186 EXPORT_SYMBOL(netif_rx);
3187
3188 int netif_rx_ni(struct sk_buff *skb)
3189 {
3190 int err;
3191
3192 preempt_disable();
3193 err = netif_rx(skb);
3194 if (local_softirq_pending())
3195 do_softirq();
3196 preempt_enable();
3197
3198 return err;
3199 }
3200 EXPORT_SYMBOL(netif_rx_ni);
3201
3202 static void net_tx_action(struct softirq_action *h)
3203 {
3204 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3205
3206 if (sd->completion_queue) {
3207 struct sk_buff *clist;
3208
3209 local_irq_disable();
3210 clist = sd->completion_queue;
3211 sd->completion_queue = NULL;
3212 local_irq_enable();
3213
3214 while (clist) {
3215 struct sk_buff *skb = clist;
3216 clist = clist->next;
3217
3218 WARN_ON(atomic_read(&skb->users));
3219 trace_kfree_skb(skb, net_tx_action);
3220 __kfree_skb(skb);
3221 }
3222 }
3223
3224 if (sd->output_queue) {
3225 struct Qdisc *head;
3226
3227 local_irq_disable();
3228 head = sd->output_queue;
3229 sd->output_queue = NULL;
3230 sd->output_queue_tailp = &sd->output_queue;
3231 local_irq_enable();
3232
3233 while (head) {
3234 struct Qdisc *q = head;
3235 spinlock_t *root_lock;
3236
3237 head = head->next_sched;
3238
3239 root_lock = qdisc_lock(q);
3240 if (spin_trylock(root_lock)) {
3241 smp_mb__before_clear_bit();
3242 clear_bit(__QDISC_STATE_SCHED,
3243 &q->state);
3244 qdisc_run(q);
3245 spin_unlock(root_lock);
3246 } else {
3247 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3248 &q->state)) {
3249 __netif_reschedule(q);
3250 } else {
3251 smp_mb__before_clear_bit();
3252 clear_bit(__QDISC_STATE_SCHED,
3253 &q->state);
3254 }
3255 }
3256 }
3257 }
3258 }
3259
3260 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3261 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3262 /* This hook is defined here for ATM LANE */
3263 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3264 unsigned char *addr) __read_mostly;
3265 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3266 #endif
3267
3268 #ifdef CONFIG_NET_CLS_ACT
3269 /* TODO: Maybe we should just force sch_ingress to be compiled in
3270 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3271 * a compare and 2 stores extra right now if we dont have it on
3272 * but have CONFIG_NET_CLS_ACT
3273 * NOTE: This doesn't stop any functionality; if you dont have
3274 * the ingress scheduler, you just can't add policies on ingress.
3275 *
3276 */
3277 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3278 {
3279 struct net_device *dev = skb->dev;
3280 u32 ttl = G_TC_RTTL(skb->tc_verd);
3281 int result = TC_ACT_OK;
3282 struct Qdisc *q;
3283
3284 if (unlikely(MAX_RED_LOOP < ttl++)) {
3285 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3286 skb->skb_iif, dev->ifindex);
3287 return TC_ACT_SHOT;
3288 }
3289
3290 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3291 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3292
3293 q = rxq->qdisc;
3294 if (q != &noop_qdisc) {
3295 spin_lock(qdisc_lock(q));
3296 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3297 result = qdisc_enqueue_root(skb, q);
3298 spin_unlock(qdisc_lock(q));
3299 }
3300
3301 return result;
3302 }
3303
3304 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3305 struct packet_type **pt_prev,
3306 int *ret, struct net_device *orig_dev)
3307 {
3308 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3309
3310 if (!rxq || rxq->qdisc == &noop_qdisc)
3311 goto out;
3312
3313 if (*pt_prev) {
3314 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3315 *pt_prev = NULL;
3316 }
3317
3318 switch (ing_filter(skb, rxq)) {
3319 case TC_ACT_SHOT:
3320 case TC_ACT_STOLEN:
3321 kfree_skb(skb);
3322 return NULL;
3323 }
3324
3325 out:
3326 skb->tc_verd = 0;
3327 return skb;
3328 }
3329 #endif
3330
3331 /**
3332 * netdev_rx_handler_register - register receive handler
3333 * @dev: device to register a handler for
3334 * @rx_handler: receive handler to register
3335 * @rx_handler_data: data pointer that is used by rx handler
3336 *
3337 * Register a receive hander for a device. This handler will then be
3338 * called from __netif_receive_skb. A negative errno code is returned
3339 * on a failure.
3340 *
3341 * The caller must hold the rtnl_mutex.
3342 *
3343 * For a general description of rx_handler, see enum rx_handler_result.
3344 */
3345 int netdev_rx_handler_register(struct net_device *dev,
3346 rx_handler_func_t *rx_handler,
3347 void *rx_handler_data)
3348 {
3349 ASSERT_RTNL();
3350
3351 if (dev->rx_handler)
3352 return -EBUSY;
3353
3354 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3355 rcu_assign_pointer(dev->rx_handler, rx_handler);
3356
3357 return 0;
3358 }
3359 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3360
3361 /**
3362 * netdev_rx_handler_unregister - unregister receive handler
3363 * @dev: device to unregister a handler from
3364 *
3365 * Unregister a receive hander from a device.
3366 *
3367 * The caller must hold the rtnl_mutex.
3368 */
3369 void netdev_rx_handler_unregister(struct net_device *dev)
3370 {
3371
3372 ASSERT_RTNL();
3373 RCU_INIT_POINTER(dev->rx_handler, NULL);
3374 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3375 }
3376 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3377
3378 /*
3379 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3380 * the special handling of PFMEMALLOC skbs.
3381 */
3382 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3383 {
3384 switch (skb->protocol) {
3385 case __constant_htons(ETH_P_ARP):
3386 case __constant_htons(ETH_P_IP):
3387 case __constant_htons(ETH_P_IPV6):
3388 case __constant_htons(ETH_P_8021Q):
3389 return true;
3390 default:
3391 return false;
3392 }
3393 }
3394
3395 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3396 {
3397 struct packet_type *ptype, *pt_prev;
3398 rx_handler_func_t *rx_handler;
3399 struct net_device *orig_dev;
3400 struct net_device *null_or_dev;
3401 bool deliver_exact = false;
3402 int ret = NET_RX_DROP;
3403 __be16 type;
3404
3405 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3406
3407 trace_netif_receive_skb(skb);
3408
3409 /* if we've gotten here through NAPI, check netpoll */
3410 if (netpoll_receive_skb(skb))
3411 goto out;
3412
3413 orig_dev = skb->dev;
3414
3415 skb_reset_network_header(skb);
3416 if (!skb_transport_header_was_set(skb))
3417 skb_reset_transport_header(skb);
3418 skb_reset_mac_len(skb);
3419
3420 pt_prev = NULL;
3421
3422 rcu_read_lock();
3423
3424 another_round:
3425 skb->skb_iif = skb->dev->ifindex;
3426
3427 __this_cpu_inc(softnet_data.processed);
3428
3429 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3430 skb = vlan_untag(skb);
3431 if (unlikely(!skb))
3432 goto unlock;
3433 }
3434
3435 #ifdef CONFIG_NET_CLS_ACT
3436 if (skb->tc_verd & TC_NCLS) {
3437 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3438 goto ncls;
3439 }
3440 #endif
3441
3442 if (pfmemalloc)
3443 goto skip_taps;
3444
3445 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3446 if (!ptype->dev || ptype->dev == skb->dev) {
3447 if (pt_prev)
3448 ret = deliver_skb(skb, pt_prev, orig_dev);
3449 pt_prev = ptype;
3450 }
3451 }
3452
3453 skip_taps:
3454 #ifdef CONFIG_NET_CLS_ACT
3455 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3456 if (!skb)
3457 goto unlock;
3458 ncls:
3459 #endif
3460
3461 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3462 goto drop;
3463
3464 if (vlan_tx_tag_present(skb)) {
3465 if (pt_prev) {
3466 ret = deliver_skb(skb, pt_prev, orig_dev);
3467 pt_prev = NULL;
3468 }
3469 if (vlan_do_receive(&skb))
3470 goto another_round;
3471 else if (unlikely(!skb))
3472 goto unlock;
3473 }
3474
3475 rx_handler = rcu_dereference(skb->dev->rx_handler);
3476 if (rx_handler) {
3477 if (pt_prev) {
3478 ret = deliver_skb(skb, pt_prev, orig_dev);
3479 pt_prev = NULL;
3480 }
3481 switch (rx_handler(&skb)) {
3482 case RX_HANDLER_CONSUMED:
3483 goto unlock;
3484 case RX_HANDLER_ANOTHER:
3485 goto another_round;
3486 case RX_HANDLER_EXACT:
3487 deliver_exact = true;
3488 case RX_HANDLER_PASS:
3489 break;
3490 default:
3491 BUG();
3492 }
3493 }
3494
3495 if (vlan_tx_nonzero_tag_present(skb))
3496 skb->pkt_type = PACKET_OTHERHOST;
3497
3498 /* deliver only exact match when indicated */
3499 null_or_dev = deliver_exact ? skb->dev : NULL;
3500
3501 type = skb->protocol;
3502 list_for_each_entry_rcu(ptype,
3503 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3504 if (ptype->type == type &&
3505 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3506 ptype->dev == orig_dev)) {
3507 if (pt_prev)
3508 ret = deliver_skb(skb, pt_prev, orig_dev);
3509 pt_prev = ptype;
3510 }
3511 }
3512
3513 if (pt_prev) {
3514 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3515 goto drop;
3516 else
3517 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3518 } else {
3519 drop:
3520 atomic_long_inc(&skb->dev->rx_dropped);
3521 kfree_skb(skb);
3522 /* Jamal, now you will not able to escape explaining
3523 * me how you were going to use this. :-)
3524 */
3525 ret = NET_RX_DROP;
3526 }
3527
3528 unlock:
3529 rcu_read_unlock();
3530 out:
3531 return ret;
3532 }
3533
3534 static int __netif_receive_skb(struct sk_buff *skb)
3535 {
3536 int ret;
3537
3538 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3539 unsigned long pflags = current->flags;
3540
3541 /*
3542 * PFMEMALLOC skbs are special, they should
3543 * - be delivered to SOCK_MEMALLOC sockets only
3544 * - stay away from userspace
3545 * - have bounded memory usage
3546 *
3547 * Use PF_MEMALLOC as this saves us from propagating the allocation
3548 * context down to all allocation sites.
3549 */
3550 current->flags |= PF_MEMALLOC;
3551 ret = __netif_receive_skb_core(skb, true);
3552 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3553 } else
3554 ret = __netif_receive_skb_core(skb, false);
3555
3556 return ret;
3557 }
3558
3559 /**
3560 * netif_receive_skb - process receive buffer from network
3561 * @skb: buffer to process
3562 *
3563 * netif_receive_skb() is the main receive data processing function.
3564 * It always succeeds. The buffer may be dropped during processing
3565 * for congestion control or by the protocol layers.
3566 *
3567 * This function may only be called from softirq context and interrupts
3568 * should be enabled.
3569 *
3570 * Return values (usually ignored):
3571 * NET_RX_SUCCESS: no congestion
3572 * NET_RX_DROP: packet was dropped
3573 */
3574 int netif_receive_skb(struct sk_buff *skb)
3575 {
3576 net_timestamp_check(netdev_tstamp_prequeue, skb);
3577
3578 if (skb_defer_rx_timestamp(skb))
3579 return NET_RX_SUCCESS;
3580
3581 #ifdef CONFIG_RPS
3582 if (static_key_false(&rps_needed)) {
3583 struct rps_dev_flow voidflow, *rflow = &voidflow;
3584 int cpu, ret;
3585
3586 rcu_read_lock();
3587
3588 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3589
3590 if (cpu >= 0) {
3591 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3592 rcu_read_unlock();
3593 return ret;
3594 }
3595 rcu_read_unlock();
3596 }
3597 #endif
3598 return __netif_receive_skb(skb);
3599 }
3600 EXPORT_SYMBOL(netif_receive_skb);
3601
3602 /* Network device is going away, flush any packets still pending
3603 * Called with irqs disabled.
3604 */
3605 static void flush_backlog(void *arg)
3606 {
3607 struct net_device *dev = arg;
3608 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3609 struct sk_buff *skb, *tmp;
3610
3611 rps_lock(sd);
3612 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3613 if (skb->dev == dev) {
3614 __skb_unlink(skb, &sd->input_pkt_queue);
3615 kfree_skb(skb);
3616 input_queue_head_incr(sd);
3617 }
3618 }
3619 rps_unlock(sd);
3620
3621 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3622 if (skb->dev == dev) {
3623 __skb_unlink(skb, &sd->process_queue);
3624 kfree_skb(skb);
3625 input_queue_head_incr(sd);
3626 }
3627 }
3628 }
3629
3630 static int napi_gro_complete(struct sk_buff *skb)
3631 {
3632 struct packet_offload *ptype;
3633 __be16 type = skb->protocol;
3634 struct list_head *head = &offload_base;
3635 int err = -ENOENT;
3636
3637 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3638
3639 if (NAPI_GRO_CB(skb)->count == 1) {
3640 skb_shinfo(skb)->gso_size = 0;
3641 goto out;
3642 }
3643
3644 rcu_read_lock();
3645 list_for_each_entry_rcu(ptype, head, list) {
3646 if (ptype->type != type || !ptype->callbacks.gro_complete)
3647 continue;
3648
3649 err = ptype->callbacks.gro_complete(skb);
3650 break;
3651 }
3652 rcu_read_unlock();
3653
3654 if (err) {
3655 WARN_ON(&ptype->list == head);
3656 kfree_skb(skb);
3657 return NET_RX_SUCCESS;
3658 }
3659
3660 out:
3661 return netif_receive_skb(skb);
3662 }
3663
3664 /* napi->gro_list contains packets ordered by age.
3665 * youngest packets at the head of it.
3666 * Complete skbs in reverse order to reduce latencies.
3667 */
3668 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3669 {
3670 struct sk_buff *skb, *prev = NULL;
3671
3672 /* scan list and build reverse chain */
3673 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3674 skb->prev = prev;
3675 prev = skb;
3676 }
3677
3678 for (skb = prev; skb; skb = prev) {
3679 skb->next = NULL;
3680
3681 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3682 return;
3683
3684 prev = skb->prev;
3685 napi_gro_complete(skb);
3686 napi->gro_count--;
3687 }
3688
3689 napi->gro_list = NULL;
3690 }
3691 EXPORT_SYMBOL(napi_gro_flush);
3692
3693 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3694 {
3695 struct sk_buff *p;
3696 unsigned int maclen = skb->dev->hard_header_len;
3697
3698 for (p = napi->gro_list; p; p = p->next) {
3699 unsigned long diffs;
3700
3701 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3702 diffs |= p->vlan_tci ^ skb->vlan_tci;
3703 if (maclen == ETH_HLEN)
3704 diffs |= compare_ether_header(skb_mac_header(p),
3705 skb_gro_mac_header(skb));
3706 else if (!diffs)
3707 diffs = memcmp(skb_mac_header(p),
3708 skb_gro_mac_header(skb),
3709 maclen);
3710 NAPI_GRO_CB(p)->same_flow = !diffs;
3711 NAPI_GRO_CB(p)->flush = 0;
3712 }
3713 }
3714
3715 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3716 {
3717 struct sk_buff **pp = NULL;
3718 struct packet_offload *ptype;
3719 __be16 type = skb->protocol;
3720 struct list_head *head = &offload_base;
3721 int same_flow;
3722 enum gro_result ret;
3723
3724 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3725 goto normal;
3726
3727 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3728 goto normal;
3729
3730 gro_list_prepare(napi, skb);
3731
3732 rcu_read_lock();
3733 list_for_each_entry_rcu(ptype, head, list) {
3734 if (ptype->type != type || !ptype->callbacks.gro_receive)
3735 continue;
3736
3737 skb_set_network_header(skb, skb_gro_offset(skb));
3738 skb_reset_mac_len(skb);
3739 NAPI_GRO_CB(skb)->same_flow = 0;
3740 NAPI_GRO_CB(skb)->flush = 0;
3741 NAPI_GRO_CB(skb)->free = 0;
3742
3743 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3744 break;
3745 }
3746 rcu_read_unlock();
3747
3748 if (&ptype->list == head)
3749 goto normal;
3750
3751 same_flow = NAPI_GRO_CB(skb)->same_flow;
3752 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3753
3754 if (pp) {
3755 struct sk_buff *nskb = *pp;
3756
3757 *pp = nskb->next;
3758 nskb->next = NULL;
3759 napi_gro_complete(nskb);
3760 napi->gro_count--;
3761 }
3762
3763 if (same_flow)
3764 goto ok;
3765
3766 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3767 goto normal;
3768
3769 napi->gro_count++;
3770 NAPI_GRO_CB(skb)->count = 1;
3771 NAPI_GRO_CB(skb)->age = jiffies;
3772 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3773 skb->next = napi->gro_list;
3774 napi->gro_list = skb;
3775 ret = GRO_HELD;
3776
3777 pull:
3778 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3779 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3780
3781 BUG_ON(skb->end - skb->tail < grow);
3782
3783 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3784
3785 skb->tail += grow;
3786 skb->data_len -= grow;
3787
3788 skb_shinfo(skb)->frags[0].page_offset += grow;
3789 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3790
3791 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3792 skb_frag_unref(skb, 0);
3793 memmove(skb_shinfo(skb)->frags,
3794 skb_shinfo(skb)->frags + 1,
3795 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3796 }
3797 }
3798
3799 ok:
3800 return ret;
3801
3802 normal:
3803 ret = GRO_NORMAL;
3804 goto pull;
3805 }
3806
3807
3808 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3809 {
3810 switch (ret) {
3811 case GRO_NORMAL:
3812 if (netif_receive_skb(skb))
3813 ret = GRO_DROP;
3814 break;
3815
3816 case GRO_DROP:
3817 kfree_skb(skb);
3818 break;
3819
3820 case GRO_MERGED_FREE:
3821 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3822 kmem_cache_free(skbuff_head_cache, skb);
3823 else
3824 __kfree_skb(skb);
3825 break;
3826
3827 case GRO_HELD:
3828 case GRO_MERGED:
3829 break;
3830 }
3831
3832 return ret;
3833 }
3834
3835 static void skb_gro_reset_offset(struct sk_buff *skb)
3836 {
3837 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3838 const skb_frag_t *frag0 = &pinfo->frags[0];
3839
3840 NAPI_GRO_CB(skb)->data_offset = 0;
3841 NAPI_GRO_CB(skb)->frag0 = NULL;
3842 NAPI_GRO_CB(skb)->frag0_len = 0;
3843
3844 if (skb->mac_header == skb->tail &&
3845 pinfo->nr_frags &&
3846 !PageHighMem(skb_frag_page(frag0))) {
3847 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3848 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3849 }
3850 }
3851
3852 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3853 {
3854 skb_gro_reset_offset(skb);
3855
3856 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3857 }
3858 EXPORT_SYMBOL(napi_gro_receive);
3859
3860 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3861 {
3862 __skb_pull(skb, skb_headlen(skb));
3863 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3864 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3865 skb->vlan_tci = 0;
3866 skb->dev = napi->dev;
3867 skb->skb_iif = 0;
3868
3869 napi->skb = skb;
3870 }
3871
3872 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3873 {
3874 struct sk_buff *skb = napi->skb;
3875
3876 if (!skb) {
3877 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3878 if (skb)
3879 napi->skb = skb;
3880 }
3881 return skb;
3882 }
3883 EXPORT_SYMBOL(napi_get_frags);
3884
3885 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3886 gro_result_t ret)
3887 {
3888 switch (ret) {
3889 case GRO_NORMAL:
3890 case GRO_HELD:
3891 skb->protocol = eth_type_trans(skb, skb->dev);
3892
3893 if (ret == GRO_HELD)
3894 skb_gro_pull(skb, -ETH_HLEN);
3895 else if (netif_receive_skb(skb))
3896 ret = GRO_DROP;
3897 break;
3898
3899 case GRO_DROP:
3900 case GRO_MERGED_FREE:
3901 napi_reuse_skb(napi, skb);
3902 break;
3903
3904 case GRO_MERGED:
3905 break;
3906 }
3907
3908 return ret;
3909 }
3910
3911 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3912 {
3913 struct sk_buff *skb = napi->skb;
3914 struct ethhdr *eth;
3915 unsigned int hlen;
3916 unsigned int off;
3917
3918 napi->skb = NULL;
3919
3920 skb_reset_mac_header(skb);
3921 skb_gro_reset_offset(skb);
3922
3923 off = skb_gro_offset(skb);
3924 hlen = off + sizeof(*eth);
3925 eth = skb_gro_header_fast(skb, off);
3926 if (skb_gro_header_hard(skb, hlen)) {
3927 eth = skb_gro_header_slow(skb, hlen, off);
3928 if (unlikely(!eth)) {
3929 napi_reuse_skb(napi, skb);
3930 skb = NULL;
3931 goto out;
3932 }
3933 }
3934
3935 skb_gro_pull(skb, sizeof(*eth));
3936
3937 /*
3938 * This works because the only protocols we care about don't require
3939 * special handling. We'll fix it up properly at the end.
3940 */
3941 skb->protocol = eth->h_proto;
3942
3943 out:
3944 return skb;
3945 }
3946
3947 gro_result_t napi_gro_frags(struct napi_struct *napi)
3948 {
3949 struct sk_buff *skb = napi_frags_skb(napi);
3950
3951 if (!skb)
3952 return GRO_DROP;
3953
3954 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3955 }
3956 EXPORT_SYMBOL(napi_gro_frags);
3957
3958 /*
3959 * net_rps_action sends any pending IPI's for rps.
3960 * Note: called with local irq disabled, but exits with local irq enabled.
3961 */
3962 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3963 {
3964 #ifdef CONFIG_RPS
3965 struct softnet_data *remsd = sd->rps_ipi_list;
3966
3967 if (remsd) {
3968 sd->rps_ipi_list = NULL;
3969
3970 local_irq_enable();
3971
3972 /* Send pending IPI's to kick RPS processing on remote cpus. */
3973 while (remsd) {
3974 struct softnet_data *next = remsd->rps_ipi_next;
3975
3976 if (cpu_online(remsd->cpu))
3977 __smp_call_function_single(remsd->cpu,
3978 &remsd->csd, 0);
3979 remsd = next;
3980 }
3981 } else
3982 #endif
3983 local_irq_enable();
3984 }
3985
3986 static int process_backlog(struct napi_struct *napi, int quota)
3987 {
3988 int work = 0;
3989 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3990
3991 #ifdef CONFIG_RPS
3992 /* Check if we have pending ipi, its better to send them now,
3993 * not waiting net_rx_action() end.
3994 */
3995 if (sd->rps_ipi_list) {
3996 local_irq_disable();
3997 net_rps_action_and_irq_enable(sd);
3998 }
3999 #endif
4000 napi->weight = weight_p;
4001 local_irq_disable();
4002 while (work < quota) {
4003 struct sk_buff *skb;
4004 unsigned int qlen;
4005
4006 while ((skb = __skb_dequeue(&sd->process_queue))) {
4007 local_irq_enable();
4008 __netif_receive_skb(skb);
4009 local_irq_disable();
4010 input_queue_head_incr(sd);
4011 if (++work >= quota) {
4012 local_irq_enable();
4013 return work;
4014 }
4015 }
4016
4017 rps_lock(sd);
4018 qlen = skb_queue_len(&sd->input_pkt_queue);
4019 if (qlen)
4020 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4021 &sd->process_queue);
4022
4023 if (qlen < quota - work) {
4024 /*
4025 * Inline a custom version of __napi_complete().
4026 * only current cpu owns and manipulates this napi,
4027 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4028 * we can use a plain write instead of clear_bit(),
4029 * and we dont need an smp_mb() memory barrier.
4030 */
4031 list_del(&napi->poll_list);
4032 napi->state = 0;
4033
4034 quota = work + qlen;
4035 }
4036 rps_unlock(sd);
4037 }
4038 local_irq_enable();
4039
4040 return work;
4041 }
4042
4043 /**
4044 * __napi_schedule - schedule for receive
4045 * @n: entry to schedule
4046 *
4047 * The entry's receive function will be scheduled to run
4048 */
4049 void __napi_schedule(struct napi_struct *n)
4050 {
4051 unsigned long flags;
4052
4053 local_irq_save(flags);
4054 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4055 local_irq_restore(flags);
4056 }
4057 EXPORT_SYMBOL(__napi_schedule);
4058
4059 void __napi_complete(struct napi_struct *n)
4060 {
4061 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4062 BUG_ON(n->gro_list);
4063
4064 list_del(&n->poll_list);
4065 smp_mb__before_clear_bit();
4066 clear_bit(NAPI_STATE_SCHED, &n->state);
4067 }
4068 EXPORT_SYMBOL(__napi_complete);
4069
4070 void napi_complete(struct napi_struct *n)
4071 {
4072 unsigned long flags;
4073
4074 /*
4075 * don't let napi dequeue from the cpu poll list
4076 * just in case its running on a different cpu
4077 */
4078 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4079 return;
4080
4081 napi_gro_flush(n, false);
4082 local_irq_save(flags);
4083 __napi_complete(n);
4084 local_irq_restore(flags);
4085 }
4086 EXPORT_SYMBOL(napi_complete);
4087
4088 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4089 int (*poll)(struct napi_struct *, int), int weight)
4090 {
4091 INIT_LIST_HEAD(&napi->poll_list);
4092 napi->gro_count = 0;
4093 napi->gro_list = NULL;
4094 napi->skb = NULL;
4095 napi->poll = poll;
4096 napi->weight = weight;
4097 list_add(&napi->dev_list, &dev->napi_list);
4098 napi->dev = dev;
4099 #ifdef CONFIG_NETPOLL
4100 spin_lock_init(&napi->poll_lock);
4101 napi->poll_owner = -1;
4102 #endif
4103 set_bit(NAPI_STATE_SCHED, &napi->state);
4104 }
4105 EXPORT_SYMBOL(netif_napi_add);
4106
4107 void netif_napi_del(struct napi_struct *napi)
4108 {
4109 struct sk_buff *skb, *next;
4110
4111 list_del_init(&napi->dev_list);
4112 napi_free_frags(napi);
4113
4114 for (skb = napi->gro_list; skb; skb = next) {
4115 next = skb->next;
4116 skb->next = NULL;
4117 kfree_skb(skb);
4118 }
4119
4120 napi->gro_list = NULL;
4121 napi->gro_count = 0;
4122 }
4123 EXPORT_SYMBOL(netif_napi_del);
4124
4125 static void net_rx_action(struct softirq_action *h)
4126 {
4127 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4128 unsigned long time_limit = jiffies + 2;
4129 int budget = netdev_budget;
4130 void *have;
4131
4132 local_irq_disable();
4133
4134 while (!list_empty(&sd->poll_list)) {
4135 struct napi_struct *n;
4136 int work, weight;
4137
4138 /* If softirq window is exhuasted then punt.
4139 * Allow this to run for 2 jiffies since which will allow
4140 * an average latency of 1.5/HZ.
4141 */
4142 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4143 goto softnet_break;
4144
4145 local_irq_enable();
4146
4147 /* Even though interrupts have been re-enabled, this
4148 * access is safe because interrupts can only add new
4149 * entries to the tail of this list, and only ->poll()
4150 * calls can remove this head entry from the list.
4151 */
4152 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4153
4154 have = netpoll_poll_lock(n);
4155
4156 weight = n->weight;
4157
4158 /* This NAPI_STATE_SCHED test is for avoiding a race
4159 * with netpoll's poll_napi(). Only the entity which
4160 * obtains the lock and sees NAPI_STATE_SCHED set will
4161 * actually make the ->poll() call. Therefore we avoid
4162 * accidentally calling ->poll() when NAPI is not scheduled.
4163 */
4164 work = 0;
4165 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4166 work = n->poll(n, weight);
4167 trace_napi_poll(n);
4168 }
4169
4170 WARN_ON_ONCE(work > weight);
4171
4172 budget -= work;
4173
4174 local_irq_disable();
4175
4176 /* Drivers must not modify the NAPI state if they
4177 * consume the entire weight. In such cases this code
4178 * still "owns" the NAPI instance and therefore can
4179 * move the instance around on the list at-will.
4180 */
4181 if (unlikely(work == weight)) {
4182 if (unlikely(napi_disable_pending(n))) {
4183 local_irq_enable();
4184 napi_complete(n);
4185 local_irq_disable();
4186 } else {
4187 if (n->gro_list) {
4188 /* flush too old packets
4189 * If HZ < 1000, flush all packets.
4190 */
4191 local_irq_enable();
4192 napi_gro_flush(n, HZ >= 1000);
4193 local_irq_disable();
4194 }
4195 list_move_tail(&n->poll_list, &sd->poll_list);
4196 }
4197 }
4198
4199 netpoll_poll_unlock(have);
4200 }
4201 out:
4202 net_rps_action_and_irq_enable(sd);
4203
4204 #ifdef CONFIG_NET_DMA
4205 /*
4206 * There may not be any more sk_buffs coming right now, so push
4207 * any pending DMA copies to hardware
4208 */
4209 dma_issue_pending_all();
4210 #endif
4211
4212 return;
4213
4214 softnet_break:
4215 sd->time_squeeze++;
4216 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4217 goto out;
4218 }
4219
4220 #ifdef CONFIG_PROC_FS
4221
4222 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4223
4224 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4225 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4226 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4227
4228 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4229 {
4230 struct net *net = seq_file_net(seq);
4231 struct net_device *dev;
4232 struct hlist_node *p;
4233 struct hlist_head *h;
4234 unsigned int count = 0, offset = get_offset(*pos);
4235
4236 h = &net->dev_name_head[get_bucket(*pos)];
4237 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4238 if (++count == offset)
4239 return dev;
4240 }
4241
4242 return NULL;
4243 }
4244
4245 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4246 {
4247 struct net_device *dev;
4248 unsigned int bucket;
4249
4250 do {
4251 dev = dev_from_same_bucket(seq, pos);
4252 if (dev)
4253 return dev;
4254
4255 bucket = get_bucket(*pos) + 1;
4256 *pos = set_bucket_offset(bucket, 1);
4257 } while (bucket < NETDEV_HASHENTRIES);
4258
4259 return NULL;
4260 }
4261
4262 /*
4263 * This is invoked by the /proc filesystem handler to display a device
4264 * in detail.
4265 */
4266 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4267 __acquires(RCU)
4268 {
4269 rcu_read_lock();
4270 if (!*pos)
4271 return SEQ_START_TOKEN;
4272
4273 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4274 return NULL;
4275
4276 return dev_from_bucket(seq, pos);
4277 }
4278
4279 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4280 {
4281 ++*pos;
4282 return dev_from_bucket(seq, pos);
4283 }
4284
4285 void dev_seq_stop(struct seq_file *seq, void *v)
4286 __releases(RCU)
4287 {
4288 rcu_read_unlock();
4289 }
4290
4291 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4292 {
4293 struct rtnl_link_stats64 temp;
4294 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4295
4296 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4297 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4298 dev->name, stats->rx_bytes, stats->rx_packets,
4299 stats->rx_errors,
4300 stats->rx_dropped + stats->rx_missed_errors,
4301 stats->rx_fifo_errors,
4302 stats->rx_length_errors + stats->rx_over_errors +
4303 stats->rx_crc_errors + stats->rx_frame_errors,
4304 stats->rx_compressed, stats->multicast,
4305 stats->tx_bytes, stats->tx_packets,
4306 stats->tx_errors, stats->tx_dropped,
4307 stats->tx_fifo_errors, stats->collisions,
4308 stats->tx_carrier_errors +
4309 stats->tx_aborted_errors +
4310 stats->tx_window_errors +
4311 stats->tx_heartbeat_errors,
4312 stats->tx_compressed);
4313 }
4314
4315 /*
4316 * Called from the PROCfs module. This now uses the new arbitrary sized
4317 * /proc/net interface to create /proc/net/dev
4318 */
4319 static int dev_seq_show(struct seq_file *seq, void *v)
4320 {
4321 if (v == SEQ_START_TOKEN)
4322 seq_puts(seq, "Inter-| Receive "
4323 " | Transmit\n"
4324 " face |bytes packets errs drop fifo frame "
4325 "compressed multicast|bytes packets errs "
4326 "drop fifo colls carrier compressed\n");
4327 else
4328 dev_seq_printf_stats(seq, v);
4329 return 0;
4330 }
4331
4332 static struct softnet_data *softnet_get_online(loff_t *pos)
4333 {
4334 struct softnet_data *sd = NULL;
4335
4336 while (*pos < nr_cpu_ids)
4337 if (cpu_online(*pos)) {
4338 sd = &per_cpu(softnet_data, *pos);
4339 break;
4340 } else
4341 ++*pos;
4342 return sd;
4343 }
4344
4345 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4346 {
4347 return softnet_get_online(pos);
4348 }
4349
4350 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4351 {
4352 ++*pos;
4353 return softnet_get_online(pos);
4354 }
4355
4356 static void softnet_seq_stop(struct seq_file *seq, void *v)
4357 {
4358 }
4359
4360 static int softnet_seq_show(struct seq_file *seq, void *v)
4361 {
4362 struct softnet_data *sd = v;
4363
4364 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4365 sd->processed, sd->dropped, sd->time_squeeze, 0,
4366 0, 0, 0, 0, /* was fastroute */
4367 sd->cpu_collision, sd->received_rps);
4368 return 0;
4369 }
4370
4371 static const struct seq_operations dev_seq_ops = {
4372 .start = dev_seq_start,
4373 .next = dev_seq_next,
4374 .stop = dev_seq_stop,
4375 .show = dev_seq_show,
4376 };
4377
4378 static int dev_seq_open(struct inode *inode, struct file *file)
4379 {
4380 return seq_open_net(inode, file, &dev_seq_ops,
4381 sizeof(struct seq_net_private));
4382 }
4383
4384 static const struct file_operations dev_seq_fops = {
4385 .owner = THIS_MODULE,
4386 .open = dev_seq_open,
4387 .read = seq_read,
4388 .llseek = seq_lseek,
4389 .release = seq_release_net,
4390 };
4391
4392 static const struct seq_operations softnet_seq_ops = {
4393 .start = softnet_seq_start,
4394 .next = softnet_seq_next,
4395 .stop = softnet_seq_stop,
4396 .show = softnet_seq_show,
4397 };
4398
4399 static int softnet_seq_open(struct inode *inode, struct file *file)
4400 {
4401 return seq_open(file, &softnet_seq_ops);
4402 }
4403
4404 static const struct file_operations softnet_seq_fops = {
4405 .owner = THIS_MODULE,
4406 .open = softnet_seq_open,
4407 .read = seq_read,
4408 .llseek = seq_lseek,
4409 .release = seq_release,
4410 };
4411
4412 static void *ptype_get_idx(loff_t pos)
4413 {
4414 struct packet_type *pt = NULL;
4415 loff_t i = 0;
4416 int t;
4417
4418 list_for_each_entry_rcu(pt, &ptype_all, list) {
4419 if (i == pos)
4420 return pt;
4421 ++i;
4422 }
4423
4424 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4425 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4426 if (i == pos)
4427 return pt;
4428 ++i;
4429 }
4430 }
4431 return NULL;
4432 }
4433
4434 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4435 __acquires(RCU)
4436 {
4437 rcu_read_lock();
4438 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4439 }
4440
4441 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4442 {
4443 struct packet_type *pt;
4444 struct list_head *nxt;
4445 int hash;
4446
4447 ++*pos;
4448 if (v == SEQ_START_TOKEN)
4449 return ptype_get_idx(0);
4450
4451 pt = v;
4452 nxt = pt->list.next;
4453 if (pt->type == htons(ETH_P_ALL)) {
4454 if (nxt != &ptype_all)
4455 goto found;
4456 hash = 0;
4457 nxt = ptype_base[0].next;
4458 } else
4459 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4460
4461 while (nxt == &ptype_base[hash]) {
4462 if (++hash >= PTYPE_HASH_SIZE)
4463 return NULL;
4464 nxt = ptype_base[hash].next;
4465 }
4466 found:
4467 return list_entry(nxt, struct packet_type, list);
4468 }
4469
4470 static void ptype_seq_stop(struct seq_file *seq, void *v)
4471 __releases(RCU)
4472 {
4473 rcu_read_unlock();
4474 }
4475
4476 static int ptype_seq_show(struct seq_file *seq, void *v)
4477 {
4478 struct packet_type *pt = v;
4479
4480 if (v == SEQ_START_TOKEN)
4481 seq_puts(seq, "Type Device Function\n");
4482 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4483 if (pt->type == htons(ETH_P_ALL))
4484 seq_puts(seq, "ALL ");
4485 else
4486 seq_printf(seq, "%04x", ntohs(pt->type));
4487
4488 seq_printf(seq, " %-8s %pF\n",
4489 pt->dev ? pt->dev->name : "", pt->func);
4490 }
4491
4492 return 0;
4493 }
4494
4495 static const struct seq_operations ptype_seq_ops = {
4496 .start = ptype_seq_start,
4497 .next = ptype_seq_next,
4498 .stop = ptype_seq_stop,
4499 .show = ptype_seq_show,
4500 };
4501
4502 static int ptype_seq_open(struct inode *inode, struct file *file)
4503 {
4504 return seq_open_net(inode, file, &ptype_seq_ops,
4505 sizeof(struct seq_net_private));
4506 }
4507
4508 static const struct file_operations ptype_seq_fops = {
4509 .owner = THIS_MODULE,
4510 .open = ptype_seq_open,
4511 .read = seq_read,
4512 .llseek = seq_lseek,
4513 .release = seq_release_net,
4514 };
4515
4516
4517 static int __net_init dev_proc_net_init(struct net *net)
4518 {
4519 int rc = -ENOMEM;
4520
4521 if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
4522 goto out;
4523 if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
4524 &softnet_seq_fops))
4525 goto out_dev;
4526 if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
4527 goto out_softnet;
4528
4529 if (wext_proc_init(net))
4530 goto out_ptype;
4531 rc = 0;
4532 out:
4533 return rc;
4534 out_ptype:
4535 remove_proc_entry("ptype", net->proc_net);
4536 out_softnet:
4537 remove_proc_entry("softnet_stat", net->proc_net);
4538 out_dev:
4539 remove_proc_entry("dev", net->proc_net);
4540 goto out;
4541 }
4542
4543 static void __net_exit dev_proc_net_exit(struct net *net)
4544 {
4545 wext_proc_exit(net);
4546
4547 remove_proc_entry("ptype", net->proc_net);
4548 remove_proc_entry("softnet_stat", net->proc_net);
4549 remove_proc_entry("dev", net->proc_net);
4550 }
4551
4552 static struct pernet_operations __net_initdata dev_proc_ops = {
4553 .init = dev_proc_net_init,
4554 .exit = dev_proc_net_exit,
4555 };
4556
4557 static int __init dev_proc_init(void)
4558 {
4559 return register_pernet_subsys(&dev_proc_ops);
4560 }
4561 #else
4562 #define dev_proc_init() 0
4563 #endif /* CONFIG_PROC_FS */
4564
4565
4566 struct netdev_upper {
4567 struct net_device *dev;
4568 bool master;
4569 struct list_head list;
4570 struct rcu_head rcu;
4571 struct list_head search_list;
4572 };
4573
4574 static void __append_search_uppers(struct list_head *search_list,
4575 struct net_device *dev)
4576 {
4577 struct netdev_upper *upper;
4578
4579 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4580 /* check if this upper is not already in search list */
4581 if (list_empty(&upper->search_list))
4582 list_add_tail(&upper->search_list, search_list);
4583 }
4584 }
4585
4586 static bool __netdev_search_upper_dev(struct net_device *dev,
4587 struct net_device *upper_dev)
4588 {
4589 LIST_HEAD(search_list);
4590 struct netdev_upper *upper;
4591 struct netdev_upper *tmp;
4592 bool ret = false;
4593
4594 __append_search_uppers(&search_list, dev);
4595 list_for_each_entry(upper, &search_list, search_list) {
4596 if (upper->dev == upper_dev) {
4597 ret = true;
4598 break;
4599 }
4600 __append_search_uppers(&search_list, upper->dev);
4601 }
4602 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4603 INIT_LIST_HEAD(&upper->search_list);
4604 return ret;
4605 }
4606
4607 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4608 struct net_device *upper_dev)
4609 {
4610 struct netdev_upper *upper;
4611
4612 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4613 if (upper->dev == upper_dev)
4614 return upper;
4615 }
4616 return NULL;
4617 }
4618
4619 /**
4620 * netdev_has_upper_dev - Check if device is linked to an upper device
4621 * @dev: device
4622 * @upper_dev: upper device to check
4623 *
4624 * Find out if a device is linked to specified upper device and return true
4625 * in case it is. Note that this checks only immediate upper device,
4626 * not through a complete stack of devices. The caller must hold the RTNL lock.
4627 */
4628 bool netdev_has_upper_dev(struct net_device *dev,
4629 struct net_device *upper_dev)
4630 {
4631 ASSERT_RTNL();
4632
4633 return __netdev_find_upper(dev, upper_dev);
4634 }
4635 EXPORT_SYMBOL(netdev_has_upper_dev);
4636
4637 /**
4638 * netdev_has_any_upper_dev - Check if device is linked to some device
4639 * @dev: device
4640 *
4641 * Find out if a device is linked to an upper device and return true in case
4642 * it is. The caller must hold the RTNL lock.
4643 */
4644 bool netdev_has_any_upper_dev(struct net_device *dev)
4645 {
4646 ASSERT_RTNL();
4647
4648 return !list_empty(&dev->upper_dev_list);
4649 }
4650 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4651
4652 /**
4653 * netdev_master_upper_dev_get - Get master upper device
4654 * @dev: device
4655 *
4656 * Find a master upper device and return pointer to it or NULL in case
4657 * it's not there. The caller must hold the RTNL lock.
4658 */
4659 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4660 {
4661 struct netdev_upper *upper;
4662
4663 ASSERT_RTNL();
4664
4665 if (list_empty(&dev->upper_dev_list))
4666 return NULL;
4667
4668 upper = list_first_entry(&dev->upper_dev_list,
4669 struct netdev_upper, list);
4670 if (likely(upper->master))
4671 return upper->dev;
4672 return NULL;
4673 }
4674 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4675
4676 /**
4677 * netdev_master_upper_dev_get_rcu - Get master upper device
4678 * @dev: device
4679 *
4680 * Find a master upper device and return pointer to it or NULL in case
4681 * it's not there. The caller must hold the RCU read lock.
4682 */
4683 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4684 {
4685 struct netdev_upper *upper;
4686
4687 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4688 struct netdev_upper, list);
4689 if (upper && likely(upper->master))
4690 return upper->dev;
4691 return NULL;
4692 }
4693 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4694
4695 static int __netdev_upper_dev_link(struct net_device *dev,
4696 struct net_device *upper_dev, bool master)
4697 {
4698 struct netdev_upper *upper;
4699
4700 ASSERT_RTNL();
4701
4702 if (dev == upper_dev)
4703 return -EBUSY;
4704
4705 /* To prevent loops, check if dev is not upper device to upper_dev. */
4706 if (__netdev_search_upper_dev(upper_dev, dev))
4707 return -EBUSY;
4708
4709 if (__netdev_find_upper(dev, upper_dev))
4710 return -EEXIST;
4711
4712 if (master && netdev_master_upper_dev_get(dev))
4713 return -EBUSY;
4714
4715 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4716 if (!upper)
4717 return -ENOMEM;
4718
4719 upper->dev = upper_dev;
4720 upper->master = master;
4721 INIT_LIST_HEAD(&upper->search_list);
4722
4723 /* Ensure that master upper link is always the first item in list. */
4724 if (master)
4725 list_add_rcu(&upper->list, &dev->upper_dev_list);
4726 else
4727 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4728 dev_hold(upper_dev);
4729
4730 return 0;
4731 }
4732
4733 /**
4734 * netdev_upper_dev_link - Add a link to the upper device
4735 * @dev: device
4736 * @upper_dev: new upper device
4737 *
4738 * Adds a link to device which is upper to this one. The caller must hold
4739 * the RTNL lock. On a failure a negative errno code is returned.
4740 * On success the reference counts are adjusted and the function
4741 * returns zero.
4742 */
4743 int netdev_upper_dev_link(struct net_device *dev,
4744 struct net_device *upper_dev)
4745 {
4746 return __netdev_upper_dev_link(dev, upper_dev, false);
4747 }
4748 EXPORT_SYMBOL(netdev_upper_dev_link);
4749
4750 /**
4751 * netdev_master_upper_dev_link - Add a master link to the upper device
4752 * @dev: device
4753 * @upper_dev: new upper device
4754 *
4755 * Adds a link to device which is upper to this one. In this case, only
4756 * one master upper device can be linked, although other non-master devices
4757 * might be linked as well. The caller must hold the RTNL lock.
4758 * On a failure a negative errno code is returned. On success the reference
4759 * counts are adjusted and the function returns zero.
4760 */
4761 int netdev_master_upper_dev_link(struct net_device *dev,
4762 struct net_device *upper_dev)
4763 {
4764 return __netdev_upper_dev_link(dev, upper_dev, true);
4765 }
4766 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4767
4768 /**
4769 * netdev_upper_dev_unlink - Removes a link to upper device
4770 * @dev: device
4771 * @upper_dev: new upper device
4772 *
4773 * Removes a link to device which is upper to this one. The caller must hold
4774 * the RTNL lock.
4775 */
4776 void netdev_upper_dev_unlink(struct net_device *dev,
4777 struct net_device *upper_dev)
4778 {
4779 struct netdev_upper *upper;
4780
4781 ASSERT_RTNL();
4782
4783 upper = __netdev_find_upper(dev, upper_dev);
4784 if (!upper)
4785 return;
4786 list_del_rcu(&upper->list);
4787 dev_put(upper_dev);
4788 kfree_rcu(upper, rcu);
4789 }
4790 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4791
4792 static void dev_change_rx_flags(struct net_device *dev, int flags)
4793 {
4794 const struct net_device_ops *ops = dev->netdev_ops;
4795
4796 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4797 ops->ndo_change_rx_flags(dev, flags);
4798 }
4799
4800 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4801 {
4802 unsigned int old_flags = dev->flags;
4803 kuid_t uid;
4804 kgid_t gid;
4805
4806 ASSERT_RTNL();
4807
4808 dev->flags |= IFF_PROMISC;
4809 dev->promiscuity += inc;
4810 if (dev->promiscuity == 0) {
4811 /*
4812 * Avoid overflow.
4813 * If inc causes overflow, untouch promisc and return error.
4814 */
4815 if (inc < 0)
4816 dev->flags &= ~IFF_PROMISC;
4817 else {
4818 dev->promiscuity -= inc;
4819 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4820 dev->name);
4821 return -EOVERFLOW;
4822 }
4823 }
4824 if (dev->flags != old_flags) {
4825 pr_info("device %s %s promiscuous mode\n",
4826 dev->name,
4827 dev->flags & IFF_PROMISC ? "entered" : "left");
4828 if (audit_enabled) {
4829 current_uid_gid(&uid, &gid);
4830 audit_log(current->audit_context, GFP_ATOMIC,
4831 AUDIT_ANOM_PROMISCUOUS,
4832 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4833 dev->name, (dev->flags & IFF_PROMISC),
4834 (old_flags & IFF_PROMISC),
4835 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4836 from_kuid(&init_user_ns, uid),
4837 from_kgid(&init_user_ns, gid),
4838 audit_get_sessionid(current));
4839 }
4840
4841 dev_change_rx_flags(dev, IFF_PROMISC);
4842 }
4843 return 0;
4844 }
4845
4846 /**
4847 * dev_set_promiscuity - update promiscuity count on a device
4848 * @dev: device
4849 * @inc: modifier
4850 *
4851 * Add or remove promiscuity from a device. While the count in the device
4852 * remains above zero the interface remains promiscuous. Once it hits zero
4853 * the device reverts back to normal filtering operation. A negative inc
4854 * value is used to drop promiscuity on the device.
4855 * Return 0 if successful or a negative errno code on error.
4856 */
4857 int dev_set_promiscuity(struct net_device *dev, int inc)
4858 {
4859 unsigned int old_flags = dev->flags;
4860 int err;
4861
4862 err = __dev_set_promiscuity(dev, inc);
4863 if (err < 0)
4864 return err;
4865 if (dev->flags != old_flags)
4866 dev_set_rx_mode(dev);
4867 return err;
4868 }
4869 EXPORT_SYMBOL(dev_set_promiscuity);
4870
4871 /**
4872 * dev_set_allmulti - update allmulti count on a device
4873 * @dev: device
4874 * @inc: modifier
4875 *
4876 * Add or remove reception of all multicast frames to a device. While the
4877 * count in the device remains above zero the interface remains listening
4878 * to all interfaces. Once it hits zero the device reverts back to normal
4879 * filtering operation. A negative @inc value is used to drop the counter
4880 * when releasing a resource needing all multicasts.
4881 * Return 0 if successful or a negative errno code on error.
4882 */
4883
4884 int dev_set_allmulti(struct net_device *dev, int inc)
4885 {
4886 unsigned int old_flags = dev->flags;
4887
4888 ASSERT_RTNL();
4889
4890 dev->flags |= IFF_ALLMULTI;
4891 dev->allmulti += inc;
4892 if (dev->allmulti == 0) {
4893 /*
4894 * Avoid overflow.
4895 * If inc causes overflow, untouch allmulti and return error.
4896 */
4897 if (inc < 0)
4898 dev->flags &= ~IFF_ALLMULTI;
4899 else {
4900 dev->allmulti -= inc;
4901 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4902 dev->name);
4903 return -EOVERFLOW;
4904 }
4905 }
4906 if (dev->flags ^ old_flags) {
4907 dev_change_rx_flags(dev, IFF_ALLMULTI);
4908 dev_set_rx_mode(dev);
4909 }
4910 return 0;
4911 }
4912 EXPORT_SYMBOL(dev_set_allmulti);
4913
4914 /*
4915 * Upload unicast and multicast address lists to device and
4916 * configure RX filtering. When the device doesn't support unicast
4917 * filtering it is put in promiscuous mode while unicast addresses
4918 * are present.
4919 */
4920 void __dev_set_rx_mode(struct net_device *dev)
4921 {
4922 const struct net_device_ops *ops = dev->netdev_ops;
4923
4924 /* dev_open will call this function so the list will stay sane. */
4925 if (!(dev->flags&IFF_UP))
4926 return;
4927
4928 if (!netif_device_present(dev))
4929 return;
4930
4931 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4932 /* Unicast addresses changes may only happen under the rtnl,
4933 * therefore calling __dev_set_promiscuity here is safe.
4934 */
4935 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4936 __dev_set_promiscuity(dev, 1);
4937 dev->uc_promisc = true;
4938 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4939 __dev_set_promiscuity(dev, -1);
4940 dev->uc_promisc = false;
4941 }
4942 }
4943
4944 if (ops->ndo_set_rx_mode)
4945 ops->ndo_set_rx_mode(dev);
4946 }
4947
4948 void dev_set_rx_mode(struct net_device *dev)
4949 {
4950 netif_addr_lock_bh(dev);
4951 __dev_set_rx_mode(dev);
4952 netif_addr_unlock_bh(dev);
4953 }
4954
4955 /**
4956 * dev_get_flags - get flags reported to userspace
4957 * @dev: device
4958 *
4959 * Get the combination of flag bits exported through APIs to userspace.
4960 */
4961 unsigned int dev_get_flags(const struct net_device *dev)
4962 {
4963 unsigned int flags;
4964
4965 flags = (dev->flags & ~(IFF_PROMISC |
4966 IFF_ALLMULTI |
4967 IFF_RUNNING |
4968 IFF_LOWER_UP |
4969 IFF_DORMANT)) |
4970 (dev->gflags & (IFF_PROMISC |
4971 IFF_ALLMULTI));
4972
4973 if (netif_running(dev)) {
4974 if (netif_oper_up(dev))
4975 flags |= IFF_RUNNING;
4976 if (netif_carrier_ok(dev))
4977 flags |= IFF_LOWER_UP;
4978 if (netif_dormant(dev))
4979 flags |= IFF_DORMANT;
4980 }
4981
4982 return flags;
4983 }
4984 EXPORT_SYMBOL(dev_get_flags);
4985
4986 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4987 {
4988 unsigned int old_flags = dev->flags;
4989 int ret;
4990
4991 ASSERT_RTNL();
4992
4993 /*
4994 * Set the flags on our device.
4995 */
4996
4997 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4998 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4999 IFF_AUTOMEDIA)) |
5000 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5001 IFF_ALLMULTI));
5002
5003 /*
5004 * Load in the correct multicast list now the flags have changed.
5005 */
5006
5007 if ((old_flags ^ flags) & IFF_MULTICAST)
5008 dev_change_rx_flags(dev, IFF_MULTICAST);
5009
5010 dev_set_rx_mode(dev);
5011
5012 /*
5013 * Have we downed the interface. We handle IFF_UP ourselves
5014 * according to user attempts to set it, rather than blindly
5015 * setting it.
5016 */
5017
5018 ret = 0;
5019 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
5020 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5021
5022 if (!ret)
5023 dev_set_rx_mode(dev);
5024 }
5025
5026 if ((flags ^ dev->gflags) & IFF_PROMISC) {
5027 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5028
5029 dev->gflags ^= IFF_PROMISC;
5030 dev_set_promiscuity(dev, inc);
5031 }
5032
5033 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5034 is important. Some (broken) drivers set IFF_PROMISC, when
5035 IFF_ALLMULTI is requested not asking us and not reporting.
5036 */
5037 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5038 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5039
5040 dev->gflags ^= IFF_ALLMULTI;
5041 dev_set_allmulti(dev, inc);
5042 }
5043
5044 return ret;
5045 }
5046
5047 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5048 {
5049 unsigned int changes = dev->flags ^ old_flags;
5050
5051 if (changes & IFF_UP) {
5052 if (dev->flags & IFF_UP)
5053 call_netdevice_notifiers(NETDEV_UP, dev);
5054 else
5055 call_netdevice_notifiers(NETDEV_DOWN, dev);
5056 }
5057
5058 if (dev->flags & IFF_UP &&
5059 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5060 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5061 }
5062
5063 /**
5064 * dev_change_flags - change device settings
5065 * @dev: device
5066 * @flags: device state flags
5067 *
5068 * Change settings on device based state flags. The flags are
5069 * in the userspace exported format.
5070 */
5071 int dev_change_flags(struct net_device *dev, unsigned int flags)
5072 {
5073 int ret;
5074 unsigned int changes, old_flags = dev->flags;
5075
5076 ret = __dev_change_flags(dev, flags);
5077 if (ret < 0)
5078 return ret;
5079
5080 changes = old_flags ^ dev->flags;
5081 if (changes)
5082 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5083
5084 __dev_notify_flags(dev, old_flags);
5085 return ret;
5086 }
5087 EXPORT_SYMBOL(dev_change_flags);
5088
5089 /**
5090 * dev_set_mtu - Change maximum transfer unit
5091 * @dev: device
5092 * @new_mtu: new transfer unit
5093 *
5094 * Change the maximum transfer size of the network device.
5095 */
5096 int dev_set_mtu(struct net_device *dev, int new_mtu)
5097 {
5098 const struct net_device_ops *ops = dev->netdev_ops;
5099 int err;
5100
5101 if (new_mtu == dev->mtu)
5102 return 0;
5103
5104 /* MTU must be positive. */
5105 if (new_mtu < 0)
5106 return -EINVAL;
5107
5108 if (!netif_device_present(dev))
5109 return -ENODEV;
5110
5111 err = 0;
5112 if (ops->ndo_change_mtu)
5113 err = ops->ndo_change_mtu(dev, new_mtu);
5114 else
5115 dev->mtu = new_mtu;
5116
5117 if (!err)
5118 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5119 return err;
5120 }
5121 EXPORT_SYMBOL(dev_set_mtu);
5122
5123 /**
5124 * dev_set_group - Change group this device belongs to
5125 * @dev: device
5126 * @new_group: group this device should belong to
5127 */
5128 void dev_set_group(struct net_device *dev, int new_group)
5129 {
5130 dev->group = new_group;
5131 }
5132 EXPORT_SYMBOL(dev_set_group);
5133
5134 /**
5135 * dev_set_mac_address - Change Media Access Control Address
5136 * @dev: device
5137 * @sa: new address
5138 *
5139 * Change the hardware (MAC) address of the device
5140 */
5141 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5142 {
5143 const struct net_device_ops *ops = dev->netdev_ops;
5144 int err;
5145
5146 if (!ops->ndo_set_mac_address)
5147 return -EOPNOTSUPP;
5148 if (sa->sa_family != dev->type)
5149 return -EINVAL;
5150 if (!netif_device_present(dev))
5151 return -ENODEV;
5152 err = ops->ndo_set_mac_address(dev, sa);
5153 if (err)
5154 return err;
5155 dev->addr_assign_type = NET_ADDR_SET;
5156 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5157 add_device_randomness(dev->dev_addr, dev->addr_len);
5158 return 0;
5159 }
5160 EXPORT_SYMBOL(dev_set_mac_address);
5161
5162 /**
5163 * dev_change_carrier - Change device carrier
5164 * @dev: device
5165 * @new_carries: new value
5166 *
5167 * Change device carrier
5168 */
5169 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5170 {
5171 const struct net_device_ops *ops = dev->netdev_ops;
5172
5173 if (!ops->ndo_change_carrier)
5174 return -EOPNOTSUPP;
5175 if (!netif_device_present(dev))
5176 return -ENODEV;
5177 return ops->ndo_change_carrier(dev, new_carrier);
5178 }
5179 EXPORT_SYMBOL(dev_change_carrier);
5180
5181 /**
5182 * dev_new_index - allocate an ifindex
5183 * @net: the applicable net namespace
5184 *
5185 * Returns a suitable unique value for a new device interface
5186 * number. The caller must hold the rtnl semaphore or the
5187 * dev_base_lock to be sure it remains unique.
5188 */
5189 static int dev_new_index(struct net *net)
5190 {
5191 int ifindex = net->ifindex;
5192 for (;;) {
5193 if (++ifindex <= 0)
5194 ifindex = 1;
5195 if (!__dev_get_by_index(net, ifindex))
5196 return net->ifindex = ifindex;
5197 }
5198 }
5199
5200 /* Delayed registration/unregisteration */
5201 static LIST_HEAD(net_todo_list);
5202
5203 static void net_set_todo(struct net_device *dev)
5204 {
5205 list_add_tail(&dev->todo_list, &net_todo_list);
5206 }
5207
5208 static void rollback_registered_many(struct list_head *head)
5209 {
5210 struct net_device *dev, *tmp;
5211
5212 BUG_ON(dev_boot_phase);
5213 ASSERT_RTNL();
5214
5215 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5216 /* Some devices call without registering
5217 * for initialization unwind. Remove those
5218 * devices and proceed with the remaining.
5219 */
5220 if (dev->reg_state == NETREG_UNINITIALIZED) {
5221 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5222 dev->name, dev);
5223
5224 WARN_ON(1);
5225 list_del(&dev->unreg_list);
5226 continue;
5227 }
5228 dev->dismantle = true;
5229 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5230 }
5231
5232 /* If device is running, close it first. */
5233 dev_close_many(head);
5234
5235 list_for_each_entry(dev, head, unreg_list) {
5236 /* And unlink it from device chain. */
5237 unlist_netdevice(dev);
5238
5239 dev->reg_state = NETREG_UNREGISTERING;
5240 }
5241
5242 synchronize_net();
5243
5244 list_for_each_entry(dev, head, unreg_list) {
5245 /* Shutdown queueing discipline. */
5246 dev_shutdown(dev);
5247
5248
5249 /* Notify protocols, that we are about to destroy
5250 this device. They should clean all the things.
5251 */
5252 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5253
5254 if (!dev->rtnl_link_ops ||
5255 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5256 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5257
5258 /*
5259 * Flush the unicast and multicast chains
5260 */
5261 dev_uc_flush(dev);
5262 dev_mc_flush(dev);
5263
5264 if (dev->netdev_ops->ndo_uninit)
5265 dev->netdev_ops->ndo_uninit(dev);
5266
5267 /* Notifier chain MUST detach us all upper devices. */
5268 WARN_ON(netdev_has_any_upper_dev(dev));
5269
5270 /* Remove entries from kobject tree */
5271 netdev_unregister_kobject(dev);
5272 #ifdef CONFIG_XPS
5273 /* Remove XPS queueing entries */
5274 netif_reset_xps_queues_gt(dev, 0);
5275 #endif
5276 }
5277
5278 synchronize_net();
5279
5280 list_for_each_entry(dev, head, unreg_list)
5281 dev_put(dev);
5282 }
5283
5284 static void rollback_registered(struct net_device *dev)
5285 {
5286 LIST_HEAD(single);
5287
5288 list_add(&dev->unreg_list, &single);
5289 rollback_registered_many(&single);
5290 list_del(&single);
5291 }
5292
5293 static netdev_features_t netdev_fix_features(struct net_device *dev,
5294 netdev_features_t features)
5295 {
5296 /* Fix illegal checksum combinations */
5297 if ((features & NETIF_F_HW_CSUM) &&
5298 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5299 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5300 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5301 }
5302
5303 /* Fix illegal SG+CSUM combinations. */
5304 if ((features & NETIF_F_SG) &&
5305 !(features & NETIF_F_ALL_CSUM)) {
5306 netdev_dbg(dev,
5307 "Dropping NETIF_F_SG since no checksum feature.\n");
5308 features &= ~NETIF_F_SG;
5309 }
5310
5311 /* TSO requires that SG is present as well. */
5312 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5313 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5314 features &= ~NETIF_F_ALL_TSO;
5315 }
5316
5317 /* TSO ECN requires that TSO is present as well. */
5318 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5319 features &= ~NETIF_F_TSO_ECN;
5320
5321 /* Software GSO depends on SG. */
5322 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5323 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5324 features &= ~NETIF_F_GSO;
5325 }
5326
5327 /* UFO needs SG and checksumming */
5328 if (features & NETIF_F_UFO) {
5329 /* maybe split UFO into V4 and V6? */
5330 if (!((features & NETIF_F_GEN_CSUM) ||
5331 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5332 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5333 netdev_dbg(dev,
5334 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5335 features &= ~NETIF_F_UFO;
5336 }
5337
5338 if (!(features & NETIF_F_SG)) {
5339 netdev_dbg(dev,
5340 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5341 features &= ~NETIF_F_UFO;
5342 }
5343 }
5344
5345 return features;
5346 }
5347
5348 int __netdev_update_features(struct net_device *dev)
5349 {
5350 netdev_features_t features;
5351 int err = 0;
5352
5353 ASSERT_RTNL();
5354
5355 features = netdev_get_wanted_features(dev);
5356
5357 if (dev->netdev_ops->ndo_fix_features)
5358 features = dev->netdev_ops->ndo_fix_features(dev, features);
5359
5360 /* driver might be less strict about feature dependencies */
5361 features = netdev_fix_features(dev, features);
5362
5363 if (dev->features == features)
5364 return 0;
5365
5366 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5367 &dev->features, &features);
5368
5369 if (dev->netdev_ops->ndo_set_features)
5370 err = dev->netdev_ops->ndo_set_features(dev, features);
5371
5372 if (unlikely(err < 0)) {
5373 netdev_err(dev,
5374 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5375 err, &features, &dev->features);
5376 return -1;
5377 }
5378
5379 if (!err)
5380 dev->features = features;
5381
5382 return 1;
5383 }
5384
5385 /**
5386 * netdev_update_features - recalculate device features
5387 * @dev: the device to check
5388 *
5389 * Recalculate dev->features set and send notifications if it
5390 * has changed. Should be called after driver or hardware dependent
5391 * conditions might have changed that influence the features.
5392 */
5393 void netdev_update_features(struct net_device *dev)
5394 {
5395 if (__netdev_update_features(dev))
5396 netdev_features_change(dev);
5397 }
5398 EXPORT_SYMBOL(netdev_update_features);
5399
5400 /**
5401 * netdev_change_features - recalculate device features
5402 * @dev: the device to check
5403 *
5404 * Recalculate dev->features set and send notifications even
5405 * if they have not changed. Should be called instead of
5406 * netdev_update_features() if also dev->vlan_features might
5407 * have changed to allow the changes to be propagated to stacked
5408 * VLAN devices.
5409 */
5410 void netdev_change_features(struct net_device *dev)
5411 {
5412 __netdev_update_features(dev);
5413 netdev_features_change(dev);
5414 }
5415 EXPORT_SYMBOL(netdev_change_features);
5416
5417 /**
5418 * netif_stacked_transfer_operstate - transfer operstate
5419 * @rootdev: the root or lower level device to transfer state from
5420 * @dev: the device to transfer operstate to
5421 *
5422 * Transfer operational state from root to device. This is normally
5423 * called when a stacking relationship exists between the root
5424 * device and the device(a leaf device).
5425 */
5426 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5427 struct net_device *dev)
5428 {
5429 if (rootdev->operstate == IF_OPER_DORMANT)
5430 netif_dormant_on(dev);
5431 else
5432 netif_dormant_off(dev);
5433
5434 if (netif_carrier_ok(rootdev)) {
5435 if (!netif_carrier_ok(dev))
5436 netif_carrier_on(dev);
5437 } else {
5438 if (netif_carrier_ok(dev))
5439 netif_carrier_off(dev);
5440 }
5441 }
5442 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5443
5444 #ifdef CONFIG_RPS
5445 static int netif_alloc_rx_queues(struct net_device *dev)
5446 {
5447 unsigned int i, count = dev->num_rx_queues;
5448 struct netdev_rx_queue *rx;
5449
5450 BUG_ON(count < 1);
5451
5452 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5453 if (!rx)
5454 return -ENOMEM;
5455
5456 dev->_rx = rx;
5457
5458 for (i = 0; i < count; i++)
5459 rx[i].dev = dev;
5460 return 0;
5461 }
5462 #endif
5463
5464 static void netdev_init_one_queue(struct net_device *dev,
5465 struct netdev_queue *queue, void *_unused)
5466 {
5467 /* Initialize queue lock */
5468 spin_lock_init(&queue->_xmit_lock);
5469 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5470 queue->xmit_lock_owner = -1;
5471 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5472 queue->dev = dev;
5473 #ifdef CONFIG_BQL
5474 dql_init(&queue->dql, HZ);
5475 #endif
5476 }
5477
5478 static int netif_alloc_netdev_queues(struct net_device *dev)
5479 {
5480 unsigned int count = dev->num_tx_queues;
5481 struct netdev_queue *tx;
5482
5483 BUG_ON(count < 1);
5484
5485 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5486 if (!tx)
5487 return -ENOMEM;
5488
5489 dev->_tx = tx;
5490
5491 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5492 spin_lock_init(&dev->tx_global_lock);
5493
5494 return 0;
5495 }
5496
5497 /**
5498 * register_netdevice - register a network device
5499 * @dev: device to register
5500 *
5501 * Take a completed network device structure and add it to the kernel
5502 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5503 * chain. 0 is returned on success. A negative errno code is returned
5504 * on a failure to set up the device, or if the name is a duplicate.
5505 *
5506 * Callers must hold the rtnl semaphore. You may want
5507 * register_netdev() instead of this.
5508 *
5509 * BUGS:
5510 * The locking appears insufficient to guarantee two parallel registers
5511 * will not get the same name.
5512 */
5513
5514 int register_netdevice(struct net_device *dev)
5515 {
5516 int ret;
5517 struct net *net = dev_net(dev);
5518
5519 BUG_ON(dev_boot_phase);
5520 ASSERT_RTNL();
5521
5522 might_sleep();
5523
5524 /* When net_device's are persistent, this will be fatal. */
5525 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5526 BUG_ON(!net);
5527
5528 spin_lock_init(&dev->addr_list_lock);
5529 netdev_set_addr_lockdep_class(dev);
5530
5531 dev->iflink = -1;
5532
5533 ret = dev_get_valid_name(net, dev, dev->name);
5534 if (ret < 0)
5535 goto out;
5536
5537 /* Init, if this function is available */
5538 if (dev->netdev_ops->ndo_init) {
5539 ret = dev->netdev_ops->ndo_init(dev);
5540 if (ret) {
5541 if (ret > 0)
5542 ret = -EIO;
5543 goto out;
5544 }
5545 }
5546
5547 if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
5548 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5549 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5550 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5551 ret = -EINVAL;
5552 goto err_uninit;
5553 }
5554
5555 ret = -EBUSY;
5556 if (!dev->ifindex)
5557 dev->ifindex = dev_new_index(net);
5558 else if (__dev_get_by_index(net, dev->ifindex))
5559 goto err_uninit;
5560
5561 if (dev->iflink == -1)
5562 dev->iflink = dev->ifindex;
5563
5564 /* Transfer changeable features to wanted_features and enable
5565 * software offloads (GSO and GRO).
5566 */
5567 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5568 dev->features |= NETIF_F_SOFT_FEATURES;
5569 dev->wanted_features = dev->features & dev->hw_features;
5570
5571 /* Turn on no cache copy if HW is doing checksum */
5572 if (!(dev->flags & IFF_LOOPBACK)) {
5573 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5574 if (dev->features & NETIF_F_ALL_CSUM) {
5575 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5576 dev->features |= NETIF_F_NOCACHE_COPY;
5577 }
5578 }
5579
5580 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5581 */
5582 dev->vlan_features |= NETIF_F_HIGHDMA;
5583
5584 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5585 ret = notifier_to_errno(ret);
5586 if (ret)
5587 goto err_uninit;
5588
5589 ret = netdev_register_kobject(dev);
5590 if (ret)
5591 goto err_uninit;
5592 dev->reg_state = NETREG_REGISTERED;
5593
5594 __netdev_update_features(dev);
5595
5596 /*
5597 * Default initial state at registry is that the
5598 * device is present.
5599 */
5600
5601 set_bit(__LINK_STATE_PRESENT, &dev->state);
5602
5603 linkwatch_init_dev(dev);
5604
5605 dev_init_scheduler(dev);
5606 dev_hold(dev);
5607 list_netdevice(dev);
5608 add_device_randomness(dev->dev_addr, dev->addr_len);
5609
5610 /* If the device has permanent device address, driver should
5611 * set dev_addr and also addr_assign_type should be set to
5612 * NET_ADDR_PERM (default value).
5613 */
5614 if (dev->addr_assign_type == NET_ADDR_PERM)
5615 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5616
5617 /* Notify protocols, that a new device appeared. */
5618 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5619 ret = notifier_to_errno(ret);
5620 if (ret) {
5621 rollback_registered(dev);
5622 dev->reg_state = NETREG_UNREGISTERED;
5623 }
5624 /*
5625 * Prevent userspace races by waiting until the network
5626 * device is fully setup before sending notifications.
5627 */
5628 if (!dev->rtnl_link_ops ||
5629 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5630 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5631
5632 out:
5633 return ret;
5634
5635 err_uninit:
5636 if (dev->netdev_ops->ndo_uninit)
5637 dev->netdev_ops->ndo_uninit(dev);
5638 goto out;
5639 }
5640 EXPORT_SYMBOL(register_netdevice);
5641
5642 /**
5643 * init_dummy_netdev - init a dummy network device for NAPI
5644 * @dev: device to init
5645 *
5646 * This takes a network device structure and initialize the minimum
5647 * amount of fields so it can be used to schedule NAPI polls without
5648 * registering a full blown interface. This is to be used by drivers
5649 * that need to tie several hardware interfaces to a single NAPI
5650 * poll scheduler due to HW limitations.
5651 */
5652 int init_dummy_netdev(struct net_device *dev)
5653 {
5654 /* Clear everything. Note we don't initialize spinlocks
5655 * are they aren't supposed to be taken by any of the
5656 * NAPI code and this dummy netdev is supposed to be
5657 * only ever used for NAPI polls
5658 */
5659 memset(dev, 0, sizeof(struct net_device));
5660
5661 /* make sure we BUG if trying to hit standard
5662 * register/unregister code path
5663 */
5664 dev->reg_state = NETREG_DUMMY;
5665
5666 /* NAPI wants this */
5667 INIT_LIST_HEAD(&dev->napi_list);
5668
5669 /* a dummy interface is started by default */
5670 set_bit(__LINK_STATE_PRESENT, &dev->state);
5671 set_bit(__LINK_STATE_START, &dev->state);
5672
5673 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5674 * because users of this 'device' dont need to change
5675 * its refcount.
5676 */
5677
5678 return 0;
5679 }
5680 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5681
5682
5683 /**
5684 * register_netdev - register a network device
5685 * @dev: device to register
5686 *
5687 * Take a completed network device structure and add it to the kernel
5688 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5689 * chain. 0 is returned on success. A negative errno code is returned
5690 * on a failure to set up the device, or if the name is a duplicate.
5691 *
5692 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5693 * and expands the device name if you passed a format string to
5694 * alloc_netdev.
5695 */
5696 int register_netdev(struct net_device *dev)
5697 {
5698 int err;
5699
5700 rtnl_lock();
5701 err = register_netdevice(dev);
5702 rtnl_unlock();
5703 return err;
5704 }
5705 EXPORT_SYMBOL(register_netdev);
5706
5707 int netdev_refcnt_read(const struct net_device *dev)
5708 {
5709 int i, refcnt = 0;
5710
5711 for_each_possible_cpu(i)
5712 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5713 return refcnt;
5714 }
5715 EXPORT_SYMBOL(netdev_refcnt_read);
5716
5717 /**
5718 * netdev_wait_allrefs - wait until all references are gone.
5719 * @dev: target net_device
5720 *
5721 * This is called when unregistering network devices.
5722 *
5723 * Any protocol or device that holds a reference should register
5724 * for netdevice notification, and cleanup and put back the
5725 * reference if they receive an UNREGISTER event.
5726 * We can get stuck here if buggy protocols don't correctly
5727 * call dev_put.
5728 */
5729 static void netdev_wait_allrefs(struct net_device *dev)
5730 {
5731 unsigned long rebroadcast_time, warning_time;
5732 int refcnt;
5733
5734 linkwatch_forget_dev(dev);
5735
5736 rebroadcast_time = warning_time = jiffies;
5737 refcnt = netdev_refcnt_read(dev);
5738
5739 while (refcnt != 0) {
5740 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5741 rtnl_lock();
5742
5743 /* Rebroadcast unregister notification */
5744 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5745
5746 __rtnl_unlock();
5747 rcu_barrier();
5748 rtnl_lock();
5749
5750 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5751 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5752 &dev->state)) {
5753 /* We must not have linkwatch events
5754 * pending on unregister. If this
5755 * happens, we simply run the queue
5756 * unscheduled, resulting in a noop
5757 * for this device.
5758 */
5759 linkwatch_run_queue();
5760 }
5761
5762 __rtnl_unlock();
5763
5764 rebroadcast_time = jiffies;
5765 }
5766
5767 msleep(250);
5768
5769 refcnt = netdev_refcnt_read(dev);
5770
5771 if (time_after(jiffies, warning_time + 10 * HZ)) {
5772 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5773 dev->name, refcnt);
5774 warning_time = jiffies;
5775 }
5776 }
5777 }
5778
5779 /* The sequence is:
5780 *
5781 * rtnl_lock();
5782 * ...
5783 * register_netdevice(x1);
5784 * register_netdevice(x2);
5785 * ...
5786 * unregister_netdevice(y1);
5787 * unregister_netdevice(y2);
5788 * ...
5789 * rtnl_unlock();
5790 * free_netdev(y1);
5791 * free_netdev(y2);
5792 *
5793 * We are invoked by rtnl_unlock().
5794 * This allows us to deal with problems:
5795 * 1) We can delete sysfs objects which invoke hotplug
5796 * without deadlocking with linkwatch via keventd.
5797 * 2) Since we run with the RTNL semaphore not held, we can sleep
5798 * safely in order to wait for the netdev refcnt to drop to zero.
5799 *
5800 * We must not return until all unregister events added during
5801 * the interval the lock was held have been completed.
5802 */
5803 void netdev_run_todo(void)
5804 {
5805 struct list_head list;
5806
5807 /* Snapshot list, allow later requests */
5808 list_replace_init(&net_todo_list, &list);
5809
5810 __rtnl_unlock();
5811
5812
5813 /* Wait for rcu callbacks to finish before next phase */
5814 if (!list_empty(&list))
5815 rcu_barrier();
5816
5817 while (!list_empty(&list)) {
5818 struct net_device *dev
5819 = list_first_entry(&list, struct net_device, todo_list);
5820 list_del(&dev->todo_list);
5821
5822 rtnl_lock();
5823 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5824 __rtnl_unlock();
5825
5826 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5827 pr_err("network todo '%s' but state %d\n",
5828 dev->name, dev->reg_state);
5829 dump_stack();
5830 continue;
5831 }
5832
5833 dev->reg_state = NETREG_UNREGISTERED;
5834
5835 on_each_cpu(flush_backlog, dev, 1);
5836
5837 netdev_wait_allrefs(dev);
5838
5839 /* paranoia */
5840 BUG_ON(netdev_refcnt_read(dev));
5841 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5842 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5843 WARN_ON(dev->dn_ptr);
5844
5845 if (dev->destructor)
5846 dev->destructor(dev);
5847
5848 /* Free network device */
5849 kobject_put(&dev->dev.kobj);
5850 }
5851 }
5852
5853 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5854 * fields in the same order, with only the type differing.
5855 */
5856 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5857 const struct net_device_stats *netdev_stats)
5858 {
5859 #if BITS_PER_LONG == 64
5860 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5861 memcpy(stats64, netdev_stats, sizeof(*stats64));
5862 #else
5863 size_t i, n = sizeof(*stats64) / sizeof(u64);
5864 const unsigned long *src = (const unsigned long *)netdev_stats;
5865 u64 *dst = (u64 *)stats64;
5866
5867 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5868 sizeof(*stats64) / sizeof(u64));
5869 for (i = 0; i < n; i++)
5870 dst[i] = src[i];
5871 #endif
5872 }
5873 EXPORT_SYMBOL(netdev_stats_to_stats64);
5874
5875 /**
5876 * dev_get_stats - get network device statistics
5877 * @dev: device to get statistics from
5878 * @storage: place to store stats
5879 *
5880 * Get network statistics from device. Return @storage.
5881 * The device driver may provide its own method by setting
5882 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5883 * otherwise the internal statistics structure is used.
5884 */
5885 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5886 struct rtnl_link_stats64 *storage)
5887 {
5888 const struct net_device_ops *ops = dev->netdev_ops;
5889
5890 if (ops->ndo_get_stats64) {
5891 memset(storage, 0, sizeof(*storage));
5892 ops->ndo_get_stats64(dev, storage);
5893 } else if (ops->ndo_get_stats) {
5894 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5895 } else {
5896 netdev_stats_to_stats64(storage, &dev->stats);
5897 }
5898 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5899 return storage;
5900 }
5901 EXPORT_SYMBOL(dev_get_stats);
5902
5903 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5904 {
5905 struct netdev_queue *queue = dev_ingress_queue(dev);
5906
5907 #ifdef CONFIG_NET_CLS_ACT
5908 if (queue)
5909 return queue;
5910 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5911 if (!queue)
5912 return NULL;
5913 netdev_init_one_queue(dev, queue, NULL);
5914 queue->qdisc = &noop_qdisc;
5915 queue->qdisc_sleeping = &noop_qdisc;
5916 rcu_assign_pointer(dev->ingress_queue, queue);
5917 #endif
5918 return queue;
5919 }
5920
5921 static const struct ethtool_ops default_ethtool_ops;
5922
5923 void netdev_set_default_ethtool_ops(struct net_device *dev,
5924 const struct ethtool_ops *ops)
5925 {
5926 if (dev->ethtool_ops == &default_ethtool_ops)
5927 dev->ethtool_ops = ops;
5928 }
5929 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5930
5931 /**
5932 * alloc_netdev_mqs - allocate network device
5933 * @sizeof_priv: size of private data to allocate space for
5934 * @name: device name format string
5935 * @setup: callback to initialize device
5936 * @txqs: the number of TX subqueues to allocate
5937 * @rxqs: the number of RX subqueues to allocate
5938 *
5939 * Allocates a struct net_device with private data area for driver use
5940 * and performs basic initialization. Also allocates subquue structs
5941 * for each queue on the device.
5942 */
5943 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5944 void (*setup)(struct net_device *),
5945 unsigned int txqs, unsigned int rxqs)
5946 {
5947 struct net_device *dev;
5948 size_t alloc_size;
5949 struct net_device *p;
5950
5951 BUG_ON(strlen(name) >= sizeof(dev->name));
5952
5953 if (txqs < 1) {
5954 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5955 return NULL;
5956 }
5957
5958 #ifdef CONFIG_RPS
5959 if (rxqs < 1) {
5960 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5961 return NULL;
5962 }
5963 #endif
5964
5965 alloc_size = sizeof(struct net_device);
5966 if (sizeof_priv) {
5967 /* ensure 32-byte alignment of private area */
5968 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5969 alloc_size += sizeof_priv;
5970 }
5971 /* ensure 32-byte alignment of whole construct */
5972 alloc_size += NETDEV_ALIGN - 1;
5973
5974 p = kzalloc(alloc_size, GFP_KERNEL);
5975 if (!p)
5976 return NULL;
5977
5978 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5979 dev->padded = (char *)dev - (char *)p;
5980
5981 dev->pcpu_refcnt = alloc_percpu(int);
5982 if (!dev->pcpu_refcnt)
5983 goto free_p;
5984
5985 if (dev_addr_init(dev))
5986 goto free_pcpu;
5987
5988 dev_mc_init(dev);
5989 dev_uc_init(dev);
5990
5991 dev_net_set(dev, &init_net);
5992
5993 dev->gso_max_size = GSO_MAX_SIZE;
5994 dev->gso_max_segs = GSO_MAX_SEGS;
5995
5996 INIT_LIST_HEAD(&dev->napi_list);
5997 INIT_LIST_HEAD(&dev->unreg_list);
5998 INIT_LIST_HEAD(&dev->link_watch_list);
5999 INIT_LIST_HEAD(&dev->upper_dev_list);
6000 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6001 setup(dev);
6002
6003 dev->num_tx_queues = txqs;
6004 dev->real_num_tx_queues = txqs;
6005 if (netif_alloc_netdev_queues(dev))
6006 goto free_all;
6007
6008 #ifdef CONFIG_RPS
6009 dev->num_rx_queues = rxqs;
6010 dev->real_num_rx_queues = rxqs;
6011 if (netif_alloc_rx_queues(dev))
6012 goto free_all;
6013 #endif
6014
6015 strcpy(dev->name, name);
6016 dev->group = INIT_NETDEV_GROUP;
6017 if (!dev->ethtool_ops)
6018 dev->ethtool_ops = &default_ethtool_ops;
6019 return dev;
6020
6021 free_all:
6022 free_netdev(dev);
6023 return NULL;
6024
6025 free_pcpu:
6026 free_percpu(dev->pcpu_refcnt);
6027 kfree(dev->_tx);
6028 #ifdef CONFIG_RPS
6029 kfree(dev->_rx);
6030 #endif
6031
6032 free_p:
6033 kfree(p);
6034 return NULL;
6035 }
6036 EXPORT_SYMBOL(alloc_netdev_mqs);
6037
6038 /**
6039 * free_netdev - free network device
6040 * @dev: device
6041 *
6042 * This function does the last stage of destroying an allocated device
6043 * interface. The reference to the device object is released.
6044 * If this is the last reference then it will be freed.
6045 */
6046 void free_netdev(struct net_device *dev)
6047 {
6048 struct napi_struct *p, *n;
6049
6050 release_net(dev_net(dev));
6051
6052 kfree(dev->_tx);
6053 #ifdef CONFIG_RPS
6054 kfree(dev->_rx);
6055 #endif
6056
6057 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6058
6059 /* Flush device addresses */
6060 dev_addr_flush(dev);
6061
6062 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6063 netif_napi_del(p);
6064
6065 free_percpu(dev->pcpu_refcnt);
6066 dev->pcpu_refcnt = NULL;
6067
6068 /* Compatibility with error handling in drivers */
6069 if (dev->reg_state == NETREG_UNINITIALIZED) {
6070 kfree((char *)dev - dev->padded);
6071 return;
6072 }
6073
6074 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6075 dev->reg_state = NETREG_RELEASED;
6076
6077 /* will free via device release */
6078 put_device(&dev->dev);
6079 }
6080 EXPORT_SYMBOL(free_netdev);
6081
6082 /**
6083 * synchronize_net - Synchronize with packet receive processing
6084 *
6085 * Wait for packets currently being received to be done.
6086 * Does not block later packets from starting.
6087 */
6088 void synchronize_net(void)
6089 {
6090 might_sleep();
6091 if (rtnl_is_locked())
6092 synchronize_rcu_expedited();
6093 else
6094 synchronize_rcu();
6095 }
6096 EXPORT_SYMBOL(synchronize_net);
6097
6098 /**
6099 * unregister_netdevice_queue - remove device from the kernel
6100 * @dev: device
6101 * @head: list
6102 *
6103 * This function shuts down a device interface and removes it
6104 * from the kernel tables.
6105 * If head not NULL, device is queued to be unregistered later.
6106 *
6107 * Callers must hold the rtnl semaphore. You may want
6108 * unregister_netdev() instead of this.
6109 */
6110
6111 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6112 {
6113 ASSERT_RTNL();
6114
6115 if (head) {
6116 list_move_tail(&dev->unreg_list, head);
6117 } else {
6118 rollback_registered(dev);
6119 /* Finish processing unregister after unlock */
6120 net_set_todo(dev);
6121 }
6122 }
6123 EXPORT_SYMBOL(unregister_netdevice_queue);
6124
6125 /**
6126 * unregister_netdevice_many - unregister many devices
6127 * @head: list of devices
6128 */
6129 void unregister_netdevice_many(struct list_head *head)
6130 {
6131 struct net_device *dev;
6132
6133 if (!list_empty(head)) {
6134 rollback_registered_many(head);
6135 list_for_each_entry(dev, head, unreg_list)
6136 net_set_todo(dev);
6137 }
6138 }
6139 EXPORT_SYMBOL(unregister_netdevice_many);
6140
6141 /**
6142 * unregister_netdev - remove device from the kernel
6143 * @dev: device
6144 *
6145 * This function shuts down a device interface and removes it
6146 * from the kernel tables.
6147 *
6148 * This is just a wrapper for unregister_netdevice that takes
6149 * the rtnl semaphore. In general you want to use this and not
6150 * unregister_netdevice.
6151 */
6152 void unregister_netdev(struct net_device *dev)
6153 {
6154 rtnl_lock();
6155 unregister_netdevice(dev);
6156 rtnl_unlock();
6157 }
6158 EXPORT_SYMBOL(unregister_netdev);
6159
6160 /**
6161 * dev_change_net_namespace - move device to different nethost namespace
6162 * @dev: device
6163 * @net: network namespace
6164 * @pat: If not NULL name pattern to try if the current device name
6165 * is already taken in the destination network namespace.
6166 *
6167 * This function shuts down a device interface and moves it
6168 * to a new network namespace. On success 0 is returned, on
6169 * a failure a netagive errno code is returned.
6170 *
6171 * Callers must hold the rtnl semaphore.
6172 */
6173
6174 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6175 {
6176 int err;
6177
6178 ASSERT_RTNL();
6179
6180 /* Don't allow namespace local devices to be moved. */
6181 err = -EINVAL;
6182 if (dev->features & NETIF_F_NETNS_LOCAL)
6183 goto out;
6184
6185 /* Ensure the device has been registrered */
6186 if (dev->reg_state != NETREG_REGISTERED)
6187 goto out;
6188
6189 /* Get out if there is nothing todo */
6190 err = 0;
6191 if (net_eq(dev_net(dev), net))
6192 goto out;
6193
6194 /* Pick the destination device name, and ensure
6195 * we can use it in the destination network namespace.
6196 */
6197 err = -EEXIST;
6198 if (__dev_get_by_name(net, dev->name)) {
6199 /* We get here if we can't use the current device name */
6200 if (!pat)
6201 goto out;
6202 if (dev_get_valid_name(net, dev, pat) < 0)
6203 goto out;
6204 }
6205
6206 /*
6207 * And now a mini version of register_netdevice unregister_netdevice.
6208 */
6209
6210 /* If device is running close it first. */
6211 dev_close(dev);
6212
6213 /* And unlink it from device chain */
6214 err = -ENODEV;
6215 unlist_netdevice(dev);
6216
6217 synchronize_net();
6218
6219 /* Shutdown queueing discipline. */
6220 dev_shutdown(dev);
6221
6222 /* Notify protocols, that we are about to destroy
6223 this device. They should clean all the things.
6224
6225 Note that dev->reg_state stays at NETREG_REGISTERED.
6226 This is wanted because this way 8021q and macvlan know
6227 the device is just moving and can keep their slaves up.
6228 */
6229 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6230 rcu_barrier();
6231 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6232 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6233
6234 /*
6235 * Flush the unicast and multicast chains
6236 */
6237 dev_uc_flush(dev);
6238 dev_mc_flush(dev);
6239
6240 /* Send a netdev-removed uevent to the old namespace */
6241 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6242
6243 /* Actually switch the network namespace */
6244 dev_net_set(dev, net);
6245
6246 /* If there is an ifindex conflict assign a new one */
6247 if (__dev_get_by_index(net, dev->ifindex)) {
6248 int iflink = (dev->iflink == dev->ifindex);
6249 dev->ifindex = dev_new_index(net);
6250 if (iflink)
6251 dev->iflink = dev->ifindex;
6252 }
6253
6254 /* Send a netdev-add uevent to the new namespace */
6255 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6256
6257 /* Fixup kobjects */
6258 err = device_rename(&dev->dev, dev->name);
6259 WARN_ON(err);
6260
6261 /* Add the device back in the hashes */
6262 list_netdevice(dev);
6263
6264 /* Notify protocols, that a new device appeared. */
6265 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6266
6267 /*
6268 * Prevent userspace races by waiting until the network
6269 * device is fully setup before sending notifications.
6270 */
6271 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6272
6273 synchronize_net();
6274 err = 0;
6275 out:
6276 return err;
6277 }
6278 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6279
6280 static int dev_cpu_callback(struct notifier_block *nfb,
6281 unsigned long action,
6282 void *ocpu)
6283 {
6284 struct sk_buff **list_skb;
6285 struct sk_buff *skb;
6286 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6287 struct softnet_data *sd, *oldsd;
6288
6289 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6290 return NOTIFY_OK;
6291
6292 local_irq_disable();
6293 cpu = smp_processor_id();
6294 sd = &per_cpu(softnet_data, cpu);
6295 oldsd = &per_cpu(softnet_data, oldcpu);
6296
6297 /* Find end of our completion_queue. */
6298 list_skb = &sd->completion_queue;
6299 while (*list_skb)
6300 list_skb = &(*list_skb)->next;
6301 /* Append completion queue from offline CPU. */
6302 *list_skb = oldsd->completion_queue;
6303 oldsd->completion_queue = NULL;
6304
6305 /* Append output queue from offline CPU. */
6306 if (oldsd->output_queue) {
6307 *sd->output_queue_tailp = oldsd->output_queue;
6308 sd->output_queue_tailp = oldsd->output_queue_tailp;
6309 oldsd->output_queue = NULL;
6310 oldsd->output_queue_tailp = &oldsd->output_queue;
6311 }
6312 /* Append NAPI poll list from offline CPU. */
6313 if (!list_empty(&oldsd->poll_list)) {
6314 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6315 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6316 }
6317
6318 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6319 local_irq_enable();
6320
6321 /* Process offline CPU's input_pkt_queue */
6322 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6323 netif_rx(skb);
6324 input_queue_head_incr(oldsd);
6325 }
6326 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6327 netif_rx(skb);
6328 input_queue_head_incr(oldsd);
6329 }
6330
6331 return NOTIFY_OK;
6332 }
6333
6334
6335 /**
6336 * netdev_increment_features - increment feature set by one
6337 * @all: current feature set
6338 * @one: new feature set
6339 * @mask: mask feature set
6340 *
6341 * Computes a new feature set after adding a device with feature set
6342 * @one to the master device with current feature set @all. Will not
6343 * enable anything that is off in @mask. Returns the new feature set.
6344 */
6345 netdev_features_t netdev_increment_features(netdev_features_t all,
6346 netdev_features_t one, netdev_features_t mask)
6347 {
6348 if (mask & NETIF_F_GEN_CSUM)
6349 mask |= NETIF_F_ALL_CSUM;
6350 mask |= NETIF_F_VLAN_CHALLENGED;
6351
6352 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6353 all &= one | ~NETIF_F_ALL_FOR_ALL;
6354
6355 /* If one device supports hw checksumming, set for all. */
6356 if (all & NETIF_F_GEN_CSUM)
6357 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6358
6359 return all;
6360 }
6361 EXPORT_SYMBOL(netdev_increment_features);
6362
6363 static struct hlist_head *netdev_create_hash(void)
6364 {
6365 int i;
6366 struct hlist_head *hash;
6367
6368 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6369 if (hash != NULL)
6370 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6371 INIT_HLIST_HEAD(&hash[i]);
6372
6373 return hash;
6374 }
6375
6376 /* Initialize per network namespace state */
6377 static int __net_init netdev_init(struct net *net)
6378 {
6379 if (net != &init_net)
6380 INIT_LIST_HEAD(&net->dev_base_head);
6381
6382 net->dev_name_head = netdev_create_hash();
6383 if (net->dev_name_head == NULL)
6384 goto err_name;
6385
6386 net->dev_index_head = netdev_create_hash();
6387 if (net->dev_index_head == NULL)
6388 goto err_idx;
6389
6390 return 0;
6391
6392 err_idx:
6393 kfree(net->dev_name_head);
6394 err_name:
6395 return -ENOMEM;
6396 }
6397
6398 /**
6399 * netdev_drivername - network driver for the device
6400 * @dev: network device
6401 *
6402 * Determine network driver for device.
6403 */
6404 const char *netdev_drivername(const struct net_device *dev)
6405 {
6406 const struct device_driver *driver;
6407 const struct device *parent;
6408 const char *empty = "";
6409
6410 parent = dev->dev.parent;
6411 if (!parent)
6412 return empty;
6413
6414 driver = parent->driver;
6415 if (driver && driver->name)
6416 return driver->name;
6417 return empty;
6418 }
6419
6420 static int __netdev_printk(const char *level, const struct net_device *dev,
6421 struct va_format *vaf)
6422 {
6423 int r;
6424
6425 if (dev && dev->dev.parent) {
6426 r = dev_printk_emit(level[1] - '0',
6427 dev->dev.parent,
6428 "%s %s %s: %pV",
6429 dev_driver_string(dev->dev.parent),
6430 dev_name(dev->dev.parent),
6431 netdev_name(dev), vaf);
6432 } else if (dev) {
6433 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6434 } else {
6435 r = printk("%s(NULL net_device): %pV", level, vaf);
6436 }
6437
6438 return r;
6439 }
6440
6441 int netdev_printk(const char *level, const struct net_device *dev,
6442 const char *format, ...)
6443 {
6444 struct va_format vaf;
6445 va_list args;
6446 int r;
6447
6448 va_start(args, format);
6449
6450 vaf.fmt = format;
6451 vaf.va = &args;
6452
6453 r = __netdev_printk(level, dev, &vaf);
6454
6455 va_end(args);
6456
6457 return r;
6458 }
6459 EXPORT_SYMBOL(netdev_printk);
6460
6461 #define define_netdev_printk_level(func, level) \
6462 int func(const struct net_device *dev, const char *fmt, ...) \
6463 { \
6464 int r; \
6465 struct va_format vaf; \
6466 va_list args; \
6467 \
6468 va_start(args, fmt); \
6469 \
6470 vaf.fmt = fmt; \
6471 vaf.va = &args; \
6472 \
6473 r = __netdev_printk(level, dev, &vaf); \
6474 \
6475 va_end(args); \
6476 \
6477 return r; \
6478 } \
6479 EXPORT_SYMBOL(func);
6480
6481 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6482 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6483 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6484 define_netdev_printk_level(netdev_err, KERN_ERR);
6485 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6486 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6487 define_netdev_printk_level(netdev_info, KERN_INFO);
6488
6489 static void __net_exit netdev_exit(struct net *net)
6490 {
6491 kfree(net->dev_name_head);
6492 kfree(net->dev_index_head);
6493 }
6494
6495 static struct pernet_operations __net_initdata netdev_net_ops = {
6496 .init = netdev_init,
6497 .exit = netdev_exit,
6498 };
6499
6500 static void __net_exit default_device_exit(struct net *net)
6501 {
6502 struct net_device *dev, *aux;
6503 /*
6504 * Push all migratable network devices back to the
6505 * initial network namespace
6506 */
6507 rtnl_lock();
6508 for_each_netdev_safe(net, dev, aux) {
6509 int err;
6510 char fb_name[IFNAMSIZ];
6511
6512 /* Ignore unmoveable devices (i.e. loopback) */
6513 if (dev->features & NETIF_F_NETNS_LOCAL)
6514 continue;
6515
6516 /* Leave virtual devices for the generic cleanup */
6517 if (dev->rtnl_link_ops)
6518 continue;
6519
6520 /* Push remaining network devices to init_net */
6521 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6522 err = dev_change_net_namespace(dev, &init_net, fb_name);
6523 if (err) {
6524 pr_emerg("%s: failed to move %s to init_net: %d\n",
6525 __func__, dev->name, err);
6526 BUG();
6527 }
6528 }
6529 rtnl_unlock();
6530 }
6531
6532 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6533 {
6534 /* At exit all network devices most be removed from a network
6535 * namespace. Do this in the reverse order of registration.
6536 * Do this across as many network namespaces as possible to
6537 * improve batching efficiency.
6538 */
6539 struct net_device *dev;
6540 struct net *net;
6541 LIST_HEAD(dev_kill_list);
6542
6543 rtnl_lock();
6544 list_for_each_entry(net, net_list, exit_list) {
6545 for_each_netdev_reverse(net, dev) {
6546 if (dev->rtnl_link_ops)
6547 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6548 else
6549 unregister_netdevice_queue(dev, &dev_kill_list);
6550 }
6551 }
6552 unregister_netdevice_many(&dev_kill_list);
6553 list_del(&dev_kill_list);
6554 rtnl_unlock();
6555 }
6556
6557 static struct pernet_operations __net_initdata default_device_ops = {
6558 .exit = default_device_exit,
6559 .exit_batch = default_device_exit_batch,
6560 };
6561
6562 /*
6563 * Initialize the DEV module. At boot time this walks the device list and
6564 * unhooks any devices that fail to initialise (normally hardware not
6565 * present) and leaves us with a valid list of present and active devices.
6566 *
6567 */
6568
6569 /*
6570 * This is called single threaded during boot, so no need
6571 * to take the rtnl semaphore.
6572 */
6573 static int __init net_dev_init(void)
6574 {
6575 int i, rc = -ENOMEM;
6576
6577 BUG_ON(!dev_boot_phase);
6578
6579 if (dev_proc_init())
6580 goto out;
6581
6582 if (netdev_kobject_init())
6583 goto out;
6584
6585 INIT_LIST_HEAD(&ptype_all);
6586 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6587 INIT_LIST_HEAD(&ptype_base[i]);
6588
6589 INIT_LIST_HEAD(&offload_base);
6590
6591 if (register_pernet_subsys(&netdev_net_ops))
6592 goto out;
6593
6594 /*
6595 * Initialise the packet receive queues.
6596 */
6597
6598 for_each_possible_cpu(i) {
6599 struct softnet_data *sd = &per_cpu(softnet_data, i);
6600
6601 memset(sd, 0, sizeof(*sd));
6602 skb_queue_head_init(&sd->input_pkt_queue);
6603 skb_queue_head_init(&sd->process_queue);
6604 sd->completion_queue = NULL;
6605 INIT_LIST_HEAD(&sd->poll_list);
6606 sd->output_queue = NULL;
6607 sd->output_queue_tailp = &sd->output_queue;
6608 #ifdef CONFIG_RPS
6609 sd->csd.func = rps_trigger_softirq;
6610 sd->csd.info = sd;
6611 sd->csd.flags = 0;
6612 sd->cpu = i;
6613 #endif
6614
6615 sd->backlog.poll = process_backlog;
6616 sd->backlog.weight = weight_p;
6617 sd->backlog.gro_list = NULL;
6618 sd->backlog.gro_count = 0;
6619 }
6620
6621 dev_boot_phase = 0;
6622
6623 /* The loopback device is special if any other network devices
6624 * is present in a network namespace the loopback device must
6625 * be present. Since we now dynamically allocate and free the
6626 * loopback device ensure this invariant is maintained by
6627 * keeping the loopback device as the first device on the
6628 * list of network devices. Ensuring the loopback devices
6629 * is the first device that appears and the last network device
6630 * that disappears.
6631 */
6632 if (register_pernet_device(&loopback_net_ops))
6633 goto out;
6634
6635 if (register_pernet_device(&default_device_ops))
6636 goto out;
6637
6638 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6639 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6640
6641 hotcpu_notifier(dev_cpu_callback, 0);
6642 dst_init();
6643 dev_mcast_init();
6644 rc = 0;
6645 out:
6646 return rc;
6647 }
6648
6649 subsys_initcall(net_dev_init);