]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - net/core/dev.c
Fix common misspellings
[mirror_ubuntu-bionic-kernel.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136
137 #include "net-sysfs.h"
138
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145 /*
146 * The list of packet types we will receive (as opposed to discard)
147 * and the routines to invoke.
148 *
149 * Why 16. Because with 16 the only overlap we get on a hash of the
150 * low nibble of the protocol value is RARP/SNAP/X.25.
151 *
152 * NOTE: That is no longer true with the addition of VLAN tags. Not
153 * sure which should go first, but I bet it won't make much
154 * difference if we are running VLANs. The good news is that
155 * this protocol won't be in the list unless compiled in, so
156 * the average user (w/out VLANs) will not be adversely affected.
157 * --BLG
158 *
159 * 0800 IP
160 * 8100 802.1Q VLAN
161 * 0001 802.3
162 * 0002 AX.25
163 * 0004 802.2
164 * 8035 RARP
165 * 0005 SNAP
166 * 0805 X.25
167 * 0806 ARP
168 * 8137 IPX
169 * 0009 Localtalk
170 * 86DD IPv6
171 */
172
173 #define PTYPE_HASH_SIZE (16)
174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
175
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly; /* Taps */
179
180 /*
181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182 * semaphore.
183 *
184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185 *
186 * Writers must hold the rtnl semaphore while they loop through the
187 * dev_base_head list, and hold dev_base_lock for writing when they do the
188 * actual updates. This allows pure readers to access the list even
189 * while a writer is preparing to update it.
190 *
191 * To put it another way, dev_base_lock is held for writing only to
192 * protect against pure readers; the rtnl semaphore provides the
193 * protection against other writers.
194 *
195 * See, for example usages, register_netdevice() and
196 * unregister_netdevice(), which must be called with the rtnl
197 * semaphore held.
198 */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 struct net *net = dev_net(dev);
231
232 ASSERT_RTNL();
233
234 write_lock_bh(&dev_base_lock);
235 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 hlist_add_head_rcu(&dev->index_hlist,
238 dev_index_hash(net, dev->ifindex));
239 write_unlock_bh(&dev_base_lock);
240 return 0;
241 }
242
243 /* Device list removal
244 * caller must respect a RCU grace period before freeing/reusing dev
245 */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 ASSERT_RTNL();
249
250 /* Unlink dev from the device chain */
251 write_lock_bh(&dev_base_lock);
252 list_del_rcu(&dev->dev_list);
253 hlist_del_rcu(&dev->name_hlist);
254 hlist_del_rcu(&dev->index_hlist);
255 write_unlock_bh(&dev_base_lock);
256 }
257
258 /*
259 * Our notifier list
260 */
261
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263
264 /*
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
267 */
268
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272 #ifdef CONFIG_LOCKDEP
273 /*
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
276 */
277 static const unsigned short netdev_lock_type[] =
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 ARPHRD_VOID, ARPHRD_NONE};
294
295 static const char *const netdev_lock_name[] =
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 "_xmit_VOID", "_xmit_NONE"};
312
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 int i;
319
320 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 if (netdev_lock_type[i] == dev_type)
322 return i;
323 /* the last key is used by default */
324 return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 unsigned short dev_type)
329 {
330 int i;
331
332 i = netdev_lock_pos(dev_type);
333 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 netdev_lock_name[i]);
335 }
336
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 int i;
340
341 i = netdev_lock_pos(dev->type);
342 lockdep_set_class_and_name(&dev->addr_list_lock,
343 &netdev_addr_lock_key[i],
344 netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355
356 /*******************************************************************************
357
358 Protocol management and registration routines
359
360 *******************************************************************************/
361
362 /*
363 * Add a protocol ID to the list. Now that the input handler is
364 * smarter we can dispense with all the messy stuff that used to be
365 * here.
366 *
367 * BEWARE!!! Protocol handlers, mangling input packets,
368 * MUST BE last in hash buckets and checking protocol handlers
369 * MUST start from promiscuous ptype_all chain in net_bh.
370 * It is true now, do not change it.
371 * Explanation follows: if protocol handler, mangling packet, will
372 * be the first on list, it is not able to sense, that packet
373 * is cloned and should be copied-on-write, so that it will
374 * change it and subsequent readers will get broken packet.
375 * --ANK (980803)
376 */
377
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 if (pt->type == htons(ETH_P_ALL))
381 return &ptype_all;
382 else
383 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385
386 /**
387 * dev_add_pack - add packet handler
388 * @pt: packet type declaration
389 *
390 * Add a protocol handler to the networking stack. The passed &packet_type
391 * is linked into kernel lists and may not be freed until it has been
392 * removed from the kernel lists.
393 *
394 * This call does not sleep therefore it can not
395 * guarantee all CPU's that are in middle of receiving packets
396 * will see the new packet type (until the next received packet).
397 */
398
399 void dev_add_pack(struct packet_type *pt)
400 {
401 struct list_head *head = ptype_head(pt);
402
403 spin_lock(&ptype_lock);
404 list_add_rcu(&pt->list, head);
405 spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408
409 /**
410 * __dev_remove_pack - remove packet handler
411 * @pt: packet type declaration
412 *
413 * Remove a protocol handler that was previously added to the kernel
414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
415 * from the kernel lists and can be freed or reused once this function
416 * returns.
417 *
418 * The packet type might still be in use by receivers
419 * and must not be freed until after all the CPU's have gone
420 * through a quiescent state.
421 */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 struct list_head *head = ptype_head(pt);
425 struct packet_type *pt1;
426
427 spin_lock(&ptype_lock);
428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441
442 /**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 __dev_remove_pack(pt);
457
458 synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461
462 /******************************************************************************
463
464 Device Boot-time Settings Routines
465
466 *******************************************************************************/
467
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470
471 /**
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
475 *
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
478 * all netdevices.
479 */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 struct netdev_boot_setup *s;
483 int i;
484
485 s = dev_boot_setup;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
489 strlcpy(s[i].name, name, IFNAMSIZ);
490 memcpy(&s[i].map, map, sizeof(s[i].map));
491 break;
492 }
493 }
494
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497
498 /**
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
501 *
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
506 */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 struct netdev_boot_setup *s = dev_boot_setup;
510 int i;
511
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 !strcmp(dev->name, s[i].name)) {
515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
519 return 1;
520 }
521 }
522 return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525
526
527 /**
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
531 *
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
536 */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 const struct netdev_boot_setup *s = dev_boot_setup;
540 char name[IFNAMSIZ];
541 int i;
542
543 sprintf(name, "%s%d", prefix, unit);
544
545 /*
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
548 */
549 if (__dev_get_by_name(&init_net, name))
550 return 1;
551
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
555 return 0;
556 }
557
558 /*
559 * Saves at boot time configured settings for any netdevice.
560 */
561 int __init netdev_boot_setup(char *str)
562 {
563 int ints[5];
564 struct ifmap map;
565
566 str = get_options(str, ARRAY_SIZE(ints), ints);
567 if (!str || !*str)
568 return 0;
569
570 /* Save settings */
571 memset(&map, 0, sizeof(map));
572 if (ints[0] > 0)
573 map.irq = ints[1];
574 if (ints[0] > 1)
575 map.base_addr = ints[2];
576 if (ints[0] > 2)
577 map.mem_start = ints[3];
578 if (ints[0] > 3)
579 map.mem_end = ints[4];
580
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
583 }
584
585 __setup("netdev=", netdev_boot_setup);
586
587 /*******************************************************************************
588
589 Device Interface Subroutines
590
591 *******************************************************************************/
592
593 /**
594 * __dev_get_by_name - find a device by its name
595 * @net: the applicable net namespace
596 * @name: name to find
597 *
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
603 */
604
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 struct hlist_node *p;
608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
610
611 hlist_for_each_entry(dev, p, head, name_hlist)
612 if (!strncmp(dev->name, name, IFNAMSIZ))
613 return dev;
614
615 return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618
619 /**
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
623 *
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
629 */
630
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
636
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
639 return dev;
640
641 return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644
645 /**
646 * dev_get_by_name - find a device by its name
647 * @net: the applicable net namespace
648 * @name: name to find
649 *
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
655 */
656
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 struct net_device *dev;
660
661 rcu_read_lock();
662 dev = dev_get_by_name_rcu(net, name);
663 if (dev)
664 dev_hold(dev);
665 rcu_read_unlock();
666 return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669
670 /**
671 * __dev_get_by_index - find a device by its ifindex
672 * @net: the applicable net namespace
673 * @ifindex: index of device
674 *
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
679 * or @dev_base_lock.
680 */
681
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 struct hlist_node *p;
685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
687
688 hlist_for_each_entry(dev, p, head, index_hlist)
689 if (dev->ifindex == ifindex)
690 return dev;
691
692 return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695
696 /**
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
700 *
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
705 */
706
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
712
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
715 return dev;
716
717 return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720
721
722 /**
723 * dev_get_by_index - find a device by its ifindex
724 * @net: the applicable net namespace
725 * @ifindex: index of device
726 *
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
731 */
732
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 struct net_device *dev;
736
737 rcu_read_lock();
738 dev = dev_get_by_index_rcu(net, ifindex);
739 if (dev)
740 dev_hold(dev);
741 rcu_read_unlock();
742 return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745
746 /**
747 * dev_getbyhwaddr_rcu - find a device by its hardware address
748 * @net: the applicable net namespace
749 * @type: media type of device
750 * @ha: hardware address
751 *
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device.
754 * The caller must hold RCU or RTNL.
755 * The returned device has not had its ref count increased
756 * and the caller must therefore be careful about locking
757 *
758 */
759
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 const char *ha)
762 {
763 struct net_device *dev;
764
765 for_each_netdev_rcu(net, dev)
766 if (dev->type == type &&
767 !memcmp(dev->dev_addr, ha, dev->addr_len))
768 return dev;
769
770 return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 struct net_device *dev;
777
778 ASSERT_RTNL();
779 for_each_netdev(net, dev)
780 if (dev->type == type)
781 return dev;
782
783 return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 struct net_device *dev, *ret = NULL;
790
791 rcu_read_lock();
792 for_each_netdev_rcu(net, dev)
793 if (dev->type == type) {
794 dev_hold(dev);
795 ret = dev;
796 break;
797 }
798 rcu_read_unlock();
799 return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802
803 /**
804 * dev_get_by_flags_rcu - find any device with given flags
805 * @net: the applicable net namespace
806 * @if_flags: IFF_* values
807 * @mask: bitmask of bits in if_flags to check
808 *
809 * Search for any interface with the given flags. Returns NULL if a device
810 * is not found or a pointer to the device. Must be called inside
811 * rcu_read_lock(), and result refcount is unchanged.
812 */
813
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 unsigned short mask)
816 {
817 struct net_device *dev, *ret;
818
819 ret = NULL;
820 for_each_netdev_rcu(net, dev) {
821 if (((dev->flags ^ if_flags) & mask) == 0) {
822 ret = dev;
823 break;
824 }
825 }
826 return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829
830 /**
831 * dev_valid_name - check if name is okay for network device
832 * @name: name string
833 *
834 * Network device names need to be valid file names to
835 * to allow sysfs to work. We also disallow any kind of
836 * whitespace.
837 */
838 int dev_valid_name(const char *name)
839 {
840 if (*name == '\0')
841 return 0;
842 if (strlen(name) >= IFNAMSIZ)
843 return 0;
844 if (!strcmp(name, ".") || !strcmp(name, ".."))
845 return 0;
846
847 while (*name) {
848 if (*name == '/' || isspace(*name))
849 return 0;
850 name++;
851 }
852 return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855
856 /**
857 * __dev_alloc_name - allocate a name for a device
858 * @net: network namespace to allocate the device name in
859 * @name: name format string
860 * @buf: scratch buffer and result name string
861 *
862 * Passed a format string - eg "lt%d" it will try and find a suitable
863 * id. It scans list of devices to build up a free map, then chooses
864 * the first empty slot. The caller must hold the dev_base or rtnl lock
865 * while allocating the name and adding the device in order to avoid
866 * duplicates.
867 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868 * Returns the number of the unit assigned or a negative errno code.
869 */
870
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 int i = 0;
874 const char *p;
875 const int max_netdevices = 8*PAGE_SIZE;
876 unsigned long *inuse;
877 struct net_device *d;
878
879 p = strnchr(name, IFNAMSIZ-1, '%');
880 if (p) {
881 /*
882 * Verify the string as this thing may have come from
883 * the user. There must be either one "%d" and no other "%"
884 * characters.
885 */
886 if (p[1] != 'd' || strchr(p + 2, '%'))
887 return -EINVAL;
888
889 /* Use one page as a bit array of possible slots */
890 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 if (!inuse)
892 return -ENOMEM;
893
894 for_each_netdev(net, d) {
895 if (!sscanf(d->name, name, &i))
896 continue;
897 if (i < 0 || i >= max_netdevices)
898 continue;
899
900 /* avoid cases where sscanf is not exact inverse of printf */
901 snprintf(buf, IFNAMSIZ, name, i);
902 if (!strncmp(buf, d->name, IFNAMSIZ))
903 set_bit(i, inuse);
904 }
905
906 i = find_first_zero_bit(inuse, max_netdevices);
907 free_page((unsigned long) inuse);
908 }
909
910 if (buf != name)
911 snprintf(buf, IFNAMSIZ, name, i);
912 if (!__dev_get_by_name(net, buf))
913 return i;
914
915 /* It is possible to run out of possible slots
916 * when the name is long and there isn't enough space left
917 * for the digits, or if all bits are used.
918 */
919 return -ENFILE;
920 }
921
922 /**
923 * dev_alloc_name - allocate a name for a device
924 * @dev: device
925 * @name: name format string
926 *
927 * Passed a format string - eg "lt%d" it will try and find a suitable
928 * id. It scans list of devices to build up a free map, then chooses
929 * the first empty slot. The caller must hold the dev_base or rtnl lock
930 * while allocating the name and adding the device in order to avoid
931 * duplicates.
932 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933 * Returns the number of the unit assigned or a negative errno code.
934 */
935
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 char buf[IFNAMSIZ];
939 struct net *net;
940 int ret;
941
942 BUG_ON(!dev_net(dev));
943 net = dev_net(dev);
944 ret = __dev_alloc_name(net, name, buf);
945 if (ret >= 0)
946 strlcpy(dev->name, buf, IFNAMSIZ);
947 return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950
951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
952 {
953 struct net *net;
954
955 BUG_ON(!dev_net(dev));
956 net = dev_net(dev);
957
958 if (!dev_valid_name(name))
959 return -EINVAL;
960
961 if (fmt && strchr(name, '%'))
962 return dev_alloc_name(dev, name);
963 else if (__dev_get_by_name(net, name))
964 return -EEXIST;
965 else if (dev->name != name)
966 strlcpy(dev->name, name, IFNAMSIZ);
967
968 return 0;
969 }
970
971 /**
972 * dev_change_name - change name of a device
973 * @dev: device
974 * @newname: name (or format string) must be at least IFNAMSIZ
975 *
976 * Change name of a device, can pass format strings "eth%d".
977 * for wildcarding.
978 */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 char oldname[IFNAMSIZ];
982 int err = 0;
983 int ret;
984 struct net *net;
985
986 ASSERT_RTNL();
987 BUG_ON(!dev_net(dev));
988
989 net = dev_net(dev);
990 if (dev->flags & IFF_UP)
991 return -EBUSY;
992
993 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 return 0;
995
996 memcpy(oldname, dev->name, IFNAMSIZ);
997
998 err = dev_get_valid_name(dev, newname, 1);
999 if (err < 0)
1000 return err;
1001
1002 rollback:
1003 ret = device_rename(&dev->dev, dev->name);
1004 if (ret) {
1005 memcpy(dev->name, oldname, IFNAMSIZ);
1006 return ret;
1007 }
1008
1009 write_lock_bh(&dev_base_lock);
1010 hlist_del(&dev->name_hlist);
1011 write_unlock_bh(&dev_base_lock);
1012
1013 synchronize_rcu();
1014
1015 write_lock_bh(&dev_base_lock);
1016 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 write_unlock_bh(&dev_base_lock);
1018
1019 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 ret = notifier_to_errno(ret);
1021
1022 if (ret) {
1023 /* err >= 0 after dev_alloc_name() or stores the first errno */
1024 if (err >= 0) {
1025 err = ret;
1026 memcpy(dev->name, oldname, IFNAMSIZ);
1027 goto rollback;
1028 } else {
1029 printk(KERN_ERR
1030 "%s: name change rollback failed: %d.\n",
1031 dev->name, ret);
1032 }
1033 }
1034
1035 return err;
1036 }
1037
1038 /**
1039 * dev_set_alias - change ifalias of a device
1040 * @dev: device
1041 * @alias: name up to IFALIASZ
1042 * @len: limit of bytes to copy from info
1043 *
1044 * Set ifalias for a device,
1045 */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 ASSERT_RTNL();
1049
1050 if (len >= IFALIASZ)
1051 return -EINVAL;
1052
1053 if (!len) {
1054 if (dev->ifalias) {
1055 kfree(dev->ifalias);
1056 dev->ifalias = NULL;
1057 }
1058 return 0;
1059 }
1060
1061 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 if (!dev->ifalias)
1063 return -ENOMEM;
1064
1065 strlcpy(dev->ifalias, alias, len+1);
1066 return len;
1067 }
1068
1069
1070 /**
1071 * netdev_features_change - device changes features
1072 * @dev: device to cause notification
1073 *
1074 * Called to indicate a device has changed features.
1075 */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081
1082 /**
1083 * netdev_state_change - device changes state
1084 * @dev: device to cause notification
1085 *
1086 * Called to indicate a device has changed state. This function calls
1087 * the notifier chains for netdev_chain and sends a NEWLINK message
1088 * to the routing socket.
1089 */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 if (dev->flags & IFF_UP) {
1093 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 }
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104
1105 /**
1106 * dev_load - load a network module
1107 * @net: the applicable net namespace
1108 * @name: name of interface
1109 *
1110 * If a network interface is not present and the process has suitable
1111 * privileges this function loads the module. If module loading is not
1112 * available in this kernel then it becomes a nop.
1113 */
1114
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 struct net_device *dev;
1118 int no_module;
1119
1120 rcu_read_lock();
1121 dev = dev_get_by_name_rcu(net, name);
1122 rcu_read_unlock();
1123
1124 no_module = !dev;
1125 if (no_module && capable(CAP_NET_ADMIN))
1126 no_module = request_module("netdev-%s", name);
1127 if (no_module && capable(CAP_SYS_MODULE)) {
1128 if (!request_module("%s", name))
1129 pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132 }
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135
1136 static int __dev_open(struct net_device *dev)
1137 {
1138 const struct net_device_ops *ops = dev->netdev_ops;
1139 int ret;
1140
1141 ASSERT_RTNL();
1142
1143 if (!netif_device_present(dev))
1144 return -ENODEV;
1145
1146 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147 ret = notifier_to_errno(ret);
1148 if (ret)
1149 return ret;
1150
1151 set_bit(__LINK_STATE_START, &dev->state);
1152
1153 if (ops->ndo_validate_addr)
1154 ret = ops->ndo_validate_addr(dev);
1155
1156 if (!ret && ops->ndo_open)
1157 ret = ops->ndo_open(dev);
1158
1159 if (ret)
1160 clear_bit(__LINK_STATE_START, &dev->state);
1161 else {
1162 dev->flags |= IFF_UP;
1163 net_dmaengine_get();
1164 dev_set_rx_mode(dev);
1165 dev_activate(dev);
1166 }
1167
1168 return ret;
1169 }
1170
1171 /**
1172 * dev_open - prepare an interface for use.
1173 * @dev: device to open
1174 *
1175 * Takes a device from down to up state. The device's private open
1176 * function is invoked and then the multicast lists are loaded. Finally
1177 * the device is moved into the up state and a %NETDEV_UP message is
1178 * sent to the netdev notifier chain.
1179 *
1180 * Calling this function on an active interface is a nop. On a failure
1181 * a negative errno code is returned.
1182 */
1183 int dev_open(struct net_device *dev)
1184 {
1185 int ret;
1186
1187 if (dev->flags & IFF_UP)
1188 return 0;
1189
1190 ret = __dev_open(dev);
1191 if (ret < 0)
1192 return ret;
1193
1194 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195 call_netdevice_notifiers(NETDEV_UP, dev);
1196
1197 return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203 struct net_device *dev;
1204
1205 ASSERT_RTNL();
1206 might_sleep();
1207
1208 list_for_each_entry(dev, head, unreg_list) {
1209 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210
1211 clear_bit(__LINK_STATE_START, &dev->state);
1212
1213 /* Synchronize to scheduled poll. We cannot touch poll list, it
1214 * can be even on different cpu. So just clear netif_running().
1215 *
1216 * dev->stop() will invoke napi_disable() on all of it's
1217 * napi_struct instances on this device.
1218 */
1219 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220 }
1221
1222 dev_deactivate_many(head);
1223
1224 list_for_each_entry(dev, head, unreg_list) {
1225 const struct net_device_ops *ops = dev->netdev_ops;
1226
1227 /*
1228 * Call the device specific close. This cannot fail.
1229 * Only if device is UP
1230 *
1231 * We allow it to be called even after a DETACH hot-plug
1232 * event.
1233 */
1234 if (ops->ndo_stop)
1235 ops->ndo_stop(dev);
1236
1237 dev->flags &= ~IFF_UP;
1238 net_dmaengine_put();
1239 }
1240
1241 return 0;
1242 }
1243
1244 static int __dev_close(struct net_device *dev)
1245 {
1246 int retval;
1247 LIST_HEAD(single);
1248
1249 list_add(&dev->unreg_list, &single);
1250 retval = __dev_close_many(&single);
1251 list_del(&single);
1252 return retval;
1253 }
1254
1255 static int dev_close_many(struct list_head *head)
1256 {
1257 struct net_device *dev, *tmp;
1258 LIST_HEAD(tmp_list);
1259
1260 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261 if (!(dev->flags & IFF_UP))
1262 list_move(&dev->unreg_list, &tmp_list);
1263
1264 __dev_close_many(head);
1265
1266 list_for_each_entry(dev, head, unreg_list) {
1267 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268 call_netdevice_notifiers(NETDEV_DOWN, dev);
1269 }
1270
1271 /* rollback_registered_many needs the complete original list */
1272 list_splice(&tmp_list, head);
1273 return 0;
1274 }
1275
1276 /**
1277 * dev_close - shutdown an interface.
1278 * @dev: device to shutdown
1279 *
1280 * This function moves an active device into down state. A
1281 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283 * chain.
1284 */
1285 int dev_close(struct net_device *dev)
1286 {
1287 LIST_HEAD(single);
1288
1289 list_add(&dev->unreg_list, &single);
1290 dev_close_many(&single);
1291 list_del(&single);
1292 return 0;
1293 }
1294 EXPORT_SYMBOL(dev_close);
1295
1296
1297 /**
1298 * dev_disable_lro - disable Large Receive Offload on a device
1299 * @dev: device
1300 *
1301 * Disable Large Receive Offload (LRO) on a net device. Must be
1302 * called under RTNL. This is needed if received packets may be
1303 * forwarded to another interface.
1304 */
1305 void dev_disable_lro(struct net_device *dev)
1306 {
1307 u32 flags;
1308
1309 if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1310 flags = dev->ethtool_ops->get_flags(dev);
1311 else
1312 flags = ethtool_op_get_flags(dev);
1313
1314 if (!(flags & ETH_FLAG_LRO))
1315 return;
1316
1317 __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1318 WARN_ON(dev->features & NETIF_F_LRO);
1319 }
1320 EXPORT_SYMBOL(dev_disable_lro);
1321
1322
1323 static int dev_boot_phase = 1;
1324
1325 /**
1326 * register_netdevice_notifier - register a network notifier block
1327 * @nb: notifier
1328 *
1329 * Register a notifier to be called when network device events occur.
1330 * The notifier passed is linked into the kernel structures and must
1331 * not be reused until it has been unregistered. A negative errno code
1332 * is returned on a failure.
1333 *
1334 * When registered all registration and up events are replayed
1335 * to the new notifier to allow device to have a race free
1336 * view of the network device list.
1337 */
1338
1339 int register_netdevice_notifier(struct notifier_block *nb)
1340 {
1341 struct net_device *dev;
1342 struct net_device *last;
1343 struct net *net;
1344 int err;
1345
1346 rtnl_lock();
1347 err = raw_notifier_chain_register(&netdev_chain, nb);
1348 if (err)
1349 goto unlock;
1350 if (dev_boot_phase)
1351 goto unlock;
1352 for_each_net(net) {
1353 for_each_netdev(net, dev) {
1354 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1355 err = notifier_to_errno(err);
1356 if (err)
1357 goto rollback;
1358
1359 if (!(dev->flags & IFF_UP))
1360 continue;
1361
1362 nb->notifier_call(nb, NETDEV_UP, dev);
1363 }
1364 }
1365
1366 unlock:
1367 rtnl_unlock();
1368 return err;
1369
1370 rollback:
1371 last = dev;
1372 for_each_net(net) {
1373 for_each_netdev(net, dev) {
1374 if (dev == last)
1375 break;
1376
1377 if (dev->flags & IFF_UP) {
1378 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1379 nb->notifier_call(nb, NETDEV_DOWN, dev);
1380 }
1381 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1382 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1383 }
1384 }
1385
1386 raw_notifier_chain_unregister(&netdev_chain, nb);
1387 goto unlock;
1388 }
1389 EXPORT_SYMBOL(register_netdevice_notifier);
1390
1391 /**
1392 * unregister_netdevice_notifier - unregister a network notifier block
1393 * @nb: notifier
1394 *
1395 * Unregister a notifier previously registered by
1396 * register_netdevice_notifier(). The notifier is unlinked into the
1397 * kernel structures and may then be reused. A negative errno code
1398 * is returned on a failure.
1399 */
1400
1401 int unregister_netdevice_notifier(struct notifier_block *nb)
1402 {
1403 int err;
1404
1405 rtnl_lock();
1406 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1407 rtnl_unlock();
1408 return err;
1409 }
1410 EXPORT_SYMBOL(unregister_netdevice_notifier);
1411
1412 /**
1413 * call_netdevice_notifiers - call all network notifier blocks
1414 * @val: value passed unmodified to notifier function
1415 * @dev: net_device pointer passed unmodified to notifier function
1416 *
1417 * Call all network notifier blocks. Parameters and return value
1418 * are as for raw_notifier_call_chain().
1419 */
1420
1421 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1422 {
1423 ASSERT_RTNL();
1424 return raw_notifier_call_chain(&netdev_chain, val, dev);
1425 }
1426 EXPORT_SYMBOL(call_netdevice_notifiers);
1427
1428 /* When > 0 there are consumers of rx skb time stamps */
1429 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1430
1431 void net_enable_timestamp(void)
1432 {
1433 atomic_inc(&netstamp_needed);
1434 }
1435 EXPORT_SYMBOL(net_enable_timestamp);
1436
1437 void net_disable_timestamp(void)
1438 {
1439 atomic_dec(&netstamp_needed);
1440 }
1441 EXPORT_SYMBOL(net_disable_timestamp);
1442
1443 static inline void net_timestamp_set(struct sk_buff *skb)
1444 {
1445 if (atomic_read(&netstamp_needed))
1446 __net_timestamp(skb);
1447 else
1448 skb->tstamp.tv64 = 0;
1449 }
1450
1451 static inline void net_timestamp_check(struct sk_buff *skb)
1452 {
1453 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1454 __net_timestamp(skb);
1455 }
1456
1457 /**
1458 * dev_forward_skb - loopback an skb to another netif
1459 *
1460 * @dev: destination network device
1461 * @skb: buffer to forward
1462 *
1463 * return values:
1464 * NET_RX_SUCCESS (no congestion)
1465 * NET_RX_DROP (packet was dropped, but freed)
1466 *
1467 * dev_forward_skb can be used for injecting an skb from the
1468 * start_xmit function of one device into the receive queue
1469 * of another device.
1470 *
1471 * The receiving device may be in another namespace, so
1472 * we have to clear all information in the skb that could
1473 * impact namespace isolation.
1474 */
1475 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1476 {
1477 skb_orphan(skb);
1478 nf_reset(skb);
1479
1480 if (unlikely(!(dev->flags & IFF_UP) ||
1481 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1482 atomic_long_inc(&dev->rx_dropped);
1483 kfree_skb(skb);
1484 return NET_RX_DROP;
1485 }
1486 skb_set_dev(skb, dev);
1487 skb->tstamp.tv64 = 0;
1488 skb->pkt_type = PACKET_HOST;
1489 skb->protocol = eth_type_trans(skb, dev);
1490 return netif_rx(skb);
1491 }
1492 EXPORT_SYMBOL_GPL(dev_forward_skb);
1493
1494 static inline int deliver_skb(struct sk_buff *skb,
1495 struct packet_type *pt_prev,
1496 struct net_device *orig_dev)
1497 {
1498 atomic_inc(&skb->users);
1499 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1500 }
1501
1502 /*
1503 * Support routine. Sends outgoing frames to any network
1504 * taps currently in use.
1505 */
1506
1507 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1508 {
1509 struct packet_type *ptype;
1510 struct sk_buff *skb2 = NULL;
1511 struct packet_type *pt_prev = NULL;
1512
1513 rcu_read_lock();
1514 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1515 /* Never send packets back to the socket
1516 * they originated from - MvS (miquels@drinkel.ow.org)
1517 */
1518 if ((ptype->dev == dev || !ptype->dev) &&
1519 (ptype->af_packet_priv == NULL ||
1520 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1521 if (pt_prev) {
1522 deliver_skb(skb2, pt_prev, skb->dev);
1523 pt_prev = ptype;
1524 continue;
1525 }
1526
1527 skb2 = skb_clone(skb, GFP_ATOMIC);
1528 if (!skb2)
1529 break;
1530
1531 net_timestamp_set(skb2);
1532
1533 /* skb->nh should be correctly
1534 set by sender, so that the second statement is
1535 just protection against buggy protocols.
1536 */
1537 skb_reset_mac_header(skb2);
1538
1539 if (skb_network_header(skb2) < skb2->data ||
1540 skb2->network_header > skb2->tail) {
1541 if (net_ratelimit())
1542 printk(KERN_CRIT "protocol %04x is "
1543 "buggy, dev %s\n",
1544 ntohs(skb2->protocol),
1545 dev->name);
1546 skb_reset_network_header(skb2);
1547 }
1548
1549 skb2->transport_header = skb2->network_header;
1550 skb2->pkt_type = PACKET_OUTGOING;
1551 pt_prev = ptype;
1552 }
1553 }
1554 if (pt_prev)
1555 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1556 rcu_read_unlock();
1557 }
1558
1559 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1560 * @dev: Network device
1561 * @txq: number of queues available
1562 *
1563 * If real_num_tx_queues is changed the tc mappings may no longer be
1564 * valid. To resolve this verify the tc mapping remains valid and if
1565 * not NULL the mapping. With no priorities mapping to this
1566 * offset/count pair it will no longer be used. In the worst case TC0
1567 * is invalid nothing can be done so disable priority mappings. If is
1568 * expected that drivers will fix this mapping if they can before
1569 * calling netif_set_real_num_tx_queues.
1570 */
1571 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1572 {
1573 int i;
1574 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1575
1576 /* If TC0 is invalidated disable TC mapping */
1577 if (tc->offset + tc->count > txq) {
1578 pr_warning("Number of in use tx queues changed "
1579 "invalidating tc mappings. Priority "
1580 "traffic classification disabled!\n");
1581 dev->num_tc = 0;
1582 return;
1583 }
1584
1585 /* Invalidated prio to tc mappings set to TC0 */
1586 for (i = 1; i < TC_BITMASK + 1; i++) {
1587 int q = netdev_get_prio_tc_map(dev, i);
1588
1589 tc = &dev->tc_to_txq[q];
1590 if (tc->offset + tc->count > txq) {
1591 pr_warning("Number of in use tx queues "
1592 "changed. Priority %i to tc "
1593 "mapping %i is no longer valid "
1594 "setting map to 0\n",
1595 i, q);
1596 netdev_set_prio_tc_map(dev, i, 0);
1597 }
1598 }
1599 }
1600
1601 /*
1602 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1603 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1604 */
1605 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1606 {
1607 int rc;
1608
1609 if (txq < 1 || txq > dev->num_tx_queues)
1610 return -EINVAL;
1611
1612 if (dev->reg_state == NETREG_REGISTERED ||
1613 dev->reg_state == NETREG_UNREGISTERING) {
1614 ASSERT_RTNL();
1615
1616 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1617 txq);
1618 if (rc)
1619 return rc;
1620
1621 if (dev->num_tc)
1622 netif_setup_tc(dev, txq);
1623
1624 if (txq < dev->real_num_tx_queues)
1625 qdisc_reset_all_tx_gt(dev, txq);
1626 }
1627
1628 dev->real_num_tx_queues = txq;
1629 return 0;
1630 }
1631 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1632
1633 #ifdef CONFIG_RPS
1634 /**
1635 * netif_set_real_num_rx_queues - set actual number of RX queues used
1636 * @dev: Network device
1637 * @rxq: Actual number of RX queues
1638 *
1639 * This must be called either with the rtnl_lock held or before
1640 * registration of the net device. Returns 0 on success, or a
1641 * negative error code. If called before registration, it always
1642 * succeeds.
1643 */
1644 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1645 {
1646 int rc;
1647
1648 if (rxq < 1 || rxq > dev->num_rx_queues)
1649 return -EINVAL;
1650
1651 if (dev->reg_state == NETREG_REGISTERED) {
1652 ASSERT_RTNL();
1653
1654 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1655 rxq);
1656 if (rc)
1657 return rc;
1658 }
1659
1660 dev->real_num_rx_queues = rxq;
1661 return 0;
1662 }
1663 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1664 #endif
1665
1666 static inline void __netif_reschedule(struct Qdisc *q)
1667 {
1668 struct softnet_data *sd;
1669 unsigned long flags;
1670
1671 local_irq_save(flags);
1672 sd = &__get_cpu_var(softnet_data);
1673 q->next_sched = NULL;
1674 *sd->output_queue_tailp = q;
1675 sd->output_queue_tailp = &q->next_sched;
1676 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1677 local_irq_restore(flags);
1678 }
1679
1680 void __netif_schedule(struct Qdisc *q)
1681 {
1682 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1683 __netif_reschedule(q);
1684 }
1685 EXPORT_SYMBOL(__netif_schedule);
1686
1687 void dev_kfree_skb_irq(struct sk_buff *skb)
1688 {
1689 if (atomic_dec_and_test(&skb->users)) {
1690 struct softnet_data *sd;
1691 unsigned long flags;
1692
1693 local_irq_save(flags);
1694 sd = &__get_cpu_var(softnet_data);
1695 skb->next = sd->completion_queue;
1696 sd->completion_queue = skb;
1697 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1698 local_irq_restore(flags);
1699 }
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_irq);
1702
1703 void dev_kfree_skb_any(struct sk_buff *skb)
1704 {
1705 if (in_irq() || irqs_disabled())
1706 dev_kfree_skb_irq(skb);
1707 else
1708 dev_kfree_skb(skb);
1709 }
1710 EXPORT_SYMBOL(dev_kfree_skb_any);
1711
1712
1713 /**
1714 * netif_device_detach - mark device as removed
1715 * @dev: network device
1716 *
1717 * Mark device as removed from system and therefore no longer available.
1718 */
1719 void netif_device_detach(struct net_device *dev)
1720 {
1721 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1722 netif_running(dev)) {
1723 netif_tx_stop_all_queues(dev);
1724 }
1725 }
1726 EXPORT_SYMBOL(netif_device_detach);
1727
1728 /**
1729 * netif_device_attach - mark device as attached
1730 * @dev: network device
1731 *
1732 * Mark device as attached from system and restart if needed.
1733 */
1734 void netif_device_attach(struct net_device *dev)
1735 {
1736 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1737 netif_running(dev)) {
1738 netif_tx_wake_all_queues(dev);
1739 __netdev_watchdog_up(dev);
1740 }
1741 }
1742 EXPORT_SYMBOL(netif_device_attach);
1743
1744 /**
1745 * skb_dev_set -- assign a new device to a buffer
1746 * @skb: buffer for the new device
1747 * @dev: network device
1748 *
1749 * If an skb is owned by a device already, we have to reset
1750 * all data private to the namespace a device belongs to
1751 * before assigning it a new device.
1752 */
1753 #ifdef CONFIG_NET_NS
1754 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1755 {
1756 skb_dst_drop(skb);
1757 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1758 secpath_reset(skb);
1759 nf_reset(skb);
1760 skb_init_secmark(skb);
1761 skb->mark = 0;
1762 skb->priority = 0;
1763 skb->nf_trace = 0;
1764 skb->ipvs_property = 0;
1765 #ifdef CONFIG_NET_SCHED
1766 skb->tc_index = 0;
1767 #endif
1768 }
1769 skb->dev = dev;
1770 }
1771 EXPORT_SYMBOL(skb_set_dev);
1772 #endif /* CONFIG_NET_NS */
1773
1774 /*
1775 * Invalidate hardware checksum when packet is to be mangled, and
1776 * complete checksum manually on outgoing path.
1777 */
1778 int skb_checksum_help(struct sk_buff *skb)
1779 {
1780 __wsum csum;
1781 int ret = 0, offset;
1782
1783 if (skb->ip_summed == CHECKSUM_COMPLETE)
1784 goto out_set_summed;
1785
1786 if (unlikely(skb_shinfo(skb)->gso_size)) {
1787 /* Let GSO fix up the checksum. */
1788 goto out_set_summed;
1789 }
1790
1791 offset = skb_checksum_start_offset(skb);
1792 BUG_ON(offset >= skb_headlen(skb));
1793 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1794
1795 offset += skb->csum_offset;
1796 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1797
1798 if (skb_cloned(skb) &&
1799 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1800 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1801 if (ret)
1802 goto out;
1803 }
1804
1805 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1806 out_set_summed:
1807 skb->ip_summed = CHECKSUM_NONE;
1808 out:
1809 return ret;
1810 }
1811 EXPORT_SYMBOL(skb_checksum_help);
1812
1813 /**
1814 * skb_gso_segment - Perform segmentation on skb.
1815 * @skb: buffer to segment
1816 * @features: features for the output path (see dev->features)
1817 *
1818 * This function segments the given skb and returns a list of segments.
1819 *
1820 * It may return NULL if the skb requires no segmentation. This is
1821 * only possible when GSO is used for verifying header integrity.
1822 */
1823 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1824 {
1825 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1826 struct packet_type *ptype;
1827 __be16 type = skb->protocol;
1828 int vlan_depth = ETH_HLEN;
1829 int err;
1830
1831 while (type == htons(ETH_P_8021Q)) {
1832 struct vlan_hdr *vh;
1833
1834 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1835 return ERR_PTR(-EINVAL);
1836
1837 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1838 type = vh->h_vlan_encapsulated_proto;
1839 vlan_depth += VLAN_HLEN;
1840 }
1841
1842 skb_reset_mac_header(skb);
1843 skb->mac_len = skb->network_header - skb->mac_header;
1844 __skb_pull(skb, skb->mac_len);
1845
1846 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1847 struct net_device *dev = skb->dev;
1848 struct ethtool_drvinfo info = {};
1849
1850 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1851 dev->ethtool_ops->get_drvinfo(dev, &info);
1852
1853 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1854 info.driver, dev ? dev->features : 0L,
1855 skb->sk ? skb->sk->sk_route_caps : 0L,
1856 skb->len, skb->data_len, skb->ip_summed);
1857
1858 if (skb_header_cloned(skb) &&
1859 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1860 return ERR_PTR(err);
1861 }
1862
1863 rcu_read_lock();
1864 list_for_each_entry_rcu(ptype,
1865 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1866 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1867 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1868 err = ptype->gso_send_check(skb);
1869 segs = ERR_PTR(err);
1870 if (err || skb_gso_ok(skb, features))
1871 break;
1872 __skb_push(skb, (skb->data -
1873 skb_network_header(skb)));
1874 }
1875 segs = ptype->gso_segment(skb, features);
1876 break;
1877 }
1878 }
1879 rcu_read_unlock();
1880
1881 __skb_push(skb, skb->data - skb_mac_header(skb));
1882
1883 return segs;
1884 }
1885 EXPORT_SYMBOL(skb_gso_segment);
1886
1887 /* Take action when hardware reception checksum errors are detected. */
1888 #ifdef CONFIG_BUG
1889 void netdev_rx_csum_fault(struct net_device *dev)
1890 {
1891 if (net_ratelimit()) {
1892 printk(KERN_ERR "%s: hw csum failure.\n",
1893 dev ? dev->name : "<unknown>");
1894 dump_stack();
1895 }
1896 }
1897 EXPORT_SYMBOL(netdev_rx_csum_fault);
1898 #endif
1899
1900 /* Actually, we should eliminate this check as soon as we know, that:
1901 * 1. IOMMU is present and allows to map all the memory.
1902 * 2. No high memory really exists on this machine.
1903 */
1904
1905 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1906 {
1907 #ifdef CONFIG_HIGHMEM
1908 int i;
1909 if (!(dev->features & NETIF_F_HIGHDMA)) {
1910 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1911 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1912 return 1;
1913 }
1914
1915 if (PCI_DMA_BUS_IS_PHYS) {
1916 struct device *pdev = dev->dev.parent;
1917
1918 if (!pdev)
1919 return 0;
1920 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1921 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1922 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1923 return 1;
1924 }
1925 }
1926 #endif
1927 return 0;
1928 }
1929
1930 struct dev_gso_cb {
1931 void (*destructor)(struct sk_buff *skb);
1932 };
1933
1934 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1935
1936 static void dev_gso_skb_destructor(struct sk_buff *skb)
1937 {
1938 struct dev_gso_cb *cb;
1939
1940 do {
1941 struct sk_buff *nskb = skb->next;
1942
1943 skb->next = nskb->next;
1944 nskb->next = NULL;
1945 kfree_skb(nskb);
1946 } while (skb->next);
1947
1948 cb = DEV_GSO_CB(skb);
1949 if (cb->destructor)
1950 cb->destructor(skb);
1951 }
1952
1953 /**
1954 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1955 * @skb: buffer to segment
1956 * @features: device features as applicable to this skb
1957 *
1958 * This function segments the given skb and stores the list of segments
1959 * in skb->next.
1960 */
1961 static int dev_gso_segment(struct sk_buff *skb, int features)
1962 {
1963 struct sk_buff *segs;
1964
1965 segs = skb_gso_segment(skb, features);
1966
1967 /* Verifying header integrity only. */
1968 if (!segs)
1969 return 0;
1970
1971 if (IS_ERR(segs))
1972 return PTR_ERR(segs);
1973
1974 skb->next = segs;
1975 DEV_GSO_CB(skb)->destructor = skb->destructor;
1976 skb->destructor = dev_gso_skb_destructor;
1977
1978 return 0;
1979 }
1980
1981 /*
1982 * Try to orphan skb early, right before transmission by the device.
1983 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1984 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1985 */
1986 static inline void skb_orphan_try(struct sk_buff *skb)
1987 {
1988 struct sock *sk = skb->sk;
1989
1990 if (sk && !skb_shinfo(skb)->tx_flags) {
1991 /* skb_tx_hash() wont be able to get sk.
1992 * We copy sk_hash into skb->rxhash
1993 */
1994 if (!skb->rxhash)
1995 skb->rxhash = sk->sk_hash;
1996 skb_orphan(skb);
1997 }
1998 }
1999
2000 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2001 {
2002 return ((features & NETIF_F_GEN_CSUM) ||
2003 ((features & NETIF_F_V4_CSUM) &&
2004 protocol == htons(ETH_P_IP)) ||
2005 ((features & NETIF_F_V6_CSUM) &&
2006 protocol == htons(ETH_P_IPV6)) ||
2007 ((features & NETIF_F_FCOE_CRC) &&
2008 protocol == htons(ETH_P_FCOE)));
2009 }
2010
2011 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2012 {
2013 if (!can_checksum_protocol(features, protocol)) {
2014 features &= ~NETIF_F_ALL_CSUM;
2015 features &= ~NETIF_F_SG;
2016 } else if (illegal_highdma(skb->dev, skb)) {
2017 features &= ~NETIF_F_SG;
2018 }
2019
2020 return features;
2021 }
2022
2023 u32 netif_skb_features(struct sk_buff *skb)
2024 {
2025 __be16 protocol = skb->protocol;
2026 u32 features = skb->dev->features;
2027
2028 if (protocol == htons(ETH_P_8021Q)) {
2029 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2030 protocol = veh->h_vlan_encapsulated_proto;
2031 } else if (!vlan_tx_tag_present(skb)) {
2032 return harmonize_features(skb, protocol, features);
2033 }
2034
2035 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2036
2037 if (protocol != htons(ETH_P_8021Q)) {
2038 return harmonize_features(skb, protocol, features);
2039 } else {
2040 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2041 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2042 return harmonize_features(skb, protocol, features);
2043 }
2044 }
2045 EXPORT_SYMBOL(netif_skb_features);
2046
2047 /*
2048 * Returns true if either:
2049 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2050 * 2. skb is fragmented and the device does not support SG, or if
2051 * at least one of fragments is in highmem and device does not
2052 * support DMA from it.
2053 */
2054 static inline int skb_needs_linearize(struct sk_buff *skb,
2055 int features)
2056 {
2057 return skb_is_nonlinear(skb) &&
2058 ((skb_has_frag_list(skb) &&
2059 !(features & NETIF_F_FRAGLIST)) ||
2060 (skb_shinfo(skb)->nr_frags &&
2061 !(features & NETIF_F_SG)));
2062 }
2063
2064 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2065 struct netdev_queue *txq)
2066 {
2067 const struct net_device_ops *ops = dev->netdev_ops;
2068 int rc = NETDEV_TX_OK;
2069
2070 if (likely(!skb->next)) {
2071 u32 features;
2072
2073 /*
2074 * If device doesn't need skb->dst, release it right now while
2075 * its hot in this cpu cache
2076 */
2077 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2078 skb_dst_drop(skb);
2079
2080 if (!list_empty(&ptype_all))
2081 dev_queue_xmit_nit(skb, dev);
2082
2083 skb_orphan_try(skb);
2084
2085 features = netif_skb_features(skb);
2086
2087 if (vlan_tx_tag_present(skb) &&
2088 !(features & NETIF_F_HW_VLAN_TX)) {
2089 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2090 if (unlikely(!skb))
2091 goto out;
2092
2093 skb->vlan_tci = 0;
2094 }
2095
2096 if (netif_needs_gso(skb, features)) {
2097 if (unlikely(dev_gso_segment(skb, features)))
2098 goto out_kfree_skb;
2099 if (skb->next)
2100 goto gso;
2101 } else {
2102 if (skb_needs_linearize(skb, features) &&
2103 __skb_linearize(skb))
2104 goto out_kfree_skb;
2105
2106 /* If packet is not checksummed and device does not
2107 * support checksumming for this protocol, complete
2108 * checksumming here.
2109 */
2110 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2111 skb_set_transport_header(skb,
2112 skb_checksum_start_offset(skb));
2113 if (!(features & NETIF_F_ALL_CSUM) &&
2114 skb_checksum_help(skb))
2115 goto out_kfree_skb;
2116 }
2117 }
2118
2119 rc = ops->ndo_start_xmit(skb, dev);
2120 trace_net_dev_xmit(skb, rc);
2121 if (rc == NETDEV_TX_OK)
2122 txq_trans_update(txq);
2123 return rc;
2124 }
2125
2126 gso:
2127 do {
2128 struct sk_buff *nskb = skb->next;
2129
2130 skb->next = nskb->next;
2131 nskb->next = NULL;
2132
2133 /*
2134 * If device doesn't need nskb->dst, release it right now while
2135 * its hot in this cpu cache
2136 */
2137 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2138 skb_dst_drop(nskb);
2139
2140 rc = ops->ndo_start_xmit(nskb, dev);
2141 trace_net_dev_xmit(nskb, rc);
2142 if (unlikely(rc != NETDEV_TX_OK)) {
2143 if (rc & ~NETDEV_TX_MASK)
2144 goto out_kfree_gso_skb;
2145 nskb->next = skb->next;
2146 skb->next = nskb;
2147 return rc;
2148 }
2149 txq_trans_update(txq);
2150 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2151 return NETDEV_TX_BUSY;
2152 } while (skb->next);
2153
2154 out_kfree_gso_skb:
2155 if (likely(skb->next == NULL))
2156 skb->destructor = DEV_GSO_CB(skb)->destructor;
2157 out_kfree_skb:
2158 kfree_skb(skb);
2159 out:
2160 return rc;
2161 }
2162
2163 static u32 hashrnd __read_mostly;
2164
2165 /*
2166 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2167 * to be used as a distribution range.
2168 */
2169 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2170 unsigned int num_tx_queues)
2171 {
2172 u32 hash;
2173 u16 qoffset = 0;
2174 u16 qcount = num_tx_queues;
2175
2176 if (skb_rx_queue_recorded(skb)) {
2177 hash = skb_get_rx_queue(skb);
2178 while (unlikely(hash >= num_tx_queues))
2179 hash -= num_tx_queues;
2180 return hash;
2181 }
2182
2183 if (dev->num_tc) {
2184 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2185 qoffset = dev->tc_to_txq[tc].offset;
2186 qcount = dev->tc_to_txq[tc].count;
2187 }
2188
2189 if (skb->sk && skb->sk->sk_hash)
2190 hash = skb->sk->sk_hash;
2191 else
2192 hash = (__force u16) skb->protocol ^ skb->rxhash;
2193 hash = jhash_1word(hash, hashrnd);
2194
2195 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2196 }
2197 EXPORT_SYMBOL(__skb_tx_hash);
2198
2199 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2200 {
2201 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2202 if (net_ratelimit()) {
2203 pr_warning("%s selects TX queue %d, but "
2204 "real number of TX queues is %d\n",
2205 dev->name, queue_index, dev->real_num_tx_queues);
2206 }
2207 return 0;
2208 }
2209 return queue_index;
2210 }
2211
2212 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2213 {
2214 #ifdef CONFIG_XPS
2215 struct xps_dev_maps *dev_maps;
2216 struct xps_map *map;
2217 int queue_index = -1;
2218
2219 rcu_read_lock();
2220 dev_maps = rcu_dereference(dev->xps_maps);
2221 if (dev_maps) {
2222 map = rcu_dereference(
2223 dev_maps->cpu_map[raw_smp_processor_id()]);
2224 if (map) {
2225 if (map->len == 1)
2226 queue_index = map->queues[0];
2227 else {
2228 u32 hash;
2229 if (skb->sk && skb->sk->sk_hash)
2230 hash = skb->sk->sk_hash;
2231 else
2232 hash = (__force u16) skb->protocol ^
2233 skb->rxhash;
2234 hash = jhash_1word(hash, hashrnd);
2235 queue_index = map->queues[
2236 ((u64)hash * map->len) >> 32];
2237 }
2238 if (unlikely(queue_index >= dev->real_num_tx_queues))
2239 queue_index = -1;
2240 }
2241 }
2242 rcu_read_unlock();
2243
2244 return queue_index;
2245 #else
2246 return -1;
2247 #endif
2248 }
2249
2250 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2251 struct sk_buff *skb)
2252 {
2253 int queue_index;
2254 const struct net_device_ops *ops = dev->netdev_ops;
2255
2256 if (dev->real_num_tx_queues == 1)
2257 queue_index = 0;
2258 else if (ops->ndo_select_queue) {
2259 queue_index = ops->ndo_select_queue(dev, skb);
2260 queue_index = dev_cap_txqueue(dev, queue_index);
2261 } else {
2262 struct sock *sk = skb->sk;
2263 queue_index = sk_tx_queue_get(sk);
2264
2265 if (queue_index < 0 || skb->ooo_okay ||
2266 queue_index >= dev->real_num_tx_queues) {
2267 int old_index = queue_index;
2268
2269 queue_index = get_xps_queue(dev, skb);
2270 if (queue_index < 0)
2271 queue_index = skb_tx_hash(dev, skb);
2272
2273 if (queue_index != old_index && sk) {
2274 struct dst_entry *dst =
2275 rcu_dereference_check(sk->sk_dst_cache, 1);
2276
2277 if (dst && skb_dst(skb) == dst)
2278 sk_tx_queue_set(sk, queue_index);
2279 }
2280 }
2281 }
2282
2283 skb_set_queue_mapping(skb, queue_index);
2284 return netdev_get_tx_queue(dev, queue_index);
2285 }
2286
2287 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2288 struct net_device *dev,
2289 struct netdev_queue *txq)
2290 {
2291 spinlock_t *root_lock = qdisc_lock(q);
2292 bool contended;
2293 int rc;
2294
2295 qdisc_skb_cb(skb)->pkt_len = skb->len;
2296 qdisc_calculate_pkt_len(skb, q);
2297 /*
2298 * Heuristic to force contended enqueues to serialize on a
2299 * separate lock before trying to get qdisc main lock.
2300 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2301 * and dequeue packets faster.
2302 */
2303 contended = qdisc_is_running(q);
2304 if (unlikely(contended))
2305 spin_lock(&q->busylock);
2306
2307 spin_lock(root_lock);
2308 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2309 kfree_skb(skb);
2310 rc = NET_XMIT_DROP;
2311 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2312 qdisc_run_begin(q)) {
2313 /*
2314 * This is a work-conserving queue; there are no old skbs
2315 * waiting to be sent out; and the qdisc is not running -
2316 * xmit the skb directly.
2317 */
2318 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2319 skb_dst_force(skb);
2320
2321 qdisc_bstats_update(q, skb);
2322
2323 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2324 if (unlikely(contended)) {
2325 spin_unlock(&q->busylock);
2326 contended = false;
2327 }
2328 __qdisc_run(q);
2329 } else
2330 qdisc_run_end(q);
2331
2332 rc = NET_XMIT_SUCCESS;
2333 } else {
2334 skb_dst_force(skb);
2335 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2336 if (qdisc_run_begin(q)) {
2337 if (unlikely(contended)) {
2338 spin_unlock(&q->busylock);
2339 contended = false;
2340 }
2341 __qdisc_run(q);
2342 }
2343 }
2344 spin_unlock(root_lock);
2345 if (unlikely(contended))
2346 spin_unlock(&q->busylock);
2347 return rc;
2348 }
2349
2350 static DEFINE_PER_CPU(int, xmit_recursion);
2351 #define RECURSION_LIMIT 10
2352
2353 /**
2354 * dev_queue_xmit - transmit a buffer
2355 * @skb: buffer to transmit
2356 *
2357 * Queue a buffer for transmission to a network device. The caller must
2358 * have set the device and priority and built the buffer before calling
2359 * this function. The function can be called from an interrupt.
2360 *
2361 * A negative errno code is returned on a failure. A success does not
2362 * guarantee the frame will be transmitted as it may be dropped due
2363 * to congestion or traffic shaping.
2364 *
2365 * -----------------------------------------------------------------------------------
2366 * I notice this method can also return errors from the queue disciplines,
2367 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2368 * be positive.
2369 *
2370 * Regardless of the return value, the skb is consumed, so it is currently
2371 * difficult to retry a send to this method. (You can bump the ref count
2372 * before sending to hold a reference for retry if you are careful.)
2373 *
2374 * When calling this method, interrupts MUST be enabled. This is because
2375 * the BH enable code must have IRQs enabled so that it will not deadlock.
2376 * --BLG
2377 */
2378 int dev_queue_xmit(struct sk_buff *skb)
2379 {
2380 struct net_device *dev = skb->dev;
2381 struct netdev_queue *txq;
2382 struct Qdisc *q;
2383 int rc = -ENOMEM;
2384
2385 /* Disable soft irqs for various locks below. Also
2386 * stops preemption for RCU.
2387 */
2388 rcu_read_lock_bh();
2389
2390 txq = dev_pick_tx(dev, skb);
2391 q = rcu_dereference_bh(txq->qdisc);
2392
2393 #ifdef CONFIG_NET_CLS_ACT
2394 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2395 #endif
2396 trace_net_dev_queue(skb);
2397 if (q->enqueue) {
2398 rc = __dev_xmit_skb(skb, q, dev, txq);
2399 goto out;
2400 }
2401
2402 /* The device has no queue. Common case for software devices:
2403 loopback, all the sorts of tunnels...
2404
2405 Really, it is unlikely that netif_tx_lock protection is necessary
2406 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2407 counters.)
2408 However, it is possible, that they rely on protection
2409 made by us here.
2410
2411 Check this and shot the lock. It is not prone from deadlocks.
2412 Either shot noqueue qdisc, it is even simpler 8)
2413 */
2414 if (dev->flags & IFF_UP) {
2415 int cpu = smp_processor_id(); /* ok because BHs are off */
2416
2417 if (txq->xmit_lock_owner != cpu) {
2418
2419 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2420 goto recursion_alert;
2421
2422 HARD_TX_LOCK(dev, txq, cpu);
2423
2424 if (!netif_tx_queue_stopped(txq)) {
2425 __this_cpu_inc(xmit_recursion);
2426 rc = dev_hard_start_xmit(skb, dev, txq);
2427 __this_cpu_dec(xmit_recursion);
2428 if (dev_xmit_complete(rc)) {
2429 HARD_TX_UNLOCK(dev, txq);
2430 goto out;
2431 }
2432 }
2433 HARD_TX_UNLOCK(dev, txq);
2434 if (net_ratelimit())
2435 printk(KERN_CRIT "Virtual device %s asks to "
2436 "queue packet!\n", dev->name);
2437 } else {
2438 /* Recursion is detected! It is possible,
2439 * unfortunately
2440 */
2441 recursion_alert:
2442 if (net_ratelimit())
2443 printk(KERN_CRIT "Dead loop on virtual device "
2444 "%s, fix it urgently!\n", dev->name);
2445 }
2446 }
2447
2448 rc = -ENETDOWN;
2449 rcu_read_unlock_bh();
2450
2451 kfree_skb(skb);
2452 return rc;
2453 out:
2454 rcu_read_unlock_bh();
2455 return rc;
2456 }
2457 EXPORT_SYMBOL(dev_queue_xmit);
2458
2459
2460 /*=======================================================================
2461 Receiver routines
2462 =======================================================================*/
2463
2464 int netdev_max_backlog __read_mostly = 1000;
2465 int netdev_tstamp_prequeue __read_mostly = 1;
2466 int netdev_budget __read_mostly = 300;
2467 int weight_p __read_mostly = 64; /* old backlog weight */
2468
2469 /* Called with irq disabled */
2470 static inline void ____napi_schedule(struct softnet_data *sd,
2471 struct napi_struct *napi)
2472 {
2473 list_add_tail(&napi->poll_list, &sd->poll_list);
2474 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2475 }
2476
2477 /*
2478 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2479 * and src/dst port numbers. Returns a non-zero hash number on success
2480 * and 0 on failure.
2481 */
2482 __u32 __skb_get_rxhash(struct sk_buff *skb)
2483 {
2484 int nhoff, hash = 0, poff;
2485 struct ipv6hdr *ip6;
2486 struct iphdr *ip;
2487 u8 ip_proto;
2488 u32 addr1, addr2, ihl;
2489 union {
2490 u32 v32;
2491 u16 v16[2];
2492 } ports;
2493
2494 nhoff = skb_network_offset(skb);
2495
2496 switch (skb->protocol) {
2497 case __constant_htons(ETH_P_IP):
2498 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2499 goto done;
2500
2501 ip = (struct iphdr *) (skb->data + nhoff);
2502 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2503 ip_proto = 0;
2504 else
2505 ip_proto = ip->protocol;
2506 addr1 = (__force u32) ip->saddr;
2507 addr2 = (__force u32) ip->daddr;
2508 ihl = ip->ihl;
2509 break;
2510 case __constant_htons(ETH_P_IPV6):
2511 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2512 goto done;
2513
2514 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2515 ip_proto = ip6->nexthdr;
2516 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2517 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2518 ihl = (40 >> 2);
2519 break;
2520 default:
2521 goto done;
2522 }
2523
2524 ports.v32 = 0;
2525 poff = proto_ports_offset(ip_proto);
2526 if (poff >= 0) {
2527 nhoff += ihl * 4 + poff;
2528 if (pskb_may_pull(skb, nhoff + 4)) {
2529 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2530 if (ports.v16[1] < ports.v16[0])
2531 swap(ports.v16[0], ports.v16[1]);
2532 }
2533 }
2534
2535 /* get a consistent hash (same value on both flow directions) */
2536 if (addr2 < addr1)
2537 swap(addr1, addr2);
2538
2539 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2540 if (!hash)
2541 hash = 1;
2542
2543 done:
2544 return hash;
2545 }
2546 EXPORT_SYMBOL(__skb_get_rxhash);
2547
2548 #ifdef CONFIG_RPS
2549
2550 /* One global table that all flow-based protocols share. */
2551 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2552 EXPORT_SYMBOL(rps_sock_flow_table);
2553
2554 static struct rps_dev_flow *
2555 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2556 struct rps_dev_flow *rflow, u16 next_cpu)
2557 {
2558 u16 tcpu;
2559
2560 tcpu = rflow->cpu = next_cpu;
2561 if (tcpu != RPS_NO_CPU) {
2562 #ifdef CONFIG_RFS_ACCEL
2563 struct netdev_rx_queue *rxqueue;
2564 struct rps_dev_flow_table *flow_table;
2565 struct rps_dev_flow *old_rflow;
2566 u32 flow_id;
2567 u16 rxq_index;
2568 int rc;
2569
2570 /* Should we steer this flow to a different hardware queue? */
2571 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2572 !(dev->features & NETIF_F_NTUPLE))
2573 goto out;
2574 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2575 if (rxq_index == skb_get_rx_queue(skb))
2576 goto out;
2577
2578 rxqueue = dev->_rx + rxq_index;
2579 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2580 if (!flow_table)
2581 goto out;
2582 flow_id = skb->rxhash & flow_table->mask;
2583 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2584 rxq_index, flow_id);
2585 if (rc < 0)
2586 goto out;
2587 old_rflow = rflow;
2588 rflow = &flow_table->flows[flow_id];
2589 rflow->cpu = next_cpu;
2590 rflow->filter = rc;
2591 if (old_rflow->filter == rflow->filter)
2592 old_rflow->filter = RPS_NO_FILTER;
2593 out:
2594 #endif
2595 rflow->last_qtail =
2596 per_cpu(softnet_data, tcpu).input_queue_head;
2597 }
2598
2599 return rflow;
2600 }
2601
2602 /*
2603 * get_rps_cpu is called from netif_receive_skb and returns the target
2604 * CPU from the RPS map of the receiving queue for a given skb.
2605 * rcu_read_lock must be held on entry.
2606 */
2607 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2608 struct rps_dev_flow **rflowp)
2609 {
2610 struct netdev_rx_queue *rxqueue;
2611 struct rps_map *map;
2612 struct rps_dev_flow_table *flow_table;
2613 struct rps_sock_flow_table *sock_flow_table;
2614 int cpu = -1;
2615 u16 tcpu;
2616
2617 if (skb_rx_queue_recorded(skb)) {
2618 u16 index = skb_get_rx_queue(skb);
2619 if (unlikely(index >= dev->real_num_rx_queues)) {
2620 WARN_ONCE(dev->real_num_rx_queues > 1,
2621 "%s received packet on queue %u, but number "
2622 "of RX queues is %u\n",
2623 dev->name, index, dev->real_num_rx_queues);
2624 goto done;
2625 }
2626 rxqueue = dev->_rx + index;
2627 } else
2628 rxqueue = dev->_rx;
2629
2630 map = rcu_dereference(rxqueue->rps_map);
2631 if (map) {
2632 if (map->len == 1 &&
2633 !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2634 tcpu = map->cpus[0];
2635 if (cpu_online(tcpu))
2636 cpu = tcpu;
2637 goto done;
2638 }
2639 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2640 goto done;
2641 }
2642
2643 skb_reset_network_header(skb);
2644 if (!skb_get_rxhash(skb))
2645 goto done;
2646
2647 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2648 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2649 if (flow_table && sock_flow_table) {
2650 u16 next_cpu;
2651 struct rps_dev_flow *rflow;
2652
2653 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2654 tcpu = rflow->cpu;
2655
2656 next_cpu = sock_flow_table->ents[skb->rxhash &
2657 sock_flow_table->mask];
2658
2659 /*
2660 * If the desired CPU (where last recvmsg was done) is
2661 * different from current CPU (one in the rx-queue flow
2662 * table entry), switch if one of the following holds:
2663 * - Current CPU is unset (equal to RPS_NO_CPU).
2664 * - Current CPU is offline.
2665 * - The current CPU's queue tail has advanced beyond the
2666 * last packet that was enqueued using this table entry.
2667 * This guarantees that all previous packets for the flow
2668 * have been dequeued, thus preserving in order delivery.
2669 */
2670 if (unlikely(tcpu != next_cpu) &&
2671 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2672 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2673 rflow->last_qtail)) >= 0))
2674 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2675
2676 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2677 *rflowp = rflow;
2678 cpu = tcpu;
2679 goto done;
2680 }
2681 }
2682
2683 if (map) {
2684 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2685
2686 if (cpu_online(tcpu)) {
2687 cpu = tcpu;
2688 goto done;
2689 }
2690 }
2691
2692 done:
2693 return cpu;
2694 }
2695
2696 #ifdef CONFIG_RFS_ACCEL
2697
2698 /**
2699 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2700 * @dev: Device on which the filter was set
2701 * @rxq_index: RX queue index
2702 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2703 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2704 *
2705 * Drivers that implement ndo_rx_flow_steer() should periodically call
2706 * this function for each installed filter and remove the filters for
2707 * which it returns %true.
2708 */
2709 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2710 u32 flow_id, u16 filter_id)
2711 {
2712 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2713 struct rps_dev_flow_table *flow_table;
2714 struct rps_dev_flow *rflow;
2715 bool expire = true;
2716 int cpu;
2717
2718 rcu_read_lock();
2719 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2720 if (flow_table && flow_id <= flow_table->mask) {
2721 rflow = &flow_table->flows[flow_id];
2722 cpu = ACCESS_ONCE(rflow->cpu);
2723 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2724 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2725 rflow->last_qtail) <
2726 (int)(10 * flow_table->mask)))
2727 expire = false;
2728 }
2729 rcu_read_unlock();
2730 return expire;
2731 }
2732 EXPORT_SYMBOL(rps_may_expire_flow);
2733
2734 #endif /* CONFIG_RFS_ACCEL */
2735
2736 /* Called from hardirq (IPI) context */
2737 static void rps_trigger_softirq(void *data)
2738 {
2739 struct softnet_data *sd = data;
2740
2741 ____napi_schedule(sd, &sd->backlog);
2742 sd->received_rps++;
2743 }
2744
2745 #endif /* CONFIG_RPS */
2746
2747 /*
2748 * Check if this softnet_data structure is another cpu one
2749 * If yes, queue it to our IPI list and return 1
2750 * If no, return 0
2751 */
2752 static int rps_ipi_queued(struct softnet_data *sd)
2753 {
2754 #ifdef CONFIG_RPS
2755 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2756
2757 if (sd != mysd) {
2758 sd->rps_ipi_next = mysd->rps_ipi_list;
2759 mysd->rps_ipi_list = sd;
2760
2761 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2762 return 1;
2763 }
2764 #endif /* CONFIG_RPS */
2765 return 0;
2766 }
2767
2768 /*
2769 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2770 * queue (may be a remote CPU queue).
2771 */
2772 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2773 unsigned int *qtail)
2774 {
2775 struct softnet_data *sd;
2776 unsigned long flags;
2777
2778 sd = &per_cpu(softnet_data, cpu);
2779
2780 local_irq_save(flags);
2781
2782 rps_lock(sd);
2783 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2784 if (skb_queue_len(&sd->input_pkt_queue)) {
2785 enqueue:
2786 __skb_queue_tail(&sd->input_pkt_queue, skb);
2787 input_queue_tail_incr_save(sd, qtail);
2788 rps_unlock(sd);
2789 local_irq_restore(flags);
2790 return NET_RX_SUCCESS;
2791 }
2792
2793 /* Schedule NAPI for backlog device
2794 * We can use non atomic operation since we own the queue lock
2795 */
2796 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2797 if (!rps_ipi_queued(sd))
2798 ____napi_schedule(sd, &sd->backlog);
2799 }
2800 goto enqueue;
2801 }
2802
2803 sd->dropped++;
2804 rps_unlock(sd);
2805
2806 local_irq_restore(flags);
2807
2808 atomic_long_inc(&skb->dev->rx_dropped);
2809 kfree_skb(skb);
2810 return NET_RX_DROP;
2811 }
2812
2813 /**
2814 * netif_rx - post buffer to the network code
2815 * @skb: buffer to post
2816 *
2817 * This function receives a packet from a device driver and queues it for
2818 * the upper (protocol) levels to process. It always succeeds. The buffer
2819 * may be dropped during processing for congestion control or by the
2820 * protocol layers.
2821 *
2822 * return values:
2823 * NET_RX_SUCCESS (no congestion)
2824 * NET_RX_DROP (packet was dropped)
2825 *
2826 */
2827
2828 int netif_rx(struct sk_buff *skb)
2829 {
2830 int ret;
2831
2832 /* if netpoll wants it, pretend we never saw it */
2833 if (netpoll_rx(skb))
2834 return NET_RX_DROP;
2835
2836 if (netdev_tstamp_prequeue)
2837 net_timestamp_check(skb);
2838
2839 trace_netif_rx(skb);
2840 #ifdef CONFIG_RPS
2841 {
2842 struct rps_dev_flow voidflow, *rflow = &voidflow;
2843 int cpu;
2844
2845 preempt_disable();
2846 rcu_read_lock();
2847
2848 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2849 if (cpu < 0)
2850 cpu = smp_processor_id();
2851
2852 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2853
2854 rcu_read_unlock();
2855 preempt_enable();
2856 }
2857 #else
2858 {
2859 unsigned int qtail;
2860 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2861 put_cpu();
2862 }
2863 #endif
2864 return ret;
2865 }
2866 EXPORT_SYMBOL(netif_rx);
2867
2868 int netif_rx_ni(struct sk_buff *skb)
2869 {
2870 int err;
2871
2872 preempt_disable();
2873 err = netif_rx(skb);
2874 if (local_softirq_pending())
2875 do_softirq();
2876 preempt_enable();
2877
2878 return err;
2879 }
2880 EXPORT_SYMBOL(netif_rx_ni);
2881
2882 static void net_tx_action(struct softirq_action *h)
2883 {
2884 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2885
2886 if (sd->completion_queue) {
2887 struct sk_buff *clist;
2888
2889 local_irq_disable();
2890 clist = sd->completion_queue;
2891 sd->completion_queue = NULL;
2892 local_irq_enable();
2893
2894 while (clist) {
2895 struct sk_buff *skb = clist;
2896 clist = clist->next;
2897
2898 WARN_ON(atomic_read(&skb->users));
2899 trace_kfree_skb(skb, net_tx_action);
2900 __kfree_skb(skb);
2901 }
2902 }
2903
2904 if (sd->output_queue) {
2905 struct Qdisc *head;
2906
2907 local_irq_disable();
2908 head = sd->output_queue;
2909 sd->output_queue = NULL;
2910 sd->output_queue_tailp = &sd->output_queue;
2911 local_irq_enable();
2912
2913 while (head) {
2914 struct Qdisc *q = head;
2915 spinlock_t *root_lock;
2916
2917 head = head->next_sched;
2918
2919 root_lock = qdisc_lock(q);
2920 if (spin_trylock(root_lock)) {
2921 smp_mb__before_clear_bit();
2922 clear_bit(__QDISC_STATE_SCHED,
2923 &q->state);
2924 qdisc_run(q);
2925 spin_unlock(root_lock);
2926 } else {
2927 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2928 &q->state)) {
2929 __netif_reschedule(q);
2930 } else {
2931 smp_mb__before_clear_bit();
2932 clear_bit(__QDISC_STATE_SCHED,
2933 &q->state);
2934 }
2935 }
2936 }
2937 }
2938 }
2939
2940 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2941 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2942 /* This hook is defined here for ATM LANE */
2943 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2944 unsigned char *addr) __read_mostly;
2945 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2946 #endif
2947
2948 #ifdef CONFIG_NET_CLS_ACT
2949 /* TODO: Maybe we should just force sch_ingress to be compiled in
2950 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2951 * a compare and 2 stores extra right now if we dont have it on
2952 * but have CONFIG_NET_CLS_ACT
2953 * NOTE: This doesn't stop any functionality; if you dont have
2954 * the ingress scheduler, you just can't add policies on ingress.
2955 *
2956 */
2957 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2958 {
2959 struct net_device *dev = skb->dev;
2960 u32 ttl = G_TC_RTTL(skb->tc_verd);
2961 int result = TC_ACT_OK;
2962 struct Qdisc *q;
2963
2964 if (unlikely(MAX_RED_LOOP < ttl++)) {
2965 if (net_ratelimit())
2966 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2967 skb->skb_iif, dev->ifindex);
2968 return TC_ACT_SHOT;
2969 }
2970
2971 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2972 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2973
2974 q = rxq->qdisc;
2975 if (q != &noop_qdisc) {
2976 spin_lock(qdisc_lock(q));
2977 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2978 result = qdisc_enqueue_root(skb, q);
2979 spin_unlock(qdisc_lock(q));
2980 }
2981
2982 return result;
2983 }
2984
2985 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2986 struct packet_type **pt_prev,
2987 int *ret, struct net_device *orig_dev)
2988 {
2989 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2990
2991 if (!rxq || rxq->qdisc == &noop_qdisc)
2992 goto out;
2993
2994 if (*pt_prev) {
2995 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2996 *pt_prev = NULL;
2997 }
2998
2999 switch (ing_filter(skb, rxq)) {
3000 case TC_ACT_SHOT:
3001 case TC_ACT_STOLEN:
3002 kfree_skb(skb);
3003 return NULL;
3004 }
3005
3006 out:
3007 skb->tc_verd = 0;
3008 return skb;
3009 }
3010 #endif
3011
3012 /**
3013 * netdev_rx_handler_register - register receive handler
3014 * @dev: device to register a handler for
3015 * @rx_handler: receive handler to register
3016 * @rx_handler_data: data pointer that is used by rx handler
3017 *
3018 * Register a receive hander for a device. This handler will then be
3019 * called from __netif_receive_skb. A negative errno code is returned
3020 * on a failure.
3021 *
3022 * The caller must hold the rtnl_mutex.
3023 *
3024 * For a general description of rx_handler, see enum rx_handler_result.
3025 */
3026 int netdev_rx_handler_register(struct net_device *dev,
3027 rx_handler_func_t *rx_handler,
3028 void *rx_handler_data)
3029 {
3030 ASSERT_RTNL();
3031
3032 if (dev->rx_handler)
3033 return -EBUSY;
3034
3035 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3036 rcu_assign_pointer(dev->rx_handler, rx_handler);
3037
3038 return 0;
3039 }
3040 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3041
3042 /**
3043 * netdev_rx_handler_unregister - unregister receive handler
3044 * @dev: device to unregister a handler from
3045 *
3046 * Unregister a receive hander from a device.
3047 *
3048 * The caller must hold the rtnl_mutex.
3049 */
3050 void netdev_rx_handler_unregister(struct net_device *dev)
3051 {
3052
3053 ASSERT_RTNL();
3054 rcu_assign_pointer(dev->rx_handler, NULL);
3055 rcu_assign_pointer(dev->rx_handler_data, NULL);
3056 }
3057 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3058
3059 static void vlan_on_bond_hook(struct sk_buff *skb)
3060 {
3061 /*
3062 * Make sure ARP frames received on VLAN interfaces stacked on
3063 * bonding interfaces still make their way to any base bonding
3064 * device that may have registered for a specific ptype.
3065 */
3066 if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3067 vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3068 skb->protocol == htons(ETH_P_ARP)) {
3069 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3070
3071 if (!skb2)
3072 return;
3073 skb2->dev = vlan_dev_real_dev(skb->dev);
3074 netif_rx(skb2);
3075 }
3076 }
3077
3078 static int __netif_receive_skb(struct sk_buff *skb)
3079 {
3080 struct packet_type *ptype, *pt_prev;
3081 rx_handler_func_t *rx_handler;
3082 struct net_device *orig_dev;
3083 struct net_device *null_or_dev;
3084 bool deliver_exact = false;
3085 int ret = NET_RX_DROP;
3086 __be16 type;
3087
3088 if (!netdev_tstamp_prequeue)
3089 net_timestamp_check(skb);
3090
3091 trace_netif_receive_skb(skb);
3092
3093 /* if we've gotten here through NAPI, check netpoll */
3094 if (netpoll_receive_skb(skb))
3095 return NET_RX_DROP;
3096
3097 if (!skb->skb_iif)
3098 skb->skb_iif = skb->dev->ifindex;
3099 orig_dev = skb->dev;
3100
3101 skb_reset_network_header(skb);
3102 skb_reset_transport_header(skb);
3103 skb->mac_len = skb->network_header - skb->mac_header;
3104
3105 pt_prev = NULL;
3106
3107 rcu_read_lock();
3108
3109 another_round:
3110
3111 __this_cpu_inc(softnet_data.processed);
3112
3113 #ifdef CONFIG_NET_CLS_ACT
3114 if (skb->tc_verd & TC_NCLS) {
3115 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3116 goto ncls;
3117 }
3118 #endif
3119
3120 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3121 if (!ptype->dev || ptype->dev == skb->dev) {
3122 if (pt_prev)
3123 ret = deliver_skb(skb, pt_prev, orig_dev);
3124 pt_prev = ptype;
3125 }
3126 }
3127
3128 #ifdef CONFIG_NET_CLS_ACT
3129 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3130 if (!skb)
3131 goto out;
3132 ncls:
3133 #endif
3134
3135 rx_handler = rcu_dereference(skb->dev->rx_handler);
3136 if (rx_handler) {
3137 if (pt_prev) {
3138 ret = deliver_skb(skb, pt_prev, orig_dev);
3139 pt_prev = NULL;
3140 }
3141 switch (rx_handler(&skb)) {
3142 case RX_HANDLER_CONSUMED:
3143 goto out;
3144 case RX_HANDLER_ANOTHER:
3145 goto another_round;
3146 case RX_HANDLER_EXACT:
3147 deliver_exact = true;
3148 case RX_HANDLER_PASS:
3149 break;
3150 default:
3151 BUG();
3152 }
3153 }
3154
3155 if (vlan_tx_tag_present(skb)) {
3156 if (pt_prev) {
3157 ret = deliver_skb(skb, pt_prev, orig_dev);
3158 pt_prev = NULL;
3159 }
3160 if (vlan_hwaccel_do_receive(&skb)) {
3161 ret = __netif_receive_skb(skb);
3162 goto out;
3163 } else if (unlikely(!skb))
3164 goto out;
3165 }
3166
3167 vlan_on_bond_hook(skb);
3168
3169 /* deliver only exact match when indicated */
3170 null_or_dev = deliver_exact ? skb->dev : NULL;
3171
3172 type = skb->protocol;
3173 list_for_each_entry_rcu(ptype,
3174 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3175 if (ptype->type == type &&
3176 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3177 ptype->dev == orig_dev)) {
3178 if (pt_prev)
3179 ret = deliver_skb(skb, pt_prev, orig_dev);
3180 pt_prev = ptype;
3181 }
3182 }
3183
3184 if (pt_prev) {
3185 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3186 } else {
3187 atomic_long_inc(&skb->dev->rx_dropped);
3188 kfree_skb(skb);
3189 /* Jamal, now you will not able to escape explaining
3190 * me how you were going to use this. :-)
3191 */
3192 ret = NET_RX_DROP;
3193 }
3194
3195 out:
3196 rcu_read_unlock();
3197 return ret;
3198 }
3199
3200 /**
3201 * netif_receive_skb - process receive buffer from network
3202 * @skb: buffer to process
3203 *
3204 * netif_receive_skb() is the main receive data processing function.
3205 * It always succeeds. The buffer may be dropped during processing
3206 * for congestion control or by the protocol layers.
3207 *
3208 * This function may only be called from softirq context and interrupts
3209 * should be enabled.
3210 *
3211 * Return values (usually ignored):
3212 * NET_RX_SUCCESS: no congestion
3213 * NET_RX_DROP: packet was dropped
3214 */
3215 int netif_receive_skb(struct sk_buff *skb)
3216 {
3217 if (netdev_tstamp_prequeue)
3218 net_timestamp_check(skb);
3219
3220 if (skb_defer_rx_timestamp(skb))
3221 return NET_RX_SUCCESS;
3222
3223 #ifdef CONFIG_RPS
3224 {
3225 struct rps_dev_flow voidflow, *rflow = &voidflow;
3226 int cpu, ret;
3227
3228 rcu_read_lock();
3229
3230 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3231
3232 if (cpu >= 0) {
3233 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3234 rcu_read_unlock();
3235 } else {
3236 rcu_read_unlock();
3237 ret = __netif_receive_skb(skb);
3238 }
3239
3240 return ret;
3241 }
3242 #else
3243 return __netif_receive_skb(skb);
3244 #endif
3245 }
3246 EXPORT_SYMBOL(netif_receive_skb);
3247
3248 /* Network device is going away, flush any packets still pending
3249 * Called with irqs disabled.
3250 */
3251 static void flush_backlog(void *arg)
3252 {
3253 struct net_device *dev = arg;
3254 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3255 struct sk_buff *skb, *tmp;
3256
3257 rps_lock(sd);
3258 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3259 if (skb->dev == dev) {
3260 __skb_unlink(skb, &sd->input_pkt_queue);
3261 kfree_skb(skb);
3262 input_queue_head_incr(sd);
3263 }
3264 }
3265 rps_unlock(sd);
3266
3267 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3268 if (skb->dev == dev) {
3269 __skb_unlink(skb, &sd->process_queue);
3270 kfree_skb(skb);
3271 input_queue_head_incr(sd);
3272 }
3273 }
3274 }
3275
3276 static int napi_gro_complete(struct sk_buff *skb)
3277 {
3278 struct packet_type *ptype;
3279 __be16 type = skb->protocol;
3280 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3281 int err = -ENOENT;
3282
3283 if (NAPI_GRO_CB(skb)->count == 1) {
3284 skb_shinfo(skb)->gso_size = 0;
3285 goto out;
3286 }
3287
3288 rcu_read_lock();
3289 list_for_each_entry_rcu(ptype, head, list) {
3290 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3291 continue;
3292
3293 err = ptype->gro_complete(skb);
3294 break;
3295 }
3296 rcu_read_unlock();
3297
3298 if (err) {
3299 WARN_ON(&ptype->list == head);
3300 kfree_skb(skb);
3301 return NET_RX_SUCCESS;
3302 }
3303
3304 out:
3305 return netif_receive_skb(skb);
3306 }
3307
3308 inline void napi_gro_flush(struct napi_struct *napi)
3309 {
3310 struct sk_buff *skb, *next;
3311
3312 for (skb = napi->gro_list; skb; skb = next) {
3313 next = skb->next;
3314 skb->next = NULL;
3315 napi_gro_complete(skb);
3316 }
3317
3318 napi->gro_count = 0;
3319 napi->gro_list = NULL;
3320 }
3321 EXPORT_SYMBOL(napi_gro_flush);
3322
3323 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3324 {
3325 struct sk_buff **pp = NULL;
3326 struct packet_type *ptype;
3327 __be16 type = skb->protocol;
3328 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3329 int same_flow;
3330 int mac_len;
3331 enum gro_result ret;
3332
3333 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3334 goto normal;
3335
3336 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3337 goto normal;
3338
3339 rcu_read_lock();
3340 list_for_each_entry_rcu(ptype, head, list) {
3341 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3342 continue;
3343
3344 skb_set_network_header(skb, skb_gro_offset(skb));
3345 mac_len = skb->network_header - skb->mac_header;
3346 skb->mac_len = mac_len;
3347 NAPI_GRO_CB(skb)->same_flow = 0;
3348 NAPI_GRO_CB(skb)->flush = 0;
3349 NAPI_GRO_CB(skb)->free = 0;
3350
3351 pp = ptype->gro_receive(&napi->gro_list, skb);
3352 break;
3353 }
3354 rcu_read_unlock();
3355
3356 if (&ptype->list == head)
3357 goto normal;
3358
3359 same_flow = NAPI_GRO_CB(skb)->same_flow;
3360 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3361
3362 if (pp) {
3363 struct sk_buff *nskb = *pp;
3364
3365 *pp = nskb->next;
3366 nskb->next = NULL;
3367 napi_gro_complete(nskb);
3368 napi->gro_count--;
3369 }
3370
3371 if (same_flow)
3372 goto ok;
3373
3374 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3375 goto normal;
3376
3377 napi->gro_count++;
3378 NAPI_GRO_CB(skb)->count = 1;
3379 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3380 skb->next = napi->gro_list;
3381 napi->gro_list = skb;
3382 ret = GRO_HELD;
3383
3384 pull:
3385 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3386 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3387
3388 BUG_ON(skb->end - skb->tail < grow);
3389
3390 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3391
3392 skb->tail += grow;
3393 skb->data_len -= grow;
3394
3395 skb_shinfo(skb)->frags[0].page_offset += grow;
3396 skb_shinfo(skb)->frags[0].size -= grow;
3397
3398 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3399 put_page(skb_shinfo(skb)->frags[0].page);
3400 memmove(skb_shinfo(skb)->frags,
3401 skb_shinfo(skb)->frags + 1,
3402 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3403 }
3404 }
3405
3406 ok:
3407 return ret;
3408
3409 normal:
3410 ret = GRO_NORMAL;
3411 goto pull;
3412 }
3413 EXPORT_SYMBOL(dev_gro_receive);
3414
3415 static inline gro_result_t
3416 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3417 {
3418 struct sk_buff *p;
3419
3420 for (p = napi->gro_list; p; p = p->next) {
3421 unsigned long diffs;
3422
3423 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3424 diffs |= p->vlan_tci ^ skb->vlan_tci;
3425 diffs |= compare_ether_header(skb_mac_header(p),
3426 skb_gro_mac_header(skb));
3427 NAPI_GRO_CB(p)->same_flow = !diffs;
3428 NAPI_GRO_CB(p)->flush = 0;
3429 }
3430
3431 return dev_gro_receive(napi, skb);
3432 }
3433
3434 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3435 {
3436 switch (ret) {
3437 case GRO_NORMAL:
3438 if (netif_receive_skb(skb))
3439 ret = GRO_DROP;
3440 break;
3441
3442 case GRO_DROP:
3443 case GRO_MERGED_FREE:
3444 kfree_skb(skb);
3445 break;
3446
3447 case GRO_HELD:
3448 case GRO_MERGED:
3449 break;
3450 }
3451
3452 return ret;
3453 }
3454 EXPORT_SYMBOL(napi_skb_finish);
3455
3456 void skb_gro_reset_offset(struct sk_buff *skb)
3457 {
3458 NAPI_GRO_CB(skb)->data_offset = 0;
3459 NAPI_GRO_CB(skb)->frag0 = NULL;
3460 NAPI_GRO_CB(skb)->frag0_len = 0;
3461
3462 if (skb->mac_header == skb->tail &&
3463 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3464 NAPI_GRO_CB(skb)->frag0 =
3465 page_address(skb_shinfo(skb)->frags[0].page) +
3466 skb_shinfo(skb)->frags[0].page_offset;
3467 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3468 }
3469 }
3470 EXPORT_SYMBOL(skb_gro_reset_offset);
3471
3472 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3473 {
3474 skb_gro_reset_offset(skb);
3475
3476 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3477 }
3478 EXPORT_SYMBOL(napi_gro_receive);
3479
3480 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3481 {
3482 __skb_pull(skb, skb_headlen(skb));
3483 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3484 skb->vlan_tci = 0;
3485 skb->dev = napi->dev;
3486 skb->skb_iif = 0;
3487
3488 napi->skb = skb;
3489 }
3490
3491 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3492 {
3493 struct sk_buff *skb = napi->skb;
3494
3495 if (!skb) {
3496 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3497 if (skb)
3498 napi->skb = skb;
3499 }
3500 return skb;
3501 }
3502 EXPORT_SYMBOL(napi_get_frags);
3503
3504 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3505 gro_result_t ret)
3506 {
3507 switch (ret) {
3508 case GRO_NORMAL:
3509 case GRO_HELD:
3510 skb->protocol = eth_type_trans(skb, skb->dev);
3511
3512 if (ret == GRO_HELD)
3513 skb_gro_pull(skb, -ETH_HLEN);
3514 else if (netif_receive_skb(skb))
3515 ret = GRO_DROP;
3516 break;
3517
3518 case GRO_DROP:
3519 case GRO_MERGED_FREE:
3520 napi_reuse_skb(napi, skb);
3521 break;
3522
3523 case GRO_MERGED:
3524 break;
3525 }
3526
3527 return ret;
3528 }
3529 EXPORT_SYMBOL(napi_frags_finish);
3530
3531 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3532 {
3533 struct sk_buff *skb = napi->skb;
3534 struct ethhdr *eth;
3535 unsigned int hlen;
3536 unsigned int off;
3537
3538 napi->skb = NULL;
3539
3540 skb_reset_mac_header(skb);
3541 skb_gro_reset_offset(skb);
3542
3543 off = skb_gro_offset(skb);
3544 hlen = off + sizeof(*eth);
3545 eth = skb_gro_header_fast(skb, off);
3546 if (skb_gro_header_hard(skb, hlen)) {
3547 eth = skb_gro_header_slow(skb, hlen, off);
3548 if (unlikely(!eth)) {
3549 napi_reuse_skb(napi, skb);
3550 skb = NULL;
3551 goto out;
3552 }
3553 }
3554
3555 skb_gro_pull(skb, sizeof(*eth));
3556
3557 /*
3558 * This works because the only protocols we care about don't require
3559 * special handling. We'll fix it up properly at the end.
3560 */
3561 skb->protocol = eth->h_proto;
3562
3563 out:
3564 return skb;
3565 }
3566 EXPORT_SYMBOL(napi_frags_skb);
3567
3568 gro_result_t napi_gro_frags(struct napi_struct *napi)
3569 {
3570 struct sk_buff *skb = napi_frags_skb(napi);
3571
3572 if (!skb)
3573 return GRO_DROP;
3574
3575 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3576 }
3577 EXPORT_SYMBOL(napi_gro_frags);
3578
3579 /*
3580 * net_rps_action sends any pending IPI's for rps.
3581 * Note: called with local irq disabled, but exits with local irq enabled.
3582 */
3583 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3584 {
3585 #ifdef CONFIG_RPS
3586 struct softnet_data *remsd = sd->rps_ipi_list;
3587
3588 if (remsd) {
3589 sd->rps_ipi_list = NULL;
3590
3591 local_irq_enable();
3592
3593 /* Send pending IPI's to kick RPS processing on remote cpus. */
3594 while (remsd) {
3595 struct softnet_data *next = remsd->rps_ipi_next;
3596
3597 if (cpu_online(remsd->cpu))
3598 __smp_call_function_single(remsd->cpu,
3599 &remsd->csd, 0);
3600 remsd = next;
3601 }
3602 } else
3603 #endif
3604 local_irq_enable();
3605 }
3606
3607 static int process_backlog(struct napi_struct *napi, int quota)
3608 {
3609 int work = 0;
3610 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3611
3612 #ifdef CONFIG_RPS
3613 /* Check if we have pending ipi, its better to send them now,
3614 * not waiting net_rx_action() end.
3615 */
3616 if (sd->rps_ipi_list) {
3617 local_irq_disable();
3618 net_rps_action_and_irq_enable(sd);
3619 }
3620 #endif
3621 napi->weight = weight_p;
3622 local_irq_disable();
3623 while (work < quota) {
3624 struct sk_buff *skb;
3625 unsigned int qlen;
3626
3627 while ((skb = __skb_dequeue(&sd->process_queue))) {
3628 local_irq_enable();
3629 __netif_receive_skb(skb);
3630 local_irq_disable();
3631 input_queue_head_incr(sd);
3632 if (++work >= quota) {
3633 local_irq_enable();
3634 return work;
3635 }
3636 }
3637
3638 rps_lock(sd);
3639 qlen = skb_queue_len(&sd->input_pkt_queue);
3640 if (qlen)
3641 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3642 &sd->process_queue);
3643
3644 if (qlen < quota - work) {
3645 /*
3646 * Inline a custom version of __napi_complete().
3647 * only current cpu owns and manipulates this napi,
3648 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3649 * we can use a plain write instead of clear_bit(),
3650 * and we dont need an smp_mb() memory barrier.
3651 */
3652 list_del(&napi->poll_list);
3653 napi->state = 0;
3654
3655 quota = work + qlen;
3656 }
3657 rps_unlock(sd);
3658 }
3659 local_irq_enable();
3660
3661 return work;
3662 }
3663
3664 /**
3665 * __napi_schedule - schedule for receive
3666 * @n: entry to schedule
3667 *
3668 * The entry's receive function will be scheduled to run
3669 */
3670 void __napi_schedule(struct napi_struct *n)
3671 {
3672 unsigned long flags;
3673
3674 local_irq_save(flags);
3675 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3676 local_irq_restore(flags);
3677 }
3678 EXPORT_SYMBOL(__napi_schedule);
3679
3680 void __napi_complete(struct napi_struct *n)
3681 {
3682 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3683 BUG_ON(n->gro_list);
3684
3685 list_del(&n->poll_list);
3686 smp_mb__before_clear_bit();
3687 clear_bit(NAPI_STATE_SCHED, &n->state);
3688 }
3689 EXPORT_SYMBOL(__napi_complete);
3690
3691 void napi_complete(struct napi_struct *n)
3692 {
3693 unsigned long flags;
3694
3695 /*
3696 * don't let napi dequeue from the cpu poll list
3697 * just in case its running on a different cpu
3698 */
3699 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3700 return;
3701
3702 napi_gro_flush(n);
3703 local_irq_save(flags);
3704 __napi_complete(n);
3705 local_irq_restore(flags);
3706 }
3707 EXPORT_SYMBOL(napi_complete);
3708
3709 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3710 int (*poll)(struct napi_struct *, int), int weight)
3711 {
3712 INIT_LIST_HEAD(&napi->poll_list);
3713 napi->gro_count = 0;
3714 napi->gro_list = NULL;
3715 napi->skb = NULL;
3716 napi->poll = poll;
3717 napi->weight = weight;
3718 list_add(&napi->dev_list, &dev->napi_list);
3719 napi->dev = dev;
3720 #ifdef CONFIG_NETPOLL
3721 spin_lock_init(&napi->poll_lock);
3722 napi->poll_owner = -1;
3723 #endif
3724 set_bit(NAPI_STATE_SCHED, &napi->state);
3725 }
3726 EXPORT_SYMBOL(netif_napi_add);
3727
3728 void netif_napi_del(struct napi_struct *napi)
3729 {
3730 struct sk_buff *skb, *next;
3731
3732 list_del_init(&napi->dev_list);
3733 napi_free_frags(napi);
3734
3735 for (skb = napi->gro_list; skb; skb = next) {
3736 next = skb->next;
3737 skb->next = NULL;
3738 kfree_skb(skb);
3739 }
3740
3741 napi->gro_list = NULL;
3742 napi->gro_count = 0;
3743 }
3744 EXPORT_SYMBOL(netif_napi_del);
3745
3746 static void net_rx_action(struct softirq_action *h)
3747 {
3748 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3749 unsigned long time_limit = jiffies + 2;
3750 int budget = netdev_budget;
3751 void *have;
3752
3753 local_irq_disable();
3754
3755 while (!list_empty(&sd->poll_list)) {
3756 struct napi_struct *n;
3757 int work, weight;
3758
3759 /* If softirq window is exhuasted then punt.
3760 * Allow this to run for 2 jiffies since which will allow
3761 * an average latency of 1.5/HZ.
3762 */
3763 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3764 goto softnet_break;
3765
3766 local_irq_enable();
3767
3768 /* Even though interrupts have been re-enabled, this
3769 * access is safe because interrupts can only add new
3770 * entries to the tail of this list, and only ->poll()
3771 * calls can remove this head entry from the list.
3772 */
3773 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3774
3775 have = netpoll_poll_lock(n);
3776
3777 weight = n->weight;
3778
3779 /* This NAPI_STATE_SCHED test is for avoiding a race
3780 * with netpoll's poll_napi(). Only the entity which
3781 * obtains the lock and sees NAPI_STATE_SCHED set will
3782 * actually make the ->poll() call. Therefore we avoid
3783 * accidentally calling ->poll() when NAPI is not scheduled.
3784 */
3785 work = 0;
3786 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3787 work = n->poll(n, weight);
3788 trace_napi_poll(n);
3789 }
3790
3791 WARN_ON_ONCE(work > weight);
3792
3793 budget -= work;
3794
3795 local_irq_disable();
3796
3797 /* Drivers must not modify the NAPI state if they
3798 * consume the entire weight. In such cases this code
3799 * still "owns" the NAPI instance and therefore can
3800 * move the instance around on the list at-will.
3801 */
3802 if (unlikely(work == weight)) {
3803 if (unlikely(napi_disable_pending(n))) {
3804 local_irq_enable();
3805 napi_complete(n);
3806 local_irq_disable();
3807 } else
3808 list_move_tail(&n->poll_list, &sd->poll_list);
3809 }
3810
3811 netpoll_poll_unlock(have);
3812 }
3813 out:
3814 net_rps_action_and_irq_enable(sd);
3815
3816 #ifdef CONFIG_NET_DMA
3817 /*
3818 * There may not be any more sk_buffs coming right now, so push
3819 * any pending DMA copies to hardware
3820 */
3821 dma_issue_pending_all();
3822 #endif
3823
3824 return;
3825
3826 softnet_break:
3827 sd->time_squeeze++;
3828 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3829 goto out;
3830 }
3831
3832 static gifconf_func_t *gifconf_list[NPROTO];
3833
3834 /**
3835 * register_gifconf - register a SIOCGIF handler
3836 * @family: Address family
3837 * @gifconf: Function handler
3838 *
3839 * Register protocol dependent address dumping routines. The handler
3840 * that is passed must not be freed or reused until it has been replaced
3841 * by another handler.
3842 */
3843 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3844 {
3845 if (family >= NPROTO)
3846 return -EINVAL;
3847 gifconf_list[family] = gifconf;
3848 return 0;
3849 }
3850 EXPORT_SYMBOL(register_gifconf);
3851
3852
3853 /*
3854 * Map an interface index to its name (SIOCGIFNAME)
3855 */
3856
3857 /*
3858 * We need this ioctl for efficient implementation of the
3859 * if_indextoname() function required by the IPv6 API. Without
3860 * it, we would have to search all the interfaces to find a
3861 * match. --pb
3862 */
3863
3864 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3865 {
3866 struct net_device *dev;
3867 struct ifreq ifr;
3868
3869 /*
3870 * Fetch the caller's info block.
3871 */
3872
3873 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3874 return -EFAULT;
3875
3876 rcu_read_lock();
3877 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3878 if (!dev) {
3879 rcu_read_unlock();
3880 return -ENODEV;
3881 }
3882
3883 strcpy(ifr.ifr_name, dev->name);
3884 rcu_read_unlock();
3885
3886 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3887 return -EFAULT;
3888 return 0;
3889 }
3890
3891 /*
3892 * Perform a SIOCGIFCONF call. This structure will change
3893 * size eventually, and there is nothing I can do about it.
3894 * Thus we will need a 'compatibility mode'.
3895 */
3896
3897 static int dev_ifconf(struct net *net, char __user *arg)
3898 {
3899 struct ifconf ifc;
3900 struct net_device *dev;
3901 char __user *pos;
3902 int len;
3903 int total;
3904 int i;
3905
3906 /*
3907 * Fetch the caller's info block.
3908 */
3909
3910 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3911 return -EFAULT;
3912
3913 pos = ifc.ifc_buf;
3914 len = ifc.ifc_len;
3915
3916 /*
3917 * Loop over the interfaces, and write an info block for each.
3918 */
3919
3920 total = 0;
3921 for_each_netdev(net, dev) {
3922 for (i = 0; i < NPROTO; i++) {
3923 if (gifconf_list[i]) {
3924 int done;
3925 if (!pos)
3926 done = gifconf_list[i](dev, NULL, 0);
3927 else
3928 done = gifconf_list[i](dev, pos + total,
3929 len - total);
3930 if (done < 0)
3931 return -EFAULT;
3932 total += done;
3933 }
3934 }
3935 }
3936
3937 /*
3938 * All done. Write the updated control block back to the caller.
3939 */
3940 ifc.ifc_len = total;
3941
3942 /*
3943 * Both BSD and Solaris return 0 here, so we do too.
3944 */
3945 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3946 }
3947
3948 #ifdef CONFIG_PROC_FS
3949 /*
3950 * This is invoked by the /proc filesystem handler to display a device
3951 * in detail.
3952 */
3953 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3954 __acquires(RCU)
3955 {
3956 struct net *net = seq_file_net(seq);
3957 loff_t off;
3958 struct net_device *dev;
3959
3960 rcu_read_lock();
3961 if (!*pos)
3962 return SEQ_START_TOKEN;
3963
3964 off = 1;
3965 for_each_netdev_rcu(net, dev)
3966 if (off++ == *pos)
3967 return dev;
3968
3969 return NULL;
3970 }
3971
3972 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3973 {
3974 struct net_device *dev = v;
3975
3976 if (v == SEQ_START_TOKEN)
3977 dev = first_net_device_rcu(seq_file_net(seq));
3978 else
3979 dev = next_net_device_rcu(dev);
3980
3981 ++*pos;
3982 return dev;
3983 }
3984
3985 void dev_seq_stop(struct seq_file *seq, void *v)
3986 __releases(RCU)
3987 {
3988 rcu_read_unlock();
3989 }
3990
3991 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3992 {
3993 struct rtnl_link_stats64 temp;
3994 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3995
3996 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3997 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3998 dev->name, stats->rx_bytes, stats->rx_packets,
3999 stats->rx_errors,
4000 stats->rx_dropped + stats->rx_missed_errors,
4001 stats->rx_fifo_errors,
4002 stats->rx_length_errors + stats->rx_over_errors +
4003 stats->rx_crc_errors + stats->rx_frame_errors,
4004 stats->rx_compressed, stats->multicast,
4005 stats->tx_bytes, stats->tx_packets,
4006 stats->tx_errors, stats->tx_dropped,
4007 stats->tx_fifo_errors, stats->collisions,
4008 stats->tx_carrier_errors +
4009 stats->tx_aborted_errors +
4010 stats->tx_window_errors +
4011 stats->tx_heartbeat_errors,
4012 stats->tx_compressed);
4013 }
4014
4015 /*
4016 * Called from the PROCfs module. This now uses the new arbitrary sized
4017 * /proc/net interface to create /proc/net/dev
4018 */
4019 static int dev_seq_show(struct seq_file *seq, void *v)
4020 {
4021 if (v == SEQ_START_TOKEN)
4022 seq_puts(seq, "Inter-| Receive "
4023 " | Transmit\n"
4024 " face |bytes packets errs drop fifo frame "
4025 "compressed multicast|bytes packets errs "
4026 "drop fifo colls carrier compressed\n");
4027 else
4028 dev_seq_printf_stats(seq, v);
4029 return 0;
4030 }
4031
4032 static struct softnet_data *softnet_get_online(loff_t *pos)
4033 {
4034 struct softnet_data *sd = NULL;
4035
4036 while (*pos < nr_cpu_ids)
4037 if (cpu_online(*pos)) {
4038 sd = &per_cpu(softnet_data, *pos);
4039 break;
4040 } else
4041 ++*pos;
4042 return sd;
4043 }
4044
4045 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4046 {
4047 return softnet_get_online(pos);
4048 }
4049
4050 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4051 {
4052 ++*pos;
4053 return softnet_get_online(pos);
4054 }
4055
4056 static void softnet_seq_stop(struct seq_file *seq, void *v)
4057 {
4058 }
4059
4060 static int softnet_seq_show(struct seq_file *seq, void *v)
4061 {
4062 struct softnet_data *sd = v;
4063
4064 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4065 sd->processed, sd->dropped, sd->time_squeeze, 0,
4066 0, 0, 0, 0, /* was fastroute */
4067 sd->cpu_collision, sd->received_rps);
4068 return 0;
4069 }
4070
4071 static const struct seq_operations dev_seq_ops = {
4072 .start = dev_seq_start,
4073 .next = dev_seq_next,
4074 .stop = dev_seq_stop,
4075 .show = dev_seq_show,
4076 };
4077
4078 static int dev_seq_open(struct inode *inode, struct file *file)
4079 {
4080 return seq_open_net(inode, file, &dev_seq_ops,
4081 sizeof(struct seq_net_private));
4082 }
4083
4084 static const struct file_operations dev_seq_fops = {
4085 .owner = THIS_MODULE,
4086 .open = dev_seq_open,
4087 .read = seq_read,
4088 .llseek = seq_lseek,
4089 .release = seq_release_net,
4090 };
4091
4092 static const struct seq_operations softnet_seq_ops = {
4093 .start = softnet_seq_start,
4094 .next = softnet_seq_next,
4095 .stop = softnet_seq_stop,
4096 .show = softnet_seq_show,
4097 };
4098
4099 static int softnet_seq_open(struct inode *inode, struct file *file)
4100 {
4101 return seq_open(file, &softnet_seq_ops);
4102 }
4103
4104 static const struct file_operations softnet_seq_fops = {
4105 .owner = THIS_MODULE,
4106 .open = softnet_seq_open,
4107 .read = seq_read,
4108 .llseek = seq_lseek,
4109 .release = seq_release,
4110 };
4111
4112 static void *ptype_get_idx(loff_t pos)
4113 {
4114 struct packet_type *pt = NULL;
4115 loff_t i = 0;
4116 int t;
4117
4118 list_for_each_entry_rcu(pt, &ptype_all, list) {
4119 if (i == pos)
4120 return pt;
4121 ++i;
4122 }
4123
4124 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4125 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4126 if (i == pos)
4127 return pt;
4128 ++i;
4129 }
4130 }
4131 return NULL;
4132 }
4133
4134 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4135 __acquires(RCU)
4136 {
4137 rcu_read_lock();
4138 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4139 }
4140
4141 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4142 {
4143 struct packet_type *pt;
4144 struct list_head *nxt;
4145 int hash;
4146
4147 ++*pos;
4148 if (v == SEQ_START_TOKEN)
4149 return ptype_get_idx(0);
4150
4151 pt = v;
4152 nxt = pt->list.next;
4153 if (pt->type == htons(ETH_P_ALL)) {
4154 if (nxt != &ptype_all)
4155 goto found;
4156 hash = 0;
4157 nxt = ptype_base[0].next;
4158 } else
4159 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4160
4161 while (nxt == &ptype_base[hash]) {
4162 if (++hash >= PTYPE_HASH_SIZE)
4163 return NULL;
4164 nxt = ptype_base[hash].next;
4165 }
4166 found:
4167 return list_entry(nxt, struct packet_type, list);
4168 }
4169
4170 static void ptype_seq_stop(struct seq_file *seq, void *v)
4171 __releases(RCU)
4172 {
4173 rcu_read_unlock();
4174 }
4175
4176 static int ptype_seq_show(struct seq_file *seq, void *v)
4177 {
4178 struct packet_type *pt = v;
4179
4180 if (v == SEQ_START_TOKEN)
4181 seq_puts(seq, "Type Device Function\n");
4182 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4183 if (pt->type == htons(ETH_P_ALL))
4184 seq_puts(seq, "ALL ");
4185 else
4186 seq_printf(seq, "%04x", ntohs(pt->type));
4187
4188 seq_printf(seq, " %-8s %pF\n",
4189 pt->dev ? pt->dev->name : "", pt->func);
4190 }
4191
4192 return 0;
4193 }
4194
4195 static const struct seq_operations ptype_seq_ops = {
4196 .start = ptype_seq_start,
4197 .next = ptype_seq_next,
4198 .stop = ptype_seq_stop,
4199 .show = ptype_seq_show,
4200 };
4201
4202 static int ptype_seq_open(struct inode *inode, struct file *file)
4203 {
4204 return seq_open_net(inode, file, &ptype_seq_ops,
4205 sizeof(struct seq_net_private));
4206 }
4207
4208 static const struct file_operations ptype_seq_fops = {
4209 .owner = THIS_MODULE,
4210 .open = ptype_seq_open,
4211 .read = seq_read,
4212 .llseek = seq_lseek,
4213 .release = seq_release_net,
4214 };
4215
4216
4217 static int __net_init dev_proc_net_init(struct net *net)
4218 {
4219 int rc = -ENOMEM;
4220
4221 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4222 goto out;
4223 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4224 goto out_dev;
4225 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4226 goto out_softnet;
4227
4228 if (wext_proc_init(net))
4229 goto out_ptype;
4230 rc = 0;
4231 out:
4232 return rc;
4233 out_ptype:
4234 proc_net_remove(net, "ptype");
4235 out_softnet:
4236 proc_net_remove(net, "softnet_stat");
4237 out_dev:
4238 proc_net_remove(net, "dev");
4239 goto out;
4240 }
4241
4242 static void __net_exit dev_proc_net_exit(struct net *net)
4243 {
4244 wext_proc_exit(net);
4245
4246 proc_net_remove(net, "ptype");
4247 proc_net_remove(net, "softnet_stat");
4248 proc_net_remove(net, "dev");
4249 }
4250
4251 static struct pernet_operations __net_initdata dev_proc_ops = {
4252 .init = dev_proc_net_init,
4253 .exit = dev_proc_net_exit,
4254 };
4255
4256 static int __init dev_proc_init(void)
4257 {
4258 return register_pernet_subsys(&dev_proc_ops);
4259 }
4260 #else
4261 #define dev_proc_init() 0
4262 #endif /* CONFIG_PROC_FS */
4263
4264
4265 /**
4266 * netdev_set_master - set up master pointer
4267 * @slave: slave device
4268 * @master: new master device
4269 *
4270 * Changes the master device of the slave. Pass %NULL to break the
4271 * bonding. The caller must hold the RTNL semaphore. On a failure
4272 * a negative errno code is returned. On success the reference counts
4273 * are adjusted and the function returns zero.
4274 */
4275 int netdev_set_master(struct net_device *slave, struct net_device *master)
4276 {
4277 struct net_device *old = slave->master;
4278
4279 ASSERT_RTNL();
4280
4281 if (master) {
4282 if (old)
4283 return -EBUSY;
4284 dev_hold(master);
4285 }
4286
4287 slave->master = master;
4288
4289 if (old) {
4290 synchronize_net();
4291 dev_put(old);
4292 }
4293 return 0;
4294 }
4295 EXPORT_SYMBOL(netdev_set_master);
4296
4297 /**
4298 * netdev_set_bond_master - set up bonding master/slave pair
4299 * @slave: slave device
4300 * @master: new master device
4301 *
4302 * Changes the master device of the slave. Pass %NULL to break the
4303 * bonding. The caller must hold the RTNL semaphore. On a failure
4304 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4305 * to the routing socket and the function returns zero.
4306 */
4307 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4308 {
4309 int err;
4310
4311 ASSERT_RTNL();
4312
4313 err = netdev_set_master(slave, master);
4314 if (err)
4315 return err;
4316 if (master)
4317 slave->flags |= IFF_SLAVE;
4318 else
4319 slave->flags &= ~IFF_SLAVE;
4320
4321 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4322 return 0;
4323 }
4324 EXPORT_SYMBOL(netdev_set_bond_master);
4325
4326 static void dev_change_rx_flags(struct net_device *dev, int flags)
4327 {
4328 const struct net_device_ops *ops = dev->netdev_ops;
4329
4330 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4331 ops->ndo_change_rx_flags(dev, flags);
4332 }
4333
4334 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4335 {
4336 unsigned short old_flags = dev->flags;
4337 uid_t uid;
4338 gid_t gid;
4339
4340 ASSERT_RTNL();
4341
4342 dev->flags |= IFF_PROMISC;
4343 dev->promiscuity += inc;
4344 if (dev->promiscuity == 0) {
4345 /*
4346 * Avoid overflow.
4347 * If inc causes overflow, untouch promisc and return error.
4348 */
4349 if (inc < 0)
4350 dev->flags &= ~IFF_PROMISC;
4351 else {
4352 dev->promiscuity -= inc;
4353 printk(KERN_WARNING "%s: promiscuity touches roof, "
4354 "set promiscuity failed, promiscuity feature "
4355 "of device might be broken.\n", dev->name);
4356 return -EOVERFLOW;
4357 }
4358 }
4359 if (dev->flags != old_flags) {
4360 printk(KERN_INFO "device %s %s promiscuous mode\n",
4361 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4362 "left");
4363 if (audit_enabled) {
4364 current_uid_gid(&uid, &gid);
4365 audit_log(current->audit_context, GFP_ATOMIC,
4366 AUDIT_ANOM_PROMISCUOUS,
4367 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4368 dev->name, (dev->flags & IFF_PROMISC),
4369 (old_flags & IFF_PROMISC),
4370 audit_get_loginuid(current),
4371 uid, gid,
4372 audit_get_sessionid(current));
4373 }
4374
4375 dev_change_rx_flags(dev, IFF_PROMISC);
4376 }
4377 return 0;
4378 }
4379
4380 /**
4381 * dev_set_promiscuity - update promiscuity count on a device
4382 * @dev: device
4383 * @inc: modifier
4384 *
4385 * Add or remove promiscuity from a device. While the count in the device
4386 * remains above zero the interface remains promiscuous. Once it hits zero
4387 * the device reverts back to normal filtering operation. A negative inc
4388 * value is used to drop promiscuity on the device.
4389 * Return 0 if successful or a negative errno code on error.
4390 */
4391 int dev_set_promiscuity(struct net_device *dev, int inc)
4392 {
4393 unsigned short old_flags = dev->flags;
4394 int err;
4395
4396 err = __dev_set_promiscuity(dev, inc);
4397 if (err < 0)
4398 return err;
4399 if (dev->flags != old_flags)
4400 dev_set_rx_mode(dev);
4401 return err;
4402 }
4403 EXPORT_SYMBOL(dev_set_promiscuity);
4404
4405 /**
4406 * dev_set_allmulti - update allmulti count on a device
4407 * @dev: device
4408 * @inc: modifier
4409 *
4410 * Add or remove reception of all multicast frames to a device. While the
4411 * count in the device remains above zero the interface remains listening
4412 * to all interfaces. Once it hits zero the device reverts back to normal
4413 * filtering operation. A negative @inc value is used to drop the counter
4414 * when releasing a resource needing all multicasts.
4415 * Return 0 if successful or a negative errno code on error.
4416 */
4417
4418 int dev_set_allmulti(struct net_device *dev, int inc)
4419 {
4420 unsigned short old_flags = dev->flags;
4421
4422 ASSERT_RTNL();
4423
4424 dev->flags |= IFF_ALLMULTI;
4425 dev->allmulti += inc;
4426 if (dev->allmulti == 0) {
4427 /*
4428 * Avoid overflow.
4429 * If inc causes overflow, untouch allmulti and return error.
4430 */
4431 if (inc < 0)
4432 dev->flags &= ~IFF_ALLMULTI;
4433 else {
4434 dev->allmulti -= inc;
4435 printk(KERN_WARNING "%s: allmulti touches roof, "
4436 "set allmulti failed, allmulti feature of "
4437 "device might be broken.\n", dev->name);
4438 return -EOVERFLOW;
4439 }
4440 }
4441 if (dev->flags ^ old_flags) {
4442 dev_change_rx_flags(dev, IFF_ALLMULTI);
4443 dev_set_rx_mode(dev);
4444 }
4445 return 0;
4446 }
4447 EXPORT_SYMBOL(dev_set_allmulti);
4448
4449 /*
4450 * Upload unicast and multicast address lists to device and
4451 * configure RX filtering. When the device doesn't support unicast
4452 * filtering it is put in promiscuous mode while unicast addresses
4453 * are present.
4454 */
4455 void __dev_set_rx_mode(struct net_device *dev)
4456 {
4457 const struct net_device_ops *ops = dev->netdev_ops;
4458
4459 /* dev_open will call this function so the list will stay sane. */
4460 if (!(dev->flags&IFF_UP))
4461 return;
4462
4463 if (!netif_device_present(dev))
4464 return;
4465
4466 if (ops->ndo_set_rx_mode)
4467 ops->ndo_set_rx_mode(dev);
4468 else {
4469 /* Unicast addresses changes may only happen under the rtnl,
4470 * therefore calling __dev_set_promiscuity here is safe.
4471 */
4472 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4473 __dev_set_promiscuity(dev, 1);
4474 dev->uc_promisc = 1;
4475 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4476 __dev_set_promiscuity(dev, -1);
4477 dev->uc_promisc = 0;
4478 }
4479
4480 if (ops->ndo_set_multicast_list)
4481 ops->ndo_set_multicast_list(dev);
4482 }
4483 }
4484
4485 void dev_set_rx_mode(struct net_device *dev)
4486 {
4487 netif_addr_lock_bh(dev);
4488 __dev_set_rx_mode(dev);
4489 netif_addr_unlock_bh(dev);
4490 }
4491
4492 /**
4493 * dev_get_flags - get flags reported to userspace
4494 * @dev: device
4495 *
4496 * Get the combination of flag bits exported through APIs to userspace.
4497 */
4498 unsigned dev_get_flags(const struct net_device *dev)
4499 {
4500 unsigned flags;
4501
4502 flags = (dev->flags & ~(IFF_PROMISC |
4503 IFF_ALLMULTI |
4504 IFF_RUNNING |
4505 IFF_LOWER_UP |
4506 IFF_DORMANT)) |
4507 (dev->gflags & (IFF_PROMISC |
4508 IFF_ALLMULTI));
4509
4510 if (netif_running(dev)) {
4511 if (netif_oper_up(dev))
4512 flags |= IFF_RUNNING;
4513 if (netif_carrier_ok(dev))
4514 flags |= IFF_LOWER_UP;
4515 if (netif_dormant(dev))
4516 flags |= IFF_DORMANT;
4517 }
4518
4519 return flags;
4520 }
4521 EXPORT_SYMBOL(dev_get_flags);
4522
4523 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4524 {
4525 int old_flags = dev->flags;
4526 int ret;
4527
4528 ASSERT_RTNL();
4529
4530 /*
4531 * Set the flags on our device.
4532 */
4533
4534 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4535 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4536 IFF_AUTOMEDIA)) |
4537 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4538 IFF_ALLMULTI));
4539
4540 /*
4541 * Load in the correct multicast list now the flags have changed.
4542 */
4543
4544 if ((old_flags ^ flags) & IFF_MULTICAST)
4545 dev_change_rx_flags(dev, IFF_MULTICAST);
4546
4547 dev_set_rx_mode(dev);
4548
4549 /*
4550 * Have we downed the interface. We handle IFF_UP ourselves
4551 * according to user attempts to set it, rather than blindly
4552 * setting it.
4553 */
4554
4555 ret = 0;
4556 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4557 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4558
4559 if (!ret)
4560 dev_set_rx_mode(dev);
4561 }
4562
4563 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4564 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4565
4566 dev->gflags ^= IFF_PROMISC;
4567 dev_set_promiscuity(dev, inc);
4568 }
4569
4570 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4571 is important. Some (broken) drivers set IFF_PROMISC, when
4572 IFF_ALLMULTI is requested not asking us and not reporting.
4573 */
4574 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4575 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4576
4577 dev->gflags ^= IFF_ALLMULTI;
4578 dev_set_allmulti(dev, inc);
4579 }
4580
4581 return ret;
4582 }
4583
4584 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4585 {
4586 unsigned int changes = dev->flags ^ old_flags;
4587
4588 if (changes & IFF_UP) {
4589 if (dev->flags & IFF_UP)
4590 call_netdevice_notifiers(NETDEV_UP, dev);
4591 else
4592 call_netdevice_notifiers(NETDEV_DOWN, dev);
4593 }
4594
4595 if (dev->flags & IFF_UP &&
4596 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4597 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4598 }
4599
4600 /**
4601 * dev_change_flags - change device settings
4602 * @dev: device
4603 * @flags: device state flags
4604 *
4605 * Change settings on device based state flags. The flags are
4606 * in the userspace exported format.
4607 */
4608 int dev_change_flags(struct net_device *dev, unsigned flags)
4609 {
4610 int ret, changes;
4611 int old_flags = dev->flags;
4612
4613 ret = __dev_change_flags(dev, flags);
4614 if (ret < 0)
4615 return ret;
4616
4617 changes = old_flags ^ dev->flags;
4618 if (changes)
4619 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4620
4621 __dev_notify_flags(dev, old_flags);
4622 return ret;
4623 }
4624 EXPORT_SYMBOL(dev_change_flags);
4625
4626 /**
4627 * dev_set_mtu - Change maximum transfer unit
4628 * @dev: device
4629 * @new_mtu: new transfer unit
4630 *
4631 * Change the maximum transfer size of the network device.
4632 */
4633 int dev_set_mtu(struct net_device *dev, int new_mtu)
4634 {
4635 const struct net_device_ops *ops = dev->netdev_ops;
4636 int err;
4637
4638 if (new_mtu == dev->mtu)
4639 return 0;
4640
4641 /* MTU must be positive. */
4642 if (new_mtu < 0)
4643 return -EINVAL;
4644
4645 if (!netif_device_present(dev))
4646 return -ENODEV;
4647
4648 err = 0;
4649 if (ops->ndo_change_mtu)
4650 err = ops->ndo_change_mtu(dev, new_mtu);
4651 else
4652 dev->mtu = new_mtu;
4653
4654 if (!err && dev->flags & IFF_UP)
4655 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4656 return err;
4657 }
4658 EXPORT_SYMBOL(dev_set_mtu);
4659
4660 /**
4661 * dev_set_group - Change group this device belongs to
4662 * @dev: device
4663 * @new_group: group this device should belong to
4664 */
4665 void dev_set_group(struct net_device *dev, int new_group)
4666 {
4667 dev->group = new_group;
4668 }
4669 EXPORT_SYMBOL(dev_set_group);
4670
4671 /**
4672 * dev_set_mac_address - Change Media Access Control Address
4673 * @dev: device
4674 * @sa: new address
4675 *
4676 * Change the hardware (MAC) address of the device
4677 */
4678 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4679 {
4680 const struct net_device_ops *ops = dev->netdev_ops;
4681 int err;
4682
4683 if (!ops->ndo_set_mac_address)
4684 return -EOPNOTSUPP;
4685 if (sa->sa_family != dev->type)
4686 return -EINVAL;
4687 if (!netif_device_present(dev))
4688 return -ENODEV;
4689 err = ops->ndo_set_mac_address(dev, sa);
4690 if (!err)
4691 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4692 return err;
4693 }
4694 EXPORT_SYMBOL(dev_set_mac_address);
4695
4696 /*
4697 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4698 */
4699 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4700 {
4701 int err;
4702 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4703
4704 if (!dev)
4705 return -ENODEV;
4706
4707 switch (cmd) {
4708 case SIOCGIFFLAGS: /* Get interface flags */
4709 ifr->ifr_flags = (short) dev_get_flags(dev);
4710 return 0;
4711
4712 case SIOCGIFMETRIC: /* Get the metric on the interface
4713 (currently unused) */
4714 ifr->ifr_metric = 0;
4715 return 0;
4716
4717 case SIOCGIFMTU: /* Get the MTU of a device */
4718 ifr->ifr_mtu = dev->mtu;
4719 return 0;
4720
4721 case SIOCGIFHWADDR:
4722 if (!dev->addr_len)
4723 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4724 else
4725 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4726 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4727 ifr->ifr_hwaddr.sa_family = dev->type;
4728 return 0;
4729
4730 case SIOCGIFSLAVE:
4731 err = -EINVAL;
4732 break;
4733
4734 case SIOCGIFMAP:
4735 ifr->ifr_map.mem_start = dev->mem_start;
4736 ifr->ifr_map.mem_end = dev->mem_end;
4737 ifr->ifr_map.base_addr = dev->base_addr;
4738 ifr->ifr_map.irq = dev->irq;
4739 ifr->ifr_map.dma = dev->dma;
4740 ifr->ifr_map.port = dev->if_port;
4741 return 0;
4742
4743 case SIOCGIFINDEX:
4744 ifr->ifr_ifindex = dev->ifindex;
4745 return 0;
4746
4747 case SIOCGIFTXQLEN:
4748 ifr->ifr_qlen = dev->tx_queue_len;
4749 return 0;
4750
4751 default:
4752 /* dev_ioctl() should ensure this case
4753 * is never reached
4754 */
4755 WARN_ON(1);
4756 err = -EINVAL;
4757 break;
4758
4759 }
4760 return err;
4761 }
4762
4763 /*
4764 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4765 */
4766 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4767 {
4768 int err;
4769 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4770 const struct net_device_ops *ops;
4771
4772 if (!dev)
4773 return -ENODEV;
4774
4775 ops = dev->netdev_ops;
4776
4777 switch (cmd) {
4778 case SIOCSIFFLAGS: /* Set interface flags */
4779 return dev_change_flags(dev, ifr->ifr_flags);
4780
4781 case SIOCSIFMETRIC: /* Set the metric on the interface
4782 (currently unused) */
4783 return -EOPNOTSUPP;
4784
4785 case SIOCSIFMTU: /* Set the MTU of a device */
4786 return dev_set_mtu(dev, ifr->ifr_mtu);
4787
4788 case SIOCSIFHWADDR:
4789 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4790
4791 case SIOCSIFHWBROADCAST:
4792 if (ifr->ifr_hwaddr.sa_family != dev->type)
4793 return -EINVAL;
4794 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4795 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4796 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4797 return 0;
4798
4799 case SIOCSIFMAP:
4800 if (ops->ndo_set_config) {
4801 if (!netif_device_present(dev))
4802 return -ENODEV;
4803 return ops->ndo_set_config(dev, &ifr->ifr_map);
4804 }
4805 return -EOPNOTSUPP;
4806
4807 case SIOCADDMULTI:
4808 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4809 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4810 return -EINVAL;
4811 if (!netif_device_present(dev))
4812 return -ENODEV;
4813 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4814
4815 case SIOCDELMULTI:
4816 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4817 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4818 return -EINVAL;
4819 if (!netif_device_present(dev))
4820 return -ENODEV;
4821 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4822
4823 case SIOCSIFTXQLEN:
4824 if (ifr->ifr_qlen < 0)
4825 return -EINVAL;
4826 dev->tx_queue_len = ifr->ifr_qlen;
4827 return 0;
4828
4829 case SIOCSIFNAME:
4830 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4831 return dev_change_name(dev, ifr->ifr_newname);
4832
4833 /*
4834 * Unknown or private ioctl
4835 */
4836 default:
4837 if ((cmd >= SIOCDEVPRIVATE &&
4838 cmd <= SIOCDEVPRIVATE + 15) ||
4839 cmd == SIOCBONDENSLAVE ||
4840 cmd == SIOCBONDRELEASE ||
4841 cmd == SIOCBONDSETHWADDR ||
4842 cmd == SIOCBONDSLAVEINFOQUERY ||
4843 cmd == SIOCBONDINFOQUERY ||
4844 cmd == SIOCBONDCHANGEACTIVE ||
4845 cmd == SIOCGMIIPHY ||
4846 cmd == SIOCGMIIREG ||
4847 cmd == SIOCSMIIREG ||
4848 cmd == SIOCBRADDIF ||
4849 cmd == SIOCBRDELIF ||
4850 cmd == SIOCSHWTSTAMP ||
4851 cmd == SIOCWANDEV) {
4852 err = -EOPNOTSUPP;
4853 if (ops->ndo_do_ioctl) {
4854 if (netif_device_present(dev))
4855 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4856 else
4857 err = -ENODEV;
4858 }
4859 } else
4860 err = -EINVAL;
4861
4862 }
4863 return err;
4864 }
4865
4866 /*
4867 * This function handles all "interface"-type I/O control requests. The actual
4868 * 'doing' part of this is dev_ifsioc above.
4869 */
4870
4871 /**
4872 * dev_ioctl - network device ioctl
4873 * @net: the applicable net namespace
4874 * @cmd: command to issue
4875 * @arg: pointer to a struct ifreq in user space
4876 *
4877 * Issue ioctl functions to devices. This is normally called by the
4878 * user space syscall interfaces but can sometimes be useful for
4879 * other purposes. The return value is the return from the syscall if
4880 * positive or a negative errno code on error.
4881 */
4882
4883 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4884 {
4885 struct ifreq ifr;
4886 int ret;
4887 char *colon;
4888
4889 /* One special case: SIOCGIFCONF takes ifconf argument
4890 and requires shared lock, because it sleeps writing
4891 to user space.
4892 */
4893
4894 if (cmd == SIOCGIFCONF) {
4895 rtnl_lock();
4896 ret = dev_ifconf(net, (char __user *) arg);
4897 rtnl_unlock();
4898 return ret;
4899 }
4900 if (cmd == SIOCGIFNAME)
4901 return dev_ifname(net, (struct ifreq __user *)arg);
4902
4903 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4904 return -EFAULT;
4905
4906 ifr.ifr_name[IFNAMSIZ-1] = 0;
4907
4908 colon = strchr(ifr.ifr_name, ':');
4909 if (colon)
4910 *colon = 0;
4911
4912 /*
4913 * See which interface the caller is talking about.
4914 */
4915
4916 switch (cmd) {
4917 /*
4918 * These ioctl calls:
4919 * - can be done by all.
4920 * - atomic and do not require locking.
4921 * - return a value
4922 */
4923 case SIOCGIFFLAGS:
4924 case SIOCGIFMETRIC:
4925 case SIOCGIFMTU:
4926 case SIOCGIFHWADDR:
4927 case SIOCGIFSLAVE:
4928 case SIOCGIFMAP:
4929 case SIOCGIFINDEX:
4930 case SIOCGIFTXQLEN:
4931 dev_load(net, ifr.ifr_name);
4932 rcu_read_lock();
4933 ret = dev_ifsioc_locked(net, &ifr, cmd);
4934 rcu_read_unlock();
4935 if (!ret) {
4936 if (colon)
4937 *colon = ':';
4938 if (copy_to_user(arg, &ifr,
4939 sizeof(struct ifreq)))
4940 ret = -EFAULT;
4941 }
4942 return ret;
4943
4944 case SIOCETHTOOL:
4945 dev_load(net, ifr.ifr_name);
4946 rtnl_lock();
4947 ret = dev_ethtool(net, &ifr);
4948 rtnl_unlock();
4949 if (!ret) {
4950 if (colon)
4951 *colon = ':';
4952 if (copy_to_user(arg, &ifr,
4953 sizeof(struct ifreq)))
4954 ret = -EFAULT;
4955 }
4956 return ret;
4957
4958 /*
4959 * These ioctl calls:
4960 * - require superuser power.
4961 * - require strict serialization.
4962 * - return a value
4963 */
4964 case SIOCGMIIPHY:
4965 case SIOCGMIIREG:
4966 case SIOCSIFNAME:
4967 if (!capable(CAP_NET_ADMIN))
4968 return -EPERM;
4969 dev_load(net, ifr.ifr_name);
4970 rtnl_lock();
4971 ret = dev_ifsioc(net, &ifr, cmd);
4972 rtnl_unlock();
4973 if (!ret) {
4974 if (colon)
4975 *colon = ':';
4976 if (copy_to_user(arg, &ifr,
4977 sizeof(struct ifreq)))
4978 ret = -EFAULT;
4979 }
4980 return ret;
4981
4982 /*
4983 * These ioctl calls:
4984 * - require superuser power.
4985 * - require strict serialization.
4986 * - do not return a value
4987 */
4988 case SIOCSIFFLAGS:
4989 case SIOCSIFMETRIC:
4990 case SIOCSIFMTU:
4991 case SIOCSIFMAP:
4992 case SIOCSIFHWADDR:
4993 case SIOCSIFSLAVE:
4994 case SIOCADDMULTI:
4995 case SIOCDELMULTI:
4996 case SIOCSIFHWBROADCAST:
4997 case SIOCSIFTXQLEN:
4998 case SIOCSMIIREG:
4999 case SIOCBONDENSLAVE:
5000 case SIOCBONDRELEASE:
5001 case SIOCBONDSETHWADDR:
5002 case SIOCBONDCHANGEACTIVE:
5003 case SIOCBRADDIF:
5004 case SIOCBRDELIF:
5005 case SIOCSHWTSTAMP:
5006 if (!capable(CAP_NET_ADMIN))
5007 return -EPERM;
5008 /* fall through */
5009 case SIOCBONDSLAVEINFOQUERY:
5010 case SIOCBONDINFOQUERY:
5011 dev_load(net, ifr.ifr_name);
5012 rtnl_lock();
5013 ret = dev_ifsioc(net, &ifr, cmd);
5014 rtnl_unlock();
5015 return ret;
5016
5017 case SIOCGIFMEM:
5018 /* Get the per device memory space. We can add this but
5019 * currently do not support it */
5020 case SIOCSIFMEM:
5021 /* Set the per device memory buffer space.
5022 * Not applicable in our case */
5023 case SIOCSIFLINK:
5024 return -EINVAL;
5025
5026 /*
5027 * Unknown or private ioctl.
5028 */
5029 default:
5030 if (cmd == SIOCWANDEV ||
5031 (cmd >= SIOCDEVPRIVATE &&
5032 cmd <= SIOCDEVPRIVATE + 15)) {
5033 dev_load(net, ifr.ifr_name);
5034 rtnl_lock();
5035 ret = dev_ifsioc(net, &ifr, cmd);
5036 rtnl_unlock();
5037 if (!ret && copy_to_user(arg, &ifr,
5038 sizeof(struct ifreq)))
5039 ret = -EFAULT;
5040 return ret;
5041 }
5042 /* Take care of Wireless Extensions */
5043 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5044 return wext_handle_ioctl(net, &ifr, cmd, arg);
5045 return -EINVAL;
5046 }
5047 }
5048
5049
5050 /**
5051 * dev_new_index - allocate an ifindex
5052 * @net: the applicable net namespace
5053 *
5054 * Returns a suitable unique value for a new device interface
5055 * number. The caller must hold the rtnl semaphore or the
5056 * dev_base_lock to be sure it remains unique.
5057 */
5058 static int dev_new_index(struct net *net)
5059 {
5060 static int ifindex;
5061 for (;;) {
5062 if (++ifindex <= 0)
5063 ifindex = 1;
5064 if (!__dev_get_by_index(net, ifindex))
5065 return ifindex;
5066 }
5067 }
5068
5069 /* Delayed registration/unregisteration */
5070 static LIST_HEAD(net_todo_list);
5071
5072 static void net_set_todo(struct net_device *dev)
5073 {
5074 list_add_tail(&dev->todo_list, &net_todo_list);
5075 }
5076
5077 static void rollback_registered_many(struct list_head *head)
5078 {
5079 struct net_device *dev, *tmp;
5080
5081 BUG_ON(dev_boot_phase);
5082 ASSERT_RTNL();
5083
5084 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5085 /* Some devices call without registering
5086 * for initialization unwind. Remove those
5087 * devices and proceed with the remaining.
5088 */
5089 if (dev->reg_state == NETREG_UNINITIALIZED) {
5090 pr_debug("unregister_netdevice: device %s/%p never "
5091 "was registered\n", dev->name, dev);
5092
5093 WARN_ON(1);
5094 list_del(&dev->unreg_list);
5095 continue;
5096 }
5097
5098 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5099 }
5100
5101 /* If device is running, close it first. */
5102 dev_close_many(head);
5103
5104 list_for_each_entry(dev, head, unreg_list) {
5105 /* And unlink it from device chain. */
5106 unlist_netdevice(dev);
5107
5108 dev->reg_state = NETREG_UNREGISTERING;
5109 }
5110
5111 synchronize_net();
5112
5113 list_for_each_entry(dev, head, unreg_list) {
5114 /* Shutdown queueing discipline. */
5115 dev_shutdown(dev);
5116
5117
5118 /* Notify protocols, that we are about to destroy
5119 this device. They should clean all the things.
5120 */
5121 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5122
5123 if (!dev->rtnl_link_ops ||
5124 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5125 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5126
5127 /*
5128 * Flush the unicast and multicast chains
5129 */
5130 dev_uc_flush(dev);
5131 dev_mc_flush(dev);
5132
5133 if (dev->netdev_ops->ndo_uninit)
5134 dev->netdev_ops->ndo_uninit(dev);
5135
5136 /* Notifier chain MUST detach us from master device. */
5137 WARN_ON(dev->master);
5138
5139 /* Remove entries from kobject tree */
5140 netdev_unregister_kobject(dev);
5141 }
5142
5143 /* Process any work delayed until the end of the batch */
5144 dev = list_first_entry(head, struct net_device, unreg_list);
5145 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5146
5147 rcu_barrier();
5148
5149 list_for_each_entry(dev, head, unreg_list)
5150 dev_put(dev);
5151 }
5152
5153 static void rollback_registered(struct net_device *dev)
5154 {
5155 LIST_HEAD(single);
5156
5157 list_add(&dev->unreg_list, &single);
5158 rollback_registered_many(&single);
5159 list_del(&single);
5160 }
5161
5162 u32 netdev_fix_features(struct net_device *dev, u32 features)
5163 {
5164 /* Fix illegal checksum combinations */
5165 if ((features & NETIF_F_HW_CSUM) &&
5166 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5167 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5168 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5169 }
5170
5171 if ((features & NETIF_F_NO_CSUM) &&
5172 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5173 netdev_info(dev, "mixed no checksumming and other settings.\n");
5174 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5175 }
5176
5177 /* Fix illegal SG+CSUM combinations. */
5178 if ((features & NETIF_F_SG) &&
5179 !(features & NETIF_F_ALL_CSUM)) {
5180 netdev_info(dev,
5181 "Dropping NETIF_F_SG since no checksum feature.\n");
5182 features &= ~NETIF_F_SG;
5183 }
5184
5185 /* TSO requires that SG is present as well. */
5186 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5187 netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5188 features &= ~NETIF_F_TSO;
5189 }
5190
5191 /* Software GSO depends on SG. */
5192 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5193 netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5194 features &= ~NETIF_F_GSO;
5195 }
5196
5197 /* UFO needs SG and checksumming */
5198 if (features & NETIF_F_UFO) {
5199 /* maybe split UFO into V4 and V6? */
5200 if (!((features & NETIF_F_GEN_CSUM) ||
5201 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5202 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5203 netdev_info(dev,
5204 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5205 features &= ~NETIF_F_UFO;
5206 }
5207
5208 if (!(features & NETIF_F_SG)) {
5209 netdev_info(dev,
5210 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5211 features &= ~NETIF_F_UFO;
5212 }
5213 }
5214
5215 return features;
5216 }
5217 EXPORT_SYMBOL(netdev_fix_features);
5218
5219 void netdev_update_features(struct net_device *dev)
5220 {
5221 u32 features;
5222 int err = 0;
5223
5224 features = netdev_get_wanted_features(dev);
5225
5226 if (dev->netdev_ops->ndo_fix_features)
5227 features = dev->netdev_ops->ndo_fix_features(dev, features);
5228
5229 /* driver might be less strict about feature dependencies */
5230 features = netdev_fix_features(dev, features);
5231
5232 if (dev->features == features)
5233 return;
5234
5235 netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5236 dev->features, features);
5237
5238 if (dev->netdev_ops->ndo_set_features)
5239 err = dev->netdev_ops->ndo_set_features(dev, features);
5240
5241 if (!err)
5242 dev->features = features;
5243 else if (err < 0)
5244 netdev_err(dev,
5245 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5246 err, features, dev->features);
5247 }
5248 EXPORT_SYMBOL(netdev_update_features);
5249
5250 /**
5251 * netif_stacked_transfer_operstate - transfer operstate
5252 * @rootdev: the root or lower level device to transfer state from
5253 * @dev: the device to transfer operstate to
5254 *
5255 * Transfer operational state from root to device. This is normally
5256 * called when a stacking relationship exists between the root
5257 * device and the device(a leaf device).
5258 */
5259 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5260 struct net_device *dev)
5261 {
5262 if (rootdev->operstate == IF_OPER_DORMANT)
5263 netif_dormant_on(dev);
5264 else
5265 netif_dormant_off(dev);
5266
5267 if (netif_carrier_ok(rootdev)) {
5268 if (!netif_carrier_ok(dev))
5269 netif_carrier_on(dev);
5270 } else {
5271 if (netif_carrier_ok(dev))
5272 netif_carrier_off(dev);
5273 }
5274 }
5275 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5276
5277 #ifdef CONFIG_RPS
5278 static int netif_alloc_rx_queues(struct net_device *dev)
5279 {
5280 unsigned int i, count = dev->num_rx_queues;
5281 struct netdev_rx_queue *rx;
5282
5283 BUG_ON(count < 1);
5284
5285 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5286 if (!rx) {
5287 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5288 return -ENOMEM;
5289 }
5290 dev->_rx = rx;
5291
5292 for (i = 0; i < count; i++)
5293 rx[i].dev = dev;
5294 return 0;
5295 }
5296 #endif
5297
5298 static void netdev_init_one_queue(struct net_device *dev,
5299 struct netdev_queue *queue, void *_unused)
5300 {
5301 /* Initialize queue lock */
5302 spin_lock_init(&queue->_xmit_lock);
5303 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5304 queue->xmit_lock_owner = -1;
5305 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5306 queue->dev = dev;
5307 }
5308
5309 static int netif_alloc_netdev_queues(struct net_device *dev)
5310 {
5311 unsigned int count = dev->num_tx_queues;
5312 struct netdev_queue *tx;
5313
5314 BUG_ON(count < 1);
5315
5316 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5317 if (!tx) {
5318 pr_err("netdev: Unable to allocate %u tx queues.\n",
5319 count);
5320 return -ENOMEM;
5321 }
5322 dev->_tx = tx;
5323
5324 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5325 spin_lock_init(&dev->tx_global_lock);
5326
5327 return 0;
5328 }
5329
5330 /**
5331 * register_netdevice - register a network device
5332 * @dev: device to register
5333 *
5334 * Take a completed network device structure and add it to the kernel
5335 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5336 * chain. 0 is returned on success. A negative errno code is returned
5337 * on a failure to set up the device, or if the name is a duplicate.
5338 *
5339 * Callers must hold the rtnl semaphore. You may want
5340 * register_netdev() instead of this.
5341 *
5342 * BUGS:
5343 * The locking appears insufficient to guarantee two parallel registers
5344 * will not get the same name.
5345 */
5346
5347 int register_netdevice(struct net_device *dev)
5348 {
5349 int ret;
5350 struct net *net = dev_net(dev);
5351
5352 BUG_ON(dev_boot_phase);
5353 ASSERT_RTNL();
5354
5355 might_sleep();
5356
5357 /* When net_device's are persistent, this will be fatal. */
5358 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5359 BUG_ON(!net);
5360
5361 spin_lock_init(&dev->addr_list_lock);
5362 netdev_set_addr_lockdep_class(dev);
5363
5364 dev->iflink = -1;
5365
5366 /* Init, if this function is available */
5367 if (dev->netdev_ops->ndo_init) {
5368 ret = dev->netdev_ops->ndo_init(dev);
5369 if (ret) {
5370 if (ret > 0)
5371 ret = -EIO;
5372 goto out;
5373 }
5374 }
5375
5376 ret = dev_get_valid_name(dev, dev->name, 0);
5377 if (ret)
5378 goto err_uninit;
5379
5380 dev->ifindex = dev_new_index(net);
5381 if (dev->iflink == -1)
5382 dev->iflink = dev->ifindex;
5383
5384 /* Transfer changeable features to wanted_features and enable
5385 * software offloads (GSO and GRO).
5386 */
5387 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5388 dev->features |= NETIF_F_SOFT_FEATURES;
5389 dev->wanted_features = dev->features & dev->hw_features;
5390
5391 /* Avoid warning from netdev_fix_features() for GSO without SG */
5392 if (!(dev->wanted_features & NETIF_F_SG)) {
5393 dev->wanted_features &= ~NETIF_F_GSO;
5394 dev->features &= ~NETIF_F_GSO;
5395 }
5396
5397 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5398 * vlan_dev_init() will do the dev->features check, so these features
5399 * are enabled only if supported by underlying device.
5400 */
5401 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5402
5403 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5404 ret = notifier_to_errno(ret);
5405 if (ret)
5406 goto err_uninit;
5407
5408 ret = netdev_register_kobject(dev);
5409 if (ret)
5410 goto err_uninit;
5411 dev->reg_state = NETREG_REGISTERED;
5412
5413 netdev_update_features(dev);
5414
5415 /*
5416 * Default initial state at registry is that the
5417 * device is present.
5418 */
5419
5420 set_bit(__LINK_STATE_PRESENT, &dev->state);
5421
5422 dev_init_scheduler(dev);
5423 dev_hold(dev);
5424 list_netdevice(dev);
5425
5426 /* Notify protocols, that a new device appeared. */
5427 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5428 ret = notifier_to_errno(ret);
5429 if (ret) {
5430 rollback_registered(dev);
5431 dev->reg_state = NETREG_UNREGISTERED;
5432 }
5433 /*
5434 * Prevent userspace races by waiting until the network
5435 * device is fully setup before sending notifications.
5436 */
5437 if (!dev->rtnl_link_ops ||
5438 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5439 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5440
5441 out:
5442 return ret;
5443
5444 err_uninit:
5445 if (dev->netdev_ops->ndo_uninit)
5446 dev->netdev_ops->ndo_uninit(dev);
5447 goto out;
5448 }
5449 EXPORT_SYMBOL(register_netdevice);
5450
5451 /**
5452 * init_dummy_netdev - init a dummy network device for NAPI
5453 * @dev: device to init
5454 *
5455 * This takes a network device structure and initialize the minimum
5456 * amount of fields so it can be used to schedule NAPI polls without
5457 * registering a full blown interface. This is to be used by drivers
5458 * that need to tie several hardware interfaces to a single NAPI
5459 * poll scheduler due to HW limitations.
5460 */
5461 int init_dummy_netdev(struct net_device *dev)
5462 {
5463 /* Clear everything. Note we don't initialize spinlocks
5464 * are they aren't supposed to be taken by any of the
5465 * NAPI code and this dummy netdev is supposed to be
5466 * only ever used for NAPI polls
5467 */
5468 memset(dev, 0, sizeof(struct net_device));
5469
5470 /* make sure we BUG if trying to hit standard
5471 * register/unregister code path
5472 */
5473 dev->reg_state = NETREG_DUMMY;
5474
5475 /* NAPI wants this */
5476 INIT_LIST_HEAD(&dev->napi_list);
5477
5478 /* a dummy interface is started by default */
5479 set_bit(__LINK_STATE_PRESENT, &dev->state);
5480 set_bit(__LINK_STATE_START, &dev->state);
5481
5482 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5483 * because users of this 'device' dont need to change
5484 * its refcount.
5485 */
5486
5487 return 0;
5488 }
5489 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5490
5491
5492 /**
5493 * register_netdev - register a network device
5494 * @dev: device to register
5495 *
5496 * Take a completed network device structure and add it to the kernel
5497 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5498 * chain. 0 is returned on success. A negative errno code is returned
5499 * on a failure to set up the device, or if the name is a duplicate.
5500 *
5501 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5502 * and expands the device name if you passed a format string to
5503 * alloc_netdev.
5504 */
5505 int register_netdev(struct net_device *dev)
5506 {
5507 int err;
5508
5509 rtnl_lock();
5510
5511 /*
5512 * If the name is a format string the caller wants us to do a
5513 * name allocation.
5514 */
5515 if (strchr(dev->name, '%')) {
5516 err = dev_alloc_name(dev, dev->name);
5517 if (err < 0)
5518 goto out;
5519 }
5520
5521 err = register_netdevice(dev);
5522 out:
5523 rtnl_unlock();
5524 return err;
5525 }
5526 EXPORT_SYMBOL(register_netdev);
5527
5528 int netdev_refcnt_read(const struct net_device *dev)
5529 {
5530 int i, refcnt = 0;
5531
5532 for_each_possible_cpu(i)
5533 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5534 return refcnt;
5535 }
5536 EXPORT_SYMBOL(netdev_refcnt_read);
5537
5538 /*
5539 * netdev_wait_allrefs - wait until all references are gone.
5540 *
5541 * This is called when unregistering network devices.
5542 *
5543 * Any protocol or device that holds a reference should register
5544 * for netdevice notification, and cleanup and put back the
5545 * reference if they receive an UNREGISTER event.
5546 * We can get stuck here if buggy protocols don't correctly
5547 * call dev_put.
5548 */
5549 static void netdev_wait_allrefs(struct net_device *dev)
5550 {
5551 unsigned long rebroadcast_time, warning_time;
5552 int refcnt;
5553
5554 linkwatch_forget_dev(dev);
5555
5556 rebroadcast_time = warning_time = jiffies;
5557 refcnt = netdev_refcnt_read(dev);
5558
5559 while (refcnt != 0) {
5560 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5561 rtnl_lock();
5562
5563 /* Rebroadcast unregister notification */
5564 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5565 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5566 * should have already handle it the first time */
5567
5568 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5569 &dev->state)) {
5570 /* We must not have linkwatch events
5571 * pending on unregister. If this
5572 * happens, we simply run the queue
5573 * unscheduled, resulting in a noop
5574 * for this device.
5575 */
5576 linkwatch_run_queue();
5577 }
5578
5579 __rtnl_unlock();
5580
5581 rebroadcast_time = jiffies;
5582 }
5583
5584 msleep(250);
5585
5586 refcnt = netdev_refcnt_read(dev);
5587
5588 if (time_after(jiffies, warning_time + 10 * HZ)) {
5589 printk(KERN_EMERG "unregister_netdevice: "
5590 "waiting for %s to become free. Usage "
5591 "count = %d\n",
5592 dev->name, refcnt);
5593 warning_time = jiffies;
5594 }
5595 }
5596 }
5597
5598 /* The sequence is:
5599 *
5600 * rtnl_lock();
5601 * ...
5602 * register_netdevice(x1);
5603 * register_netdevice(x2);
5604 * ...
5605 * unregister_netdevice(y1);
5606 * unregister_netdevice(y2);
5607 * ...
5608 * rtnl_unlock();
5609 * free_netdev(y1);
5610 * free_netdev(y2);
5611 *
5612 * We are invoked by rtnl_unlock().
5613 * This allows us to deal with problems:
5614 * 1) We can delete sysfs objects which invoke hotplug
5615 * without deadlocking with linkwatch via keventd.
5616 * 2) Since we run with the RTNL semaphore not held, we can sleep
5617 * safely in order to wait for the netdev refcnt to drop to zero.
5618 *
5619 * We must not return until all unregister events added during
5620 * the interval the lock was held have been completed.
5621 */
5622 void netdev_run_todo(void)
5623 {
5624 struct list_head list;
5625
5626 /* Snapshot list, allow later requests */
5627 list_replace_init(&net_todo_list, &list);
5628
5629 __rtnl_unlock();
5630
5631 while (!list_empty(&list)) {
5632 struct net_device *dev
5633 = list_first_entry(&list, struct net_device, todo_list);
5634 list_del(&dev->todo_list);
5635
5636 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5637 printk(KERN_ERR "network todo '%s' but state %d\n",
5638 dev->name, dev->reg_state);
5639 dump_stack();
5640 continue;
5641 }
5642
5643 dev->reg_state = NETREG_UNREGISTERED;
5644
5645 on_each_cpu(flush_backlog, dev, 1);
5646
5647 netdev_wait_allrefs(dev);
5648
5649 /* paranoia */
5650 BUG_ON(netdev_refcnt_read(dev));
5651 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5652 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5653 WARN_ON(dev->dn_ptr);
5654
5655 if (dev->destructor)
5656 dev->destructor(dev);
5657
5658 /* Free network device */
5659 kobject_put(&dev->dev.kobj);
5660 }
5661 }
5662
5663 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5664 * fields in the same order, with only the type differing.
5665 */
5666 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5667 const struct net_device_stats *netdev_stats)
5668 {
5669 #if BITS_PER_LONG == 64
5670 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5671 memcpy(stats64, netdev_stats, sizeof(*stats64));
5672 #else
5673 size_t i, n = sizeof(*stats64) / sizeof(u64);
5674 const unsigned long *src = (const unsigned long *)netdev_stats;
5675 u64 *dst = (u64 *)stats64;
5676
5677 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5678 sizeof(*stats64) / sizeof(u64));
5679 for (i = 0; i < n; i++)
5680 dst[i] = src[i];
5681 #endif
5682 }
5683
5684 /**
5685 * dev_get_stats - get network device statistics
5686 * @dev: device to get statistics from
5687 * @storage: place to store stats
5688 *
5689 * Get network statistics from device. Return @storage.
5690 * The device driver may provide its own method by setting
5691 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5692 * otherwise the internal statistics structure is used.
5693 */
5694 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5695 struct rtnl_link_stats64 *storage)
5696 {
5697 const struct net_device_ops *ops = dev->netdev_ops;
5698
5699 if (ops->ndo_get_stats64) {
5700 memset(storage, 0, sizeof(*storage));
5701 ops->ndo_get_stats64(dev, storage);
5702 } else if (ops->ndo_get_stats) {
5703 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5704 } else {
5705 netdev_stats_to_stats64(storage, &dev->stats);
5706 }
5707 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5708 return storage;
5709 }
5710 EXPORT_SYMBOL(dev_get_stats);
5711
5712 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5713 {
5714 struct netdev_queue *queue = dev_ingress_queue(dev);
5715
5716 #ifdef CONFIG_NET_CLS_ACT
5717 if (queue)
5718 return queue;
5719 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5720 if (!queue)
5721 return NULL;
5722 netdev_init_one_queue(dev, queue, NULL);
5723 queue->qdisc = &noop_qdisc;
5724 queue->qdisc_sleeping = &noop_qdisc;
5725 rcu_assign_pointer(dev->ingress_queue, queue);
5726 #endif
5727 return queue;
5728 }
5729
5730 /**
5731 * alloc_netdev_mqs - allocate network device
5732 * @sizeof_priv: size of private data to allocate space for
5733 * @name: device name format string
5734 * @setup: callback to initialize device
5735 * @txqs: the number of TX subqueues to allocate
5736 * @rxqs: the number of RX subqueues to allocate
5737 *
5738 * Allocates a struct net_device with private data area for driver use
5739 * and performs basic initialization. Also allocates subquue structs
5740 * for each queue on the device.
5741 */
5742 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5743 void (*setup)(struct net_device *),
5744 unsigned int txqs, unsigned int rxqs)
5745 {
5746 struct net_device *dev;
5747 size_t alloc_size;
5748 struct net_device *p;
5749
5750 BUG_ON(strlen(name) >= sizeof(dev->name));
5751
5752 if (txqs < 1) {
5753 pr_err("alloc_netdev: Unable to allocate device "
5754 "with zero queues.\n");
5755 return NULL;
5756 }
5757
5758 #ifdef CONFIG_RPS
5759 if (rxqs < 1) {
5760 pr_err("alloc_netdev: Unable to allocate device "
5761 "with zero RX queues.\n");
5762 return NULL;
5763 }
5764 #endif
5765
5766 alloc_size = sizeof(struct net_device);
5767 if (sizeof_priv) {
5768 /* ensure 32-byte alignment of private area */
5769 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5770 alloc_size += sizeof_priv;
5771 }
5772 /* ensure 32-byte alignment of whole construct */
5773 alloc_size += NETDEV_ALIGN - 1;
5774
5775 p = kzalloc(alloc_size, GFP_KERNEL);
5776 if (!p) {
5777 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5778 return NULL;
5779 }
5780
5781 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5782 dev->padded = (char *)dev - (char *)p;
5783
5784 dev->pcpu_refcnt = alloc_percpu(int);
5785 if (!dev->pcpu_refcnt)
5786 goto free_p;
5787
5788 if (dev_addr_init(dev))
5789 goto free_pcpu;
5790
5791 dev_mc_init(dev);
5792 dev_uc_init(dev);
5793
5794 dev_net_set(dev, &init_net);
5795
5796 dev->gso_max_size = GSO_MAX_SIZE;
5797
5798 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5799 dev->ethtool_ntuple_list.count = 0;
5800 INIT_LIST_HEAD(&dev->napi_list);
5801 INIT_LIST_HEAD(&dev->unreg_list);
5802 INIT_LIST_HEAD(&dev->link_watch_list);
5803 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5804 setup(dev);
5805
5806 dev->num_tx_queues = txqs;
5807 dev->real_num_tx_queues = txqs;
5808 if (netif_alloc_netdev_queues(dev))
5809 goto free_all;
5810
5811 #ifdef CONFIG_RPS
5812 dev->num_rx_queues = rxqs;
5813 dev->real_num_rx_queues = rxqs;
5814 if (netif_alloc_rx_queues(dev))
5815 goto free_all;
5816 #endif
5817
5818 strcpy(dev->name, name);
5819 dev->group = INIT_NETDEV_GROUP;
5820 return dev;
5821
5822 free_all:
5823 free_netdev(dev);
5824 return NULL;
5825
5826 free_pcpu:
5827 free_percpu(dev->pcpu_refcnt);
5828 kfree(dev->_tx);
5829 #ifdef CONFIG_RPS
5830 kfree(dev->_rx);
5831 #endif
5832
5833 free_p:
5834 kfree(p);
5835 return NULL;
5836 }
5837 EXPORT_SYMBOL(alloc_netdev_mqs);
5838
5839 /**
5840 * free_netdev - free network device
5841 * @dev: device
5842 *
5843 * This function does the last stage of destroying an allocated device
5844 * interface. The reference to the device object is released.
5845 * If this is the last reference then it will be freed.
5846 */
5847 void free_netdev(struct net_device *dev)
5848 {
5849 struct napi_struct *p, *n;
5850
5851 release_net(dev_net(dev));
5852
5853 kfree(dev->_tx);
5854 #ifdef CONFIG_RPS
5855 kfree(dev->_rx);
5856 #endif
5857
5858 kfree(rcu_dereference_raw(dev->ingress_queue));
5859
5860 /* Flush device addresses */
5861 dev_addr_flush(dev);
5862
5863 /* Clear ethtool n-tuple list */
5864 ethtool_ntuple_flush(dev);
5865
5866 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5867 netif_napi_del(p);
5868
5869 free_percpu(dev->pcpu_refcnt);
5870 dev->pcpu_refcnt = NULL;
5871
5872 /* Compatibility with error handling in drivers */
5873 if (dev->reg_state == NETREG_UNINITIALIZED) {
5874 kfree((char *)dev - dev->padded);
5875 return;
5876 }
5877
5878 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5879 dev->reg_state = NETREG_RELEASED;
5880
5881 /* will free via device release */
5882 put_device(&dev->dev);
5883 }
5884 EXPORT_SYMBOL(free_netdev);
5885
5886 /**
5887 * synchronize_net - Synchronize with packet receive processing
5888 *
5889 * Wait for packets currently being received to be done.
5890 * Does not block later packets from starting.
5891 */
5892 void synchronize_net(void)
5893 {
5894 might_sleep();
5895 synchronize_rcu();
5896 }
5897 EXPORT_SYMBOL(synchronize_net);
5898
5899 /**
5900 * unregister_netdevice_queue - remove device from the kernel
5901 * @dev: device
5902 * @head: list
5903 *
5904 * This function shuts down a device interface and removes it
5905 * from the kernel tables.
5906 * If head not NULL, device is queued to be unregistered later.
5907 *
5908 * Callers must hold the rtnl semaphore. You may want
5909 * unregister_netdev() instead of this.
5910 */
5911
5912 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5913 {
5914 ASSERT_RTNL();
5915
5916 if (head) {
5917 list_move_tail(&dev->unreg_list, head);
5918 } else {
5919 rollback_registered(dev);
5920 /* Finish processing unregister after unlock */
5921 net_set_todo(dev);
5922 }
5923 }
5924 EXPORT_SYMBOL(unregister_netdevice_queue);
5925
5926 /**
5927 * unregister_netdevice_many - unregister many devices
5928 * @head: list of devices
5929 */
5930 void unregister_netdevice_many(struct list_head *head)
5931 {
5932 struct net_device *dev;
5933
5934 if (!list_empty(head)) {
5935 rollback_registered_many(head);
5936 list_for_each_entry(dev, head, unreg_list)
5937 net_set_todo(dev);
5938 }
5939 }
5940 EXPORT_SYMBOL(unregister_netdevice_many);
5941
5942 /**
5943 * unregister_netdev - remove device from the kernel
5944 * @dev: device
5945 *
5946 * This function shuts down a device interface and removes it
5947 * from the kernel tables.
5948 *
5949 * This is just a wrapper for unregister_netdevice that takes
5950 * the rtnl semaphore. In general you want to use this and not
5951 * unregister_netdevice.
5952 */
5953 void unregister_netdev(struct net_device *dev)
5954 {
5955 rtnl_lock();
5956 unregister_netdevice(dev);
5957 rtnl_unlock();
5958 }
5959 EXPORT_SYMBOL(unregister_netdev);
5960
5961 /**
5962 * dev_change_net_namespace - move device to different nethost namespace
5963 * @dev: device
5964 * @net: network namespace
5965 * @pat: If not NULL name pattern to try if the current device name
5966 * is already taken in the destination network namespace.
5967 *
5968 * This function shuts down a device interface and moves it
5969 * to a new network namespace. On success 0 is returned, on
5970 * a failure a netagive errno code is returned.
5971 *
5972 * Callers must hold the rtnl semaphore.
5973 */
5974
5975 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5976 {
5977 int err;
5978
5979 ASSERT_RTNL();
5980
5981 /* Don't allow namespace local devices to be moved. */
5982 err = -EINVAL;
5983 if (dev->features & NETIF_F_NETNS_LOCAL)
5984 goto out;
5985
5986 /* Ensure the device has been registrered */
5987 err = -EINVAL;
5988 if (dev->reg_state != NETREG_REGISTERED)
5989 goto out;
5990
5991 /* Get out if there is nothing todo */
5992 err = 0;
5993 if (net_eq(dev_net(dev), net))
5994 goto out;
5995
5996 /* Pick the destination device name, and ensure
5997 * we can use it in the destination network namespace.
5998 */
5999 err = -EEXIST;
6000 if (__dev_get_by_name(net, dev->name)) {
6001 /* We get here if we can't use the current device name */
6002 if (!pat)
6003 goto out;
6004 if (dev_get_valid_name(dev, pat, 1))
6005 goto out;
6006 }
6007
6008 /*
6009 * And now a mini version of register_netdevice unregister_netdevice.
6010 */
6011
6012 /* If device is running close it first. */
6013 dev_close(dev);
6014
6015 /* And unlink it from device chain */
6016 err = -ENODEV;
6017 unlist_netdevice(dev);
6018
6019 synchronize_net();
6020
6021 /* Shutdown queueing discipline. */
6022 dev_shutdown(dev);
6023
6024 /* Notify protocols, that we are about to destroy
6025 this device. They should clean all the things.
6026
6027 Note that dev->reg_state stays at NETREG_REGISTERED.
6028 This is wanted because this way 8021q and macvlan know
6029 the device is just moving and can keep their slaves up.
6030 */
6031 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6032 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6033
6034 /*
6035 * Flush the unicast and multicast chains
6036 */
6037 dev_uc_flush(dev);
6038 dev_mc_flush(dev);
6039
6040 /* Actually switch the network namespace */
6041 dev_net_set(dev, net);
6042
6043 /* If there is an ifindex conflict assign a new one */
6044 if (__dev_get_by_index(net, dev->ifindex)) {
6045 int iflink = (dev->iflink == dev->ifindex);
6046 dev->ifindex = dev_new_index(net);
6047 if (iflink)
6048 dev->iflink = dev->ifindex;
6049 }
6050
6051 /* Fixup kobjects */
6052 err = device_rename(&dev->dev, dev->name);
6053 WARN_ON(err);
6054
6055 /* Add the device back in the hashes */
6056 list_netdevice(dev);
6057
6058 /* Notify protocols, that a new device appeared. */
6059 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6060
6061 /*
6062 * Prevent userspace races by waiting until the network
6063 * device is fully setup before sending notifications.
6064 */
6065 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6066
6067 synchronize_net();
6068 err = 0;
6069 out:
6070 return err;
6071 }
6072 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6073
6074 static int dev_cpu_callback(struct notifier_block *nfb,
6075 unsigned long action,
6076 void *ocpu)
6077 {
6078 struct sk_buff **list_skb;
6079 struct sk_buff *skb;
6080 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6081 struct softnet_data *sd, *oldsd;
6082
6083 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6084 return NOTIFY_OK;
6085
6086 local_irq_disable();
6087 cpu = smp_processor_id();
6088 sd = &per_cpu(softnet_data, cpu);
6089 oldsd = &per_cpu(softnet_data, oldcpu);
6090
6091 /* Find end of our completion_queue. */
6092 list_skb = &sd->completion_queue;
6093 while (*list_skb)
6094 list_skb = &(*list_skb)->next;
6095 /* Append completion queue from offline CPU. */
6096 *list_skb = oldsd->completion_queue;
6097 oldsd->completion_queue = NULL;
6098
6099 /* Append output queue from offline CPU. */
6100 if (oldsd->output_queue) {
6101 *sd->output_queue_tailp = oldsd->output_queue;
6102 sd->output_queue_tailp = oldsd->output_queue_tailp;
6103 oldsd->output_queue = NULL;
6104 oldsd->output_queue_tailp = &oldsd->output_queue;
6105 }
6106
6107 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6108 local_irq_enable();
6109
6110 /* Process offline CPU's input_pkt_queue */
6111 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6112 netif_rx(skb);
6113 input_queue_head_incr(oldsd);
6114 }
6115 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6116 netif_rx(skb);
6117 input_queue_head_incr(oldsd);
6118 }
6119
6120 return NOTIFY_OK;
6121 }
6122
6123
6124 /**
6125 * netdev_increment_features - increment feature set by one
6126 * @all: current feature set
6127 * @one: new feature set
6128 * @mask: mask feature set
6129 *
6130 * Computes a new feature set after adding a device with feature set
6131 * @one to the master device with current feature set @all. Will not
6132 * enable anything that is off in @mask. Returns the new feature set.
6133 */
6134 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6135 {
6136 /* If device needs checksumming, downgrade to it. */
6137 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6138 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6139 else if (mask & NETIF_F_ALL_CSUM) {
6140 /* If one device supports v4/v6 checksumming, set for all. */
6141 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6142 !(all & NETIF_F_GEN_CSUM)) {
6143 all &= ~NETIF_F_ALL_CSUM;
6144 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6145 }
6146
6147 /* If one device supports hw checksumming, set for all. */
6148 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6149 all &= ~NETIF_F_ALL_CSUM;
6150 all |= NETIF_F_HW_CSUM;
6151 }
6152 }
6153
6154 one |= NETIF_F_ALL_CSUM;
6155
6156 one |= all & NETIF_F_ONE_FOR_ALL;
6157 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6158 all |= one & mask & NETIF_F_ONE_FOR_ALL;
6159
6160 return all;
6161 }
6162 EXPORT_SYMBOL(netdev_increment_features);
6163
6164 static struct hlist_head *netdev_create_hash(void)
6165 {
6166 int i;
6167 struct hlist_head *hash;
6168
6169 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6170 if (hash != NULL)
6171 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6172 INIT_HLIST_HEAD(&hash[i]);
6173
6174 return hash;
6175 }
6176
6177 /* Initialize per network namespace state */
6178 static int __net_init netdev_init(struct net *net)
6179 {
6180 INIT_LIST_HEAD(&net->dev_base_head);
6181
6182 net->dev_name_head = netdev_create_hash();
6183 if (net->dev_name_head == NULL)
6184 goto err_name;
6185
6186 net->dev_index_head = netdev_create_hash();
6187 if (net->dev_index_head == NULL)
6188 goto err_idx;
6189
6190 return 0;
6191
6192 err_idx:
6193 kfree(net->dev_name_head);
6194 err_name:
6195 return -ENOMEM;
6196 }
6197
6198 /**
6199 * netdev_drivername - network driver for the device
6200 * @dev: network device
6201 * @buffer: buffer for resulting name
6202 * @len: size of buffer
6203 *
6204 * Determine network driver for device.
6205 */
6206 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6207 {
6208 const struct device_driver *driver;
6209 const struct device *parent;
6210
6211 if (len <= 0 || !buffer)
6212 return buffer;
6213 buffer[0] = 0;
6214
6215 parent = dev->dev.parent;
6216
6217 if (!parent)
6218 return buffer;
6219
6220 driver = parent->driver;
6221 if (driver && driver->name)
6222 strlcpy(buffer, driver->name, len);
6223 return buffer;
6224 }
6225
6226 static int __netdev_printk(const char *level, const struct net_device *dev,
6227 struct va_format *vaf)
6228 {
6229 int r;
6230
6231 if (dev && dev->dev.parent)
6232 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6233 netdev_name(dev), vaf);
6234 else if (dev)
6235 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6236 else
6237 r = printk("%s(NULL net_device): %pV", level, vaf);
6238
6239 return r;
6240 }
6241
6242 int netdev_printk(const char *level, const struct net_device *dev,
6243 const char *format, ...)
6244 {
6245 struct va_format vaf;
6246 va_list args;
6247 int r;
6248
6249 va_start(args, format);
6250
6251 vaf.fmt = format;
6252 vaf.va = &args;
6253
6254 r = __netdev_printk(level, dev, &vaf);
6255 va_end(args);
6256
6257 return r;
6258 }
6259 EXPORT_SYMBOL(netdev_printk);
6260
6261 #define define_netdev_printk_level(func, level) \
6262 int func(const struct net_device *dev, const char *fmt, ...) \
6263 { \
6264 int r; \
6265 struct va_format vaf; \
6266 va_list args; \
6267 \
6268 va_start(args, fmt); \
6269 \
6270 vaf.fmt = fmt; \
6271 vaf.va = &args; \
6272 \
6273 r = __netdev_printk(level, dev, &vaf); \
6274 va_end(args); \
6275 \
6276 return r; \
6277 } \
6278 EXPORT_SYMBOL(func);
6279
6280 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6281 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6282 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6283 define_netdev_printk_level(netdev_err, KERN_ERR);
6284 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6285 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6286 define_netdev_printk_level(netdev_info, KERN_INFO);
6287
6288 static void __net_exit netdev_exit(struct net *net)
6289 {
6290 kfree(net->dev_name_head);
6291 kfree(net->dev_index_head);
6292 }
6293
6294 static struct pernet_operations __net_initdata netdev_net_ops = {
6295 .init = netdev_init,
6296 .exit = netdev_exit,
6297 };
6298
6299 static void __net_exit default_device_exit(struct net *net)
6300 {
6301 struct net_device *dev, *aux;
6302 /*
6303 * Push all migratable network devices back to the
6304 * initial network namespace
6305 */
6306 rtnl_lock();
6307 for_each_netdev_safe(net, dev, aux) {
6308 int err;
6309 char fb_name[IFNAMSIZ];
6310
6311 /* Ignore unmoveable devices (i.e. loopback) */
6312 if (dev->features & NETIF_F_NETNS_LOCAL)
6313 continue;
6314
6315 /* Leave virtual devices for the generic cleanup */
6316 if (dev->rtnl_link_ops)
6317 continue;
6318
6319 /* Push remaining network devices to init_net */
6320 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6321 err = dev_change_net_namespace(dev, &init_net, fb_name);
6322 if (err) {
6323 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6324 __func__, dev->name, err);
6325 BUG();
6326 }
6327 }
6328 rtnl_unlock();
6329 }
6330
6331 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6332 {
6333 /* At exit all network devices most be removed from a network
6334 * namespace. Do this in the reverse order of registration.
6335 * Do this across as many network namespaces as possible to
6336 * improve batching efficiency.
6337 */
6338 struct net_device *dev;
6339 struct net *net;
6340 LIST_HEAD(dev_kill_list);
6341
6342 rtnl_lock();
6343 list_for_each_entry(net, net_list, exit_list) {
6344 for_each_netdev_reverse(net, dev) {
6345 if (dev->rtnl_link_ops)
6346 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6347 else
6348 unregister_netdevice_queue(dev, &dev_kill_list);
6349 }
6350 }
6351 unregister_netdevice_many(&dev_kill_list);
6352 list_del(&dev_kill_list);
6353 rtnl_unlock();
6354 }
6355
6356 static struct pernet_operations __net_initdata default_device_ops = {
6357 .exit = default_device_exit,
6358 .exit_batch = default_device_exit_batch,
6359 };
6360
6361 /*
6362 * Initialize the DEV module. At boot time this walks the device list and
6363 * unhooks any devices that fail to initialise (normally hardware not
6364 * present) and leaves us with a valid list of present and active devices.
6365 *
6366 */
6367
6368 /*
6369 * This is called single threaded during boot, so no need
6370 * to take the rtnl semaphore.
6371 */
6372 static int __init net_dev_init(void)
6373 {
6374 int i, rc = -ENOMEM;
6375
6376 BUG_ON(!dev_boot_phase);
6377
6378 if (dev_proc_init())
6379 goto out;
6380
6381 if (netdev_kobject_init())
6382 goto out;
6383
6384 INIT_LIST_HEAD(&ptype_all);
6385 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6386 INIT_LIST_HEAD(&ptype_base[i]);
6387
6388 if (register_pernet_subsys(&netdev_net_ops))
6389 goto out;
6390
6391 /*
6392 * Initialise the packet receive queues.
6393 */
6394
6395 for_each_possible_cpu(i) {
6396 struct softnet_data *sd = &per_cpu(softnet_data, i);
6397
6398 memset(sd, 0, sizeof(*sd));
6399 skb_queue_head_init(&sd->input_pkt_queue);
6400 skb_queue_head_init(&sd->process_queue);
6401 sd->completion_queue = NULL;
6402 INIT_LIST_HEAD(&sd->poll_list);
6403 sd->output_queue = NULL;
6404 sd->output_queue_tailp = &sd->output_queue;
6405 #ifdef CONFIG_RPS
6406 sd->csd.func = rps_trigger_softirq;
6407 sd->csd.info = sd;
6408 sd->csd.flags = 0;
6409 sd->cpu = i;
6410 #endif
6411
6412 sd->backlog.poll = process_backlog;
6413 sd->backlog.weight = weight_p;
6414 sd->backlog.gro_list = NULL;
6415 sd->backlog.gro_count = 0;
6416 }
6417
6418 dev_boot_phase = 0;
6419
6420 /* The loopback device is special if any other network devices
6421 * is present in a network namespace the loopback device must
6422 * be present. Since we now dynamically allocate and free the
6423 * loopback device ensure this invariant is maintained by
6424 * keeping the loopback device as the first device on the
6425 * list of network devices. Ensuring the loopback devices
6426 * is the first device that appears and the last network device
6427 * that disappears.
6428 */
6429 if (register_pernet_device(&loopback_net_ops))
6430 goto out;
6431
6432 if (register_pernet_device(&default_device_ops))
6433 goto out;
6434
6435 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6436 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6437
6438 hotcpu_notifier(dev_cpu_callback, 0);
6439 dst_init();
6440 dev_mcast_init();
6441 rc = 0;
6442 out:
6443 return rc;
6444 }
6445
6446 subsys_initcall(net_dev_init);
6447
6448 static int __init initialize_hashrnd(void)
6449 {
6450 get_random_bytes(&hashrnd, sizeof(hashrnd));
6451 return 0;
6452 }
6453
6454 late_initcall_sync(initialize_hashrnd);
6455