net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return &ptype_all;
 375         else
 376                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 377 }
 378
 379 /**
 380  *      dev_add_pack - add packet handler
 381  *      @pt: packet type declaration
 382  *
 383  *      Add a protocol handler to the networking stack. The passed &packet_type
 384  *      is linked into kernel lists and may not be freed until it has been
 385  *      removed from the kernel lists.
 386  *
 387  *      This call does not sleep therefore it can not
 388  *      guarantee all CPU's that are in middle of receiving packets
 389  *      will see the new packet type (until the next received packet).
 390  */
 391
 392 void dev_add_pack(struct packet_type *pt)
 393 {
 394         struct list_head *head = ptype_head(pt);
 395
 396         spin_lock(&ptype_lock);
 397         list_add_rcu(&pt->list, head);
 398         spin_unlock(&ptype_lock);
 399 }
 400 EXPORT_SYMBOL(dev_add_pack);
 401
 402 /**
 403  *      __dev_remove_pack        - remove packet handler
 404  *      @pt: packet type declaration
 405  *
 406  *      Remove a protocol handler that was previously added to the kernel
 407  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 408  *      from the kernel lists and can be freed or reused once this function
 409  *      returns.
 410  *
 411  *      The packet type might still be in use by receivers
 412  *      and must not be freed until after all the CPU's have gone
 413  *      through a quiescent state.
 414  */
 415 void __dev_remove_pack(struct packet_type *pt)
 416 {
 417         struct list_head *head = ptype_head(pt);
 418         struct packet_type *pt1;
 419
 420         spin_lock(&ptype_lock);
 421
 422         list_for_each_entry(pt1, head, list) {
 423                 if (pt == pt1) {
 424                         list_del_rcu(&pt->list);
 425                         goto out;
 426                 }
 427         }
 428
 429         pr_warn("dev_remove_pack: %p not found\n", pt);
 430 out:
 431         spin_unlock(&ptype_lock);
 432 }
 433 EXPORT_SYMBOL(__dev_remove_pack);
 434
 435 /**
 436  *      dev_remove_pack  - remove packet handler
 437  *      @pt: packet type declaration
 438  *
 439  *      Remove a protocol handler that was previously added to the kernel
 440  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 441  *      from the kernel lists and can be freed or reused once this function
 442  *      returns.
 443  *
 444  *      This call sleeps to guarantee that no CPU is looking at the packet
 445  *      type after return.
 446  */
 447 void dev_remove_pack(struct packet_type *pt)
 448 {
 449         __dev_remove_pack(pt);
 450
 451         synchronize_net();
 452 }
 453 EXPORT_SYMBOL(dev_remove_pack);
 454
 455
 456 /**
 457  *      dev_add_offload - register offload handlers
 458  *      @po: protocol offload declaration
 459  *
 460  *      Add protocol offload handlers to the networking stack. The passed
 461  *      &proto_offload is linked into kernel lists and may not be freed until
 462  *      it has been removed from the kernel lists.
 463  *
 464  *      This call does not sleep therefore it can not
 465  *      guarantee all CPU's that are in middle of receiving packets
 466  *      will see the new offload handlers (until the next received packet).
 467  */
 468 void dev_add_offload(struct packet_offload *po)
 469 {
 470         struct list_head *head = &offload_base;
 471
 472         spin_lock(&offload_lock);
 473         list_add_rcu(&po->list, head);
 474         spin_unlock(&offload_lock);
 475 }
 476 EXPORT_SYMBOL(dev_add_offload);
 477
 478 /**
 479  *      __dev_remove_offload     - remove offload handler
 480  *      @po: packet offload declaration
 481  *
 482  *      Remove a protocol offload handler that was previously added to the
 483  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 484  *      is removed from the kernel lists and can be freed or reused once this
 485  *      function returns.
 486  *
 487  *      The packet type might still be in use by receivers
 488  *      and must not be freed until after all the CPU's have gone
 489  *      through a quiescent state.
 490  */
 491 static void __dev_remove_offload(struct packet_offload *po)
 492 {
 493         struct list_head *head = &offload_base;
 494         struct packet_offload *po1;
 495
 496         spin_lock(&offload_lock);
 497
 498         list_for_each_entry(po1, head, list) {
 499                 if (po == po1) {
 500                         list_del_rcu(&po->list);
 501                         goto out;
 502                 }
 503         }
 504
 505         pr_warn("dev_remove_offload: %p not found\n", po);
 506 out:
 507         spin_unlock(&offload_lock);
 508 }
 509
 510 /**
 511  *      dev_remove_offload       - remove packet offload handler
 512  *      @po: packet offload declaration
 513  *
 514  *      Remove a packet offload handler that was previously added to the kernel
 515  *      offload handlers by dev_add_offload(). The passed &offload_type is
 516  *      removed from the kernel lists and can be freed or reused once this
 517  *      function returns.
 518  *
 519  *      This call sleeps to guarantee that no CPU is looking at the packet
 520  *      type after return.
 521  */
 522 void dev_remove_offload(struct packet_offload *po)
 523 {
 524         __dev_remove_offload(po);
 525
 526         synchronize_net();
 527 }
 528 EXPORT_SYMBOL(dev_remove_offload);
 529
 530 /******************************************************************************
 531
 532                       Device Boot-time Settings Routines
 533
 534 *******************************************************************************/
 535
 536 /* Boot time configuration table */
 537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 538
 539 /**
 540  *      netdev_boot_setup_add   - add new setup entry
 541  *      @name: name of the device
 542  *      @map: configured settings for the device
 543  *
 544  *      Adds new setup entry to the dev_boot_setup list.  The function
 545  *      returns 0 on error and 1 on success.  This is a generic routine to
 546  *      all netdevices.
 547  */
 548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 549 {
 550         struct netdev_boot_setup *s;
 551         int i;
 552
 553         s = dev_boot_setup;
 554         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 555                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 556                         memset(s[i].name, 0, sizeof(s[i].name));
 557                         strlcpy(s[i].name, name, IFNAMSIZ);
 558                         memcpy(&s[i].map, map, sizeof(s[i].map));
 559                         break;
 560                 }
 561         }
 562
 563         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 564 }
 565
 566 /**
 567  *      netdev_boot_setup_check - check boot time settings
 568  *      @dev: the netdevice
 569  *
 570  *      Check boot time settings for the device.
 571  *      The found settings are set for the device to be used
 572  *      later in the device probing.
 573  *      Returns 0 if no settings found, 1 if they are.
 574  */
 575 int netdev_boot_setup_check(struct net_device *dev)
 576 {
 577         struct netdev_boot_setup *s = dev_boot_setup;
 578         int i;
 579
 580         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 581                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 582                     !strcmp(dev->name, s[i].name)) {
 583                         dev->irq        = s[i].map.irq;
 584                         dev->base_addr  = s[i].map.base_addr;
 585                         dev->mem_start  = s[i].map.mem_start;
 586                         dev->mem_end    = s[i].map.mem_end;
 587                         return 1;
 588                 }
 589         }
 590         return 0;
 591 }
 592 EXPORT_SYMBOL(netdev_boot_setup_check);
 593
 594
 595 /**
 596  *      netdev_boot_base        - get address from boot time settings
 597  *      @prefix: prefix for network device
 598  *      @unit: id for network device
 599  *
 600  *      Check boot time settings for the base address of device.
 601  *      The found settings are set for the device to be used
 602  *      later in the device probing.
 603  *      Returns 0 if no settings found.
 604  */
 605 unsigned long netdev_boot_base(const char *prefix, int unit)
 606 {
 607         const struct netdev_boot_setup *s = dev_boot_setup;
 608         char name[IFNAMSIZ];
 609         int i;
 610
 611         sprintf(name, "%s%d", prefix, unit);
 612
 613         /*
 614          * If device already registered then return base of 1
 615          * to indicate not to probe for this interface
 616          */
 617         if (__dev_get_by_name(&init_net, name))
 618                 return 1;
 619
 620         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 621                 if (!strcmp(name, s[i].name))
 622                         return s[i].map.base_addr;
 623         return 0;
 624 }
 625
 626 /*
 627  * Saves at boot time configured settings for any netdevice.
 628  */
 629 int __init netdev_boot_setup(char *str)
 630 {
 631         int ints[5];
 632         struct ifmap map;
 633
 634         str = get_options(str, ARRAY_SIZE(ints), ints);
 635         if (!str || !*str)
 636                 return 0;
 637
 638         /* Save settings */
 639         memset(&map, 0, sizeof(map));
 640         if (ints[0] > 0)
 641                 map.irq = ints[1];
 642         if (ints[0] > 1)
 643                 map.base_addr = ints[2];
 644         if (ints[0] > 2)
 645                 map.mem_start = ints[3];
 646         if (ints[0] > 3)
 647                 map.mem_end = ints[4];
 648
 649         /* Add new entry to the list */
 650         return netdev_boot_setup_add(str, &map);
 651 }
 652
 653 __setup("netdev=", netdev_boot_setup);
 654
 655 /*******************************************************************************
 656
 657                             Device Interface Subroutines
 658
 659 *******************************************************************************/
 660
 661 /**
 662  *      __dev_get_by_name       - find a device by its name
 663  *      @net: the applicable net namespace
 664  *      @name: name to find
 665  *
 666  *      Find an interface by name. Must be called under RTNL semaphore
 667  *      or @dev_base_lock. If the name is found a pointer to the device
 668  *      is returned. If the name is not found then %NULL is returned. The
 669  *      reference counters are not incremented so the caller must be
 670  *      careful with locks.
 671  */
 672
 673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 674 {
 675         struct net_device *dev;
 676         struct hlist_head *head = dev_name_hash(net, name);
 677
 678         hlist_for_each_entry(dev, head, name_hlist)
 679                 if (!strncmp(dev->name, name, IFNAMSIZ))
 680                         return dev;
 681
 682         return NULL;
 683 }
 684 EXPORT_SYMBOL(__dev_get_by_name);
 685
 686 /**
 687  *      dev_get_by_name_rcu     - find a device by its name
 688  *      @net: the applicable net namespace
 689  *      @name: name to find
 690  *
 691  *      Find an interface by name.
 692  *      If the name is found a pointer to the device is returned.
 693  *      If the name is not found then %NULL is returned.
 694  *      The reference counters are not incremented so the caller must be
 695  *      careful with locks. The caller must hold RCU lock.
 696  */
 697
 698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 699 {
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_name_hash(net, name);
 702
 703         hlist_for_each_entry_rcu(dev, head, name_hlist)
 704                 if (!strncmp(dev->name, name, IFNAMSIZ))
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(dev_get_by_name_rcu);
 710
 711 /**
 712  *      dev_get_by_name         - find a device by its name
 713  *      @net: the applicable net namespace
 714  *      @name: name to find
 715  *
 716  *      Find an interface by name. This can be called from any
 717  *      context and does its own locking. The returned handle has
 718  *      the usage count incremented and the caller must use dev_put() to
 719  *      release it when it is no longer needed. %NULL is returned if no
 720  *      matching device is found.
 721  */
 722
 723 struct net_device *dev_get_by_name(struct net *net, const char *name)
 724 {
 725         struct net_device *dev;
 726
 727         rcu_read_lock();
 728         dev = dev_get_by_name_rcu(net, name);
 729         if (dev)
 730                 dev_hold(dev);
 731         rcu_read_unlock();
 732         return dev;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_name);
 735
 736 /**
 737  *      __dev_get_by_index - find a device by its ifindex
 738  *      @net: the applicable net namespace
 739  *      @ifindex: index of device
 740  *
 741  *      Search for an interface by index. Returns %NULL if the device
 742  *      is not found or a pointer to the device. The device has not
 743  *      had its reference counter increased so the caller must be careful
 744  *      about locking. The caller must hold either the RTNL semaphore
 745  *      or @dev_base_lock.
 746  */
 747
 748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751         struct hlist_head *head = dev_index_hash(net, ifindex);
 752
 753         hlist_for_each_entry(dev, head, index_hlist)
 754                 if (dev->ifindex == ifindex)
 755                         return dev;
 756
 757         return NULL;
 758 }
 759 EXPORT_SYMBOL(__dev_get_by_index);
 760
 761 /**
 762  *      dev_get_by_index_rcu - find a device by its ifindex
 763  *      @net: the applicable net namespace
 764  *      @ifindex: index of device
 765  *
 766  *      Search for an interface by index. Returns %NULL if the device
 767  *      is not found or a pointer to the device. The device has not
 768  *      had its reference counter increased so the caller must be careful
 769  *      about locking. The caller must hold RCU lock.
 770  */
 771
 772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 773 {
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry_rcu(dev, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(dev_get_by_index_rcu);
 784
 785
 786 /**
 787  *      dev_get_by_index - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns NULL if the device
 792  *      is not found or a pointer to the device. The device returned has
 793  *      had a reference added and the pointer is safe until the user calls
 794  *      dev_put to indicate they have finished with it.
 795  */
 796
 797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 798 {
 799         struct net_device *dev;
 800
 801         rcu_read_lock();
 802         dev = dev_get_by_index_rcu(net, ifindex);
 803         if (dev)
 804                 dev_hold(dev);
 805         rcu_read_unlock();
 806         return dev;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index);
 809
 810 /**
 811  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 812  *      @net: network namespace
 813  *      @name: a pointer to the buffer where the name will be stored.
 814  *      @ifindex: the ifindex of the interface to get the name from.
 815  *
 816  *      The use of raw_seqcount_begin() and cond_resched() before
 817  *      retrying is required as we want to give the writers a chance
 818  *      to complete when CONFIG_PREEMPT is not set.
 819  */
 820 int netdev_get_name(struct net *net, char *name, int ifindex)
 821 {
 822         struct net_device *dev;
 823         unsigned int seq;
 824
 825 retry:
 826         seq = raw_seqcount_begin(&devnet_rename_seq);
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (!dev) {
 830                 rcu_read_unlock();
 831                 return -ENODEV;
 832         }
 833
 834         strcpy(name, dev->name);
 835         rcu_read_unlock();
 836         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 837                 cond_resched();
 838                 goto retry;
 839         }
 840
 841         return 0;
 842 }
 843
 844 /**
 845  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 846  *      @net: the applicable net namespace
 847  *      @type: media type of device
 848  *      @ha: hardware address
 849  *
 850  *      Search for an interface by MAC address. Returns NULL if the device
 851  *      is not found or a pointer to the device.
 852  *      The caller must hold RCU or RTNL.
 853  *      The returned device has not had its ref count increased
 854  *      and the caller must therefore be careful about locking
 855  *
 856  */
 857
 858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 859                                        const char *ha)
 860 {
 861         struct net_device *dev;
 862
 863         for_each_netdev_rcu(net, dev)
 864                 if (dev->type == type &&
 865                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 866                         return dev;
 867
 868         return NULL;
 869 }
 870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 871
 872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 873 {
 874         struct net_device *dev;
 875
 876         ASSERT_RTNL();
 877         for_each_netdev(net, dev)
 878                 if (dev->type == type)
 879                         return dev;
 880
 881         return NULL;
 882 }
 883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 884
 885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 886 {
 887         struct net_device *dev, *ret = NULL;
 888
 889         rcu_read_lock();
 890         for_each_netdev_rcu(net, dev)
 891                 if (dev->type == type) {
 892                         dev_hold(dev);
 893                         ret = dev;
 894                         break;
 895                 }
 896         rcu_read_unlock();
 897         return ret;
 898 }
 899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 900
 901 /**
 902  *      __dev_get_by_flags - find any device with given flags
 903  *      @net: the applicable net namespace
 904  *      @if_flags: IFF_* values
 905  *      @mask: bitmask of bits in if_flags to check
 906  *
 907  *      Search for any interface with the given flags. Returns NULL if a device
 908  *      is not found or a pointer to the device. Must be called inside
 909  *      rtnl_lock(), and result refcount is unchanged.
 910  */
 911
 912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 913                                       unsigned short mask)
 914 {
 915         struct net_device *dev, *ret;
 916
 917         ASSERT_RTNL();
 918
 919         ret = NULL;
 920         for_each_netdev(net, dev) {
 921                 if (((dev->flags ^ if_flags) & mask) == 0) {
 922                         ret = dev;
 923                         break;
 924                 }
 925         }
 926         return ret;
 927 }
 928 EXPORT_SYMBOL(__dev_get_by_flags);
 929
 930 /**
 931  *      dev_valid_name - check if name is okay for network device
 932  *      @name: name string
 933  *
 934  *      Network device names need to be valid file names to
 935  *      to allow sysfs to work.  We also disallow any kind of
 936  *      whitespace.
 937  */
 938 bool dev_valid_name(const char *name)
 939 {
 940         if (*name == '\0')
 941                 return false;
 942         if (strlen(name) >= IFNAMSIZ)
 943                 return false;
 944         if (!strcmp(name, ".") || !strcmp(name, ".."))
 945                 return false;
 946
 947         while (*name) {
 948                 if (*name == '/' || isspace(*name))
 949                         return false;
 950                 name++;
 951         }
 952         return true;
 953 }
 954 EXPORT_SYMBOL(dev_valid_name);
 955
 956 /**
 957  *      __dev_alloc_name - allocate a name for a device
 958  *      @net: network namespace to allocate the device name in
 959  *      @name: name format string
 960  *      @buf:  scratch buffer and result name string
 961  *
 962  *      Passed a format string - eg "lt%d" it will try and find a suitable
 963  *      id. It scans list of devices to build up a free map, then chooses
 964  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 965  *      while allocating the name and adding the device in order to avoid
 966  *      duplicates.
 967  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 968  *      Returns the number of the unit assigned or a negative errno code.
 969  */
 970
 971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 972 {
 973         int i = 0;
 974         const char *p;
 975         const int max_netdevices = 8*PAGE_SIZE;
 976         unsigned long *inuse;
 977         struct net_device *d;
 978
 979         p = strnchr(name, IFNAMSIZ-1, '%');
 980         if (p) {
 981                 /*
 982                  * Verify the string as this thing may have come from
 983                  * the user.  There must be either one "%d" and no other "%"
 984                  * characters.
 985                  */
 986                 if (p[1] != 'd' || strchr(p + 2, '%'))
 987                         return -EINVAL;
 988
 989                 /* Use one page as a bit array of possible slots */
 990                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 991                 if (!inuse)
 992                         return -ENOMEM;
 993
 994                 for_each_netdev(net, d) {
 995                         if (!sscanf(d->name, name, &i))
 996                                 continue;
 997                         if (i < 0 || i >= max_netdevices)
 998                                 continue;
 999
1000                         /*  avoid cases where sscanf is not exact inverse of printf */
1001                         snprintf(buf, IFNAMSIZ, name, i);
1002                         if (!strncmp(buf, d->name, IFNAMSIZ))
1003                                 set_bit(i, inuse);
1004                 }
1005
1006                 i = find_first_zero_bit(inuse, max_netdevices);
1007                 free_page((unsigned long) inuse);
1008         }
1009
1010         if (buf != name)
1011                 snprintf(buf, IFNAMSIZ, name, i);
1012         if (!__dev_get_by_name(net, buf))
1013                 return i;
1014
1015         /* It is possible to run out of possible slots
1016          * when the name is long and there isn't enough space left
1017          * for the digits, or if all bits are used.
1018          */
1019         return -ENFILE;
1020 }
1021
1022 /**
1023  *      dev_alloc_name - allocate a name for a device
1024  *      @dev: device
1025  *      @name: name format string
1026  *
1027  *      Passed a format string - eg "lt%d" it will try and find a suitable
1028  *      id. It scans list of devices to build up a free map, then chooses
1029  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *      while allocating the name and adding the device in order to avoid
1031  *      duplicates.
1032  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *      Returns the number of the unit assigned or a negative errno code.
1034  */
1035
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038         char buf[IFNAMSIZ];
1039         struct net *net;
1040         int ret;
1041
1042         BUG_ON(!dev_net(dev));
1043         net = dev_net(dev);
1044         ret = __dev_alloc_name(net, name, buf);
1045         if (ret >= 0)
1046                 strlcpy(dev->name, buf, IFNAMSIZ);
1047         return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050
1051 static int dev_alloc_name_ns(struct net *net,
1052                              struct net_device *dev,
1053                              const char *name)
1054 {
1055         char buf[IFNAMSIZ];
1056         int ret;
1057
1058         ret = __dev_alloc_name(net, name, buf);
1059         if (ret >= 0)
1060                 strlcpy(dev->name, buf, IFNAMSIZ);
1061         return ret;
1062 }
1063
1064 static int dev_get_valid_name(struct net *net,
1065                               struct net_device *dev,
1066                               const char *name)
1067 {
1068         BUG_ON(!net);
1069
1070         if (!dev_valid_name(name))
1071                 return -EINVAL;
1072
1073         if (strchr(name, '%'))
1074                 return dev_alloc_name_ns(net, dev, name);
1075         else if (__dev_get_by_name(net, name))
1076                 return -EEXIST;
1077         else if (dev->name != name)
1078                 strlcpy(dev->name, name, IFNAMSIZ);
1079
1080         return 0;
1081 }
1082
1083 /**
1084  *      dev_change_name - change name of a device
1085  *      @dev: device
1086  *      @newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *      Change name of a device, can pass format strings "eth%d".
1089  *      for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093         unsigned char old_assign_type;
1094         char oldname[IFNAMSIZ];
1095         int err = 0;
1096         int ret;
1097         struct net *net;
1098
1099         ASSERT_RTNL();
1100         BUG_ON(!dev_net(dev));
1101
1102         net = dev_net(dev);
1103         if (dev->flags & IFF_UP)
1104                 return -EBUSY;
1105
1106         write_seqcount_begin(&devnet_rename_seq);
1107
1108         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109                 write_seqcount_end(&devnet_rename_seq);
1110                 return 0;
1111         }
1112
1113         memcpy(oldname, dev->name, IFNAMSIZ);
1114
1115         err = dev_get_valid_name(net, dev, newname);
1116         if (err < 0) {
1117                 write_seqcount_end(&devnet_rename_seq);
1118                 return err;
1119         }
1120
1121         if (oldname[0] && !strchr(oldname, '%'))
1122                 netdev_info(dev, "renamed from %s\n", oldname);
1123
1124         old_assign_type = dev->name_assign_type;
1125         dev->name_assign_type = NET_NAME_RENAMED;
1126
1127 rollback:
1128         ret = device_rename(&dev->dev, dev->name);
1129         if (ret) {
1130                 memcpy(dev->name, oldname, IFNAMSIZ);
1131                 dev->name_assign_type = old_assign_type;
1132                 write_seqcount_end(&devnet_rename_seq);
1133                 return ret;
1134         }
1135
1136         write_seqcount_end(&devnet_rename_seq);
1137
1138         netdev_adjacent_rename_links(dev, oldname);
1139
1140         write_lock_bh(&dev_base_lock);
1141         hlist_del_rcu(&dev->name_hlist);
1142         write_unlock_bh(&dev_base_lock);
1143
1144         synchronize_rcu();
1145
1146         write_lock_bh(&dev_base_lock);
1147         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148         write_unlock_bh(&dev_base_lock);
1149
1150         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151         ret = notifier_to_errno(ret);
1152
1153         if (ret) {
1154                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1155                 if (err >= 0) {
1156                         err = ret;
1157                         write_seqcount_begin(&devnet_rename_seq);
1158                         memcpy(dev->name, oldname, IFNAMSIZ);
1159                         memcpy(oldname, newname, IFNAMSIZ);
1160                         dev->name_assign_type = old_assign_type;
1161                         old_assign_type = NET_NAME_RENAMED;
1162                         goto rollback;
1163                 } else {
1164                         pr_err("%s: name change rollback failed: %d\n",
1165                                dev->name, ret);
1166                 }
1167         }
1168
1169         return err;
1170 }
1171
1172 /**
1173  *      dev_set_alias - change ifalias of a device
1174  *      @dev: device
1175  *      @alias: name up to IFALIASZ
1176  *      @len: limit of bytes to copy from info
1177  *
1178  *      Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182         char *new_ifalias;
1183
1184         ASSERT_RTNL();
1185
1186         if (len >= IFALIASZ)
1187                 return -EINVAL;
1188
1189         if (!len) {
1190                 kfree(dev->ifalias);
1191                 dev->ifalias = NULL;
1192                 return 0;
1193         }
1194
1195         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196         if (!new_ifalias)
1197                 return -ENOMEM;
1198         dev->ifalias = new_ifalias;
1199
1200         strlcpy(dev->ifalias, alias, len+1);
1201         return len;
1202 }
1203
1204
1205 /**
1206  *      netdev_features_change - device changes features
1207  *      @dev: device to cause notification
1208  *
1209  *      Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216
1217 /**
1218  *      netdev_state_change - device changes state
1219  *      @dev: device to cause notification
1220  *
1221  *      Called to indicate a device has changed state. This function calls
1222  *      the notifier chains for netdev_chain and sends a NEWLINK message
1223  *      to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227         if (dev->flags & IFF_UP) {
1228                 struct netdev_notifier_change_info change_info;
1229
1230                 change_info.flags_changed = 0;
1231                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232                                               &change_info.info);
1233                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234         }
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237
1238 /**
1239  *      netdev_notify_peers - notify network peers about existence of @dev
1240  *      @dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250         rtnl_lock();
1251         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252         rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255
1256 static int __dev_open(struct net_device *dev)
1257 {
1258         const struct net_device_ops *ops = dev->netdev_ops;
1259         int ret;
1260
1261         ASSERT_RTNL();
1262
1263         if (!netif_device_present(dev))
1264                 return -ENODEV;
1265
1266         /* Block netpoll from trying to do any rx path servicing.
1267          * If we don't do this there is a chance ndo_poll_controller
1268          * or ndo_poll may be running while we open the device
1269          */
1270         netpoll_poll_disable(dev);
1271
1272         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273         ret = notifier_to_errno(ret);
1274         if (ret)
1275                 return ret;
1276
1277         set_bit(__LINK_STATE_START, &dev->state);
1278
1279         if (ops->ndo_validate_addr)
1280                 ret = ops->ndo_validate_addr(dev);
1281
1282         if (!ret && ops->ndo_open)
1283                 ret = ops->ndo_open(dev);
1284
1285         netpoll_poll_enable(dev);
1286
1287         if (ret)
1288                 clear_bit(__LINK_STATE_START, &dev->state);
1289         else {
1290                 dev->flags |= IFF_UP;
1291                 dev_set_rx_mode(dev);
1292                 dev_activate(dev);
1293                 add_device_randomness(dev->dev_addr, dev->addr_len);
1294         }
1295
1296         return ret;
1297 }
1298
1299 /**
1300  *      dev_open        - prepare an interface for use.
1301  *      @dev:   device to open
1302  *
1303  *      Takes a device from down to up state. The device's private open
1304  *      function is invoked and then the multicast lists are loaded. Finally
1305  *      the device is moved into the up state and a %NETDEV_UP message is
1306  *      sent to the netdev notifier chain.
1307  *
1308  *      Calling this function on an active interface is a nop. On a failure
1309  *      a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313         int ret;
1314
1315         if (dev->flags & IFF_UP)
1316                 return 0;
1317
1318         ret = __dev_open(dev);
1319         if (ret < 0)
1320                 return ret;
1321
1322         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323         call_netdevice_notifiers(NETDEV_UP, dev);
1324
1325         return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331         struct net_device *dev;
1332
1333         ASSERT_RTNL();
1334         might_sleep();
1335
1336         list_for_each_entry(dev, head, close_list) {
1337                 /* Temporarily disable netpoll until the interface is down */
1338                 netpoll_poll_disable(dev);
1339
1340                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341
1342                 clear_bit(__LINK_STATE_START, &dev->state);
1343
1344                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1345                  * can be even on different cpu. So just clear netif_running().
1346                  *
1347                  * dev->stop() will invoke napi_disable() on all of it's
1348                  * napi_struct instances on this device.
1349                  */
1350                 smp_mb__after_atomic(); /* Commit netif_running(). */
1351         }
1352
1353         dev_deactivate_many(head);
1354
1355         list_for_each_entry(dev, head, close_list) {
1356                 const struct net_device_ops *ops = dev->netdev_ops;
1357
1358                 /*
1359                  *      Call the device specific close. This cannot fail.
1360                  *      Only if device is UP
1361                  *
1362                  *      We allow it to be called even after a DETACH hot-plug
1363                  *      event.
1364                  */
1365                 if (ops->ndo_stop)
1366                         ops->ndo_stop(dev);
1367
1368                 dev->flags &= ~IFF_UP;
1369                 netpoll_poll_enable(dev);
1370         }
1371
1372         return 0;
1373 }
1374
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383
1384         return retval;
1385 }
1386
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395
1396         __dev_close_many(head);
1397
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428
1429
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         /*
1441          * If we're trying to disable lro on a vlan device
1442          * use the underlying physical device instead
1443          */
1444         if (is_vlan_dev(dev))
1445                 dev = vlan_dev_real_dev(dev);
1446
1447         /* the same for macvlan devices */
1448         if (netif_is_macvlan(dev))
1449                 dev = macvlan_dev_real_dev(dev);
1450
1451         dev->wanted_features &= ~NETIF_F_LRO;
1452         netdev_update_features(dev);
1453
1454         if (unlikely(dev->features & NETIF_F_LRO))
1455                 netdev_WARN(dev, "failed to disable LRO!\n");
1456 }
1457 EXPORT_SYMBOL(dev_disable_lro);
1458
1459 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1460                                    struct net_device *dev)
1461 {
1462         struct netdev_notifier_info info;
1463
1464         netdev_notifier_info_init(&info, dev);
1465         return nb->notifier_call(nb, val, &info);
1466 }
1467
1468 static int dev_boot_phase = 1;
1469
1470 /**
1471  *      register_netdevice_notifier - register a network notifier block
1472  *      @nb: notifier
1473  *
1474  *      Register a notifier to be called when network device events occur.
1475  *      The notifier passed is linked into the kernel structures and must
1476  *      not be reused until it has been unregistered. A negative errno code
1477  *      is returned on a failure.
1478  *
1479  *      When registered all registration and up events are replayed
1480  *      to the new notifier to allow device to have a race free
1481  *      view of the network device list.
1482  */
1483
1484 int register_netdevice_notifier(struct notifier_block *nb)
1485 {
1486         struct net_device *dev;
1487         struct net_device *last;
1488         struct net *net;
1489         int err;
1490
1491         rtnl_lock();
1492         err = raw_notifier_chain_register(&netdev_chain, nb);
1493         if (err)
1494                 goto unlock;
1495         if (dev_boot_phase)
1496                 goto unlock;
1497         for_each_net(net) {
1498                 for_each_netdev(net, dev) {
1499                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1500                         err = notifier_to_errno(err);
1501                         if (err)
1502                                 goto rollback;
1503
1504                         if (!(dev->flags & IFF_UP))
1505                                 continue;
1506
1507                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1508                 }
1509         }
1510
1511 unlock:
1512         rtnl_unlock();
1513         return err;
1514
1515 rollback:
1516         last = dev;
1517         for_each_net(net) {
1518                 for_each_netdev(net, dev) {
1519                         if (dev == last)
1520                                 goto outroll;
1521
1522                         if (dev->flags & IFF_UP) {
1523                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1524                                                         dev);
1525                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1526                         }
1527                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1528                 }
1529         }
1530
1531 outroll:
1532         raw_notifier_chain_unregister(&netdev_chain, nb);
1533         goto unlock;
1534 }
1535 EXPORT_SYMBOL(register_netdevice_notifier);
1536
1537 /**
1538  *      unregister_netdevice_notifier - unregister a network notifier block
1539  *      @nb: notifier
1540  *
1541  *      Unregister a notifier previously registered by
1542  *      register_netdevice_notifier(). The notifier is unlinked into the
1543  *      kernel structures and may then be reused. A negative errno code
1544  *      is returned on a failure.
1545  *
1546  *      After unregistering unregister and down device events are synthesized
1547  *      for all devices on the device list to the removed notifier to remove
1548  *      the need for special case cleanup code.
1549  */
1550
1551 int unregister_netdevice_notifier(struct notifier_block *nb)
1552 {
1553         struct net_device *dev;
1554         struct net *net;
1555         int err;
1556
1557         rtnl_lock();
1558         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1559         if (err)
1560                 goto unlock;
1561
1562         for_each_net(net) {
1563                 for_each_netdev(net, dev) {
1564                         if (dev->flags & IFF_UP) {
1565                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1566                                                         dev);
1567                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1568                         }
1569                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1570                 }
1571         }
1572 unlock:
1573         rtnl_unlock();
1574         return err;
1575 }
1576 EXPORT_SYMBOL(unregister_netdevice_notifier);
1577
1578 /**
1579  *      call_netdevice_notifiers_info - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *      @info: notifier information data
1583  *
1584  *      Call all network notifier blocks.  Parameters and return value
1585  *      are as for raw_notifier_call_chain().
1586  */
1587
1588 static int call_netdevice_notifiers_info(unsigned long val,
1589                                          struct net_device *dev,
1590                                          struct netdev_notifier_info *info)
1591 {
1592         ASSERT_RTNL();
1593         netdev_notifier_info_init(info, dev);
1594         return raw_notifier_call_chain(&netdev_chain, val, info);
1595 }
1596
1597 /**
1598  *      call_netdevice_notifiers - call all network notifier blocks
1599  *      @val: value passed unmodified to notifier function
1600  *      @dev: net_device pointer passed unmodified to notifier function
1601  *
1602  *      Call all network notifier blocks.  Parameters and return value
1603  *      are as for raw_notifier_call_chain().
1604  */
1605
1606 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1607 {
1608         struct netdev_notifier_info info;
1609
1610         return call_netdevice_notifiers_info(val, dev, &info);
1611 }
1612 EXPORT_SYMBOL(call_netdevice_notifiers);
1613
1614 static struct static_key netstamp_needed __read_mostly;
1615 #ifdef HAVE_JUMP_LABEL
1616 /* We are not allowed to call static_key_slow_dec() from irq context
1617  * If net_disable_timestamp() is called from irq context, defer the
1618  * static_key_slow_dec() calls.
1619  */
1620 static atomic_t netstamp_needed_deferred;
1621 #endif
1622
1623 void net_enable_timestamp(void)
1624 {
1625 #ifdef HAVE_JUMP_LABEL
1626         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1627
1628         if (deferred) {
1629                 while (--deferred)
1630                         static_key_slow_dec(&netstamp_needed);
1631                 return;
1632         }
1633 #endif
1634         static_key_slow_inc(&netstamp_needed);
1635 }
1636 EXPORT_SYMBOL(net_enable_timestamp);
1637
1638 void net_disable_timestamp(void)
1639 {
1640 #ifdef HAVE_JUMP_LABEL
1641         if (in_interrupt()) {
1642                 atomic_inc(&netstamp_needed_deferred);
1643                 return;
1644         }
1645 #endif
1646         static_key_slow_dec(&netstamp_needed);
1647 }
1648 EXPORT_SYMBOL(net_disable_timestamp);
1649
1650 static inline void net_timestamp_set(struct sk_buff *skb)
1651 {
1652         skb->tstamp.tv64 = 0;
1653         if (static_key_false(&netstamp_needed))
1654                 __net_timestamp(skb);
1655 }
1656
1657 #define net_timestamp_check(COND, SKB)                  \
1658         if (static_key_false(&netstamp_needed)) {               \
1659                 if ((COND) && !(SKB)->tstamp.tv64)      \
1660                         __net_timestamp(SKB);           \
1661         }                                               \
1662
1663 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1664 {
1665         unsigned int len;
1666
1667         if (!(dev->flags & IFF_UP))
1668                 return false;
1669
1670         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1671         if (skb->len <= len)
1672                 return true;
1673
1674         /* if TSO is enabled, we don't care about the length as the packet
1675          * could be forwarded without being segmented before
1676          */
1677         if (skb_is_gso(skb))
1678                 return true;
1679
1680         return false;
1681 }
1682 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1683
1684 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1685 {
1686         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1687                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1688                         atomic_long_inc(&dev->rx_dropped);
1689                         kfree_skb(skb);
1690                         return NET_RX_DROP;
1691                 }
1692         }
1693
1694         if (unlikely(!is_skb_forwardable(dev, skb))) {
1695                 atomic_long_inc(&dev->rx_dropped);
1696                 kfree_skb(skb);
1697                 return NET_RX_DROP;
1698         }
1699
1700         skb_scrub_packet(skb, true);
1701         skb->protocol = eth_type_trans(skb, dev);
1702
1703         return 0;
1704 }
1705 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1706
1707 /**
1708  * dev_forward_skb - loopback an skb to another netif
1709  *
1710  * @dev: destination network device
1711  * @skb: buffer to forward
1712  *
1713  * return values:
1714  *      NET_RX_SUCCESS  (no congestion)
1715  *      NET_RX_DROP     (packet was dropped, but freed)
1716  *
1717  * dev_forward_skb can be used for injecting an skb from the
1718  * start_xmit function of one device into the receive queue
1719  * of another device.
1720  *
1721  * The receiving device may be in another namespace, so
1722  * we have to clear all information in the skb that could
1723  * impact namespace isolation.
1724  */
1725 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1726 {
1727         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1728 }
1729 EXPORT_SYMBOL_GPL(dev_forward_skb);
1730
1731 static inline int deliver_skb(struct sk_buff *skb,
1732                               struct packet_type *pt_prev,
1733                               struct net_device *orig_dev)
1734 {
1735         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1736                 return -ENOMEM;
1737         atomic_inc(&skb->users);
1738         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1739 }
1740
1741 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1742 {
1743         if (!ptype->af_packet_priv || !skb->sk)
1744                 return false;
1745
1746         if (ptype->id_match)
1747                 return ptype->id_match(ptype, skb->sk);
1748         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1749                 return true;
1750
1751         return false;
1752 }
1753
1754 /*
1755  *      Support routine. Sends outgoing frames to any network
1756  *      taps currently in use.
1757  */
1758
1759 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1760 {
1761         struct packet_type *ptype;
1762         struct sk_buff *skb2 = NULL;
1763         struct packet_type *pt_prev = NULL;
1764
1765         rcu_read_lock();
1766         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1767                 /* Never send packets back to the socket
1768                  * they originated from - MvS (miquels@drinkel.ow.org)
1769                  */
1770                 if ((ptype->dev == dev || !ptype->dev) &&
1771                     (!skb_loop_sk(ptype, skb))) {
1772                         if (pt_prev) {
1773                                 deliver_skb(skb2, pt_prev, skb->dev);
1774                                 pt_prev = ptype;
1775                                 continue;
1776                         }
1777
1778                         skb2 = skb_clone(skb, GFP_ATOMIC);
1779                         if (!skb2)
1780                                 break;
1781
1782                         net_timestamp_set(skb2);
1783
1784                         /* skb->nh should be correctly
1785                            set by sender, so that the second statement is
1786                            just protection against buggy protocols.
1787                          */
1788                         skb_reset_mac_header(skb2);
1789
1790                         if (skb_network_header(skb2) < skb2->data ||
1791                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1792                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1793                                                      ntohs(skb2->protocol),
1794                                                      dev->name);
1795                                 skb_reset_network_header(skb2);
1796                         }
1797
1798                         skb2->transport_header = skb2->network_header;
1799                         skb2->pkt_type = PACKET_OUTGOING;
1800                         pt_prev = ptype;
1801                 }
1802         }
1803         if (pt_prev)
1804                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1805         rcu_read_unlock();
1806 }
1807
1808 /**
1809  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1810  * @dev: Network device
1811  * @txq: number of queues available
1812  *
1813  * If real_num_tx_queues is changed the tc mappings may no longer be
1814  * valid. To resolve this verify the tc mapping remains valid and if
1815  * not NULL the mapping. With no priorities mapping to this
1816  * offset/count pair it will no longer be used. In the worst case TC0
1817  * is invalid nothing can be done so disable priority mappings. If is
1818  * expected that drivers will fix this mapping if they can before
1819  * calling netif_set_real_num_tx_queues.
1820  */
1821 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1822 {
1823         int i;
1824         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1825
1826         /* If TC0 is invalidated disable TC mapping */
1827         if (tc->offset + tc->count > txq) {
1828                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1829                 dev->num_tc = 0;
1830                 return;
1831         }
1832
1833         /* Invalidated prio to tc mappings set to TC0 */
1834         for (i = 1; i < TC_BITMASK + 1; i++) {
1835                 int q = netdev_get_prio_tc_map(dev, i);
1836
1837                 tc = &dev->tc_to_txq[q];
1838                 if (tc->offset + tc->count > txq) {
1839                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1840                                 i, q);
1841                         netdev_set_prio_tc_map(dev, i, 0);
1842                 }
1843         }
1844 }
1845
1846 #ifdef CONFIG_XPS
1847 static DEFINE_MUTEX(xps_map_mutex);
1848 #define xmap_dereference(P)             \
1849         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1850
1851 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1852                                         int cpu, u16 index)
1853 {
1854         struct xps_map *map = NULL;
1855         int pos;
1856
1857         if (dev_maps)
1858                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1859
1860         for (pos = 0; map && pos < map->len; pos++) {
1861                 if (map->queues[pos] == index) {
1862                         if (map->len > 1) {
1863                                 map->queues[pos] = map->queues[--map->len];
1864                         } else {
1865                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1866                                 kfree_rcu(map, rcu);
1867                                 map = NULL;
1868                         }
1869                         break;
1870                 }
1871         }
1872
1873         return map;
1874 }
1875
1876 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1877 {
1878         struct xps_dev_maps *dev_maps;
1879         int cpu, i;
1880         bool active = false;
1881
1882         mutex_lock(&xps_map_mutex);
1883         dev_maps = xmap_dereference(dev->xps_maps);
1884
1885         if (!dev_maps)
1886                 goto out_no_maps;
1887
1888         for_each_possible_cpu(cpu) {
1889                 for (i = index; i < dev->num_tx_queues; i++) {
1890                         if (!remove_xps_queue(dev_maps, cpu, i))
1891                                 break;
1892                 }
1893                 if (i == dev->num_tx_queues)
1894                         active = true;
1895         }
1896
1897         if (!active) {
1898                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1899                 kfree_rcu(dev_maps, rcu);
1900         }
1901
1902         for (i = index; i < dev->num_tx_queues; i++)
1903                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1904                                              NUMA_NO_NODE);
1905
1906 out_no_maps:
1907         mutex_unlock(&xps_map_mutex);
1908 }
1909
1910 static struct xps_map *expand_xps_map(struct xps_map *map,
1911                                       int cpu, u16 index)
1912 {
1913         struct xps_map *new_map;
1914         int alloc_len = XPS_MIN_MAP_ALLOC;
1915         int i, pos;
1916
1917         for (pos = 0; map && pos < map->len; pos++) {
1918                 if (map->queues[pos] != index)
1919                         continue;
1920                 return map;
1921         }
1922
1923         /* Need to add queue to this CPU's existing map */
1924         if (map) {
1925                 if (pos < map->alloc_len)
1926                         return map;
1927
1928                 alloc_len = map->alloc_len * 2;
1929         }
1930
1931         /* Need to allocate new map to store queue on this CPU's map */
1932         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1933                                cpu_to_node(cpu));
1934         if (!new_map)
1935                 return NULL;
1936
1937         for (i = 0; i < pos; i++)
1938                 new_map->queues[i] = map->queues[i];
1939         new_map->alloc_len = alloc_len;
1940         new_map->len = pos;
1941
1942         return new_map;
1943 }
1944
1945 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1946                         u16 index)
1947 {
1948         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1949         struct xps_map *map, *new_map;
1950         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1951         int cpu, numa_node_id = -2;
1952         bool active = false;
1953
1954         mutex_lock(&xps_map_mutex);
1955
1956         dev_maps = xmap_dereference(dev->xps_maps);
1957
1958         /* allocate memory for queue storage */
1959         for_each_online_cpu(cpu) {
1960                 if (!cpumask_test_cpu(cpu, mask))
1961                         continue;
1962
1963                 if (!new_dev_maps)
1964                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1965                 if (!new_dev_maps) {
1966                         mutex_unlock(&xps_map_mutex);
1967                         return -ENOMEM;
1968                 }
1969
1970                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1971                                  NULL;
1972
1973                 map = expand_xps_map(map, cpu, index);
1974                 if (!map)
1975                         goto error;
1976
1977                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1978         }
1979
1980         if (!new_dev_maps)
1981                 goto out_no_new_maps;
1982
1983         for_each_possible_cpu(cpu) {
1984                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1985                         /* add queue to CPU maps */
1986                         int pos = 0;
1987
1988                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989                         while ((pos < map->len) && (map->queues[pos] != index))
1990                                 pos++;
1991
1992                         if (pos == map->len)
1993                                 map->queues[map->len++] = index;
1994 #ifdef CONFIG_NUMA
1995                         if (numa_node_id == -2)
1996                                 numa_node_id = cpu_to_node(cpu);
1997                         else if (numa_node_id != cpu_to_node(cpu))
1998                                 numa_node_id = -1;
1999 #endif
2000                 } else if (dev_maps) {
2001                         /* fill in the new device map from the old device map */
2002                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2003                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2004                 }
2005
2006         }
2007
2008         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2009
2010         /* Cleanup old maps */
2011         if (dev_maps) {
2012                 for_each_possible_cpu(cpu) {
2013                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2014                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2015                         if (map && map != new_map)
2016                                 kfree_rcu(map, rcu);
2017                 }
2018
2019                 kfree_rcu(dev_maps, rcu);
2020         }
2021
2022         dev_maps = new_dev_maps;
2023         active = true;
2024
2025 out_no_new_maps:
2026         /* update Tx queue numa node */
2027         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2028                                      (numa_node_id >= 0) ? numa_node_id :
2029                                      NUMA_NO_NODE);
2030
2031         if (!dev_maps)
2032                 goto out_no_maps;
2033
2034         /* removes queue from unused CPUs */
2035         for_each_possible_cpu(cpu) {
2036                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2037                         continue;
2038
2039                 if (remove_xps_queue(dev_maps, cpu, index))
2040                         active = true;
2041         }
2042
2043         /* free map if not active */
2044         if (!active) {
2045                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2046                 kfree_rcu(dev_maps, rcu);
2047         }
2048
2049 out_no_maps:
2050         mutex_unlock(&xps_map_mutex);
2051
2052         return 0;
2053 error:
2054         /* remove any maps that we added */
2055         for_each_possible_cpu(cpu) {
2056                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2057                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2058                                  NULL;
2059                 if (new_map && new_map != map)
2060                         kfree(new_map);
2061         }
2062
2063         mutex_unlock(&xps_map_mutex);
2064
2065         kfree(new_dev_maps);
2066         return -ENOMEM;
2067 }
2068 EXPORT_SYMBOL(netif_set_xps_queue);
2069
2070 #endif
2071 /*
2072  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2073  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2074  */
2075 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2076 {
2077         int rc;
2078
2079         if (txq < 1 || txq > dev->num_tx_queues)
2080                 return -EINVAL;
2081
2082         if (dev->reg_state == NETREG_REGISTERED ||
2083             dev->reg_state == NETREG_UNREGISTERING) {
2084                 ASSERT_RTNL();
2085
2086                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2087                                                   txq);
2088                 if (rc)
2089                         return rc;
2090
2091                 if (dev->num_tc)
2092                         netif_setup_tc(dev, txq);
2093
2094                 if (txq < dev->real_num_tx_queues) {
2095                         qdisc_reset_all_tx_gt(dev, txq);
2096 #ifdef CONFIG_XPS
2097                         netif_reset_xps_queues_gt(dev, txq);
2098 #endif
2099                 }
2100         }
2101
2102         dev->real_num_tx_queues = txq;
2103         return 0;
2104 }
2105 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2106
2107 #ifdef CONFIG_SYSFS
2108 /**
2109  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2110  *      @dev: Network device
2111  *      @rxq: Actual number of RX queues
2112  *
2113  *      This must be called either with the rtnl_lock held or before
2114  *      registration of the net device.  Returns 0 on success, or a
2115  *      negative error code.  If called before registration, it always
2116  *      succeeds.
2117  */
2118 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2119 {
2120         int rc;
2121
2122         if (rxq < 1 || rxq > dev->num_rx_queues)
2123                 return -EINVAL;
2124
2125         if (dev->reg_state == NETREG_REGISTERED) {
2126                 ASSERT_RTNL();
2127
2128                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2129                                                   rxq);
2130                 if (rc)
2131                         return rc;
2132         }
2133
2134         dev->real_num_rx_queues = rxq;
2135         return 0;
2136 }
2137 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2138 #endif
2139
2140 /**
2141  * netif_get_num_default_rss_queues - default number of RSS queues
2142  *
2143  * This routine should set an upper limit on the number of RSS queues
2144  * used by default by multiqueue devices.
2145  */
2146 int netif_get_num_default_rss_queues(void)
2147 {
2148         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2149 }
2150 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2151
2152 static inline void __netif_reschedule(struct Qdisc *q)
2153 {
2154         struct softnet_data *sd;
2155         unsigned long flags;
2156
2157         local_irq_save(flags);
2158         sd = this_cpu_ptr(&softnet_data);
2159         q->next_sched = NULL;
2160         *sd->output_queue_tailp = q;
2161         sd->output_queue_tailp = &q->next_sched;
2162         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2163         local_irq_restore(flags);
2164 }
2165
2166 void __netif_schedule(struct Qdisc *q)
2167 {
2168         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2169                 __netif_reschedule(q);
2170 }
2171 EXPORT_SYMBOL(__netif_schedule);
2172
2173 struct dev_kfree_skb_cb {
2174         enum skb_free_reason reason;
2175 };
2176
2177 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2178 {
2179         return (struct dev_kfree_skb_cb *)skb->cb;
2180 }
2181
2182 void netif_schedule_queue(struct netdev_queue *txq)
2183 {
2184         rcu_read_lock();
2185         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2186                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2187
2188                 __netif_schedule(q);
2189         }
2190         rcu_read_unlock();
2191 }
2192 EXPORT_SYMBOL(netif_schedule_queue);
2193
2194 /**
2195  *      netif_wake_subqueue - allow sending packets on subqueue
2196  *      @dev: network device
2197  *      @queue_index: sub queue index
2198  *
2199  * Resume individual transmit queue of a device with multiple transmit queues.
2200  */
2201 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2202 {
2203         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2204
2205         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2206                 struct Qdisc *q;
2207
2208                 rcu_read_lock();
2209                 q = rcu_dereference(txq->qdisc);
2210                 __netif_schedule(q);
2211                 rcu_read_unlock();
2212         }
2213 }
2214 EXPORT_SYMBOL(netif_wake_subqueue);
2215
2216 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2217 {
2218         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2219                 struct Qdisc *q;
2220
2221                 rcu_read_lock();
2222                 q = rcu_dereference(dev_queue->qdisc);
2223                 __netif_schedule(q);
2224                 rcu_read_unlock();
2225         }
2226 }
2227 EXPORT_SYMBOL(netif_tx_wake_queue);
2228
2229 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2230 {
2231         unsigned long flags;
2232
2233         if (likely(atomic_read(&skb->users) == 1)) {
2234                 smp_rmb();
2235                 atomic_set(&skb->users, 0);
2236         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2237                 return;
2238         }
2239         get_kfree_skb_cb(skb)->reason = reason;
2240         local_irq_save(flags);
2241         skb->next = __this_cpu_read(softnet_data.completion_queue);
2242         __this_cpu_write(softnet_data.completion_queue, skb);
2243         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2244         local_irq_restore(flags);
2245 }
2246 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2247
2248 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2249 {
2250         if (in_irq() || irqs_disabled())
2251                 __dev_kfree_skb_irq(skb, reason);
2252         else
2253                 dev_kfree_skb(skb);
2254 }
2255 EXPORT_SYMBOL(__dev_kfree_skb_any);
2256
2257
2258 /**
2259  * netif_device_detach - mark device as removed
2260  * @dev: network device
2261  *
2262  * Mark device as removed from system and therefore no longer available.
2263  */
2264 void netif_device_detach(struct net_device *dev)
2265 {
2266         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2267             netif_running(dev)) {
2268                 netif_tx_stop_all_queues(dev);
2269         }
2270 }
2271 EXPORT_SYMBOL(netif_device_detach);
2272
2273 /**
2274  * netif_device_attach - mark device as attached
2275  * @dev: network device
2276  *
2277  * Mark device as attached from system and restart if needed.
2278  */
2279 void netif_device_attach(struct net_device *dev)
2280 {
2281         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2282             netif_running(dev)) {
2283                 netif_tx_wake_all_queues(dev);
2284                 __netdev_watchdog_up(dev);
2285         }
2286 }
2287 EXPORT_SYMBOL(netif_device_attach);
2288
2289 static void skb_warn_bad_offload(const struct sk_buff *skb)
2290 {
2291         static const netdev_features_t null_features = 0;
2292         struct net_device *dev = skb->dev;
2293         const char *driver = "";
2294
2295         if (!net_ratelimit())
2296                 return;
2297
2298         if (dev && dev->dev.parent)
2299                 driver = dev_driver_string(dev->dev.parent);
2300
2301         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2302              "gso_type=%d ip_summed=%d\n",
2303              driver, dev ? &dev->features : &null_features,
2304              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2305              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2306              skb_shinfo(skb)->gso_type, skb->ip_summed);
2307 }
2308
2309 /*
2310  * Invalidate hardware checksum when packet is to be mangled, and
2311  * complete checksum manually on outgoing path.
2312  */
2313 int skb_checksum_help(struct sk_buff *skb)
2314 {
2315         __wsum csum;
2316         int ret = 0, offset;
2317
2318         if (skb->ip_summed == CHECKSUM_COMPLETE)
2319                 goto out_set_summed;
2320
2321         if (unlikely(skb_shinfo(skb)->gso_size)) {
2322                 skb_warn_bad_offload(skb);
2323                 return -EINVAL;
2324         }
2325
2326         /* Before computing a checksum, we should make sure no frag could
2327          * be modified by an external entity : checksum could be wrong.
2328          */
2329         if (skb_has_shared_frag(skb)) {
2330                 ret = __skb_linearize(skb);
2331                 if (ret)
2332                         goto out;
2333         }
2334
2335         offset = skb_checksum_start_offset(skb);
2336         BUG_ON(offset >= skb_headlen(skb));
2337         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2338
2339         offset += skb->csum_offset;
2340         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2341
2342         if (skb_cloned(skb) &&
2343             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2344                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2345                 if (ret)
2346                         goto out;
2347         }
2348
2349         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2350 out_set_summed:
2351         skb->ip_summed = CHECKSUM_NONE;
2352 out:
2353         return ret;
2354 }
2355 EXPORT_SYMBOL(skb_checksum_help);
2356
2357 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2358 {
2359         unsigned int vlan_depth = skb->mac_len;
2360         __be16 type = skb->protocol;
2361
2362         /* Tunnel gso handlers can set protocol to ethernet. */
2363         if (type == htons(ETH_P_TEB)) {
2364                 struct ethhdr *eth;
2365
2366                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2367                         return 0;
2368
2369                 eth = (struct ethhdr *)skb_mac_header(skb);
2370                 type = eth->h_proto;
2371         }
2372
2373         /* if skb->protocol is 802.1Q/AD then the header should already be
2374          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2375          * ETH_HLEN otherwise
2376          */
2377         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2378                 if (vlan_depth) {
2379                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2380                                 return 0;
2381                         vlan_depth -= VLAN_HLEN;
2382                 } else {
2383                         vlan_depth = ETH_HLEN;
2384                 }
2385                 do {
2386                         struct vlan_hdr *vh;
2387
2388                         if (unlikely(!pskb_may_pull(skb,
2389                                                     vlan_depth + VLAN_HLEN)))
2390                                 return 0;
2391
2392                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2393                         type = vh->h_vlan_encapsulated_proto;
2394                         vlan_depth += VLAN_HLEN;
2395                 } while (type == htons(ETH_P_8021Q) ||
2396                          type == htons(ETH_P_8021AD));
2397         }
2398
2399         *depth = vlan_depth;
2400
2401         return type;
2402 }
2403
2404 /**
2405  *      skb_mac_gso_segment - mac layer segmentation handler.
2406  *      @skb: buffer to segment
2407  *      @features: features for the output path (see dev->features)
2408  */
2409 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2410                                     netdev_features_t features)
2411 {
2412         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2413         struct packet_offload *ptype;
2414         int vlan_depth = skb->mac_len;
2415         __be16 type = skb_network_protocol(skb, &vlan_depth);
2416
2417         if (unlikely(!type))
2418                 return ERR_PTR(-EINVAL);
2419
2420         __skb_pull(skb, vlan_depth);
2421
2422         rcu_read_lock();
2423         list_for_each_entry_rcu(ptype, &offload_base, list) {
2424                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2425                         segs = ptype->callbacks.gso_segment(skb, features);
2426                         break;
2427                 }
2428         }
2429         rcu_read_unlock();
2430
2431         __skb_push(skb, skb->data - skb_mac_header(skb));
2432
2433         return segs;
2434 }
2435 EXPORT_SYMBOL(skb_mac_gso_segment);
2436
2437
2438 /* openvswitch calls this on rx path, so we need a different check.
2439  */
2440 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2441 {
2442         if (tx_path)
2443                 return skb->ip_summed != CHECKSUM_PARTIAL;
2444         else
2445                 return skb->ip_summed == CHECKSUM_NONE;
2446 }
2447
2448 /**
2449  *      __skb_gso_segment - Perform segmentation on skb.
2450  *      @skb: buffer to segment
2451  *      @features: features for the output path (see dev->features)
2452  *      @tx_path: whether it is called in TX path
2453  *
2454  *      This function segments the given skb and returns a list of segments.
2455  *
2456  *      It may return NULL if the skb requires no segmentation.  This is
2457  *      only possible when GSO is used for verifying header integrity.
2458  */
2459 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2460                                   netdev_features_t features, bool tx_path)
2461 {
2462         if (unlikely(skb_needs_check(skb, tx_path))) {
2463                 int err;
2464
2465                 skb_warn_bad_offload(skb);
2466
2467                 err = skb_cow_head(skb, 0);
2468                 if (err < 0)
2469                         return ERR_PTR(err);
2470         }
2471
2472         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2473         SKB_GSO_CB(skb)->encap_level = 0;
2474
2475         skb_reset_mac_header(skb);
2476         skb_reset_mac_len(skb);
2477
2478         return skb_mac_gso_segment(skb, features);
2479 }
2480 EXPORT_SYMBOL(__skb_gso_segment);
2481
2482 /* Take action when hardware reception checksum errors are detected. */
2483 #ifdef CONFIG_BUG
2484 void netdev_rx_csum_fault(struct net_device *dev)
2485 {
2486         if (net_ratelimit()) {
2487                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2488                 dump_stack();
2489         }
2490 }
2491 EXPORT_SYMBOL(netdev_rx_csum_fault);
2492 #endif
2493
2494 /* Actually, we should eliminate this check as soon as we know, that:
2495  * 1. IOMMU is present and allows to map all the memory.
2496  * 2. No high memory really exists on this machine.
2497  */
2498
2499 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2500 {
2501 #ifdef CONFIG_HIGHMEM
2502         int i;
2503         if (!(dev->features & NETIF_F_HIGHDMA)) {
2504                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2505                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2506                         if (PageHighMem(skb_frag_page(frag)))
2507                                 return 1;
2508                 }
2509         }
2510
2511         if (PCI_DMA_BUS_IS_PHYS) {
2512                 struct device *pdev = dev->dev.parent;
2513
2514                 if (!pdev)
2515                         return 0;
2516                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2517                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2518                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2519                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2520                                 return 1;
2521                 }
2522         }
2523 #endif
2524         return 0;
2525 }
2526
2527 /* If MPLS offload request, verify we are testing hardware MPLS features
2528  * instead of standard features for the netdev.
2529  */
2530 #ifdef CONFIG_NET_MPLS_GSO
2531 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2532                                            netdev_features_t features,
2533                                            __be16 type)
2534 {
2535         if (eth_p_mpls(type))
2536                 features &= skb->dev->mpls_features;
2537
2538         return features;
2539 }
2540 #else
2541 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2542                                            netdev_features_t features,
2543                                            __be16 type)
2544 {
2545         return features;
2546 }
2547 #endif
2548
2549 static netdev_features_t harmonize_features(struct sk_buff *skb,
2550         netdev_features_t features)
2551 {
2552         int tmp;
2553         __be16 type;
2554
2555         type = skb_network_protocol(skb, &tmp);
2556         features = net_mpls_features(skb, features, type);
2557
2558         if (skb->ip_summed != CHECKSUM_NONE &&
2559             !can_checksum_protocol(features, type)) {
2560                 features &= ~NETIF_F_ALL_CSUM;
2561         } else if (illegal_highdma(skb->dev, skb)) {
2562                 features &= ~NETIF_F_SG;
2563         }
2564
2565         return features;
2566 }
2567
2568 netdev_features_t netif_skb_features(struct sk_buff *skb)
2569 {
2570         const struct net_device *dev = skb->dev;
2571         netdev_features_t features = dev->features;
2572         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2573         __be16 protocol = skb->protocol;
2574
2575         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2576                 features &= ~NETIF_F_GSO_MASK;
2577
2578         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2579                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2580                 protocol = veh->h_vlan_encapsulated_proto;
2581         } else if (!vlan_tx_tag_present(skb)) {
2582                 return harmonize_features(skb, features);
2583         }
2584
2585         features = netdev_intersect_features(features,
2586                                              dev->vlan_features |
2587                                              NETIF_F_HW_VLAN_CTAG_TX |
2588                                              NETIF_F_HW_VLAN_STAG_TX);
2589
2590         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2591                 features = netdev_intersect_features(features,
2592                                                      NETIF_F_SG |
2593                                                      NETIF_F_HIGHDMA |
2594                                                      NETIF_F_FRAGLIST |
2595                                                      NETIF_F_GEN_CSUM |
2596                                                      NETIF_F_HW_VLAN_CTAG_TX |
2597                                                      NETIF_F_HW_VLAN_STAG_TX);
2598
2599         return harmonize_features(skb, features);
2600 }
2601 EXPORT_SYMBOL(netif_skb_features);
2602
2603 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2604                     struct netdev_queue *txq, bool more)
2605 {
2606         unsigned int len;
2607         int rc;
2608
2609         if (!list_empty(&ptype_all))
2610                 dev_queue_xmit_nit(skb, dev);
2611
2612         len = skb->len;
2613         trace_net_dev_start_xmit(skb, dev);
2614         rc = netdev_start_xmit(skb, dev, txq, more);
2615         trace_net_dev_xmit(skb, rc, dev, len);
2616
2617         return rc;
2618 }
2619
2620 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2621                                     struct netdev_queue *txq, int *ret)
2622 {
2623         struct sk_buff *skb = first;
2624         int rc = NETDEV_TX_OK;
2625
2626         while (skb) {
2627                 struct sk_buff *next = skb->next;
2628
2629                 skb->next = NULL;
2630                 rc = xmit_one(skb, dev, txq, next != NULL);
2631                 if (unlikely(!dev_xmit_complete(rc))) {
2632                         skb->next = next;
2633                         goto out;
2634                 }
2635
2636                 skb = next;
2637                 if (netif_xmit_stopped(txq) && skb) {
2638                         rc = NETDEV_TX_BUSY;
2639                         break;
2640                 }
2641         }
2642
2643 out:
2644         *ret = rc;
2645         return skb;
2646 }
2647
2648 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2649                                           netdev_features_t features)
2650 {
2651         if (vlan_tx_tag_present(skb) &&
2652             !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2653                 skb = __vlan_put_tag(skb, skb->vlan_proto,
2654                                      vlan_tx_tag_get(skb));
2655                 if (skb)
2656                         skb->vlan_tci = 0;
2657         }
2658         return skb;
2659 }
2660
2661 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2662 {
2663         netdev_features_t features;
2664
2665         if (skb->next)
2666                 return skb;
2667
2668         features = netif_skb_features(skb);
2669         skb = validate_xmit_vlan(skb, features);
2670         if (unlikely(!skb))
2671                 goto out_null;
2672
2673         /* If encapsulation offload request, verify we are testing
2674          * hardware encapsulation features instead of standard
2675          * features for the netdev
2676          */
2677         if (skb->encapsulation)
2678                 features &= dev->hw_enc_features;
2679
2680         if (netif_needs_gso(dev, skb, features)) {
2681                 struct sk_buff *segs;
2682
2683                 segs = skb_gso_segment(skb, features);
2684                 if (IS_ERR(segs)) {
2685                         segs = NULL;
2686                 } else if (segs) {
2687                         consume_skb(skb);
2688                         skb = segs;
2689                 }
2690         } else {
2691                 if (skb_needs_linearize(skb, features) &&
2692                     __skb_linearize(skb))
2693                         goto out_kfree_skb;
2694
2695                 /* If packet is not checksummed and device does not
2696                  * support checksumming for this protocol, complete
2697                  * checksumming here.
2698                  */
2699                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2700                         if (skb->encapsulation)
2701                                 skb_set_inner_transport_header(skb,
2702                                                                skb_checksum_start_offset(skb));
2703                         else
2704                                 skb_set_transport_header(skb,
2705                                                          skb_checksum_start_offset(skb));
2706                         if (!(features & NETIF_F_ALL_CSUM) &&
2707                             skb_checksum_help(skb))
2708                                 goto out_kfree_skb;
2709                 }
2710         }
2711
2712         return skb;
2713
2714 out_kfree_skb:
2715         kfree_skb(skb);
2716 out_null:
2717         return NULL;
2718 }
2719
2720 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2721 {
2722         struct sk_buff *next, *head = NULL, *tail;
2723
2724         for (; skb != NULL; skb = next) {
2725                 next = skb->next;
2726                 skb->next = NULL;
2727
2728                 /* in case skb wont be segmented, point to itself */
2729                 skb->prev = skb;
2730
2731                 skb = validate_xmit_skb(skb, dev);
2732                 if (!skb)
2733                         continue;
2734
2735                 if (!head)
2736                         head = skb;
2737                 else
2738                         tail->next = skb;
2739                 /* If skb was segmented, skb->prev points to
2740                  * the last segment. If not, it still contains skb.
2741                  */
2742                 tail = skb->prev;
2743         }
2744         return head;
2745 }
2746
2747 static void qdisc_pkt_len_init(struct sk_buff *skb)
2748 {
2749         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2750
2751         qdisc_skb_cb(skb)->pkt_len = skb->len;
2752
2753         /* To get more precise estimation of bytes sent on wire,
2754          * we add to pkt_len the headers size of all segments
2755          */
2756         if (shinfo->gso_size)  {
2757                 unsigned int hdr_len;
2758                 u16 gso_segs = shinfo->gso_segs;
2759
2760                 /* mac layer + network layer */
2761                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2762
2763                 /* + transport layer */
2764                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2765                         hdr_len += tcp_hdrlen(skb);
2766                 else
2767                         hdr_len += sizeof(struct udphdr);
2768
2769                 if (shinfo->gso_type & SKB_GSO_DODGY)
2770                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2771                                                 shinfo->gso_size);
2772
2773                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2774         }
2775 }
2776
2777 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2778                                  struct net_device *dev,
2779                                  struct netdev_queue *txq)
2780 {
2781         spinlock_t *root_lock = qdisc_lock(q);
2782         bool contended;
2783         int rc;
2784
2785         qdisc_pkt_len_init(skb);
2786         qdisc_calculate_pkt_len(skb, q);
2787         /*
2788          * Heuristic to force contended enqueues to serialize on a
2789          * separate lock before trying to get qdisc main lock.
2790          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2791          * often and dequeue packets faster.
2792          */
2793         contended = qdisc_is_running(q);
2794         if (unlikely(contended))
2795                 spin_lock(&q->busylock);
2796
2797         spin_lock(root_lock);
2798         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2799                 kfree_skb(skb);
2800                 rc = NET_XMIT_DROP;
2801         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2802                    qdisc_run_begin(q)) {
2803                 /*
2804                  * This is a work-conserving queue; there are no old skbs
2805                  * waiting to be sent out; and the qdisc is not running -
2806                  * xmit the skb directly.
2807                  */
2808
2809                 qdisc_bstats_update(q, skb);
2810
2811                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2812                         if (unlikely(contended)) {
2813                                 spin_unlock(&q->busylock);
2814                                 contended = false;
2815                         }
2816                         __qdisc_run(q);
2817                 } else
2818                         qdisc_run_end(q);
2819
2820                 rc = NET_XMIT_SUCCESS;
2821         } else {
2822                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2823                 if (qdisc_run_begin(q)) {
2824                         if (unlikely(contended)) {
2825                                 spin_unlock(&q->busylock);
2826                                 contended = false;
2827                         }
2828                         __qdisc_run(q);
2829                 }
2830         }
2831         spin_unlock(root_lock);
2832         if (unlikely(contended))
2833                 spin_unlock(&q->busylock);
2834         return rc;
2835 }
2836
2837 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2838 static void skb_update_prio(struct sk_buff *skb)
2839 {
2840         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2841
2842         if (!skb->priority && skb->sk && map) {
2843                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2844
2845                 if (prioidx < map->priomap_len)
2846                         skb->priority = map->priomap[prioidx];
2847         }
2848 }
2849 #else
2850 #define skb_update_prio(skb)
2851 #endif
2852
2853 static DEFINE_PER_CPU(int, xmit_recursion);
2854 #define RECURSION_LIMIT 10
2855
2856 /**
2857  *      dev_loopback_xmit - loop back @skb
2858  *      @skb: buffer to transmit
2859  */
2860 int dev_loopback_xmit(struct sk_buff *skb)
2861 {
2862         skb_reset_mac_header(skb);
2863         __skb_pull(skb, skb_network_offset(skb));
2864         skb->pkt_type = PACKET_LOOPBACK;
2865         skb->ip_summed = CHECKSUM_UNNECESSARY;
2866         WARN_ON(!skb_dst(skb));
2867         skb_dst_force(skb);
2868         netif_rx_ni(skb);
2869         return 0;
2870 }
2871 EXPORT_SYMBOL(dev_loopback_xmit);
2872
2873 /**
2874  *      __dev_queue_xmit - transmit a buffer
2875  *      @skb: buffer to transmit
2876  *      @accel_priv: private data used for L2 forwarding offload
2877  *
2878  *      Queue a buffer for transmission to a network device. The caller must
2879  *      have set the device and priority and built the buffer before calling
2880  *      this function. The function can be called from an interrupt.
2881  *
2882  *      A negative errno code is returned on a failure. A success does not
2883  *      guarantee the frame will be transmitted as it may be dropped due
2884  *      to congestion or traffic shaping.
2885  *
2886  * -----------------------------------------------------------------------------------
2887  *      I notice this method can also return errors from the queue disciplines,
2888  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2889  *      be positive.
2890  *
2891  *      Regardless of the return value, the skb is consumed, so it is currently
2892  *      difficult to retry a send to this method.  (You can bump the ref count
2893  *      before sending to hold a reference for retry if you are careful.)
2894  *
2895  *      When calling this method, interrupts MUST be enabled.  This is because
2896  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2897  *          --BLG
2898  */
2899 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2900 {
2901         struct net_device *dev = skb->dev;
2902         struct netdev_queue *txq;
2903         struct Qdisc *q;
2904         int rc = -ENOMEM;
2905
2906         skb_reset_mac_header(skb);
2907
2908         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2909                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2910
2911         /* Disable soft irqs for various locks below. Also
2912          * stops preemption for RCU.
2913          */
2914         rcu_read_lock_bh();
2915
2916         skb_update_prio(skb);
2917
2918         /* If device/qdisc don't need skb->dst, release it right now while
2919          * its hot in this cpu cache.
2920          */
2921         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2922                 skb_dst_drop(skb);
2923         else
2924                 skb_dst_force(skb);
2925
2926         txq = netdev_pick_tx(dev, skb, accel_priv);
2927         q = rcu_dereference_bh(txq->qdisc);
2928
2929 #ifdef CONFIG_NET_CLS_ACT
2930         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2931 #endif
2932         trace_net_dev_queue(skb);
2933         if (q->enqueue) {
2934                 rc = __dev_xmit_skb(skb, q, dev, txq);
2935                 goto out;
2936         }
2937
2938         /* The device has no queue. Common case for software devices:
2939            loopback, all the sorts of tunnels...
2940
2941            Really, it is unlikely that netif_tx_lock protection is necessary
2942            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2943            counters.)
2944            However, it is possible, that they rely on protection
2945            made by us here.
2946
2947            Check this and shot the lock. It is not prone from deadlocks.
2948            Either shot noqueue qdisc, it is even simpler 8)
2949          */
2950         if (dev->flags & IFF_UP) {
2951                 int cpu = smp_processor_id(); /* ok because BHs are off */
2952
2953                 if (txq->xmit_lock_owner != cpu) {
2954
2955                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2956                                 goto recursion_alert;
2957
2958                         skb = validate_xmit_skb(skb, dev);
2959                         if (!skb)
2960                                 goto drop;
2961
2962                         HARD_TX_LOCK(dev, txq, cpu);
2963
2964                         if (!netif_xmit_stopped(txq)) {
2965                                 __this_cpu_inc(xmit_recursion);
2966                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2967                                 __this_cpu_dec(xmit_recursion);
2968                                 if (dev_xmit_complete(rc)) {
2969                                         HARD_TX_UNLOCK(dev, txq);
2970                                         goto out;
2971                                 }
2972                         }
2973                         HARD_TX_UNLOCK(dev, txq);
2974                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2975                                              dev->name);
2976                 } else {
2977                         /* Recursion is detected! It is possible,
2978                          * unfortunately
2979                          */
2980 recursion_alert:
2981                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2982                                              dev->name);
2983                 }
2984         }
2985
2986         rc = -ENETDOWN;
2987 drop:
2988         rcu_read_unlock_bh();
2989
2990         atomic_long_inc(&dev->tx_dropped);
2991         kfree_skb_list(skb);
2992         return rc;
2993 out:
2994         rcu_read_unlock_bh();
2995         return rc;
2996 }
2997
2998 int dev_queue_xmit(struct sk_buff *skb)
2999 {
3000         return __dev_queue_xmit(skb, NULL);
3001 }
3002 EXPORT_SYMBOL(dev_queue_xmit);
3003
3004 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3005 {
3006         return __dev_queue_xmit(skb, accel_priv);
3007 }
3008 EXPORT_SYMBOL(dev_queue_xmit_accel);
3009
3010
3011 /*=======================================================================
3012                         Receiver routines
3013   =======================================================================*/
3014
3015 int netdev_max_backlog __read_mostly = 1000;
3016 EXPORT_SYMBOL(netdev_max_backlog);
3017
3018 int netdev_tstamp_prequeue __read_mostly = 1;
3019 int netdev_budget __read_mostly = 300;
3020 int weight_p __read_mostly = 64;            /* old backlog weight */
3021
3022 /* Called with irq disabled */
3023 static inline void ____napi_schedule(struct softnet_data *sd,
3024                                      struct napi_struct *napi)
3025 {
3026         list_add_tail(&napi->poll_list, &sd->poll_list);
3027         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3028 }
3029
3030 #ifdef CONFIG_RPS
3031
3032 /* One global table that all flow-based protocols share. */
3033 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3034 EXPORT_SYMBOL(rps_sock_flow_table);
3035
3036 struct static_key rps_needed __read_mostly;
3037
3038 static struct rps_dev_flow *
3039 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3040             struct rps_dev_flow *rflow, u16 next_cpu)
3041 {
3042         if (next_cpu != RPS_NO_CPU) {
3043 #ifdef CONFIG_RFS_ACCEL
3044                 struct netdev_rx_queue *rxqueue;
3045                 struct rps_dev_flow_table *flow_table;
3046                 struct rps_dev_flow *old_rflow;
3047                 u32 flow_id;
3048                 u16 rxq_index;
3049                 int rc;
3050
3051                 /* Should we steer this flow to a different hardware queue? */
3052                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3053                     !(dev->features & NETIF_F_NTUPLE))
3054                         goto out;
3055                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3056                 if (rxq_index == skb_get_rx_queue(skb))
3057                         goto out;
3058
3059                 rxqueue = dev->_rx + rxq_index;
3060                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3061                 if (!flow_table)
3062                         goto out;
3063                 flow_id = skb_get_hash(skb) & flow_table->mask;
3064                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3065                                                         rxq_index, flow_id);
3066                 if (rc < 0)
3067                         goto out;
3068                 old_rflow = rflow;
3069                 rflow = &flow_table->flows[flow_id];
3070                 rflow->filter = rc;
3071                 if (old_rflow->filter == rflow->filter)
3072                         old_rflow->filter = RPS_NO_FILTER;
3073         out:
3074 #endif
3075                 rflow->last_qtail =
3076                         per_cpu(softnet_data, next_cpu).input_queue_head;
3077         }
3078
3079         rflow->cpu = next_cpu;
3080         return rflow;
3081 }
3082
3083 /*
3084  * get_rps_cpu is called from netif_receive_skb and returns the target
3085  * CPU from the RPS map of the receiving queue for a given skb.
3086  * rcu_read_lock must be held on entry.
3087  */
3088 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3089                        struct rps_dev_flow **rflowp)
3090 {
3091         struct netdev_rx_queue *rxqueue;
3092         struct rps_map *map;
3093         struct rps_dev_flow_table *flow_table;
3094         struct rps_sock_flow_table *sock_flow_table;
3095         int cpu = -1;
3096         u16 tcpu;
3097         u32 hash;
3098
3099         if (skb_rx_queue_recorded(skb)) {
3100                 u16 index = skb_get_rx_queue(skb);
3101                 if (unlikely(index >= dev->real_num_rx_queues)) {
3102                         WARN_ONCE(dev->real_num_rx_queues > 1,
3103                                   "%s received packet on queue %u, but number "
3104                                   "of RX queues is %u\n",
3105                                   dev->name, index, dev->real_num_rx_queues);
3106                         goto done;
3107                 }
3108                 rxqueue = dev->_rx + index;
3109         } else
3110                 rxqueue = dev->_rx;
3111
3112         map = rcu_dereference(rxqueue->rps_map);
3113         if (map) {
3114                 if (map->len == 1 &&
3115                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3116                         tcpu = map->cpus[0];
3117                         if (cpu_online(tcpu))
3118                                 cpu = tcpu;
3119                         goto done;
3120                 }
3121         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3122                 goto done;
3123         }
3124
3125         skb_reset_network_header(skb);
3126         hash = skb_get_hash(skb);
3127         if (!hash)
3128                 goto done;
3129
3130         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3131         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3132         if (flow_table && sock_flow_table) {
3133                 u16 next_cpu;
3134                 struct rps_dev_flow *rflow;
3135
3136                 rflow = &flow_table->flows[hash & flow_table->mask];
3137                 tcpu = rflow->cpu;
3138
3139                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3140
3141                 /*
3142                  * If the desired CPU (where last recvmsg was done) is
3143                  * different from current CPU (one in the rx-queue flow
3144                  * table entry), switch if one of the following holds:
3145                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3146                  *   - Current CPU is offline.
3147                  *   - The current CPU's queue tail has advanced beyond the
3148                  *     last packet that was enqueued using this table entry.
3149                  *     This guarantees that all previous packets for the flow
3150                  *     have been dequeued, thus preserving in order delivery.
3151                  */
3152                 if (unlikely(tcpu != next_cpu) &&
3153                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3154                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3155                       rflow->last_qtail)) >= 0)) {
3156                         tcpu = next_cpu;
3157                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3158                 }
3159
3160                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3161                         *rflowp = rflow;
3162                         cpu = tcpu;
3163                         goto done;
3164                 }
3165         }
3166
3167         if (map) {
3168                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3169                 if (cpu_online(tcpu)) {
3170                         cpu = tcpu;
3171                         goto done;
3172                 }
3173         }
3174
3175 done:
3176         return cpu;
3177 }
3178
3179 #ifdef CONFIG_RFS_ACCEL
3180
3181 /**
3182  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3183  * @dev: Device on which the filter was set
3184  * @rxq_index: RX queue index
3185  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3186  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3187  *
3188  * Drivers that implement ndo_rx_flow_steer() should periodically call
3189  * this function for each installed filter and remove the filters for
3190  * which it returns %true.
3191  */
3192 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3193                          u32 flow_id, u16 filter_id)
3194 {
3195         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3196         struct rps_dev_flow_table *flow_table;
3197         struct rps_dev_flow *rflow;
3198         bool expire = true;
3199         int cpu;
3200
3201         rcu_read_lock();
3202         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3203         if (flow_table && flow_id <= flow_table->mask) {
3204                 rflow = &flow_table->flows[flow_id];
3205                 cpu = ACCESS_ONCE(rflow->cpu);
3206                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3207                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3208                            rflow->last_qtail) <
3209                      (int)(10 * flow_table->mask)))
3210                         expire = false;
3211         }
3212         rcu_read_unlock();
3213         return expire;
3214 }
3215 EXPORT_SYMBOL(rps_may_expire_flow);
3216
3217 #endif /* CONFIG_RFS_ACCEL */
3218
3219 /* Called from hardirq (IPI) context */
3220 static void rps_trigger_softirq(void *data)
3221 {
3222         struct softnet_data *sd = data;
3223
3224         ____napi_schedule(sd, &sd->backlog);
3225         sd->received_rps++;
3226 }
3227
3228 #endif /* CONFIG_RPS */
3229
3230 /*
3231  * Check if this softnet_data structure is another cpu one
3232  * If yes, queue it to our IPI list and return 1
3233  * If no, return 0
3234  */
3235 static int rps_ipi_queued(struct softnet_data *sd)
3236 {
3237 #ifdef CONFIG_RPS
3238         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3239
3240         if (sd != mysd) {
3241                 sd->rps_ipi_next = mysd->rps_ipi_list;
3242                 mysd->rps_ipi_list = sd;
3243
3244                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3245                 return 1;
3246         }
3247 #endif /* CONFIG_RPS */
3248         return 0;
3249 }
3250
3251 #ifdef CONFIG_NET_FLOW_LIMIT
3252 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3253 #endif
3254
3255 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3256 {
3257 #ifdef CONFIG_NET_FLOW_LIMIT
3258         struct sd_flow_limit *fl;
3259         struct softnet_data *sd;
3260         unsigned int old_flow, new_flow;
3261
3262         if (qlen < (netdev_max_backlog >> 1))
3263                 return false;
3264
3265         sd = this_cpu_ptr(&softnet_data);
3266
3267         rcu_read_lock();
3268         fl = rcu_dereference(sd->flow_limit);
3269         if (fl) {
3270                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3271                 old_flow = fl->history[fl->history_head];
3272                 fl->history[fl->history_head] = new_flow;
3273
3274                 fl->history_head++;
3275                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3276
3277                 if (likely(fl->buckets[old_flow]))
3278                         fl->buckets[old_flow]--;
3279
3280                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3281                         fl->count++;
3282                         rcu_read_unlock();
3283                         return true;
3284                 }
3285         }
3286         rcu_read_unlock();
3287 #endif
3288         return false;
3289 }
3290
3291 /*
3292  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3293  * queue (may be a remote CPU queue).
3294  */
3295 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3296                               unsigned int *qtail)
3297 {
3298         struct softnet_data *sd;
3299         unsigned long flags;
3300         unsigned int qlen;
3301
3302         sd = &per_cpu(softnet_data, cpu);
3303
3304         local_irq_save(flags);
3305
3306         rps_lock(sd);
3307         qlen = skb_queue_len(&sd->input_pkt_queue);
3308         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3309                 if (skb_queue_len(&sd->input_pkt_queue)) {
3310 enqueue:
3311                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3312                         input_queue_tail_incr_save(sd, qtail);
3313                         rps_unlock(sd);
3314                         local_irq_restore(flags);
3315                         return NET_RX_SUCCESS;
3316                 }
3317
3318                 /* Schedule NAPI for backlog device
3319                  * We can use non atomic operation since we own the queue lock
3320                  */
3321                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3322                         if (!rps_ipi_queued(sd))
3323                                 ____napi_schedule(sd, &sd->backlog);
3324                 }
3325                 goto enqueue;
3326         }
3327
3328         sd->dropped++;
3329         rps_unlock(sd);
3330
3331         local_irq_restore(flags);
3332
3333         atomic_long_inc(&skb->dev->rx_dropped);
3334         kfree_skb(skb);
3335         return NET_RX_DROP;
3336 }
3337
3338 static int netif_rx_internal(struct sk_buff *skb)
3339 {
3340         int ret;
3341
3342         net_timestamp_check(netdev_tstamp_prequeue, skb);
3343
3344         trace_netif_rx(skb);
3345 #ifdef CONFIG_RPS
3346         if (static_key_false(&rps_needed)) {
3347                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3348                 int cpu;
3349
3350                 preempt_disable();
3351                 rcu_read_lock();
3352
3353                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3354                 if (cpu < 0)
3355                         cpu = smp_processor_id();
3356
3357                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3358
3359                 rcu_read_unlock();
3360                 preempt_enable();
3361         } else
3362 #endif
3363         {
3364                 unsigned int qtail;
3365                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3366                 put_cpu();
3367         }
3368         return ret;
3369 }
3370
3371 /**
3372  *      netif_rx        -       post buffer to the network code
3373  *      @skb: buffer to post
3374  *
3375  *      This function receives a packet from a device driver and queues it for
3376  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3377  *      may be dropped during processing for congestion control or by the
3378  *      protocol layers.
3379  *
3380  *      return values:
3381  *      NET_RX_SUCCESS  (no congestion)
3382  *      NET_RX_DROP     (packet was dropped)
3383  *
3384  */
3385
3386 int netif_rx(struct sk_buff *skb)
3387 {
3388         trace_netif_rx_entry(skb);
3389
3390         return netif_rx_internal(skb);
3391 }
3392 EXPORT_SYMBOL(netif_rx);
3393
3394 int netif_rx_ni(struct sk_buff *skb)
3395 {
3396         int err;
3397
3398         trace_netif_rx_ni_entry(skb);
3399
3400         preempt_disable();
3401         err = netif_rx_internal(skb);
3402         if (local_softirq_pending())
3403                 do_softirq();
3404         preempt_enable();
3405
3406         return err;
3407 }
3408 EXPORT_SYMBOL(netif_rx_ni);
3409
3410 static void net_tx_action(struct softirq_action *h)
3411 {
3412         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3413
3414         if (sd->completion_queue) {
3415                 struct sk_buff *clist;
3416
3417                 local_irq_disable();
3418                 clist = sd->completion_queue;
3419                 sd->completion_queue = NULL;
3420                 local_irq_enable();
3421
3422                 while (clist) {
3423                         struct sk_buff *skb = clist;
3424                         clist = clist->next;
3425
3426                         WARN_ON(atomic_read(&skb->users));
3427                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3428                                 trace_consume_skb(skb);
3429                         else
3430                                 trace_kfree_skb(skb, net_tx_action);
3431                         __kfree_skb(skb);
3432                 }
3433         }
3434
3435         if (sd->output_queue) {
3436                 struct Qdisc *head;
3437
3438                 local_irq_disable();
3439                 head = sd->output_queue;
3440                 sd->output_queue = NULL;
3441                 sd->output_queue_tailp = &sd->output_queue;
3442                 local_irq_enable();
3443
3444                 while (head) {
3445                         struct Qdisc *q = head;
3446                         spinlock_t *root_lock;
3447
3448                         head = head->next_sched;
3449
3450                         root_lock = qdisc_lock(q);
3451                         if (spin_trylock(root_lock)) {
3452                                 smp_mb__before_atomic();
3453                                 clear_bit(__QDISC_STATE_SCHED,
3454                                           &q->state);
3455                                 qdisc_run(q);
3456                                 spin_unlock(root_lock);
3457                         } else {
3458                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3459                                               &q->state)) {
3460                                         __netif_reschedule(q);
3461                                 } else {
3462                                         smp_mb__before_atomic();
3463                                         clear_bit(__QDISC_STATE_SCHED,
3464                                                   &q->state);
3465                                 }
3466                         }
3467                 }
3468         }
3469 }
3470
3471 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3472     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3473 /* This hook is defined here for ATM LANE */
3474 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3475                              unsigned char *addr) __read_mostly;
3476 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3477 #endif
3478
3479 #ifdef CONFIG_NET_CLS_ACT
3480 /* TODO: Maybe we should just force sch_ingress to be compiled in
3481  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3482  * a compare and 2 stores extra right now if we dont have it on
3483  * but have CONFIG_NET_CLS_ACT
3484  * NOTE: This doesn't stop any functionality; if you dont have
3485  * the ingress scheduler, you just can't add policies on ingress.
3486  *
3487  */
3488 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3489 {
3490         struct net_device *dev = skb->dev;
3491         u32 ttl = G_TC_RTTL(skb->tc_verd);
3492         int result = TC_ACT_OK;
3493         struct Qdisc *q;
3494
3495         if (unlikely(MAX_RED_LOOP < ttl++)) {
3496                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3497                                      skb->skb_iif, dev->ifindex);
3498                 return TC_ACT_SHOT;
3499         }
3500
3501         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3502         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3503
3504         q = rcu_dereference(rxq->qdisc);
3505         if (q != &noop_qdisc) {
3506                 spin_lock(qdisc_lock(q));
3507                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3508                         result = qdisc_enqueue_root(skb, q);
3509                 spin_unlock(qdisc_lock(q));
3510         }
3511
3512         return result;
3513 }
3514
3515 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3516                                          struct packet_type **pt_prev,
3517                                          int *ret, struct net_device *orig_dev)
3518 {
3519         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3520
3521         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3522                 goto out;
3523
3524         if (*pt_prev) {
3525                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3526                 *pt_prev = NULL;
3527         }
3528
3529         switch (ing_filter(skb, rxq)) {
3530         case TC_ACT_SHOT:
3531         case TC_ACT_STOLEN:
3532                 kfree_skb(skb);
3533                 return NULL;
3534         }
3535
3536 out:
3537         skb->tc_verd = 0;
3538         return skb;
3539 }
3540 #endif
3541
3542 /**
3543  *      netdev_rx_handler_register - register receive handler
3544  *      @dev: device to register a handler for
3545  *      @rx_handler: receive handler to register
3546  *      @rx_handler_data: data pointer that is used by rx handler
3547  *
3548  *      Register a receive handler for a device. This handler will then be
3549  *      called from __netif_receive_skb. A negative errno code is returned
3550  *      on a failure.
3551  *
3552  *      The caller must hold the rtnl_mutex.
3553  *
3554  *      For a general description of rx_handler, see enum rx_handler_result.
3555  */
3556 int netdev_rx_handler_register(struct net_device *dev,
3557                                rx_handler_func_t *rx_handler,
3558                                void *rx_handler_data)
3559 {
3560         ASSERT_RTNL();
3561
3562         if (dev->rx_handler)
3563                 return -EBUSY;
3564
3565         /* Note: rx_handler_data must be set before rx_handler */
3566         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3567         rcu_assign_pointer(dev->rx_handler, rx_handler);
3568
3569         return 0;
3570 }
3571 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3572
3573 /**
3574  *      netdev_rx_handler_unregister - unregister receive handler
3575  *      @dev: device to unregister a handler from
3576  *
3577  *      Unregister a receive handler from a device.
3578  *
3579  *      The caller must hold the rtnl_mutex.
3580  */
3581 void netdev_rx_handler_unregister(struct net_device *dev)
3582 {
3583
3584         ASSERT_RTNL();
3585         RCU_INIT_POINTER(dev->rx_handler, NULL);
3586         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3587          * section has a guarantee to see a non NULL rx_handler_data
3588          * as well.
3589          */
3590         synchronize_net();
3591         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3592 }
3593 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3594
3595 /*
3596  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3597  * the special handling of PFMEMALLOC skbs.
3598  */
3599 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3600 {
3601         switch (skb->protocol) {
3602         case htons(ETH_P_ARP):
3603         case htons(ETH_P_IP):
3604         case htons(ETH_P_IPV6):
3605         case htons(ETH_P_8021Q):
3606         case htons(ETH_P_8021AD):
3607                 return true;
3608         default:
3609                 return false;
3610         }
3611 }
3612
3613 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3614 {
3615         struct packet_type *ptype, *pt_prev;
3616         rx_handler_func_t *rx_handler;
3617         struct net_device *orig_dev;
3618         struct net_device *null_or_dev;
3619         bool deliver_exact = false;
3620         int ret = NET_RX_DROP;
3621         __be16 type;
3622
3623         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3624
3625         trace_netif_receive_skb(skb);
3626
3627         orig_dev = skb->dev;
3628
3629         skb_reset_network_header(skb);
3630         if (!skb_transport_header_was_set(skb))
3631                 skb_reset_transport_header(skb);
3632         skb_reset_mac_len(skb);
3633
3634         pt_prev = NULL;
3635
3636         rcu_read_lock();
3637
3638 another_round:
3639         skb->skb_iif = skb->dev->ifindex;
3640
3641         __this_cpu_inc(softnet_data.processed);
3642
3643         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3644             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3645                 skb = skb_vlan_untag(skb);
3646                 if (unlikely(!skb))
3647                         goto unlock;
3648         }
3649
3650 #ifdef CONFIG_NET_CLS_ACT
3651         if (skb->tc_verd & TC_NCLS) {
3652                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3653                 goto ncls;
3654         }
3655 #endif
3656
3657         if (pfmemalloc)
3658                 goto skip_taps;
3659
3660         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3661                 if (!ptype->dev || ptype->dev == skb->dev) {
3662                         if (pt_prev)
3663                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3664                         pt_prev = ptype;
3665                 }
3666         }
3667
3668 skip_taps:
3669 #ifdef CONFIG_NET_CLS_ACT
3670         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3671         if (!skb)
3672                 goto unlock;
3673 ncls:
3674 #endif
3675
3676         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3677                 goto drop;
3678
3679         if (vlan_tx_tag_present(skb)) {
3680                 if (pt_prev) {
3681                         ret = deliver_skb(skb, pt_prev, orig_dev);
3682                         pt_prev = NULL;
3683                 }
3684                 if (vlan_do_receive(&skb))
3685                         goto another_round;
3686                 else if (unlikely(!skb))
3687                         goto unlock;
3688         }
3689
3690         rx_handler = rcu_dereference(skb->dev->rx_handler);
3691         if (rx_handler) {
3692                 if (pt_prev) {
3693                         ret = deliver_skb(skb, pt_prev, orig_dev);
3694                         pt_prev = NULL;
3695                 }
3696                 switch (rx_handler(&skb)) {
3697                 case RX_HANDLER_CONSUMED:
3698                         ret = NET_RX_SUCCESS;
3699                         goto unlock;
3700                 case RX_HANDLER_ANOTHER:
3701                         goto another_round;
3702                 case RX_HANDLER_EXACT:
3703                         deliver_exact = true;
3704                 case RX_HANDLER_PASS:
3705                         break;
3706                 default:
3707                         BUG();
3708                 }
3709         }
3710
3711         if (unlikely(vlan_tx_tag_present(skb))) {
3712                 if (vlan_tx_tag_get_id(skb))
3713                         skb->pkt_type = PACKET_OTHERHOST;
3714                 /* Note: we might in the future use prio bits
3715                  * and set skb->priority like in vlan_do_receive()
3716                  * For the time being, just ignore Priority Code Point
3717                  */
3718                 skb->vlan_tci = 0;
3719         }
3720
3721         /* deliver only exact match when indicated */
3722         null_or_dev = deliver_exact ? skb->dev : NULL;
3723
3724         type = skb->protocol;
3725         list_for_each_entry_rcu(ptype,
3726                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3727                 if (ptype->type == type &&
3728                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3729                      ptype->dev == orig_dev)) {
3730                         if (pt_prev)
3731                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3732                         pt_prev = ptype;
3733                 }
3734         }
3735
3736         if (pt_prev) {
3737                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3738                         goto drop;
3739                 else
3740                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3741         } else {
3742 drop:
3743                 atomic_long_inc(&skb->dev->rx_dropped);
3744                 kfree_skb(skb);
3745                 /* Jamal, now you will not able to escape explaining
3746                  * me how you were going to use this. :-)
3747                  */
3748                 ret = NET_RX_DROP;
3749         }
3750
3751 unlock:
3752         rcu_read_unlock();
3753         return ret;
3754 }
3755
3756 static int __netif_receive_skb(struct sk_buff *skb)
3757 {
3758         int ret;
3759
3760         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3761                 unsigned long pflags = current->flags;
3762
3763                 /*
3764                  * PFMEMALLOC skbs are special, they should
3765                  * - be delivered to SOCK_MEMALLOC sockets only
3766                  * - stay away from userspace
3767                  * - have bounded memory usage
3768                  *
3769                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3770                  * context down to all allocation sites.
3771                  */
3772                 current->flags |= PF_MEMALLOC;
3773                 ret = __netif_receive_skb_core(skb, true);
3774                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3775         } else
3776                 ret = __netif_receive_skb_core(skb, false);
3777
3778         return ret;
3779 }
3780
3781 static int netif_receive_skb_internal(struct sk_buff *skb)
3782 {
3783         net_timestamp_check(netdev_tstamp_prequeue, skb);
3784
3785         if (skb_defer_rx_timestamp(skb))
3786                 return NET_RX_SUCCESS;
3787
3788 #ifdef CONFIG_RPS
3789         if (static_key_false(&rps_needed)) {
3790                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3791                 int cpu, ret;
3792
3793                 rcu_read_lock();
3794
3795                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3796
3797                 if (cpu >= 0) {
3798                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3799                         rcu_read_unlock();
3800                         return ret;
3801                 }
3802                 rcu_read_unlock();
3803         }
3804 #endif
3805         return __netif_receive_skb(skb);
3806 }
3807
3808 /**
3809  *      netif_receive_skb - process receive buffer from network
3810  *      @skb: buffer to process
3811  *
3812  *      netif_receive_skb() is the main receive data processing function.
3813  *      It always succeeds. The buffer may be dropped during processing
3814  *      for congestion control or by the protocol layers.
3815  *
3816  *      This function may only be called from softirq context and interrupts
3817  *      should be enabled.
3818  *
3819  *      Return values (usually ignored):
3820  *      NET_RX_SUCCESS: no congestion
3821  *      NET_RX_DROP: packet was dropped
3822  */
3823 int netif_receive_skb(struct sk_buff *skb)
3824 {
3825         trace_netif_receive_skb_entry(skb);
3826
3827         return netif_receive_skb_internal(skb);
3828 }
3829 EXPORT_SYMBOL(netif_receive_skb);
3830
3831 /* Network device is going away, flush any packets still pending
3832  * Called with irqs disabled.
3833  */
3834 static void flush_backlog(void *arg)
3835 {
3836         struct net_device *dev = arg;
3837         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3838         struct sk_buff *skb, *tmp;
3839
3840         rps_lock(sd);
3841         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3842                 if (skb->dev == dev) {
3843                         __skb_unlink(skb, &sd->input_pkt_queue);
3844                         kfree_skb(skb);
3845                         input_queue_head_incr(sd);
3846                 }
3847         }
3848         rps_unlock(sd);
3849
3850         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3851                 if (skb->dev == dev) {
3852                         __skb_unlink(skb, &sd->process_queue);
3853                         kfree_skb(skb);
3854                         input_queue_head_incr(sd);
3855                 }
3856         }
3857 }
3858
3859 static int napi_gro_complete(struct sk_buff *skb)
3860 {
3861         struct packet_offload *ptype;
3862         __be16 type = skb->protocol;
3863         struct list_head *head = &offload_base;
3864         int err = -ENOENT;
3865
3866         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3867
3868         if (NAPI_GRO_CB(skb)->count == 1) {
3869                 skb_shinfo(skb)->gso_size = 0;
3870                 goto out;
3871         }
3872
3873         rcu_read_lock();
3874         list_for_each_entry_rcu(ptype, head, list) {
3875                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3876                         continue;
3877
3878                 err = ptype->callbacks.gro_complete(skb, 0);
3879                 break;
3880         }
3881         rcu_read_unlock();
3882
3883         if (err) {
3884                 WARN_ON(&ptype->list == head);
3885                 kfree_skb(skb);
3886                 return NET_RX_SUCCESS;
3887         }
3888
3889 out:
3890         return netif_receive_skb_internal(skb);
3891 }
3892
3893 /* napi->gro_list contains packets ordered by age.
3894  * youngest packets at the head of it.
3895  * Complete skbs in reverse order to reduce latencies.
3896  */
3897 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3898 {
3899         struct sk_buff *skb, *prev = NULL;
3900
3901         /* scan list and build reverse chain */
3902         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3903                 skb->prev = prev;
3904                 prev = skb;
3905         }
3906
3907         for (skb = prev; skb; skb = prev) {
3908                 skb->next = NULL;
3909
3910                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3911                         return;
3912
3913                 prev = skb->prev;
3914                 napi_gro_complete(skb);
3915                 napi->gro_count--;
3916         }
3917
3918         napi->gro_list = NULL;
3919 }
3920 EXPORT_SYMBOL(napi_gro_flush);
3921
3922 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3923 {
3924         struct sk_buff *p;
3925         unsigned int maclen = skb->dev->hard_header_len;
3926         u32 hash = skb_get_hash_raw(skb);
3927
3928         for (p = napi->gro_list; p; p = p->next) {
3929                 unsigned long diffs;
3930
3931                 NAPI_GRO_CB(p)->flush = 0;
3932
3933                 if (hash != skb_get_hash_raw(p)) {
3934                         NAPI_GRO_CB(p)->same_flow = 0;
3935                         continue;
3936                 }
3937
3938                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3939                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3940                 if (maclen == ETH_HLEN)
3941                         diffs |= compare_ether_header(skb_mac_header(p),
3942                                                       skb_mac_header(skb));
3943                 else if (!diffs)
3944                         diffs = memcmp(skb_mac_header(p),
3945                                        skb_mac_header(skb),
3946                                        maclen);
3947                 NAPI_GRO_CB(p)->same_flow = !diffs;
3948         }
3949 }
3950
3951 static void skb_gro_reset_offset(struct sk_buff *skb)
3952 {
3953         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3954         const skb_frag_t *frag0 = &pinfo->frags[0];
3955
3956         NAPI_GRO_CB(skb)->data_offset = 0;
3957         NAPI_GRO_CB(skb)->frag0 = NULL;
3958         NAPI_GRO_CB(skb)->frag0_len = 0;
3959
3960         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3961             pinfo->nr_frags &&
3962             !PageHighMem(skb_frag_page(frag0))) {
3963                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3964                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3965         }
3966 }
3967
3968 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3969 {
3970         struct skb_shared_info *pinfo = skb_shinfo(skb);
3971
3972         BUG_ON(skb->end - skb->tail < grow);
3973
3974         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3975
3976         skb->data_len -= grow;
3977         skb->tail += grow;
3978
3979         pinfo->frags[0].page_offset += grow;
3980         skb_frag_size_sub(&pinfo->frags[0], grow);
3981
3982         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3983                 skb_frag_unref(skb, 0);
3984                 memmove(pinfo->frags, pinfo->frags + 1,
3985                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3986         }
3987 }
3988
3989 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3990 {
3991         struct sk_buff **pp = NULL;
3992         struct packet_offload *ptype;
3993         __be16 type = skb->protocol;
3994         struct list_head *head = &offload_base;
3995         int same_flow;
3996         enum gro_result ret;
3997         int grow;
3998
3999         if (!(skb->dev->features & NETIF_F_GRO))
4000                 goto normal;
4001
4002         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4003                 goto normal;
4004
4005         gro_list_prepare(napi, skb);
4006
4007         rcu_read_lock();
4008         list_for_each_entry_rcu(ptype, head, list) {
4009                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4010                         continue;
4011
4012                 skb_set_network_header(skb, skb_gro_offset(skb));
4013                 skb_reset_mac_len(skb);
4014                 NAPI_GRO_CB(skb)->same_flow = 0;
4015                 NAPI_GRO_CB(skb)->flush = 0;
4016                 NAPI_GRO_CB(skb)->free = 0;
4017                 NAPI_GRO_CB(skb)->udp_mark = 0;
4018
4019                 /* Setup for GRO checksum validation */
4020                 switch (skb->ip_summed) {
4021                 case CHECKSUM_COMPLETE:
4022                         NAPI_GRO_CB(skb)->csum = skb->csum;
4023                         NAPI_GRO_CB(skb)->csum_valid = 1;
4024                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4025                         break;
4026                 case CHECKSUM_UNNECESSARY:
4027                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4028                         NAPI_GRO_CB(skb)->csum_valid = 0;
4029                         break;
4030                 default:
4031                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4032                         NAPI_GRO_CB(skb)->csum_valid = 0;
4033                 }
4034
4035                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4036                 break;
4037         }
4038         rcu_read_unlock();
4039
4040         if (&ptype->list == head)
4041                 goto normal;
4042
4043         same_flow = NAPI_GRO_CB(skb)->same_flow;
4044         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4045
4046         if (pp) {
4047                 struct sk_buff *nskb = *pp;
4048
4049                 *pp = nskb->next;
4050                 nskb->next = NULL;
4051                 napi_gro_complete(nskb);
4052                 napi->gro_count--;
4053         }
4054
4055         if (same_flow)
4056                 goto ok;
4057
4058         if (NAPI_GRO_CB(skb)->flush)
4059                 goto normal;
4060
4061         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4062                 struct sk_buff *nskb = napi->gro_list;
4063
4064                 /* locate the end of the list to select the 'oldest' flow */
4065                 while (nskb->next) {
4066                         pp = &nskb->next;
4067                         nskb = *pp;
4068                 }
4069                 *pp = NULL;
4070                 nskb->next = NULL;
4071                 napi_gro_complete(nskb);
4072         } else {
4073                 napi->gro_count++;
4074         }
4075         NAPI_GRO_CB(skb)->count = 1;
4076         NAPI_GRO_CB(skb)->age = jiffies;
4077         NAPI_GRO_CB(skb)->last = skb;
4078         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4079         skb->next = napi->gro_list;
4080         napi->gro_list = skb;
4081         ret = GRO_HELD;
4082
4083 pull:
4084         grow = skb_gro_offset(skb) - skb_headlen(skb);
4085         if (grow > 0)
4086                 gro_pull_from_frag0(skb, grow);
4087 ok:
4088         return ret;
4089
4090 normal:
4091         ret = GRO_NORMAL;
4092         goto pull;
4093 }
4094
4095 struct packet_offload *gro_find_receive_by_type(__be16 type)
4096 {
4097         struct list_head *offload_head = &offload_base;
4098         struct packet_offload *ptype;
4099
4100         list_for_each_entry_rcu(ptype, offload_head, list) {
4101                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4102                         continue;
4103                 return ptype;
4104         }
4105         return NULL;
4106 }
4107 EXPORT_SYMBOL(gro_find_receive_by_type);
4108
4109 struct packet_offload *gro_find_complete_by_type(__be16 type)
4110 {
4111         struct list_head *offload_head = &offload_base;
4112         struct packet_offload *ptype;
4113
4114         list_for_each_entry_rcu(ptype, offload_head, list) {
4115                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4116                         continue;
4117                 return ptype;
4118         }
4119         return NULL;
4120 }
4121 EXPORT_SYMBOL(gro_find_complete_by_type);
4122
4123 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4124 {
4125         switch (ret) {
4126         case GRO_NORMAL:
4127                 if (netif_receive_skb_internal(skb))
4128                         ret = GRO_DROP;
4129                 break;
4130
4131         case GRO_DROP:
4132                 kfree_skb(skb);
4133                 break;
4134
4135         case GRO_MERGED_FREE:
4136                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4137                         kmem_cache_free(skbuff_head_cache, skb);
4138                 else
4139                         __kfree_skb(skb);
4140                 break;
4141
4142         case GRO_HELD:
4143         case GRO_MERGED:
4144                 break;
4145         }
4146
4147         return ret;
4148 }
4149
4150 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4151 {
4152         trace_napi_gro_receive_entry(skb);
4153
4154         skb_gro_reset_offset(skb);
4155
4156         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4157 }
4158 EXPORT_SYMBOL(napi_gro_receive);
4159
4160 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4161 {
4162         if (unlikely(skb->pfmemalloc)) {
4163                 consume_skb(skb);
4164                 return;
4165         }
4166         __skb_pull(skb, skb_headlen(skb));
4167         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4168         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4169         skb->vlan_tci = 0;
4170         skb->dev = napi->dev;
4171         skb->skb_iif = 0;
4172         skb->encapsulation = 0;
4173         skb_shinfo(skb)->gso_type = 0;
4174         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4175
4176         napi->skb = skb;
4177 }
4178
4179 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4180 {
4181         struct sk_buff *skb = napi->skb;
4182
4183         if (!skb) {
4184                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4185                 napi->skb = skb;
4186         }
4187         return skb;
4188 }
4189 EXPORT_SYMBOL(napi_get_frags);
4190
4191 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4192                                       struct sk_buff *skb,
4193                                       gro_result_t ret)
4194 {
4195         switch (ret) {
4196         case GRO_NORMAL:
4197         case GRO_HELD:
4198                 __skb_push(skb, ETH_HLEN);
4199                 skb->protocol = eth_type_trans(skb, skb->dev);
4200                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4201                         ret = GRO_DROP;
4202                 break;
4203
4204         case GRO_DROP:
4205         case GRO_MERGED_FREE:
4206                 napi_reuse_skb(napi, skb);
4207                 break;
4208
4209         case GRO_MERGED:
4210                 break;
4211         }
4212
4213         return ret;
4214 }
4215
4216 /* Upper GRO stack assumes network header starts at gro_offset=0
4217  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4218  * We copy ethernet header into skb->data to have a common layout.
4219  */
4220 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4221 {
4222         struct sk_buff *skb = napi->skb;
4223         const struct ethhdr *eth;
4224         unsigned int hlen = sizeof(*eth);
4225
4226         napi->skb = NULL;
4227
4228         skb_reset_mac_header(skb);
4229         skb_gro_reset_offset(skb);
4230
4231         eth = skb_gro_header_fast(skb, 0);
4232         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4233                 eth = skb_gro_header_slow(skb, hlen, 0);
4234                 if (unlikely(!eth)) {
4235                         napi_reuse_skb(napi, skb);
4236                         return NULL;
4237                 }
4238         } else {
4239                 gro_pull_from_frag0(skb, hlen);
4240                 NAPI_GRO_CB(skb)->frag0 += hlen;
4241                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4242         }
4243         __skb_pull(skb, hlen);
4244
4245         /*
4246          * This works because the only protocols we care about don't require
4247          * special handling.
4248          * We'll fix it up properly in napi_frags_finish()
4249          */
4250         skb->protocol = eth->h_proto;
4251
4252         return skb;
4253 }
4254
4255 gro_result_t napi_gro_frags(struct napi_struct *napi)
4256 {
4257         struct sk_buff *skb = napi_frags_skb(napi);
4258
4259         if (!skb)
4260                 return GRO_DROP;
4261
4262         trace_napi_gro_frags_entry(skb);
4263
4264         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4265 }
4266 EXPORT_SYMBOL(napi_gro_frags);
4267
4268 /* Compute the checksum from gro_offset and return the folded value
4269  * after adding in any pseudo checksum.
4270  */
4271 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4272 {
4273         __wsum wsum;
4274         __sum16 sum;
4275
4276         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4277
4278         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4279         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4280         if (likely(!sum)) {
4281                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4282                     !skb->csum_complete_sw)
4283                         netdev_rx_csum_fault(skb->dev);
4284         }
4285
4286         NAPI_GRO_CB(skb)->csum = wsum;
4287         NAPI_GRO_CB(skb)->csum_valid = 1;
4288
4289         return sum;
4290 }
4291 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4292
4293 /*
4294  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4295  * Note: called with local irq disabled, but exits with local irq enabled.
4296  */
4297 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4298 {
4299 #ifdef CONFIG_RPS
4300         struct softnet_data *remsd = sd->rps_ipi_list;
4301
4302         if (remsd) {
4303                 sd->rps_ipi_list = NULL;
4304
4305                 local_irq_enable();
4306
4307                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4308                 while (remsd) {
4309                         struct softnet_data *next = remsd->rps_ipi_next;
4310
4311                         if (cpu_online(remsd->cpu))
4312                                 smp_call_function_single_async(remsd->cpu,
4313                                                            &remsd->csd);
4314                         remsd = next;
4315                 }
4316         } else
4317 #endif
4318                 local_irq_enable();
4319 }
4320
4321 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4322 {
4323 #ifdef CONFIG_RPS
4324         return sd->rps_ipi_list != NULL;
4325 #else
4326         return false;
4327 #endif
4328 }
4329
4330 static int process_backlog(struct napi_struct *napi, int quota)
4331 {
4332         int work = 0;
4333         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4334
4335         /* Check if we have pending ipi, its better to send them now,
4336          * not waiting net_rx_action() end.
4337          */
4338         if (sd_has_rps_ipi_waiting(sd)) {
4339                 local_irq_disable();
4340                 net_rps_action_and_irq_enable(sd);
4341         }
4342
4343         napi->weight = weight_p;
4344         local_irq_disable();
4345         while (1) {
4346                 struct sk_buff *skb;
4347
4348                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4349                         local_irq_enable();
4350                         __netif_receive_skb(skb);
4351                         local_irq_disable();
4352                         input_queue_head_incr(sd);
4353                         if (++work >= quota) {
4354                                 local_irq_enable();
4355                                 return work;
4356                         }
4357                 }
4358
4359                 rps_lock(sd);
4360                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4361                         /*
4362                          * Inline a custom version of __napi_complete().
4363                          * only current cpu owns and manipulates this napi,
4364                          * and NAPI_STATE_SCHED is the only possible flag set
4365                          * on backlog.
4366                          * We can use a plain write instead of clear_bit(),
4367                          * and we dont need an smp_mb() memory barrier.
4368                          */
4369                         napi->state = 0;
4370                         rps_unlock(sd);
4371
4372                         break;
4373                 }
4374
4375                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4376                                            &sd->process_queue);
4377                 rps_unlock(sd);
4378         }
4379         local_irq_enable();
4380
4381         return work;
4382 }
4383
4384 /**
4385  * __napi_schedule - schedule for receive
4386  * @n: entry to schedule
4387  *
4388  * The entry's receive function will be scheduled to run.
4389  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4390  */
4391 void __napi_schedule(struct napi_struct *n)
4392 {
4393         unsigned long flags;
4394
4395         local_irq_save(flags);
4396         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4397         local_irq_restore(flags);
4398 }
4399 EXPORT_SYMBOL(__napi_schedule);
4400
4401 /**
4402  * __napi_schedule_irqoff - schedule for receive
4403  * @n: entry to schedule
4404  *
4405  * Variant of __napi_schedule() assuming hard irqs are masked
4406  */
4407 void __napi_schedule_irqoff(struct napi_struct *n)
4408 {
4409         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4410 }
4411 EXPORT_SYMBOL(__napi_schedule_irqoff);
4412
4413 void __napi_complete(struct napi_struct *n)
4414 {
4415         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4416
4417         list_del_init(&n->poll_list);
4418         smp_mb__before_atomic();
4419         clear_bit(NAPI_STATE_SCHED, &n->state);
4420 }
4421 EXPORT_SYMBOL(__napi_complete);
4422
4423 void napi_complete_done(struct napi_struct *n, int work_done)
4424 {
4425         unsigned long flags;
4426
4427         /*
4428          * don't let napi dequeue from the cpu poll list
4429          * just in case its running on a different cpu
4430          */
4431         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4432                 return;
4433
4434         if (n->gro_list) {
4435                 unsigned long timeout = 0;
4436
4437                 if (work_done)
4438                         timeout = n->dev->gro_flush_timeout;
4439
4440                 if (timeout)
4441                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4442                                       HRTIMER_MODE_REL_PINNED);
4443                 else
4444                         napi_gro_flush(n, false);
4445         }
4446         if (likely(list_empty(&n->poll_list))) {
4447                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4448         } else {
4449                 /* If n->poll_list is not empty, we need to mask irqs */
4450                 local_irq_save(flags);
4451                 __napi_complete(n);
4452                 local_irq_restore(flags);
4453         }
4454 }
4455 EXPORT_SYMBOL(napi_complete_done);
4456
4457 /* must be called under rcu_read_lock(), as we dont take a reference */
4458 struct napi_struct *napi_by_id(unsigned int napi_id)
4459 {
4460         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4461         struct napi_struct *napi;
4462
4463         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4464                 if (napi->napi_id == napi_id)
4465                         return napi;
4466
4467         return NULL;
4468 }
4469 EXPORT_SYMBOL_GPL(napi_by_id);
4470
4471 void napi_hash_add(struct napi_struct *napi)
4472 {
4473         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4474
4475                 spin_lock(&napi_hash_lock);
4476
4477                 /* 0 is not a valid id, we also skip an id that is taken
4478                  * we expect both events to be extremely rare
4479                  */
4480                 napi->napi_id = 0;
4481                 while (!napi->napi_id) {
4482                         napi->napi_id = ++napi_gen_id;
4483                         if (napi_by_id(napi->napi_id))
4484                                 napi->napi_id = 0;
4485                 }
4486
4487                 hlist_add_head_rcu(&napi->napi_hash_node,
4488                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4489
4490                 spin_unlock(&napi_hash_lock);
4491         }
4492 }
4493 EXPORT_SYMBOL_GPL(napi_hash_add);
4494
4495 /* Warning : caller is responsible to make sure rcu grace period
4496  * is respected before freeing memory containing @napi
4497  */
4498 void napi_hash_del(struct napi_struct *napi)
4499 {
4500         spin_lock(&napi_hash_lock);
4501
4502         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4503                 hlist_del_rcu(&napi->napi_hash_node);
4504
4505         spin_unlock(&napi_hash_lock);
4506 }
4507 EXPORT_SYMBOL_GPL(napi_hash_del);
4508
4509 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4510 {
4511         struct napi_struct *napi;
4512
4513         napi = container_of(timer, struct napi_struct, timer);
4514         if (napi->gro_list)
4515                 napi_schedule(napi);
4516
4517         return HRTIMER_NORESTART;
4518 }
4519
4520 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4521                     int (*poll)(struct napi_struct *, int), int weight)
4522 {
4523         INIT_LIST_HEAD(&napi->poll_list);
4524         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4525         napi->timer.function = napi_watchdog;
4526         napi->gro_count = 0;
4527         napi->gro_list = NULL;
4528         napi->skb = NULL;
4529         napi->poll = poll;
4530         if (weight > NAPI_POLL_WEIGHT)
4531                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4532                             weight, dev->name);
4533         napi->weight = weight;
4534         list_add(&napi->dev_list, &dev->napi_list);
4535         napi->dev = dev;
4536 #ifdef CONFIG_NETPOLL
4537         spin_lock_init(&napi->poll_lock);
4538         napi->poll_owner = -1;
4539 #endif
4540         set_bit(NAPI_STATE_SCHED, &napi->state);
4541 }
4542 EXPORT_SYMBOL(netif_napi_add);
4543
4544 void napi_disable(struct napi_struct *n)
4545 {
4546         might_sleep();
4547         set_bit(NAPI_STATE_DISABLE, &n->state);
4548
4549         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4550                 msleep(1);
4551
4552         hrtimer_cancel(&n->timer);
4553
4554         clear_bit(NAPI_STATE_DISABLE, &n->state);
4555 }
4556 EXPORT_SYMBOL(napi_disable);
4557
4558 void netif_napi_del(struct napi_struct *napi)
4559 {
4560         list_del_init(&napi->dev_list);
4561         napi_free_frags(napi);
4562
4563         kfree_skb_list(napi->gro_list);
4564         napi->gro_list = NULL;
4565         napi->gro_count = 0;
4566 }
4567 EXPORT_SYMBOL(netif_napi_del);
4568
4569 static void net_rx_action(struct softirq_action *h)
4570 {
4571         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4572         unsigned long time_limit = jiffies + 2;
4573         int budget = netdev_budget;
4574         LIST_HEAD(list);
4575         LIST_HEAD(repoll);
4576         void *have;
4577
4578         local_irq_disable();
4579         list_splice_init(&sd->poll_list, &list);
4580         local_irq_enable();
4581
4582         while (!list_empty(&list)) {
4583                 struct napi_struct *n;
4584                 int work, weight;
4585
4586                 /* If softirq window is exhausted then punt.
4587                  * Allow this to run for 2 jiffies since which will allow
4588                  * an average latency of 1.5/HZ.
4589                  */
4590                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4591                         goto softnet_break;
4592
4593
4594                 n = list_first_entry(&list, struct napi_struct, poll_list);
4595                 list_del_init(&n->poll_list);
4596
4597                 have = netpoll_poll_lock(n);
4598
4599                 weight = n->weight;
4600
4601                 /* This NAPI_STATE_SCHED test is for avoiding a race
4602                  * with netpoll's poll_napi().  Only the entity which
4603                  * obtains the lock and sees NAPI_STATE_SCHED set will
4604                  * actually make the ->poll() call.  Therefore we avoid
4605                  * accidentally calling ->poll() when NAPI is not scheduled.
4606                  */
4607                 work = 0;
4608                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4609                         work = n->poll(n, weight);
4610                         trace_napi_poll(n);
4611                 }
4612
4613                 WARN_ON_ONCE(work > weight);
4614
4615                 budget -= work;
4616
4617                 /* Drivers must not modify the NAPI state if they
4618                  * consume the entire weight.  In such cases this code
4619                  * still "owns" the NAPI instance and therefore can
4620                  * move the instance around on the list at-will.
4621                  */
4622                 if (unlikely(work == weight)) {
4623                         if (unlikely(napi_disable_pending(n))) {
4624                                 napi_complete(n);
4625                         } else {
4626                                 if (n->gro_list) {
4627                                         /* flush too old packets
4628                                          * If HZ < 1000, flush all packets.
4629                                          */
4630                                         napi_gro_flush(n, HZ >= 1000);
4631                                 }
4632                                 list_add_tail(&n->poll_list, &repoll);
4633                         }
4634                 }
4635
4636                 netpoll_poll_unlock(have);
4637         }
4638
4639         if (!sd_has_rps_ipi_waiting(sd) &&
4640             list_empty(&list) &&
4641             list_empty(&repoll))
4642                 return;
4643 out:
4644         local_irq_disable();
4645
4646         list_splice_tail_init(&sd->poll_list, &list);
4647         list_splice_tail(&repoll, &list);
4648         list_splice(&list, &sd->poll_list);
4649         if (!list_empty(&sd->poll_list))
4650                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4651
4652         net_rps_action_and_irq_enable(sd);
4653
4654         return;
4655
4656 softnet_break:
4657         sd->time_squeeze++;
4658         goto out;
4659 }
4660
4661 struct netdev_adjacent {
4662         struct net_device *dev;
4663
4664         /* upper master flag, there can only be one master device per list */
4665         bool master;
4666
4667         /* counter for the number of times this device was added to us */
4668         u16 ref_nr;
4669
4670         /* private field for the users */
4671         void *private;
4672
4673         struct list_head list;
4674         struct rcu_head rcu;
4675 };
4676
4677 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4678                                                  struct net_device *adj_dev,
4679                                                  struct list_head *adj_list)
4680 {
4681         struct netdev_adjacent *adj;
4682
4683         list_for_each_entry(adj, adj_list, list) {
4684                 if (adj->dev == adj_dev)
4685                         return adj;
4686         }
4687         return NULL;
4688 }
4689
4690 /**
4691  * netdev_has_upper_dev - Check if device is linked to an upper device
4692  * @dev: device
4693  * @upper_dev: upper device to check
4694  *
4695  * Find out if a device is linked to specified upper device and return true
4696  * in case it is. Note that this checks only immediate upper device,
4697  * not through a complete stack of devices. The caller must hold the RTNL lock.
4698  */
4699 bool netdev_has_upper_dev(struct net_device *dev,
4700                           struct net_device *upper_dev)
4701 {
4702         ASSERT_RTNL();
4703
4704         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4705 }
4706 EXPORT_SYMBOL(netdev_has_upper_dev);
4707
4708 /**
4709  * netdev_has_any_upper_dev - Check if device is linked to some device
4710  * @dev: device
4711  *
4712  * Find out if a device is linked to an upper device and return true in case
4713  * it is. The caller must hold the RTNL lock.
4714  */
4715 static bool netdev_has_any_upper_dev(struct net_device *dev)
4716 {
4717         ASSERT_RTNL();
4718
4719         return !list_empty(&dev->all_adj_list.upper);
4720 }
4721
4722 /**
4723  * netdev_master_upper_dev_get - Get master upper device
4724  * @dev: device
4725  *
4726  * Find a master upper device and return pointer to it or NULL in case
4727  * it's not there. The caller must hold the RTNL lock.
4728  */
4729 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4730 {
4731         struct netdev_adjacent *upper;
4732
4733         ASSERT_RTNL();
4734
4735         if (list_empty(&dev->adj_list.upper))
4736                 return NULL;
4737
4738         upper = list_first_entry(&dev->adj_list.upper,
4739                                  struct netdev_adjacent, list);
4740         if (likely(upper->master))
4741                 return upper->dev;
4742         return NULL;
4743 }
4744 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4745
4746 void *netdev_adjacent_get_private(struct list_head *adj_list)
4747 {
4748         struct netdev_adjacent *adj;
4749
4750         adj = list_entry(adj_list, struct netdev_adjacent, list);
4751
4752         return adj->private;
4753 }
4754 EXPORT_SYMBOL(netdev_adjacent_get_private);
4755
4756 /**
4757  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4758  * @dev: device
4759  * @iter: list_head ** of the current position
4760  *
4761  * Gets the next device from the dev's upper list, starting from iter
4762  * position. The caller must hold RCU read lock.
4763  */
4764 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4765                                                  struct list_head **iter)
4766 {
4767         struct netdev_adjacent *upper;
4768
4769         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4770
4771         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4772
4773         if (&upper->list == &dev->adj_list.upper)
4774                 return NULL;
4775
4776         *iter = &upper->list;
4777
4778         return upper->dev;
4779 }
4780 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4781
4782 /**
4783  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4784  * @dev: device
4785  * @iter: list_head ** of the current position
4786  *
4787  * Gets the next device from the dev's upper list, starting from iter
4788  * position. The caller must hold RCU read lock.
4789  */
4790 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4791                                                      struct list_head **iter)
4792 {
4793         struct netdev_adjacent *upper;
4794
4795         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4796
4797         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4798
4799         if (&upper->list == &dev->all_adj_list.upper)
4800                 return NULL;
4801
4802         *iter = &upper->list;
4803
4804         return upper->dev;
4805 }
4806 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4807
4808 /**
4809  * netdev_lower_get_next_private - Get the next ->private from the
4810  *                                 lower neighbour list
4811  * @dev: device
4812  * @iter: list_head ** of the current position
4813  *
4814  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4815  * list, starting from iter position. The caller must hold either hold the
4816  * RTNL lock or its own locking that guarantees that the neighbour lower
4817  * list will remain unchainged.
4818  */
4819 void *netdev_lower_get_next_private(struct net_device *dev,
4820                                     struct list_head **iter)
4821 {
4822         struct netdev_adjacent *lower;
4823
4824         lower = list_entry(*iter, struct netdev_adjacent, list);
4825
4826         if (&lower->list == &dev->adj_list.lower)
4827                 return NULL;
4828
4829         *iter = lower->list.next;
4830
4831         return lower->private;
4832 }
4833 EXPORT_SYMBOL(netdev_lower_get_next_private);
4834
4835 /**
4836  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4837  *                                     lower neighbour list, RCU
4838  *                                     variant
4839  * @dev: device
4840  * @iter: list_head ** of the current position
4841  *
4842  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4843  * list, starting from iter position. The caller must hold RCU read lock.
4844  */
4845 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4846                                         struct list_head **iter)
4847 {
4848         struct netdev_adjacent *lower;
4849
4850         WARN_ON_ONCE(!rcu_read_lock_held());
4851
4852         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4853
4854         if (&lower->list == &dev->adj_list.lower)
4855                 return NULL;
4856
4857         *iter = &lower->list;
4858
4859         return lower->private;
4860 }
4861 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4862
4863 /**
4864  * netdev_lower_get_next - Get the next device from the lower neighbour
4865  *                         list
4866  * @dev: device
4867  * @iter: list_head ** of the current position
4868  *
4869  * Gets the next netdev_adjacent from the dev's lower neighbour
4870  * list, starting from iter position. The caller must hold RTNL lock or
4871  * its own locking that guarantees that the neighbour lower
4872  * list will remain unchainged.
4873  */
4874 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4875 {
4876         struct netdev_adjacent *lower;
4877
4878         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4879
4880         if (&lower->list == &dev->adj_list.lower)
4881                 return NULL;
4882
4883         *iter = &lower->list;
4884
4885         return lower->dev;
4886 }
4887 EXPORT_SYMBOL(netdev_lower_get_next);
4888
4889 /**
4890  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4891  *                                     lower neighbour list, RCU
4892  *                                     variant
4893  * @dev: device
4894  *
4895  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4896  * list. The caller must hold RCU read lock.
4897  */
4898 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4899 {
4900         struct netdev_adjacent *lower;
4901
4902         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4903                         struct netdev_adjacent, list);
4904         if (lower)
4905                 return lower->private;
4906         return NULL;
4907 }
4908 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4909
4910 /**
4911  * netdev_master_upper_dev_get_rcu - Get master upper device
4912  * @dev: device
4913  *
4914  * Find a master upper device and return pointer to it or NULL in case
4915  * it's not there. The caller must hold the RCU read lock.
4916  */
4917 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4918 {
4919         struct netdev_adjacent *upper;
4920
4921         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4922                                        struct netdev_adjacent, list);
4923         if (upper && likely(upper->master))
4924                 return upper->dev;
4925         return NULL;
4926 }
4927 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4928
4929 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4930                               struct net_device *adj_dev,
4931                               struct list_head *dev_list)
4932 {
4933         char linkname[IFNAMSIZ+7];
4934         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4935                 "upper_%s" : "lower_%s", adj_dev->name);
4936         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4937                                  linkname);
4938 }
4939 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4940                                char *name,
4941                                struct list_head *dev_list)
4942 {
4943         char linkname[IFNAMSIZ+7];
4944         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4945                 "upper_%s" : "lower_%s", name);
4946         sysfs_remove_link(&(dev->dev.kobj), linkname);
4947 }
4948
4949 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4950                                                  struct net_device *adj_dev,
4951                                                  struct list_head *dev_list)
4952 {
4953         return (dev_list == &dev->adj_list.upper ||
4954                 dev_list == &dev->adj_list.lower) &&
4955                 net_eq(dev_net(dev), dev_net(adj_dev));
4956 }
4957
4958 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4959                                         struct net_device *adj_dev,
4960                                         struct list_head *dev_list,
4961                                         void *private, bool master)
4962 {
4963         struct netdev_adjacent *adj;
4964         int ret;
4965
4966         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4967
4968         if (adj) {
4969                 adj->ref_nr++;
4970                 return 0;
4971         }
4972
4973         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4974         if (!adj)
4975                 return -ENOMEM;
4976
4977         adj->dev = adj_dev;
4978         adj->master = master;
4979         adj->ref_nr = 1;
4980         adj->private = private;
4981         dev_hold(adj_dev);
4982
4983         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4984                  adj_dev->name, dev->name, adj_dev->name);
4985
4986         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4987                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4988                 if (ret)
4989                         goto free_adj;
4990         }
4991
4992         /* Ensure that master link is always the first item in list. */
4993         if (master) {
4994                 ret = sysfs_create_link(&(dev->dev.kobj),
4995                                         &(adj_dev->dev.kobj), "master");
4996                 if (ret)
4997                         goto remove_symlinks;
4998
4999                 list_add_rcu(&adj->list, dev_list);
5000         } else {
5001                 list_add_tail_rcu(&adj->list, dev_list);
5002         }
5003
5004         return 0;
5005
5006 remove_symlinks:
5007         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5008                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5009 free_adj:
5010         kfree(adj);
5011         dev_put(adj_dev);
5012
5013         return ret;
5014 }
5015
5016 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5017                                          struct net_device *adj_dev,
5018                                          struct list_head *dev_list)
5019 {
5020         struct netdev_adjacent *adj;
5021
5022         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5023
5024         if (!adj) {
5025                 pr_err("tried to remove device %s from %s\n",
5026                        dev->name, adj_dev->name);
5027                 BUG();
5028         }
5029
5030         if (adj->ref_nr > 1) {
5031                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5032                          adj->ref_nr-1);
5033                 adj->ref_nr--;
5034                 return;
5035         }
5036
5037         if (adj->master)
5038                 sysfs_remove_link(&(dev->dev.kobj), "master");
5039
5040         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5041                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5042
5043         list_del_rcu(&adj->list);
5044         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5045                  adj_dev->name, dev->name, adj_dev->name);
5046         dev_put(adj_dev);
5047         kfree_rcu(adj, rcu);
5048 }
5049
5050 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5051                                             struct net_device *upper_dev,
5052                                             struct list_head *up_list,
5053                                             struct list_head *down_list,
5054                                             void *private, bool master)
5055 {
5056         int ret;
5057
5058         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5059                                            master);
5060         if (ret)
5061                 return ret;
5062
5063         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5064                                            false);
5065         if (ret) {
5066                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5067                 return ret;
5068         }
5069
5070         return 0;
5071 }
5072
5073 static int __netdev_adjacent_dev_link(struct net_device *dev,
5074                                       struct net_device *upper_dev)
5075 {
5076         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5077                                                 &dev->all_adj_list.upper,
5078                                                 &upper_dev->all_adj_list.lower,
5079                                                 NULL, false);
5080 }
5081
5082 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5083                                                struct net_device *upper_dev,
5084                                                struct list_head *up_list,
5085                                                struct list_head *down_list)
5086 {
5087         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5088         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5089 }
5090
5091 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5092                                          struct net_device *upper_dev)
5093 {
5094         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5095                                            &dev->all_adj_list.upper,
5096                                            &upper_dev->all_adj_list.lower);
5097 }
5098
5099 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5100                                                 struct net_device *upper_dev,
5101                                                 void *private, bool master)
5102 {
5103         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5104
5105         if (ret)
5106                 return ret;
5107
5108         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5109                                                &dev->adj_list.upper,
5110                                                &upper_dev->adj_list.lower,
5111                                                private, master);
5112         if (ret) {
5113                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5114                 return ret;
5115         }
5116
5117         return 0;
5118 }
5119
5120 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5121                                                    struct net_device *upper_dev)
5122 {
5123         __netdev_adjacent_dev_unlink(dev, upper_dev);
5124         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5125                                            &dev->adj_list.upper,
5126                                            &upper_dev->adj_list.lower);
5127 }
5128
5129 static int __netdev_upper_dev_link(struct net_device *dev,
5130                                    struct net_device *upper_dev, bool master,
5131                                    void *private)
5132 {
5133         struct netdev_adjacent *i, *j, *to_i, *to_j;
5134         int ret = 0;
5135
5136         ASSERT_RTNL();
5137
5138         if (dev == upper_dev)
5139                 return -EBUSY;
5140
5141         /* To prevent loops, check if dev is not upper device to upper_dev. */
5142         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5143                 return -EBUSY;
5144
5145         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5146                 return -EEXIST;
5147
5148         if (master && netdev_master_upper_dev_get(dev))
5149                 return -EBUSY;
5150
5151         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5152                                                    master);
5153         if (ret)
5154                 return ret;
5155
5156         /* Now that we linked these devs, make all the upper_dev's
5157          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5158          * versa, and don't forget the devices itself. All of these
5159          * links are non-neighbours.
5160          */
5161         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5162                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5163                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5164                                  i->dev->name, j->dev->name);
5165                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5166                         if (ret)
5167                                 goto rollback_mesh;
5168                 }
5169         }
5170
5171         /* add dev to every upper_dev's upper device */
5172         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5173                 pr_debug("linking %s's upper device %s with %s\n",
5174                          upper_dev->name, i->dev->name, dev->name);
5175                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5176                 if (ret)
5177                         goto rollback_upper_mesh;
5178         }
5179
5180         /* add upper_dev to every dev's lower device */
5181         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5182                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5183                          i->dev->name, upper_dev->name);
5184                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5185                 if (ret)
5186                         goto rollback_lower_mesh;
5187         }
5188
5189         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5190         return 0;
5191
5192 rollback_lower_mesh:
5193         to_i = i;
5194         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5195                 if (i == to_i)
5196                         break;
5197                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5198         }
5199
5200         i = NULL;
5201
5202 rollback_upper_mesh:
5203         to_i = i;
5204         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5205                 if (i == to_i)
5206                         break;
5207                 __netdev_adjacent_dev_unlink(dev, i->dev);
5208         }
5209
5210         i = j = NULL;
5211
5212 rollback_mesh:
5213         to_i = i;
5214         to_j = j;
5215         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5216                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5217                         if (i == to_i && j == to_j)
5218                                 break;
5219                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5220                 }
5221                 if (i == to_i)
5222                         break;
5223         }
5224
5225         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5226
5227         return ret;
5228 }
5229
5230 /**
5231  * netdev_upper_dev_link - Add a link to the upper device
5232  * @dev: device
5233  * @upper_dev: new upper device
5234  *
5235  * Adds a link to device which is upper to this one. The caller must hold
5236  * the RTNL lock. On a failure a negative errno code is returned.
5237  * On success the reference counts are adjusted and the function
5238  * returns zero.
5239  */
5240 int netdev_upper_dev_link(struct net_device *dev,
5241                           struct net_device *upper_dev)
5242 {
5243         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5244 }
5245 EXPORT_SYMBOL(netdev_upper_dev_link);
5246
5247 /**
5248  * netdev_master_upper_dev_link - Add a master link to the upper device
5249  * @dev: device
5250  * @upper_dev: new upper device
5251  *
5252  * Adds a link to device which is upper to this one. In this case, only
5253  * one master upper device can be linked, although other non-master devices
5254  * might be linked as well. The caller must hold the RTNL lock.
5255  * On a failure a negative errno code is returned. On success the reference
5256  * counts are adjusted and the function returns zero.
5257  */
5258 int netdev_master_upper_dev_link(struct net_device *dev,
5259                                  struct net_device *upper_dev)
5260 {
5261         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5262 }
5263 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5264
5265 int netdev_master_upper_dev_link_private(struct net_device *dev,
5266                                          struct net_device *upper_dev,
5267                                          void *private)
5268 {
5269         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5270 }
5271 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5272
5273 /**
5274  * netdev_upper_dev_unlink - Removes a link to upper device
5275  * @dev: device
5276  * @upper_dev: new upper device
5277  *
5278  * Removes a link to device which is upper to this one. The caller must hold
5279  * the RTNL lock.
5280  */
5281 void netdev_upper_dev_unlink(struct net_device *dev,
5282                              struct net_device *upper_dev)
5283 {
5284         struct netdev_adjacent *i, *j;
5285         ASSERT_RTNL();
5286
5287         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5288
5289         /* Here is the tricky part. We must remove all dev's lower
5290          * devices from all upper_dev's upper devices and vice
5291          * versa, to maintain the graph relationship.
5292          */
5293         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5294                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5295                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5296
5297         /* remove also the devices itself from lower/upper device
5298          * list
5299          */
5300         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5301                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5302
5303         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5304                 __netdev_adjacent_dev_unlink(dev, i->dev);
5305
5306         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5307 }
5308 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5309
5310 void netdev_adjacent_add_links(struct net_device *dev)
5311 {
5312         struct netdev_adjacent *iter;
5313
5314         struct net *net = dev_net(dev);
5315
5316         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5317                 if (!net_eq(net,dev_net(iter->dev)))
5318                         continue;
5319                 netdev_adjacent_sysfs_add(iter->dev, dev,
5320                                           &iter->dev->adj_list.lower);
5321                 netdev_adjacent_sysfs_add(dev, iter->dev,
5322                                           &dev->adj_list.upper);
5323         }
5324
5325         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5326                 if (!net_eq(net,dev_net(iter->dev)))
5327                         continue;
5328                 netdev_adjacent_sysfs_add(iter->dev, dev,
5329                                           &iter->dev->adj_list.upper);
5330                 netdev_adjacent_sysfs_add(dev, iter->dev,
5331                                           &dev->adj_list.lower);
5332         }
5333 }
5334
5335 void netdev_adjacent_del_links(struct net_device *dev)
5336 {
5337         struct netdev_adjacent *iter;
5338
5339         struct net *net = dev_net(dev);
5340
5341         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5342                 if (!net_eq(net,dev_net(iter->dev)))
5343                         continue;
5344                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5345                                           &iter->dev->adj_list.lower);
5346                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5347                                           &dev->adj_list.upper);
5348         }
5349
5350         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5351                 if (!net_eq(net,dev_net(iter->dev)))
5352                         continue;
5353                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5354                                           &iter->dev->adj_list.upper);
5355                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5356                                           &dev->adj_list.lower);
5357         }
5358 }
5359
5360 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5361 {
5362         struct netdev_adjacent *iter;
5363
5364         struct net *net = dev_net(dev);
5365
5366         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5367                 if (!net_eq(net,dev_net(iter->dev)))
5368                         continue;
5369                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5370                                           &iter->dev->adj_list.lower);
5371                 netdev_adjacent_sysfs_add(iter->dev, dev,
5372                                           &iter->dev->adj_list.lower);
5373         }
5374
5375         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5376                 if (!net_eq(net,dev_net(iter->dev)))
5377                         continue;
5378                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5379                                           &iter->dev->adj_list.upper);
5380                 netdev_adjacent_sysfs_add(iter->dev, dev,
5381                                           &iter->dev->adj_list.upper);
5382         }
5383 }
5384
5385 void *netdev_lower_dev_get_private(struct net_device *dev,
5386                                    struct net_device *lower_dev)
5387 {
5388         struct netdev_adjacent *lower;
5389
5390         if (!lower_dev)
5391                 return NULL;
5392         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5393         if (!lower)
5394                 return NULL;
5395
5396         return lower->private;
5397 }
5398 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5399
5400
5401 int dev_get_nest_level(struct net_device *dev,
5402                        bool (*type_check)(struct net_device *dev))
5403 {
5404         struct net_device *lower = NULL;
5405         struct list_head *iter;
5406         int max_nest = -1;
5407         int nest;
5408
5409         ASSERT_RTNL();
5410
5411         netdev_for_each_lower_dev(dev, lower, iter) {
5412                 nest = dev_get_nest_level(lower, type_check);
5413                 if (max_nest < nest)
5414                         max_nest = nest;
5415         }
5416
5417         if (type_check(dev))
5418                 max_nest++;
5419
5420         return max_nest;
5421 }
5422 EXPORT_SYMBOL(dev_get_nest_level);
5423
5424 static void dev_change_rx_flags(struct net_device *dev, int flags)
5425 {
5426         const struct net_device_ops *ops = dev->netdev_ops;
5427
5428         if (ops->ndo_change_rx_flags)
5429                 ops->ndo_change_rx_flags(dev, flags);
5430 }
5431
5432 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5433 {
5434         unsigned int old_flags = dev->flags;
5435         kuid_t uid;
5436         kgid_t gid;
5437
5438         ASSERT_RTNL();
5439
5440         dev->flags |= IFF_PROMISC;
5441         dev->promiscuity += inc;
5442         if (dev->promiscuity == 0) {
5443                 /*
5444                  * Avoid overflow.
5445                  * If inc causes overflow, untouch promisc and return error.
5446                  */
5447                 if (inc < 0)
5448                         dev->flags &= ~IFF_PROMISC;
5449                 else {
5450                         dev->promiscuity -= inc;
5451                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5452                                 dev->name);
5453                         return -EOVERFLOW;
5454                 }
5455         }
5456         if (dev->flags != old_flags) {
5457                 pr_info("device %s %s promiscuous mode\n",
5458                         dev->name,
5459                         dev->flags & IFF_PROMISC ? "entered" : "left");
5460                 if (audit_enabled) {
5461                         current_uid_gid(&uid, &gid);
5462                         audit_log(current->audit_context, GFP_ATOMIC,
5463                                 AUDIT_ANOM_PROMISCUOUS,
5464                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5465                                 dev->name, (dev->flags & IFF_PROMISC),
5466                                 (old_flags & IFF_PROMISC),
5467                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5468                                 from_kuid(&init_user_ns, uid),
5469                                 from_kgid(&init_user_ns, gid),
5470                                 audit_get_sessionid(current));
5471                 }
5472
5473                 dev_change_rx_flags(dev, IFF_PROMISC);
5474         }
5475         if (notify)
5476                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5477         return 0;
5478 }
5479
5480 /**
5481  *      dev_set_promiscuity     - update promiscuity count on a device
5482  *      @dev: device
5483  *      @inc: modifier
5484  *
5485  *      Add or remove promiscuity from a device. While the count in the device
5486  *      remains above zero the interface remains promiscuous. Once it hits zero
5487  *      the device reverts back to normal filtering operation. A negative inc
5488  *      value is used to drop promiscuity on the device.
5489  *      Return 0 if successful or a negative errno code on error.
5490  */
5491 int dev_set_promiscuity(struct net_device *dev, int inc)
5492 {
5493         unsigned int old_flags = dev->flags;
5494         int err;
5495
5496         err = __dev_set_promiscuity(dev, inc, true);
5497         if (err < 0)
5498                 return err;
5499         if (dev->flags != old_flags)
5500                 dev_set_rx_mode(dev);
5501         return err;
5502 }
5503 EXPORT_SYMBOL(dev_set_promiscuity);
5504
5505 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5506 {
5507         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5508
5509         ASSERT_RTNL();
5510
5511         dev->flags |= IFF_ALLMULTI;
5512         dev->allmulti += inc;
5513         if (dev->allmulti == 0) {
5514                 /*
5515                  * Avoid overflow.
5516                  * If inc causes overflow, untouch allmulti and return error.
5517                  */
5518                 if (inc < 0)
5519                         dev->flags &= ~IFF_ALLMULTI;
5520                 else {
5521                         dev->allmulti -= inc;
5522                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5523                                 dev->name);
5524                         return -EOVERFLOW;
5525                 }
5526         }
5527         if (dev->flags ^ old_flags) {
5528                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5529                 dev_set_rx_mode(dev);
5530                 if (notify)
5531                         __dev_notify_flags(dev, old_flags,
5532                                            dev->gflags ^ old_gflags);
5533         }
5534         return 0;
5535 }
5536
5537 /**
5538  *      dev_set_allmulti        - update allmulti count on a device
5539  *      @dev: device
5540  *      @inc: modifier
5541  *
5542  *      Add or remove reception of all multicast frames to a device. While the
5543  *      count in the device remains above zero the interface remains listening
5544  *      to all interfaces. Once it hits zero the device reverts back to normal
5545  *      filtering operation. A negative @inc value is used to drop the counter
5546  *      when releasing a resource needing all multicasts.
5547  *      Return 0 if successful or a negative errno code on error.
5548  */
5549
5550 int dev_set_allmulti(struct net_device *dev, int inc)
5551 {
5552         return __dev_set_allmulti(dev, inc, true);
5553 }
5554 EXPORT_SYMBOL(dev_set_allmulti);
5555
5556 /*
5557  *      Upload unicast and multicast address lists to device and
5558  *      configure RX filtering. When the device doesn't support unicast
5559  *      filtering it is put in promiscuous mode while unicast addresses
5560  *      are present.
5561  */
5562 void __dev_set_rx_mode(struct net_device *dev)
5563 {
5564         const struct net_device_ops *ops = dev->netdev_ops;
5565
5566         /* dev_open will call this function so the list will stay sane. */
5567         if (!(dev->flags&IFF_UP))
5568                 return;
5569
5570         if (!netif_device_present(dev))
5571                 return;
5572
5573         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5574                 /* Unicast addresses changes may only happen under the rtnl,
5575                  * therefore calling __dev_set_promiscuity here is safe.
5576                  */
5577                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5578                         __dev_set_promiscuity(dev, 1, false);
5579                         dev->uc_promisc = true;
5580                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5581                         __dev_set_promiscuity(dev, -1, false);
5582                         dev->uc_promisc = false;
5583                 }
5584         }
5585
5586         if (ops->ndo_set_rx_mode)
5587                 ops->ndo_set_rx_mode(dev);
5588 }
5589
5590 void dev_set_rx_mode(struct net_device *dev)
5591 {
5592         netif_addr_lock_bh(dev);
5593         __dev_set_rx_mode(dev);
5594         netif_addr_unlock_bh(dev);
5595 }
5596
5597 /**
5598  *      dev_get_flags - get flags reported to userspace
5599  *      @dev: device
5600  *
5601  *      Get the combination of flag bits exported through APIs to userspace.
5602  */
5603 unsigned int dev_get_flags(const struct net_device *dev)
5604 {
5605         unsigned int flags;
5606
5607         flags = (dev->flags & ~(IFF_PROMISC |
5608                                 IFF_ALLMULTI |
5609                                 IFF_RUNNING |
5610                                 IFF_LOWER_UP |
5611                                 IFF_DORMANT)) |
5612                 (dev->gflags & (IFF_PROMISC |
5613                                 IFF_ALLMULTI));
5614
5615         if (netif_running(dev)) {
5616                 if (netif_oper_up(dev))
5617                         flags |= IFF_RUNNING;
5618                 if (netif_carrier_ok(dev))
5619                         flags |= IFF_LOWER_UP;
5620                 if (netif_dormant(dev))
5621                         flags |= IFF_DORMANT;
5622         }
5623
5624         return flags;
5625 }
5626 EXPORT_SYMBOL(dev_get_flags);
5627
5628 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5629 {
5630         unsigned int old_flags = dev->flags;
5631         int ret;
5632
5633         ASSERT_RTNL();
5634
5635         /*
5636          *      Set the flags on our device.
5637          */
5638
5639         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5640                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5641                                IFF_AUTOMEDIA)) |
5642                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5643                                     IFF_ALLMULTI));
5644
5645         /*
5646          *      Load in the correct multicast list now the flags have changed.
5647          */
5648
5649         if ((old_flags ^ flags) & IFF_MULTICAST)
5650                 dev_change_rx_flags(dev, IFF_MULTICAST);
5651
5652         dev_set_rx_mode(dev);
5653
5654         /*
5655          *      Have we downed the interface. We handle IFF_UP ourselves
5656          *      according to user attempts to set it, rather than blindly
5657          *      setting it.
5658          */
5659
5660         ret = 0;
5661         if ((old_flags ^ flags) & IFF_UP)
5662                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5663
5664         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5665                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5666                 unsigned int old_flags = dev->flags;
5667
5668                 dev->gflags ^= IFF_PROMISC;
5669
5670                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5671                         if (dev->flags != old_flags)
5672                                 dev_set_rx_mode(dev);
5673         }
5674
5675         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5676            is important. Some (broken) drivers set IFF_PROMISC, when
5677            IFF_ALLMULTI is requested not asking us and not reporting.
5678          */
5679         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5680                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5681
5682                 dev->gflags ^= IFF_ALLMULTI;
5683                 __dev_set_allmulti(dev, inc, false);
5684         }
5685
5686         return ret;
5687 }
5688
5689 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5690                         unsigned int gchanges)
5691 {
5692         unsigned int changes = dev->flags ^ old_flags;
5693
5694         if (gchanges)
5695                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5696
5697         if (changes & IFF_UP) {
5698                 if (dev->flags & IFF_UP)
5699                         call_netdevice_notifiers(NETDEV_UP, dev);
5700                 else
5701                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5702         }
5703
5704         if (dev->flags & IFF_UP &&
5705             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5706                 struct netdev_notifier_change_info change_info;
5707
5708                 change_info.flags_changed = changes;
5709                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5710                                               &change_info.info);
5711         }
5712 }
5713
5714 /**
5715  *      dev_change_flags - change device settings
5716  *      @dev: device
5717  *      @flags: device state flags
5718  *
5719  *      Change settings on device based state flags. The flags are
5720  *      in the userspace exported format.
5721  */
5722 int dev_change_flags(struct net_device *dev, unsigned int flags)
5723 {
5724         int ret;
5725         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5726
5727         ret = __dev_change_flags(dev, flags);
5728         if (ret < 0)
5729                 return ret;
5730
5731         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5732         __dev_notify_flags(dev, old_flags, changes);
5733         return ret;
5734 }
5735 EXPORT_SYMBOL(dev_change_flags);
5736
5737 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5738 {
5739         const struct net_device_ops *ops = dev->netdev_ops;
5740
5741         if (ops->ndo_change_mtu)
5742                 return ops->ndo_change_mtu(dev, new_mtu);
5743
5744         dev->mtu = new_mtu;
5745         return 0;
5746 }
5747
5748 /**
5749  *      dev_set_mtu - Change maximum transfer unit
5750  *      @dev: device
5751  *      @new_mtu: new transfer unit
5752  *
5753  *      Change the maximum transfer size of the network device.
5754  */
5755 int dev_set_mtu(struct net_device *dev, int new_mtu)
5756 {
5757         int err, orig_mtu;
5758
5759         if (new_mtu == dev->mtu)
5760                 return 0;
5761
5762         /*      MTU must be positive.    */
5763         if (new_mtu < 0)
5764                 return -EINVAL;
5765
5766         if (!netif_device_present(dev))
5767                 return -ENODEV;
5768
5769         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5770         err = notifier_to_errno(err);
5771         if (err)
5772                 return err;
5773
5774         orig_mtu = dev->mtu;
5775         err = __dev_set_mtu(dev, new_mtu);
5776
5777         if (!err) {
5778                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5779                 err = notifier_to_errno(err);
5780                 if (err) {
5781                         /* setting mtu back and notifying everyone again,
5782                          * so that they have a chance to revert changes.
5783                          */
5784                         __dev_set_mtu(dev, orig_mtu);
5785                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5786                 }
5787         }
5788         return err;
5789 }
5790 EXPORT_SYMBOL(dev_set_mtu);
5791
5792 /**
5793  *      dev_set_group - Change group this device belongs to
5794  *      @dev: device
5795  *      @new_group: group this device should belong to
5796  */
5797 void dev_set_group(struct net_device *dev, int new_group)
5798 {
5799         dev->group = new_group;
5800 }
5801 EXPORT_SYMBOL(dev_set_group);
5802
5803 /**
5804  *      dev_set_mac_address - Change Media Access Control Address
5805  *      @dev: device
5806  *      @sa: new address
5807  *
5808  *      Change the hardware (MAC) address of the device
5809  */
5810 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5811 {
5812         const struct net_device_ops *ops = dev->netdev_ops;
5813         int err;
5814
5815         if (!ops->ndo_set_mac_address)
5816                 return -EOPNOTSUPP;
5817         if (sa->sa_family != dev->type)
5818                 return -EINVAL;
5819         if (!netif_device_present(dev))
5820                 return -ENODEV;
5821         err = ops->ndo_set_mac_address(dev, sa);
5822         if (err)
5823                 return err;
5824         dev->addr_assign_type = NET_ADDR_SET;
5825         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5826         add_device_randomness(dev->dev_addr, dev->addr_len);
5827         return 0;
5828 }
5829 EXPORT_SYMBOL(dev_set_mac_address);
5830
5831 /**
5832  *      dev_change_carrier - Change device carrier
5833  *      @dev: device
5834  *      @new_carrier: new value
5835  *
5836  *      Change device carrier
5837  */
5838 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5839 {
5840         const struct net_device_ops *ops = dev->netdev_ops;
5841
5842         if (!ops->ndo_change_carrier)
5843                 return -EOPNOTSUPP;
5844         if (!netif_device_present(dev))
5845                 return -ENODEV;
5846         return ops->ndo_change_carrier(dev, new_carrier);
5847 }
5848 EXPORT_SYMBOL(dev_change_carrier);
5849
5850 /**
5851  *      dev_get_phys_port_id - Get device physical port ID
5852  *      @dev: device
5853  *      @ppid: port ID
5854  *
5855  *      Get device physical port ID
5856  */
5857 int dev_get_phys_port_id(struct net_device *dev,
5858                          struct netdev_phys_port_id *ppid)
5859 {
5860         const struct net_device_ops *ops = dev->netdev_ops;
5861
5862         if (!ops->ndo_get_phys_port_id)
5863                 return -EOPNOTSUPP;
5864         return ops->ndo_get_phys_port_id(dev, ppid);
5865 }
5866 EXPORT_SYMBOL(dev_get_phys_port_id);
5867
5868 /**
5869  *      dev_new_index   -       allocate an ifindex
5870  *      @net: the applicable net namespace
5871  *
5872  *      Returns a suitable unique value for a new device interface
5873  *      number.  The caller must hold the rtnl semaphore or the
5874  *      dev_base_lock to be sure it remains unique.
5875  */
5876 static int dev_new_index(struct net *net)
5877 {
5878         int ifindex = net->ifindex;
5879         for (;;) {
5880                 if (++ifindex <= 0)
5881                         ifindex = 1;
5882                 if (!__dev_get_by_index(net, ifindex))
5883                         return net->ifindex = ifindex;
5884         }
5885 }
5886
5887 /* Delayed registration/unregisteration */
5888 static LIST_HEAD(net_todo_list);
5889 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5890
5891 static void net_set_todo(struct net_device *dev)
5892 {
5893         list_add_tail(&dev->todo_list, &net_todo_list);
5894         dev_net(dev)->dev_unreg_count++;
5895 }
5896
5897 static void rollback_registered_many(struct list_head *head)
5898 {
5899         struct net_device *dev, *tmp;
5900         LIST_HEAD(close_head);
5901
5902         BUG_ON(dev_boot_phase);
5903         ASSERT_RTNL();
5904
5905         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5906                 /* Some devices call without registering
5907                  * for initialization unwind. Remove those
5908                  * devices and proceed with the remaining.
5909                  */
5910                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5911                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5912                                  dev->name, dev);
5913
5914                         WARN_ON(1);
5915                         list_del(&dev->unreg_list);
5916                         continue;
5917                 }
5918                 dev->dismantle = true;
5919                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5920         }
5921
5922         /* If device is running, close it first. */
5923         list_for_each_entry(dev, head, unreg_list)
5924                 list_add_tail(&dev->close_list, &close_head);
5925         dev_close_many(&close_head);
5926
5927         list_for_each_entry(dev, head, unreg_list) {
5928                 /* And unlink it from device chain. */
5929                 unlist_netdevice(dev);
5930
5931                 dev->reg_state = NETREG_UNREGISTERING;
5932         }
5933
5934         synchronize_net();
5935
5936         list_for_each_entry(dev, head, unreg_list) {
5937                 /* Shutdown queueing discipline. */
5938                 dev_shutdown(dev);
5939
5940
5941                 /* Notify protocols, that we are about to destroy
5942                    this device. They should clean all the things.
5943                 */
5944                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5945
5946                 /*
5947                  *      Flush the unicast and multicast chains
5948                  */
5949                 dev_uc_flush(dev);
5950                 dev_mc_flush(dev);
5951
5952                 if (dev->netdev_ops->ndo_uninit)
5953                         dev->netdev_ops->ndo_uninit(dev);
5954
5955                 if (!dev->rtnl_link_ops ||
5956                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5957                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5958
5959                 /* Notifier chain MUST detach us all upper devices. */
5960                 WARN_ON(netdev_has_any_upper_dev(dev));
5961
5962                 /* Remove entries from kobject tree */
5963                 netdev_unregister_kobject(dev);
5964 #ifdef CONFIG_XPS
5965                 /* Remove XPS queueing entries */
5966                 netif_reset_xps_queues_gt(dev, 0);
5967 #endif
5968         }
5969
5970         synchronize_net();
5971
5972         list_for_each_entry(dev, head, unreg_list)
5973                 dev_put(dev);
5974 }
5975
5976 static void rollback_registered(struct net_device *dev)
5977 {
5978         LIST_HEAD(single);
5979
5980         list_add(&dev->unreg_list, &single);
5981         rollback_registered_many(&single);
5982         list_del(&single);
5983 }
5984
5985 static netdev_features_t netdev_fix_features(struct net_device *dev,
5986         netdev_features_t features)
5987 {
5988         /* Fix illegal checksum combinations */
5989         if ((features & NETIF_F_HW_CSUM) &&
5990             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5991                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5992                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5993         }
5994
5995         /* TSO requires that SG is present as well. */
5996         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5997                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5998                 features &= ~NETIF_F_ALL_TSO;
5999         }
6000
6001         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6002                                         !(features & NETIF_F_IP_CSUM)) {
6003                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6004                 features &= ~NETIF_F_TSO;
6005                 features &= ~NETIF_F_TSO_ECN;
6006         }
6007
6008         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6009                                          !(features & NETIF_F_IPV6_CSUM)) {
6010                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6011                 features &= ~NETIF_F_TSO6;
6012         }
6013
6014         /* TSO ECN requires that TSO is present as well. */
6015         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6016                 features &= ~NETIF_F_TSO_ECN;
6017
6018         /* Software GSO depends on SG. */
6019         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6020                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6021                 features &= ~NETIF_F_GSO;
6022         }
6023
6024         /* UFO needs SG and checksumming */
6025         if (features & NETIF_F_UFO) {
6026                 /* maybe split UFO into V4 and V6? */
6027                 if (!((features & NETIF_F_GEN_CSUM) ||
6028                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6029                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6030                         netdev_dbg(dev,
6031                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6032                         features &= ~NETIF_F_UFO;
6033                 }
6034
6035                 if (!(features & NETIF_F_SG)) {
6036                         netdev_dbg(dev,
6037                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6038                         features &= ~NETIF_F_UFO;
6039                 }
6040         }
6041
6042 #ifdef CONFIG_NET_RX_BUSY_POLL
6043         if (dev->netdev_ops->ndo_busy_poll)
6044                 features |= NETIF_F_BUSY_POLL;
6045         else
6046 #endif
6047                 features &= ~NETIF_F_BUSY_POLL;
6048
6049         return features;
6050 }
6051
6052 int __netdev_update_features(struct net_device *dev)
6053 {
6054         netdev_features_t features;
6055         int err = 0;
6056
6057         ASSERT_RTNL();
6058
6059         features = netdev_get_wanted_features(dev);
6060
6061         if (dev->netdev_ops->ndo_fix_features)
6062                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6063
6064         /* driver might be less strict about feature dependencies */
6065         features = netdev_fix_features(dev, features);
6066
6067         if (dev->features == features)
6068                 return 0;
6069
6070         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6071                 &dev->features, &features);
6072
6073         if (dev->netdev_ops->ndo_set_features)
6074                 err = dev->netdev_ops->ndo_set_features(dev, features);
6075
6076         if (unlikely(err < 0)) {
6077                 netdev_err(dev,
6078                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6079                         err, &features, &dev->features);
6080                 return -1;
6081         }
6082
6083         if (!err)
6084                 dev->features = features;
6085
6086         return 1;
6087 }
6088
6089 /**
6090  *      netdev_update_features - recalculate device features
6091  *      @dev: the device to check
6092  *
6093  *      Recalculate dev->features set and send notifications if it
6094  *      has changed. Should be called after driver or hardware dependent
6095  *      conditions might have changed that influence the features.
6096  */
6097 void netdev_update_features(struct net_device *dev)
6098 {
6099         if (__netdev_update_features(dev))
6100                 netdev_features_change(dev);
6101 }
6102 EXPORT_SYMBOL(netdev_update_features);
6103
6104 /**
6105  *      netdev_change_features - recalculate device features
6106  *      @dev: the device to check
6107  *
6108  *      Recalculate dev->features set and send notifications even
6109  *      if they have not changed. Should be called instead of
6110  *      netdev_update_features() if also dev->vlan_features might
6111  *      have changed to allow the changes to be propagated to stacked
6112  *      VLAN devices.
6113  */
6114 void netdev_change_features(struct net_device *dev)
6115 {
6116         __netdev_update_features(dev);
6117         netdev_features_change(dev);
6118 }
6119 EXPORT_SYMBOL(netdev_change_features);
6120
6121 /**
6122  *      netif_stacked_transfer_operstate -      transfer operstate
6123  *      @rootdev: the root or lower level device to transfer state from
6124  *      @dev: the device to transfer operstate to
6125  *
6126  *      Transfer operational state from root to device. This is normally
6127  *      called when a stacking relationship exists between the root
6128  *      device and the device(a leaf device).
6129  */
6130 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6131                                         struct net_device *dev)
6132 {
6133         if (rootdev->operstate == IF_OPER_DORMANT)
6134                 netif_dormant_on(dev);
6135         else
6136                 netif_dormant_off(dev);
6137
6138         if (netif_carrier_ok(rootdev)) {
6139                 if (!netif_carrier_ok(dev))
6140                         netif_carrier_on(dev);
6141         } else {
6142                 if (netif_carrier_ok(dev))
6143                         netif_carrier_off(dev);
6144         }
6145 }
6146 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6147
6148 #ifdef CONFIG_SYSFS
6149 static int netif_alloc_rx_queues(struct net_device *dev)
6150 {
6151         unsigned int i, count = dev->num_rx_queues;
6152         struct netdev_rx_queue *rx;
6153
6154         BUG_ON(count < 1);
6155
6156         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6157         if (!rx)
6158                 return -ENOMEM;
6159
6160         dev->_rx = rx;
6161
6162         for (i = 0; i < count; i++)
6163                 rx[i].dev = dev;
6164         return 0;
6165 }
6166 #endif
6167
6168 static void netdev_init_one_queue(struct net_device *dev,
6169                                   struct netdev_queue *queue, void *_unused)
6170 {
6171         /* Initialize queue lock */
6172         spin_lock_init(&queue->_xmit_lock);
6173         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6174         queue->xmit_lock_owner = -1;
6175         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6176         queue->dev = dev;
6177 #ifdef CONFIG_BQL
6178         dql_init(&queue->dql, HZ);
6179 #endif
6180 }
6181
6182 static void netif_free_tx_queues(struct net_device *dev)
6183 {
6184         kvfree(dev->_tx);
6185 }
6186
6187 static int netif_alloc_netdev_queues(struct net_device *dev)
6188 {
6189         unsigned int count = dev->num_tx_queues;
6190         struct netdev_queue *tx;
6191         size_t sz = count * sizeof(*tx);
6192
6193         BUG_ON(count < 1 || count > 0xffff);
6194
6195         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6196         if (!tx) {
6197                 tx = vzalloc(sz);
6198                 if (!tx)
6199                         return -ENOMEM;
6200         }
6201         dev->_tx = tx;
6202
6203         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6204         spin_lock_init(&dev->tx_global_lock);
6205
6206         return 0;
6207 }
6208
6209 /**
6210  *      register_netdevice      - register a network device
6211  *      @dev: device to register
6212  *
6213  *      Take a completed network device structure and add it to the kernel
6214  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6215  *      chain. 0 is returned on success. A negative errno code is returned
6216  *      on a failure to set up the device, or if the name is a duplicate.
6217  *
6218  *      Callers must hold the rtnl semaphore. You may want
6219  *      register_netdev() instead of this.
6220  *
6221  *      BUGS:
6222  *      The locking appears insufficient to guarantee two parallel registers
6223  *      will not get the same name.
6224  */
6225
6226 int register_netdevice(struct net_device *dev)
6227 {
6228         int ret;
6229         struct net *net = dev_net(dev);
6230
6231         BUG_ON(dev_boot_phase);
6232         ASSERT_RTNL();
6233
6234         might_sleep();
6235
6236         /* When net_device's are persistent, this will be fatal. */
6237         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6238         BUG_ON(!net);
6239
6240         spin_lock_init(&dev->addr_list_lock);
6241         netdev_set_addr_lockdep_class(dev);
6242
6243         dev->iflink = -1;
6244
6245         ret = dev_get_valid_name(net, dev, dev->name);
6246         if (ret < 0)
6247                 goto out;
6248
6249         /* Init, if this function is available */
6250         if (dev->netdev_ops->ndo_init) {
6251                 ret = dev->netdev_ops->ndo_init(dev);
6252                 if (ret) {
6253                         if (ret > 0)
6254                                 ret = -EIO;
6255                         goto out;
6256                 }
6257         }
6258
6259         if (((dev->hw_features | dev->features) &
6260              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6261             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6262              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6263                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6264                 ret = -EINVAL;
6265                 goto err_uninit;
6266         }
6267
6268         ret = -EBUSY;
6269         if (!dev->ifindex)
6270                 dev->ifindex = dev_new_index(net);
6271         else if (__dev_get_by_index(net, dev->ifindex))
6272                 goto err_uninit;
6273
6274         if (dev->iflink == -1)
6275                 dev->iflink = dev->ifindex;
6276
6277         /* Transfer changeable features to wanted_features and enable
6278          * software offloads (GSO and GRO).
6279          */
6280         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6281         dev->features |= NETIF_F_SOFT_FEATURES;
6282         dev->wanted_features = dev->features & dev->hw_features;
6283
6284         if (!(dev->flags & IFF_LOOPBACK)) {
6285                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6286         }
6287
6288         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6289          */
6290         dev->vlan_features |= NETIF_F_HIGHDMA;
6291
6292         /* Make NETIF_F_SG inheritable to tunnel devices.
6293          */
6294         dev->hw_enc_features |= NETIF_F_SG;
6295
6296         /* Make NETIF_F_SG inheritable to MPLS.
6297          */
6298         dev->mpls_features |= NETIF_F_SG;
6299
6300         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6301         ret = notifier_to_errno(ret);
6302         if (ret)
6303                 goto err_uninit;
6304
6305         ret = netdev_register_kobject(dev);
6306         if (ret)
6307                 goto err_uninit;
6308         dev->reg_state = NETREG_REGISTERED;
6309
6310         __netdev_update_features(dev);
6311
6312         /*
6313          *      Default initial state at registry is that the
6314          *      device is present.
6315          */
6316
6317         set_bit(__LINK_STATE_PRESENT, &dev->state);
6318
6319         linkwatch_init_dev(dev);
6320
6321         dev_init_scheduler(dev);
6322         dev_hold(dev);
6323         list_netdevice(dev);
6324         add_device_randomness(dev->dev_addr, dev->addr_len);
6325
6326         /* If the device has permanent device address, driver should
6327          * set dev_addr and also addr_assign_type should be set to
6328          * NET_ADDR_PERM (default value).
6329          */
6330         if (dev->addr_assign_type == NET_ADDR_PERM)
6331                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6332
6333         /* Notify protocols, that a new device appeared. */
6334         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6335         ret = notifier_to_errno(ret);
6336         if (ret) {
6337                 rollback_registered(dev);
6338                 dev->reg_state = NETREG_UNREGISTERED;
6339         }
6340         /*
6341          *      Prevent userspace races by waiting until the network
6342          *      device is fully setup before sending notifications.
6343          */
6344         if (!dev->rtnl_link_ops ||
6345             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6346                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6347
6348 out:
6349         return ret;
6350
6351 err_uninit:
6352         if (dev->netdev_ops->ndo_uninit)
6353                 dev->netdev_ops->ndo_uninit(dev);
6354         goto out;
6355 }
6356 EXPORT_SYMBOL(register_netdevice);
6357
6358 /**
6359  *      init_dummy_netdev       - init a dummy network device for NAPI
6360  *      @dev: device to init
6361  *
6362  *      This takes a network device structure and initialize the minimum
6363  *      amount of fields so it can be used to schedule NAPI polls without
6364  *      registering a full blown interface. This is to be used by drivers
6365  *      that need to tie several hardware interfaces to a single NAPI
6366  *      poll scheduler due to HW limitations.
6367  */
6368 int init_dummy_netdev(struct net_device *dev)
6369 {
6370         /* Clear everything. Note we don't initialize spinlocks
6371          * are they aren't supposed to be taken by any of the
6372          * NAPI code and this dummy netdev is supposed to be
6373          * only ever used for NAPI polls
6374          */
6375         memset(dev, 0, sizeof(struct net_device));
6376
6377         /* make sure we BUG if trying to hit standard
6378          * register/unregister code path
6379          */
6380         dev->reg_state = NETREG_DUMMY;
6381
6382         /* NAPI wants this */
6383         INIT_LIST_HEAD(&dev->napi_list);
6384
6385         /* a dummy interface is started by default */
6386         set_bit(__LINK_STATE_PRESENT, &dev->state);
6387         set_bit(__LINK_STATE_START, &dev->state);
6388
6389         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6390          * because users of this 'device' dont need to change
6391          * its refcount.
6392          */
6393
6394         return 0;
6395 }
6396 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6397
6398
6399 /**
6400  *      register_netdev - register a network device
6401  *      @dev: device to register
6402  *
6403  *      Take a completed network device structure and add it to the kernel
6404  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6405  *      chain. 0 is returned on success. A negative errno code is returned
6406  *      on a failure to set up the device, or if the name is a duplicate.
6407  *
6408  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6409  *      and expands the device name if you passed a format string to
6410  *      alloc_netdev.
6411  */
6412 int register_netdev(struct net_device *dev)
6413 {
6414         int err;
6415
6416         rtnl_lock();
6417         err = register_netdevice(dev);
6418         rtnl_unlock();
6419         return err;
6420 }
6421 EXPORT_SYMBOL(register_netdev);
6422
6423 int netdev_refcnt_read(const struct net_device *dev)
6424 {
6425         int i, refcnt = 0;
6426
6427         for_each_possible_cpu(i)
6428                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6429         return refcnt;
6430 }
6431 EXPORT_SYMBOL(netdev_refcnt_read);
6432
6433 /**
6434  * netdev_wait_allrefs - wait until all references are gone.
6435  * @dev: target net_device
6436  *
6437  * This is called when unregistering network devices.
6438  *
6439  * Any protocol or device that holds a reference should register
6440  * for netdevice notification, and cleanup and put back the
6441  * reference if they receive an UNREGISTER event.
6442  * We can get stuck here if buggy protocols don't correctly
6443  * call dev_put.
6444  */
6445 static void netdev_wait_allrefs(struct net_device *dev)
6446 {
6447         unsigned long rebroadcast_time, warning_time;
6448         int refcnt;
6449
6450         linkwatch_forget_dev(dev);
6451
6452         rebroadcast_time = warning_time = jiffies;
6453         refcnt = netdev_refcnt_read(dev);
6454
6455         while (refcnt != 0) {
6456                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6457                         rtnl_lock();
6458
6459                         /* Rebroadcast unregister notification */
6460                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6461
6462                         __rtnl_unlock();
6463                         rcu_barrier();
6464                         rtnl_lock();
6465
6466                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6467                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6468                                      &dev->state)) {
6469                                 /* We must not have linkwatch events
6470                                  * pending on unregister. If this
6471                                  * happens, we simply run the queue
6472                                  * unscheduled, resulting in a noop
6473                                  * for this device.
6474                                  */
6475                                 linkwatch_run_queue();
6476                         }
6477
6478                         __rtnl_unlock();
6479
6480                         rebroadcast_time = jiffies;
6481                 }
6482
6483                 msleep(250);
6484
6485                 refcnt = netdev_refcnt_read(dev);
6486
6487                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6488                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6489                                  dev->name, refcnt);
6490                         warning_time = jiffies;
6491                 }
6492         }
6493 }
6494
6495 /* The sequence is:
6496  *
6497  *      rtnl_lock();
6498  *      ...
6499  *      register_netdevice(x1);
6500  *      register_netdevice(x2);
6501  *      ...
6502  *      unregister_netdevice(y1);
6503  *      unregister_netdevice(y2);
6504  *      ...
6505  *      rtnl_unlock();
6506  *      free_netdev(y1);
6507  *      free_netdev(y2);
6508  *
6509  * We are invoked by rtnl_unlock().
6510  * This allows us to deal with problems:
6511  * 1) We can delete sysfs objects which invoke hotplug
6512  *    without deadlocking with linkwatch via keventd.
6513  * 2) Since we run with the RTNL semaphore not held, we can sleep
6514  *    safely in order to wait for the netdev refcnt to drop to zero.
6515  *
6516  * We must not return until all unregister events added during
6517  * the interval the lock was held have been completed.
6518  */
6519 void netdev_run_todo(void)
6520 {
6521         struct list_head list;
6522
6523         /* Snapshot list, allow later requests */
6524         list_replace_init(&net_todo_list, &list);
6525
6526         __rtnl_unlock();
6527
6528
6529         /* Wait for rcu callbacks to finish before next phase */
6530         if (!list_empty(&list))
6531                 rcu_barrier();
6532
6533         while (!list_empty(&list)) {
6534                 struct net_device *dev
6535                         = list_first_entry(&list, struct net_device, todo_list);
6536                 list_del(&dev->todo_list);
6537
6538                 rtnl_lock();
6539                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6540                 __rtnl_unlock();
6541
6542                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6543                         pr_err("network todo '%s' but state %d\n",
6544                                dev->name, dev->reg_state);
6545                         dump_stack();
6546                         continue;
6547                 }
6548
6549                 dev->reg_state = NETREG_UNREGISTERED;
6550
6551                 on_each_cpu(flush_backlog, dev, 1);
6552
6553                 netdev_wait_allrefs(dev);
6554
6555                 /* paranoia */
6556                 BUG_ON(netdev_refcnt_read(dev));
6557                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6558                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6559                 WARN_ON(dev->dn_ptr);
6560
6561                 if (dev->destructor)
6562                         dev->destructor(dev);
6563
6564                 /* Report a network device has been unregistered */
6565                 rtnl_lock();
6566                 dev_net(dev)->dev_unreg_count--;
6567                 __rtnl_unlock();
6568                 wake_up(&netdev_unregistering_wq);
6569
6570                 /* Free network device */
6571                 kobject_put(&dev->dev.kobj);
6572         }
6573 }
6574
6575 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6576  * fields in the same order, with only the type differing.
6577  */
6578 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6579                              const struct net_device_stats *netdev_stats)
6580 {
6581 #if BITS_PER_LONG == 64
6582         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6583         memcpy(stats64, netdev_stats, sizeof(*stats64));
6584 #else
6585         size_t i, n = sizeof(*stats64) / sizeof(u64);
6586         const unsigned long *src = (const unsigned long *)netdev_stats;
6587         u64 *dst = (u64 *)stats64;
6588
6589         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6590                      sizeof(*stats64) / sizeof(u64));
6591         for (i = 0; i < n; i++)
6592                 dst[i] = src[i];
6593 #endif
6594 }
6595 EXPORT_SYMBOL(netdev_stats_to_stats64);
6596
6597 /**
6598  *      dev_get_stats   - get network device statistics
6599  *      @dev: device to get statistics from
6600  *      @storage: place to store stats
6601  *
6602  *      Get network statistics from device. Return @storage.
6603  *      The device driver may provide its own method by setting
6604  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6605  *      otherwise the internal statistics structure is used.
6606  */
6607 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6608                                         struct rtnl_link_stats64 *storage)
6609 {
6610         const struct net_device_ops *ops = dev->netdev_ops;
6611
6612         if (ops->ndo_get_stats64) {
6613                 memset(storage, 0, sizeof(*storage));
6614                 ops->ndo_get_stats64(dev, storage);
6615         } else if (ops->ndo_get_stats) {
6616                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6617         } else {
6618                 netdev_stats_to_stats64(storage, &dev->stats);
6619         }
6620         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6621         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6622         return storage;
6623 }
6624 EXPORT_SYMBOL(dev_get_stats);
6625
6626 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6627 {
6628         struct netdev_queue *queue = dev_ingress_queue(dev);
6629
6630 #ifdef CONFIG_NET_CLS_ACT
6631         if (queue)
6632                 return queue;
6633         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6634         if (!queue)
6635                 return NULL;
6636         netdev_init_one_queue(dev, queue, NULL);
6637         queue->qdisc = &noop_qdisc;
6638         queue->qdisc_sleeping = &noop_qdisc;
6639         rcu_assign_pointer(dev->ingress_queue, queue);
6640 #endif
6641         return queue;
6642 }
6643
6644 static const struct ethtool_ops default_ethtool_ops;
6645
6646 void netdev_set_default_ethtool_ops(struct net_device *dev,
6647                                     const struct ethtool_ops *ops)
6648 {
6649         if (dev->ethtool_ops == &default_ethtool_ops)
6650                 dev->ethtool_ops = ops;
6651 }
6652 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6653
6654 void netdev_freemem(struct net_device *dev)
6655 {
6656         char *addr = (char *)dev - dev->padded;
6657
6658         kvfree(addr);
6659 }
6660
6661 /**
6662  *      alloc_netdev_mqs - allocate network device
6663  *      @sizeof_priv:           size of private data to allocate space for
6664  *      @name:                  device name format string
6665  *      @name_assign_type:      origin of device name
6666  *      @setup:                 callback to initialize device
6667  *      @txqs:                  the number of TX subqueues to allocate
6668  *      @rxqs:                  the number of RX subqueues to allocate
6669  *
6670  *      Allocates a struct net_device with private data area for driver use
6671  *      and performs basic initialization.  Also allocates subqueue structs
6672  *      for each queue on the device.
6673  */
6674 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6675                 unsigned char name_assign_type,
6676                 void (*setup)(struct net_device *),
6677                 unsigned int txqs, unsigned int rxqs)
6678 {
6679         struct net_device *dev;
6680         size_t alloc_size;
6681         struct net_device *p;
6682
6683         BUG_ON(strlen(name) >= sizeof(dev->name));
6684
6685         if (txqs < 1) {
6686                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6687                 return NULL;
6688         }
6689
6690 #ifdef CONFIG_SYSFS
6691         if (rxqs < 1) {
6692                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6693                 return NULL;
6694         }
6695 #endif
6696
6697         alloc_size = sizeof(struct net_device);
6698         if (sizeof_priv) {
6699                 /* ensure 32-byte alignment of private area */
6700                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6701                 alloc_size += sizeof_priv;
6702         }
6703         /* ensure 32-byte alignment of whole construct */
6704         alloc_size += NETDEV_ALIGN - 1;
6705
6706         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6707         if (!p)
6708                 p = vzalloc(alloc_size);
6709         if (!p)
6710                 return NULL;
6711
6712         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6713         dev->padded = (char *)dev - (char *)p;
6714
6715         dev->pcpu_refcnt = alloc_percpu(int);
6716         if (!dev->pcpu_refcnt)
6717                 goto free_dev;
6718
6719         if (dev_addr_init(dev))
6720                 goto free_pcpu;
6721
6722         dev_mc_init(dev);
6723         dev_uc_init(dev);
6724
6725         dev_net_set(dev, &init_net);
6726
6727         dev->gso_max_size = GSO_MAX_SIZE;
6728         dev->gso_max_segs = GSO_MAX_SEGS;
6729         dev->gso_min_segs = 0;
6730
6731         INIT_LIST_HEAD(&dev->napi_list);
6732         INIT_LIST_HEAD(&dev->unreg_list);
6733         INIT_LIST_HEAD(&dev->close_list);
6734         INIT_LIST_HEAD(&dev->link_watch_list);
6735         INIT_LIST_HEAD(&dev->adj_list.upper);
6736         INIT_LIST_HEAD(&dev->adj_list.lower);
6737         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6738         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6739         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6740         setup(dev);
6741
6742         dev->num_tx_queues = txqs;
6743         dev->real_num_tx_queues = txqs;
6744         if (netif_alloc_netdev_queues(dev))
6745                 goto free_all;
6746
6747 #ifdef CONFIG_SYSFS
6748         dev->num_rx_queues = rxqs;
6749         dev->real_num_rx_queues = rxqs;
6750         if (netif_alloc_rx_queues(dev))
6751                 goto free_all;
6752 #endif
6753
6754         strcpy(dev->name, name);
6755         dev->name_assign_type = name_assign_type;
6756         dev->group = INIT_NETDEV_GROUP;
6757         if (!dev->ethtool_ops)
6758                 dev->ethtool_ops = &default_ethtool_ops;
6759         return dev;
6760
6761 free_all:
6762         free_netdev(dev);
6763         return NULL;
6764
6765 free_pcpu:
6766         free_percpu(dev->pcpu_refcnt);
6767 free_dev:
6768         netdev_freemem(dev);
6769         return NULL;
6770 }
6771 EXPORT_SYMBOL(alloc_netdev_mqs);
6772
6773 /**
6774  *      free_netdev - free network device
6775  *      @dev: device
6776  *
6777  *      This function does the last stage of destroying an allocated device
6778  *      interface. The reference to the device object is released.
6779  *      If this is the last reference then it will be freed.
6780  */
6781 void free_netdev(struct net_device *dev)
6782 {
6783         struct napi_struct *p, *n;
6784
6785         release_net(dev_net(dev));
6786
6787         netif_free_tx_queues(dev);
6788 #ifdef CONFIG_SYSFS
6789         kfree(dev->_rx);
6790 #endif
6791
6792         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6793
6794         /* Flush device addresses */
6795         dev_addr_flush(dev);
6796
6797         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6798                 netif_napi_del(p);
6799
6800         free_percpu(dev->pcpu_refcnt);
6801         dev->pcpu_refcnt = NULL;
6802
6803         /*  Compatibility with error handling in drivers */
6804         if (dev->reg_state == NETREG_UNINITIALIZED) {
6805                 netdev_freemem(dev);
6806                 return;
6807         }
6808
6809         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6810         dev->reg_state = NETREG_RELEASED;
6811
6812         /* will free via device release */
6813         put_device(&dev->dev);
6814 }
6815 EXPORT_SYMBOL(free_netdev);
6816
6817 /**
6818  *      synchronize_net -  Synchronize with packet receive processing
6819  *
6820  *      Wait for packets currently being received to be done.
6821  *      Does not block later packets from starting.
6822  */
6823 void synchronize_net(void)
6824 {
6825         might_sleep();
6826         if (rtnl_is_locked())
6827                 synchronize_rcu_expedited();
6828         else
6829                 synchronize_rcu();
6830 }
6831 EXPORT_SYMBOL(synchronize_net);
6832
6833 /**
6834  *      unregister_netdevice_queue - remove device from the kernel
6835  *      @dev: device
6836  *      @head: list
6837  *
6838  *      This function shuts down a device interface and removes it
6839  *      from the kernel tables.
6840  *      If head not NULL, device is queued to be unregistered later.
6841  *
6842  *      Callers must hold the rtnl semaphore.  You may want
6843  *      unregister_netdev() instead of this.
6844  */
6845
6846 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6847 {
6848         ASSERT_RTNL();
6849
6850         if (head) {
6851                 list_move_tail(&dev->unreg_list, head);
6852         } else {
6853                 rollback_registered(dev);
6854                 /* Finish processing unregister after unlock */
6855                 net_set_todo(dev);
6856         }
6857 }
6858 EXPORT_SYMBOL(unregister_netdevice_queue);
6859
6860 /**
6861  *      unregister_netdevice_many - unregister many devices
6862  *      @head: list of devices
6863  *
6864  *  Note: As most callers use a stack allocated list_head,
6865  *  we force a list_del() to make sure stack wont be corrupted later.
6866  */
6867 void unregister_netdevice_many(struct list_head *head)
6868 {
6869         struct net_device *dev;
6870
6871         if (!list_empty(head)) {
6872                 rollback_registered_many(head);
6873                 list_for_each_entry(dev, head, unreg_list)
6874                         net_set_todo(dev);
6875                 list_del(head);
6876         }
6877 }
6878 EXPORT_SYMBOL(unregister_netdevice_many);
6879
6880 /**
6881  *      unregister_netdev - remove device from the kernel
6882  *      @dev: device
6883  *
6884  *      This function shuts down a device interface and removes it
6885  *      from the kernel tables.
6886  *
6887  *      This is just a wrapper for unregister_netdevice that takes
6888  *      the rtnl semaphore.  In general you want to use this and not
6889  *      unregister_netdevice.
6890  */
6891 void unregister_netdev(struct net_device *dev)
6892 {
6893         rtnl_lock();
6894         unregister_netdevice(dev);
6895         rtnl_unlock();
6896 }
6897 EXPORT_SYMBOL(unregister_netdev);
6898
6899 /**
6900  *      dev_change_net_namespace - move device to different nethost namespace
6901  *      @dev: device
6902  *      @net: network namespace
6903  *      @pat: If not NULL name pattern to try if the current device name
6904  *            is already taken in the destination network namespace.
6905  *
6906  *      This function shuts down a device interface and moves it
6907  *      to a new network namespace. On success 0 is returned, on
6908  *      a failure a netagive errno code is returned.
6909  *
6910  *      Callers must hold the rtnl semaphore.
6911  */
6912
6913 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6914 {
6915         int err;
6916
6917         ASSERT_RTNL();
6918
6919         /* Don't allow namespace local devices to be moved. */
6920         err = -EINVAL;
6921         if (dev->features & NETIF_F_NETNS_LOCAL)
6922                 goto out;
6923
6924         /* Ensure the device has been registrered */
6925         if (dev->reg_state != NETREG_REGISTERED)
6926                 goto out;
6927
6928         /* Get out if there is nothing todo */
6929         err = 0;
6930         if (net_eq(dev_net(dev), net))
6931                 goto out;
6932
6933         /* Pick the destination device name, and ensure
6934          * we can use it in the destination network namespace.
6935          */
6936         err = -EEXIST;
6937         if (__dev_get_by_name(net, dev->name)) {
6938                 /* We get here if we can't use the current device name */
6939                 if (!pat)
6940                         goto out;
6941                 if (dev_get_valid_name(net, dev, pat) < 0)
6942                         goto out;
6943         }
6944
6945         /*
6946          * And now a mini version of register_netdevice unregister_netdevice.
6947          */
6948
6949         /* If device is running close it first. */
6950         dev_close(dev);
6951
6952         /* And unlink it from device chain */
6953         err = -ENODEV;
6954         unlist_netdevice(dev);
6955
6956         synchronize_net();
6957
6958         /* Shutdown queueing discipline. */
6959         dev_shutdown(dev);
6960
6961         /* Notify protocols, that we are about to destroy
6962            this device. They should clean all the things.
6963
6964            Note that dev->reg_state stays at NETREG_REGISTERED.
6965            This is wanted because this way 8021q and macvlan know
6966            the device is just moving and can keep their slaves up.
6967         */
6968         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6969         rcu_barrier();
6970         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6971         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6972
6973         /*
6974          *      Flush the unicast and multicast chains
6975          */
6976         dev_uc_flush(dev);
6977         dev_mc_flush(dev);
6978
6979         /* Send a netdev-removed uevent to the old namespace */
6980         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6981         netdev_adjacent_del_links(dev);
6982
6983         /* Actually switch the network namespace */
6984         dev_net_set(dev, net);
6985
6986         /* If there is an ifindex conflict assign a new one */
6987         if (__dev_get_by_index(net, dev->ifindex)) {
6988                 int iflink = (dev->iflink == dev->ifindex);
6989                 dev->ifindex = dev_new_index(net);
6990                 if (iflink)
6991                         dev->iflink = dev->ifindex;
6992         }
6993
6994         /* Send a netdev-add uevent to the new namespace */
6995         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6996         netdev_adjacent_add_links(dev);
6997
6998         /* Fixup kobjects */
6999         err = device_rename(&dev->dev, dev->name);
7000         WARN_ON(err);
7001
7002         /* Add the device back in the hashes */
7003         list_netdevice(dev);
7004
7005         /* Notify protocols, that a new device appeared. */
7006         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7007
7008         /*
7009          *      Prevent userspace races by waiting until the network
7010          *      device is fully setup before sending notifications.
7011          */
7012         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7013
7014         synchronize_net();
7015         err = 0;
7016 out:
7017         return err;
7018 }
7019 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7020
7021 static int dev_cpu_callback(struct notifier_block *nfb,
7022                             unsigned long action,
7023                             void *ocpu)
7024 {
7025         struct sk_buff **list_skb;
7026         struct sk_buff *skb;
7027         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7028         struct softnet_data *sd, *oldsd;
7029
7030         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7031                 return NOTIFY_OK;
7032
7033         local_irq_disable();
7034         cpu = smp_processor_id();
7035         sd = &per_cpu(softnet_data, cpu);
7036         oldsd = &per_cpu(softnet_data, oldcpu);
7037
7038         /* Find end of our completion_queue. */
7039         list_skb = &sd->completion_queue;
7040         while (*list_skb)
7041                 list_skb = &(*list_skb)->next;
7042         /* Append completion queue from offline CPU. */
7043         *list_skb = oldsd->completion_queue;
7044         oldsd->completion_queue = NULL;
7045
7046         /* Append output queue from offline CPU. */
7047         if (oldsd->output_queue) {
7048                 *sd->output_queue_tailp = oldsd->output_queue;
7049                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7050                 oldsd->output_queue = NULL;
7051                 oldsd->output_queue_tailp = &oldsd->output_queue;
7052         }
7053         /* Append NAPI poll list from offline CPU. */
7054         if (!list_empty(&oldsd->poll_list)) {
7055                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
7056                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
7057         }
7058
7059         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7060         local_irq_enable();
7061
7062         /* Process offline CPU's input_pkt_queue */
7063         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7064                 netif_rx_internal(skb);
7065                 input_queue_head_incr(oldsd);
7066         }
7067         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7068                 netif_rx_internal(skb);
7069                 input_queue_head_incr(oldsd);
7070         }
7071
7072         return NOTIFY_OK;
7073 }
7074
7075
7076 /**
7077  *      netdev_increment_features - increment feature set by one
7078  *      @all: current feature set
7079  *      @one: new feature set
7080  *      @mask: mask feature set
7081  *
7082  *      Computes a new feature set after adding a device with feature set
7083  *      @one to the master device with current feature set @all.  Will not
7084  *      enable anything that is off in @mask. Returns the new feature set.
7085  */
7086 netdev_features_t netdev_increment_features(netdev_features_t all,
7087         netdev_features_t one, netdev_features_t mask)
7088 {
7089         if (mask & NETIF_F_GEN_CSUM)
7090                 mask |= NETIF_F_ALL_CSUM;
7091         mask |= NETIF_F_VLAN_CHALLENGED;
7092
7093         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7094         all &= one | ~NETIF_F_ALL_FOR_ALL;
7095
7096         /* If one device supports hw checksumming, set for all. */
7097         if (all & NETIF_F_GEN_CSUM)
7098                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7099
7100         return all;
7101 }
7102 EXPORT_SYMBOL(netdev_increment_features);
7103
7104 static struct hlist_head * __net_init netdev_create_hash(void)
7105 {
7106         int i;
7107         struct hlist_head *hash;
7108
7109         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7110         if (hash != NULL)
7111                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7112                         INIT_HLIST_HEAD(&hash[i]);
7113
7114         return hash;
7115 }
7116
7117 /* Initialize per network namespace state */
7118 static int __net_init netdev_init(struct net *net)
7119 {
7120         if (net != &init_net)
7121                 INIT_LIST_HEAD(&net->dev_base_head);
7122
7123         net->dev_name_head = netdev_create_hash();
7124         if (net->dev_name_head == NULL)
7125                 goto err_name;
7126
7127         net->dev_index_head = netdev_create_hash();
7128         if (net->dev_index_head == NULL)
7129                 goto err_idx;
7130
7131         return 0;
7132
7133 err_idx:
7134         kfree(net->dev_name_head);
7135 err_name:
7136         return -ENOMEM;
7137 }
7138
7139 /**
7140  *      netdev_drivername - network driver for the device
7141  *      @dev: network device
7142  *
7143  *      Determine network driver for device.
7144  */
7145 const char *netdev_drivername(const struct net_device *dev)
7146 {
7147         const struct device_driver *driver;
7148         const struct device *parent;
7149         const char *empty = "";
7150
7151         parent = dev->dev.parent;
7152         if (!parent)
7153                 return empty;
7154
7155         driver = parent->driver;
7156         if (driver && driver->name)
7157                 return driver->name;
7158         return empty;
7159 }
7160
7161 static void __netdev_printk(const char *level, const struct net_device *dev,
7162                             struct va_format *vaf)
7163 {
7164         if (dev && dev->dev.parent) {
7165                 dev_printk_emit(level[1] - '0',
7166                                 dev->dev.parent,
7167                                 "%s %s %s%s: %pV",
7168                                 dev_driver_string(dev->dev.parent),
7169                                 dev_name(dev->dev.parent),
7170                                 netdev_name(dev), netdev_reg_state(dev),
7171                                 vaf);
7172         } else if (dev) {
7173                 printk("%s%s%s: %pV",
7174                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7175         } else {
7176                 printk("%s(NULL net_device): %pV", level, vaf);
7177         }
7178 }
7179
7180 void netdev_printk(const char *level, const struct net_device *dev,
7181                    const char *format, ...)
7182 {
7183         struct va_format vaf;
7184         va_list args;
7185
7186         va_start(args, format);
7187
7188         vaf.fmt = format;
7189         vaf.va = &args;
7190
7191         __netdev_printk(level, dev, &vaf);
7192
7193         va_end(args);
7194 }
7195 EXPORT_SYMBOL(netdev_printk);
7196
7197 #define define_netdev_printk_level(func, level)                 \
7198 void func(const struct net_device *dev, const char *fmt, ...)   \
7199 {                                                               \
7200         struct va_format vaf;                                   \
7201         va_list args;                                           \
7202                                                                 \
7203         va_start(args, fmt);                                    \
7204                                                                 \
7205         vaf.fmt = fmt;                                          \
7206         vaf.va = &args;                                         \
7207                                                                 \
7208         __netdev_printk(level, dev, &vaf);                      \
7209                                                                 \
7210         va_end(args);                                           \
7211 }                                                               \
7212 EXPORT_SYMBOL(func);
7213
7214 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7215 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7216 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7217 define_netdev_printk_level(netdev_err, KERN_ERR);
7218 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7219 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7220 define_netdev_printk_level(netdev_info, KERN_INFO);
7221
7222 static void __net_exit netdev_exit(struct net *net)
7223 {
7224         kfree(net->dev_name_head);
7225         kfree(net->dev_index_head);
7226 }
7227
7228 static struct pernet_operations __net_initdata netdev_net_ops = {
7229         .init = netdev_init,
7230         .exit = netdev_exit,
7231 };
7232
7233 static void __net_exit default_device_exit(struct net *net)
7234 {
7235         struct net_device *dev, *aux;
7236         /*
7237          * Push all migratable network devices back to the
7238          * initial network namespace
7239          */
7240         rtnl_lock();
7241         for_each_netdev_safe(net, dev, aux) {
7242                 int err;
7243                 char fb_name[IFNAMSIZ];
7244
7245                 /* Ignore unmoveable devices (i.e. loopback) */
7246                 if (dev->features & NETIF_F_NETNS_LOCAL)
7247                         continue;
7248
7249                 /* Leave virtual devices for the generic cleanup */
7250                 if (dev->rtnl_link_ops)
7251                         continue;
7252
7253                 /* Push remaining network devices to init_net */
7254                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7255                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7256                 if (err) {
7257                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7258                                  __func__, dev->name, err);
7259                         BUG();
7260                 }
7261         }
7262         rtnl_unlock();
7263 }
7264
7265 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7266 {
7267         /* Return with the rtnl_lock held when there are no network
7268          * devices unregistering in any network namespace in net_list.
7269          */
7270         struct net *net;
7271         bool unregistering;
7272         DEFINE_WAIT(wait);
7273
7274         for (;;) {
7275                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7276                                 TASK_UNINTERRUPTIBLE);
7277                 unregistering = false;
7278                 rtnl_lock();
7279                 list_for_each_entry(net, net_list, exit_list) {
7280                         if (net->dev_unreg_count > 0) {
7281                                 unregistering = true;
7282                                 break;
7283                         }
7284                 }
7285                 if (!unregistering)
7286                         break;
7287                 __rtnl_unlock();
7288                 schedule();
7289         }
7290         finish_wait(&netdev_unregistering_wq, &wait);
7291 }
7292
7293 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7294 {
7295         /* At exit all network devices most be removed from a network
7296          * namespace.  Do this in the reverse order of registration.
7297          * Do this across as many network namespaces as possible to
7298          * improve batching efficiency.
7299          */
7300         struct net_device *dev;
7301         struct net *net;
7302         LIST_HEAD(dev_kill_list);
7303
7304         /* To prevent network device cleanup code from dereferencing
7305          * loopback devices or network devices that have been freed
7306          * wait here for all pending unregistrations to complete,
7307          * before unregistring the loopback device and allowing the
7308          * network namespace be freed.
7309          *
7310          * The netdev todo list containing all network devices
7311          * unregistrations that happen in default_device_exit_batch
7312          * will run in the rtnl_unlock() at the end of
7313          * default_device_exit_batch.
7314          */
7315         rtnl_lock_unregistering(net_list);
7316         list_for_each_entry(net, net_list, exit_list) {
7317                 for_each_netdev_reverse(net, dev) {
7318                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7319                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7320                         else
7321                                 unregister_netdevice_queue(dev, &dev_kill_list);
7322                 }
7323         }
7324         unregister_netdevice_many(&dev_kill_list);
7325         rtnl_unlock();
7326 }
7327
7328 static struct pernet_operations __net_initdata default_device_ops = {
7329         .exit = default_device_exit,
7330         .exit_batch = default_device_exit_batch,
7331 };
7332
7333 /*
7334  *      Initialize the DEV module. At boot time this walks the device list and
7335  *      unhooks any devices that fail to initialise (normally hardware not
7336  *      present) and leaves us with a valid list of present and active devices.
7337  *
7338  */
7339
7340 /*
7341  *       This is called single threaded during boot, so no need
7342  *       to take the rtnl semaphore.
7343  */
7344 static int __init net_dev_init(void)
7345 {
7346         int i, rc = -ENOMEM;
7347
7348         BUG_ON(!dev_boot_phase);
7349
7350         if (dev_proc_init())
7351                 goto out;
7352
7353         if (netdev_kobject_init())
7354                 goto out;
7355
7356         INIT_LIST_HEAD(&ptype_all);
7357         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7358                 INIT_LIST_HEAD(&ptype_base[i]);
7359
7360         INIT_LIST_HEAD(&offload_base);
7361
7362         if (register_pernet_subsys(&netdev_net_ops))
7363                 goto out;
7364
7365         /*
7366          *      Initialise the packet receive queues.
7367          */
7368
7369         for_each_possible_cpu(i) {
7370                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7371
7372                 skb_queue_head_init(&sd->input_pkt_queue);
7373                 skb_queue_head_init(&sd->process_queue);
7374                 INIT_LIST_HEAD(&sd->poll_list);
7375                 sd->output_queue_tailp = &sd->output_queue;
7376 #ifdef CONFIG_RPS
7377                 sd->csd.func = rps_trigger_softirq;
7378                 sd->csd.info = sd;
7379                 sd->cpu = i;
7380 #endif
7381
7382                 sd->backlog.poll = process_backlog;
7383                 sd->backlog.weight = weight_p;
7384         }
7385
7386         dev_boot_phase = 0;
7387
7388         /* The loopback device is special if any other network devices
7389          * is present in a network namespace the loopback device must
7390          * be present. Since we now dynamically allocate and free the
7391          * loopback device ensure this invariant is maintained by
7392          * keeping the loopback device as the first device on the
7393          * list of network devices.  Ensuring the loopback devices
7394          * is the first device that appears and the last network device
7395          * that disappears.
7396          */
7397         if (register_pernet_device(&loopback_net_ops))
7398                 goto out;
7399
7400         if (register_pernet_device(&default_device_ops))
7401                 goto out;
7402
7403         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7404         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7405
7406         hotcpu_notifier(dev_cpu_callback, 0);
7407         dst_init();
7408         rc = 0;
7409 out:
7410         return rc;
7411 }
7412
7413 subsys_initcall(net_dev_init);