net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135 #include <linux/errqueue.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 static DEFINE_SPINLOCK(ptype_lock);
 146 static DEFINE_SPINLOCK(offload_lock);
 147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 148 struct list_head ptype_all __read_mostly;       /* Taps */
 149 static struct list_head offload_base __read_mostly;
 150
 151 static int netif_rx_internal(struct sk_buff *skb);
 152 static int call_netdevice_notifiers_info(unsigned long val,
 153                                          struct net_device *dev,
 154                                          struct netdev_notifier_info *info);
 155
 156 /*
 157  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 158  * semaphore.
 159  *
 160  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 161  *
 162  * Writers must hold the rtnl semaphore while they loop through the
 163  * dev_base_head list, and hold dev_base_lock for writing when they do the
 164  * actual updates.  This allows pure readers to access the list even
 165  * while a writer is preparing to update it.
 166  *
 167  * To put it another way, dev_base_lock is held for writing only to
 168  * protect against pure readers; the rtnl semaphore provides the
 169  * protection against other writers.
 170  *
 171  * See, for example usages, register_netdevice() and
 172  * unregister_netdevice(), which must be called with the rtnl
 173  * semaphore held.
 174  */
 175 DEFINE_RWLOCK(dev_base_lock);
 176 EXPORT_SYMBOL(dev_base_lock);
 177
 178 /* protects napi_hash addition/deletion and napi_gen_id */
 179 static DEFINE_SPINLOCK(napi_hash_lock);
 180
 181 static unsigned int napi_gen_id;
 182 static DEFINE_HASHTABLE(napi_hash, 8);
 183
 184 static seqcount_t devnet_rename_seq;
 185
 186 static inline void dev_base_seq_inc(struct net *net)
 187 {
 188         while (++net->dev_base_seq == 0);
 189 }
 190
 191 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 192 {
 193         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 194
 195         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 196 }
 197
 198 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 199 {
 200         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 201 }
 202
 203 static inline void rps_lock(struct softnet_data *sd)
 204 {
 205 #ifdef CONFIG_RPS
 206         spin_lock(&sd->input_pkt_queue.lock);
 207 #endif
 208 }
 209
 210 static inline void rps_unlock(struct softnet_data *sd)
 211 {
 212 #ifdef CONFIG_RPS
 213         spin_unlock(&sd->input_pkt_queue.lock);
 214 #endif
 215 }
 216
 217 /* Device list insertion */
 218 static void list_netdevice(struct net_device *dev)
 219 {
 220         struct net *net = dev_net(dev);
 221
 222         ASSERT_RTNL();
 223
 224         write_lock_bh(&dev_base_lock);
 225         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 226         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 227         hlist_add_head_rcu(&dev->index_hlist,
 228                            dev_index_hash(net, dev->ifindex));
 229         write_unlock_bh(&dev_base_lock);
 230
 231         dev_base_seq_inc(net);
 232 }
 233
 234 /* Device list removal
 235  * caller must respect a RCU grace period before freeing/reusing dev
 236  */
 237 static void unlist_netdevice(struct net_device *dev)
 238 {
 239         ASSERT_RTNL();
 240
 241         /* Unlink dev from the device chain */
 242         write_lock_bh(&dev_base_lock);
 243         list_del_rcu(&dev->dev_list);
 244         hlist_del_rcu(&dev->name_hlist);
 245         hlist_del_rcu(&dev->index_hlist);
 246         write_unlock_bh(&dev_base_lock);
 247
 248         dev_base_seq_inc(dev_net(dev));
 249 }
 250
 251 /*
 252  *      Our notifier list
 253  */
 254
 255 static RAW_NOTIFIER_HEAD(netdev_chain);
 256
 257 /*
 258  *      Device drivers call our routines to queue packets here. We empty the
 259  *      queue in the local softnet handler.
 260  */
 261
 262 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 263 EXPORT_PER_CPU_SYMBOL(softnet_data);
 264
 265 #ifdef CONFIG_LOCKDEP
 266 /*
 267  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 268  * according to dev->type
 269  */
 270 static const unsigned short netdev_lock_type[] =
 271         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 272          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 273          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 274          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 275          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 276          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 277          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 278          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 279          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 280          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 281          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 282          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 283          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 284          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 285          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 286
 287 static const char *const netdev_lock_name[] =
 288         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 289          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 290          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 291          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 292          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 293          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 294          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 295          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 296          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 297          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 298          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 299          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 300          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 301          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 302          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 303
 304 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 306
 307 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 308 {
 309         int i;
 310
 311         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 312                 if (netdev_lock_type[i] == dev_type)
 313                         return i;
 314         /* the last key is used by default */
 315         return ARRAY_SIZE(netdev_lock_type) - 1;
 316 }
 317
 318 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 319                                                  unsigned short dev_type)
 320 {
 321         int i;
 322
 323         i = netdev_lock_pos(dev_type);
 324         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 325                                    netdev_lock_name[i]);
 326 }
 327
 328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev->type);
 333         lockdep_set_class_and_name(&dev->addr_list_lock,
 334                                    &netdev_addr_lock_key[i],
 335                                    netdev_lock_name[i]);
 336 }
 337 #else
 338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339                                                  unsigned short dev_type)
 340 {
 341 }
 342 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 343 {
 344 }
 345 #endif
 346
 347 /*******************************************************************************
 348
 349                 Protocol management and registration routines
 350
 351 *******************************************************************************/
 352
 353 /*
 354  *      Add a protocol ID to the list. Now that the input handler is
 355  *      smarter we can dispense with all the messy stuff that used to be
 356  *      here.
 357  *
 358  *      BEWARE!!! Protocol handlers, mangling input packets,
 359  *      MUST BE last in hash buckets and checking protocol handlers
 360  *      MUST start from promiscuous ptype_all chain in net_bh.
 361  *      It is true now, do not change it.
 362  *      Explanation follows: if protocol handler, mangling packet, will
 363  *      be the first on list, it is not able to sense, that packet
 364  *      is cloned and should be copied-on-write, so that it will
 365  *      change it and subsequent readers will get broken packet.
 366  *                                                      --ANK (980803)
 367  */
 368
 369 static inline struct list_head *ptype_head(const struct packet_type *pt)
 370 {
 371         if (pt->type == htons(ETH_P_ALL))
 372                 return &ptype_all;
 373         else
 374                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 375 }
 376
 377 /**
 378  *      dev_add_pack - add packet handler
 379  *      @pt: packet type declaration
 380  *
 381  *      Add a protocol handler to the networking stack. The passed &packet_type
 382  *      is linked into kernel lists and may not be freed until it has been
 383  *      removed from the kernel lists.
 384  *
 385  *      This call does not sleep therefore it can not
 386  *      guarantee all CPU's that are in middle of receiving packets
 387  *      will see the new packet type (until the next received packet).
 388  */
 389
 390 void dev_add_pack(struct packet_type *pt)
 391 {
 392         struct list_head *head = ptype_head(pt);
 393
 394         spin_lock(&ptype_lock);
 395         list_add_rcu(&pt->list, head);
 396         spin_unlock(&ptype_lock);
 397 }
 398 EXPORT_SYMBOL(dev_add_pack);
 399
 400 /**
 401  *      __dev_remove_pack        - remove packet handler
 402  *      @pt: packet type declaration
 403  *
 404  *      Remove a protocol handler that was previously added to the kernel
 405  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 406  *      from the kernel lists and can be freed or reused once this function
 407  *      returns.
 408  *
 409  *      The packet type might still be in use by receivers
 410  *      and must not be freed until after all the CPU's have gone
 411  *      through a quiescent state.
 412  */
 413 void __dev_remove_pack(struct packet_type *pt)
 414 {
 415         struct list_head *head = ptype_head(pt);
 416         struct packet_type *pt1;
 417
 418         spin_lock(&ptype_lock);
 419
 420         list_for_each_entry(pt1, head, list) {
 421                 if (pt == pt1) {
 422                         list_del_rcu(&pt->list);
 423                         goto out;
 424                 }
 425         }
 426
 427         pr_warn("dev_remove_pack: %p not found\n", pt);
 428 out:
 429         spin_unlock(&ptype_lock);
 430 }
 431 EXPORT_SYMBOL(__dev_remove_pack);
 432
 433 /**
 434  *      dev_remove_pack  - remove packet handler
 435  *      @pt: packet type declaration
 436  *
 437  *      Remove a protocol handler that was previously added to the kernel
 438  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 439  *      from the kernel lists and can be freed or reused once this function
 440  *      returns.
 441  *
 442  *      This call sleeps to guarantee that no CPU is looking at the packet
 443  *      type after return.
 444  */
 445 void dev_remove_pack(struct packet_type *pt)
 446 {
 447         __dev_remove_pack(pt);
 448
 449         synchronize_net();
 450 }
 451 EXPORT_SYMBOL(dev_remove_pack);
 452
 453
 454 /**
 455  *      dev_add_offload - register offload handlers
 456  *      @po: protocol offload declaration
 457  *
 458  *      Add protocol offload handlers to the networking stack. The passed
 459  *      &proto_offload is linked into kernel lists and may not be freed until
 460  *      it has been removed from the kernel lists.
 461  *
 462  *      This call does not sleep therefore it can not
 463  *      guarantee all CPU's that are in middle of receiving packets
 464  *      will see the new offload handlers (until the next received packet).
 465  */
 466 void dev_add_offload(struct packet_offload *po)
 467 {
 468         struct list_head *head = &offload_base;
 469
 470         spin_lock(&offload_lock);
 471         list_add_rcu(&po->list, head);
 472         spin_unlock(&offload_lock);
 473 }
 474 EXPORT_SYMBOL(dev_add_offload);
 475
 476 /**
 477  *      __dev_remove_offload     - remove offload handler
 478  *      @po: packet offload declaration
 479  *
 480  *      Remove a protocol offload handler that was previously added to the
 481  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 482  *      is removed from the kernel lists and can be freed or reused once this
 483  *      function returns.
 484  *
 485  *      The packet type might still be in use by receivers
 486  *      and must not be freed until after all the CPU's have gone
 487  *      through a quiescent state.
 488  */
 489 static void __dev_remove_offload(struct packet_offload *po)
 490 {
 491         struct list_head *head = &offload_base;
 492         struct packet_offload *po1;
 493
 494         spin_lock(&offload_lock);
 495
 496         list_for_each_entry(po1, head, list) {
 497                 if (po == po1) {
 498                         list_del_rcu(&po->list);
 499                         goto out;
 500                 }
 501         }
 502
 503         pr_warn("dev_remove_offload: %p not found\n", po);
 504 out:
 505         spin_unlock(&offload_lock);
 506 }
 507
 508 /**
 509  *      dev_remove_offload       - remove packet offload handler
 510  *      @po: packet offload declaration
 511  *
 512  *      Remove a packet offload handler that was previously added to the kernel
 513  *      offload handlers by dev_add_offload(). The passed &offload_type is
 514  *      removed from the kernel lists and can be freed or reused once this
 515  *      function returns.
 516  *
 517  *      This call sleeps to guarantee that no CPU is looking at the packet
 518  *      type after return.
 519  */
 520 void dev_remove_offload(struct packet_offload *po)
 521 {
 522         __dev_remove_offload(po);
 523
 524         synchronize_net();
 525 }
 526 EXPORT_SYMBOL(dev_remove_offload);
 527
 528 /******************************************************************************
 529
 530                       Device Boot-time Settings Routines
 531
 532 *******************************************************************************/
 533
 534 /* Boot time configuration table */
 535 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 536
 537 /**
 538  *      netdev_boot_setup_add   - add new setup entry
 539  *      @name: name of the device
 540  *      @map: configured settings for the device
 541  *
 542  *      Adds new setup entry to the dev_boot_setup list.  The function
 543  *      returns 0 on error and 1 on success.  This is a generic routine to
 544  *      all netdevices.
 545  */
 546 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 547 {
 548         struct netdev_boot_setup *s;
 549         int i;
 550
 551         s = dev_boot_setup;
 552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 553                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 554                         memset(s[i].name, 0, sizeof(s[i].name));
 555                         strlcpy(s[i].name, name, IFNAMSIZ);
 556                         memcpy(&s[i].map, map, sizeof(s[i].map));
 557                         break;
 558                 }
 559         }
 560
 561         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 562 }
 563
 564 /**
 565  *      netdev_boot_setup_check - check boot time settings
 566  *      @dev: the netdevice
 567  *
 568  *      Check boot time settings for the device.
 569  *      The found settings are set for the device to be used
 570  *      later in the device probing.
 571  *      Returns 0 if no settings found, 1 if they are.
 572  */
 573 int netdev_boot_setup_check(struct net_device *dev)
 574 {
 575         struct netdev_boot_setup *s = dev_boot_setup;
 576         int i;
 577
 578         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 579                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 580                     !strcmp(dev->name, s[i].name)) {
 581                         dev->irq        = s[i].map.irq;
 582                         dev->base_addr  = s[i].map.base_addr;
 583                         dev->mem_start  = s[i].map.mem_start;
 584                         dev->mem_end    = s[i].map.mem_end;
 585                         return 1;
 586                 }
 587         }
 588         return 0;
 589 }
 590 EXPORT_SYMBOL(netdev_boot_setup_check);
 591
 592
 593 /**
 594  *      netdev_boot_base        - get address from boot time settings
 595  *      @prefix: prefix for network device
 596  *      @unit: id for network device
 597  *
 598  *      Check boot time settings for the base address of device.
 599  *      The found settings are set for the device to be used
 600  *      later in the device probing.
 601  *      Returns 0 if no settings found.
 602  */
 603 unsigned long netdev_boot_base(const char *prefix, int unit)
 604 {
 605         const struct netdev_boot_setup *s = dev_boot_setup;
 606         char name[IFNAMSIZ];
 607         int i;
 608
 609         sprintf(name, "%s%d", prefix, unit);
 610
 611         /*
 612          * If device already registered then return base of 1
 613          * to indicate not to probe for this interface
 614          */
 615         if (__dev_get_by_name(&init_net, name))
 616                 return 1;
 617
 618         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 619                 if (!strcmp(name, s[i].name))
 620                         return s[i].map.base_addr;
 621         return 0;
 622 }
 623
 624 /*
 625  * Saves at boot time configured settings for any netdevice.
 626  */
 627 int __init netdev_boot_setup(char *str)
 628 {
 629         int ints[5];
 630         struct ifmap map;
 631
 632         str = get_options(str, ARRAY_SIZE(ints), ints);
 633         if (!str || !*str)
 634                 return 0;
 635
 636         /* Save settings */
 637         memset(&map, 0, sizeof(map));
 638         if (ints[0] > 0)
 639                 map.irq = ints[1];
 640         if (ints[0] > 1)
 641                 map.base_addr = ints[2];
 642         if (ints[0] > 2)
 643                 map.mem_start = ints[3];
 644         if (ints[0] > 3)
 645                 map.mem_end = ints[4];
 646
 647         /* Add new entry to the list */
 648         return netdev_boot_setup_add(str, &map);
 649 }
 650
 651 __setup("netdev=", netdev_boot_setup);
 652
 653 /*******************************************************************************
 654
 655                             Device Interface Subroutines
 656
 657 *******************************************************************************/
 658
 659 /**
 660  *      __dev_get_by_name       - find a device by its name
 661  *      @net: the applicable net namespace
 662  *      @name: name to find
 663  *
 664  *      Find an interface by name. Must be called under RTNL semaphore
 665  *      or @dev_base_lock. If the name is found a pointer to the device
 666  *      is returned. If the name is not found then %NULL is returned. The
 667  *      reference counters are not incremented so the caller must be
 668  *      careful with locks.
 669  */
 670
 671 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 672 {
 673         struct net_device *dev;
 674         struct hlist_head *head = dev_name_hash(net, name);
 675
 676         hlist_for_each_entry(dev, head, name_hlist)
 677                 if (!strncmp(dev->name, name, IFNAMSIZ))
 678                         return dev;
 679
 680         return NULL;
 681 }
 682 EXPORT_SYMBOL(__dev_get_by_name);
 683
 684 /**
 685  *      dev_get_by_name_rcu     - find a device by its name
 686  *      @net: the applicable net namespace
 687  *      @name: name to find
 688  *
 689  *      Find an interface by name.
 690  *      If the name is found a pointer to the device is returned.
 691  *      If the name is not found then %NULL is returned.
 692  *      The reference counters are not incremented so the caller must be
 693  *      careful with locks. The caller must hold RCU lock.
 694  */
 695
 696 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 697 {
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_name_hash(net, name);
 700
 701         hlist_for_each_entry_rcu(dev, head, name_hlist)
 702                 if (!strncmp(dev->name, name, IFNAMSIZ))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(dev_get_by_name_rcu);
 708
 709 /**
 710  *      dev_get_by_name         - find a device by its name
 711  *      @net: the applicable net namespace
 712  *      @name: name to find
 713  *
 714  *      Find an interface by name. This can be called from any
 715  *      context and does its own locking. The returned handle has
 716  *      the usage count incremented and the caller must use dev_put() to
 717  *      release it when it is no longer needed. %NULL is returned if no
 718  *      matching device is found.
 719  */
 720
 721 struct net_device *dev_get_by_name(struct net *net, const char *name)
 722 {
 723         struct net_device *dev;
 724
 725         rcu_read_lock();
 726         dev = dev_get_by_name_rcu(net, name);
 727         if (dev)
 728                 dev_hold(dev);
 729         rcu_read_unlock();
 730         return dev;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_name);
 733
 734 /**
 735  *      __dev_get_by_index - find a device by its ifindex
 736  *      @net: the applicable net namespace
 737  *      @ifindex: index of device
 738  *
 739  *      Search for an interface by index. Returns %NULL if the device
 740  *      is not found or a pointer to the device. The device has not
 741  *      had its reference counter increased so the caller must be careful
 742  *      about locking. The caller must hold either the RTNL semaphore
 743  *      or @dev_base_lock.
 744  */
 745
 746 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 747 {
 748         struct net_device *dev;
 749         struct hlist_head *head = dev_index_hash(net, ifindex);
 750
 751         hlist_for_each_entry(dev, head, index_hlist)
 752                 if (dev->ifindex == ifindex)
 753                         return dev;
 754
 755         return NULL;
 756 }
 757 EXPORT_SYMBOL(__dev_get_by_index);
 758
 759 /**
 760  *      dev_get_by_index_rcu - find a device by its ifindex
 761  *      @net: the applicable net namespace
 762  *      @ifindex: index of device
 763  *
 764  *      Search for an interface by index. Returns %NULL if the device
 765  *      is not found or a pointer to the device. The device has not
 766  *      had its reference counter increased so the caller must be careful
 767  *      about locking. The caller must hold RCU lock.
 768  */
 769
 770 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 771 {
 772         struct net_device *dev;
 773         struct hlist_head *head = dev_index_hash(net, ifindex);
 774
 775         hlist_for_each_entry_rcu(dev, head, index_hlist)
 776                 if (dev->ifindex == ifindex)
 777                         return dev;
 778
 779         return NULL;
 780 }
 781 EXPORT_SYMBOL(dev_get_by_index_rcu);
 782
 783
 784 /**
 785  *      dev_get_by_index - find a device by its ifindex
 786  *      @net: the applicable net namespace
 787  *      @ifindex: index of device
 788  *
 789  *      Search for an interface by index. Returns NULL if the device
 790  *      is not found or a pointer to the device. The device returned has
 791  *      had a reference added and the pointer is safe until the user calls
 792  *      dev_put to indicate they have finished with it.
 793  */
 794
 795 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 796 {
 797         struct net_device *dev;
 798
 799         rcu_read_lock();
 800         dev = dev_get_by_index_rcu(net, ifindex);
 801         if (dev)
 802                 dev_hold(dev);
 803         rcu_read_unlock();
 804         return dev;
 805 }
 806 EXPORT_SYMBOL(dev_get_by_index);
 807
 808 /**
 809  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 810  *      @net: network namespace
 811  *      @name: a pointer to the buffer where the name will be stored.
 812  *      @ifindex: the ifindex of the interface to get the name from.
 813  *
 814  *      The use of raw_seqcount_begin() and cond_resched() before
 815  *      retrying is required as we want to give the writers a chance
 816  *      to complete when CONFIG_PREEMPT is not set.
 817  */
 818 int netdev_get_name(struct net *net, char *name, int ifindex)
 819 {
 820         struct net_device *dev;
 821         unsigned int seq;
 822
 823 retry:
 824         seq = raw_seqcount_begin(&devnet_rename_seq);
 825         rcu_read_lock();
 826         dev = dev_get_by_index_rcu(net, ifindex);
 827         if (!dev) {
 828                 rcu_read_unlock();
 829                 return -ENODEV;
 830         }
 831
 832         strcpy(name, dev->name);
 833         rcu_read_unlock();
 834         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 835                 cond_resched();
 836                 goto retry;
 837         }
 838
 839         return 0;
 840 }
 841
 842 /**
 843  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 844  *      @net: the applicable net namespace
 845  *      @type: media type of device
 846  *      @ha: hardware address
 847  *
 848  *      Search for an interface by MAC address. Returns NULL if the device
 849  *      is not found or a pointer to the device.
 850  *      The caller must hold RCU or RTNL.
 851  *      The returned device has not had its ref count increased
 852  *      and the caller must therefore be careful about locking
 853  *
 854  */
 855
 856 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 857                                        const char *ha)
 858 {
 859         struct net_device *dev;
 860
 861         for_each_netdev_rcu(net, dev)
 862                 if (dev->type == type &&
 863                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 864                         return dev;
 865
 866         return NULL;
 867 }
 868 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 869
 870 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 871 {
 872         struct net_device *dev;
 873
 874         ASSERT_RTNL();
 875         for_each_netdev(net, dev)
 876                 if (dev->type == type)
 877                         return dev;
 878
 879         return NULL;
 880 }
 881 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 882
 883 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 884 {
 885         struct net_device *dev, *ret = NULL;
 886
 887         rcu_read_lock();
 888         for_each_netdev_rcu(net, dev)
 889                 if (dev->type == type) {
 890                         dev_hold(dev);
 891                         ret = dev;
 892                         break;
 893                 }
 894         rcu_read_unlock();
 895         return ret;
 896 }
 897 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 898
 899 /**
 900  *      dev_get_by_flags_rcu - find any device with given flags
 901  *      @net: the applicable net namespace
 902  *      @if_flags: IFF_* values
 903  *      @mask: bitmask of bits in if_flags to check
 904  *
 905  *      Search for any interface with the given flags. Returns NULL if a device
 906  *      is not found or a pointer to the device. Must be called inside
 907  *      rcu_read_lock(), and result refcount is unchanged.
 908  */
 909
 910 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 911                                     unsigned short mask)
 912 {
 913         struct net_device *dev, *ret;
 914
 915         ret = NULL;
 916         for_each_netdev_rcu(net, dev) {
 917                 if (((dev->flags ^ if_flags) & mask) == 0) {
 918                         ret = dev;
 919                         break;
 920                 }
 921         }
 922         return ret;
 923 }
 924 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 925
 926 /**
 927  *      dev_valid_name - check if name is okay for network device
 928  *      @name: name string
 929  *
 930  *      Network device names need to be valid file names to
 931  *      to allow sysfs to work.  We also disallow any kind of
 932  *      whitespace.
 933  */
 934 bool dev_valid_name(const char *name)
 935 {
 936         if (*name == '\0')
 937                 return false;
 938         if (strlen(name) >= IFNAMSIZ)
 939                 return false;
 940         if (!strcmp(name, ".") || !strcmp(name, ".."))
 941                 return false;
 942
 943         while (*name) {
 944                 if (*name == '/' || isspace(*name))
 945                         return false;
 946                 name++;
 947         }
 948         return true;
 949 }
 950 EXPORT_SYMBOL(dev_valid_name);
 951
 952 /**
 953  *      __dev_alloc_name - allocate a name for a device
 954  *      @net: network namespace to allocate the device name in
 955  *      @name: name format string
 956  *      @buf:  scratch buffer and result name string
 957  *
 958  *      Passed a format string - eg "lt%d" it will try and find a suitable
 959  *      id. It scans list of devices to build up a free map, then chooses
 960  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 961  *      while allocating the name and adding the device in order to avoid
 962  *      duplicates.
 963  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 964  *      Returns the number of the unit assigned or a negative errno code.
 965  */
 966
 967 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 968 {
 969         int i = 0;
 970         const char *p;
 971         const int max_netdevices = 8*PAGE_SIZE;
 972         unsigned long *inuse;
 973         struct net_device *d;
 974
 975         p = strnchr(name, IFNAMSIZ-1, '%');
 976         if (p) {
 977                 /*
 978                  * Verify the string as this thing may have come from
 979                  * the user.  There must be either one "%d" and no other "%"
 980                  * characters.
 981                  */
 982                 if (p[1] != 'd' || strchr(p + 2, '%'))
 983                         return -EINVAL;
 984
 985                 /* Use one page as a bit array of possible slots */
 986                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 987                 if (!inuse)
 988                         return -ENOMEM;
 989
 990                 for_each_netdev(net, d) {
 991                         if (!sscanf(d->name, name, &i))
 992                                 continue;
 993                         if (i < 0 || i >= max_netdevices)
 994                                 continue;
 995
 996                         /*  avoid cases where sscanf is not exact inverse of printf */
 997                         snprintf(buf, IFNAMSIZ, name, i);
 998                         if (!strncmp(buf, d->name, IFNAMSIZ))
 999                                 set_bit(i, inuse);
1000                 }
1001
1002                 i = find_first_zero_bit(inuse, max_netdevices);
1003                 free_page((unsigned long) inuse);
1004         }
1005
1006         if (buf != name)
1007                 snprintf(buf, IFNAMSIZ, name, i);
1008         if (!__dev_get_by_name(net, buf))
1009                 return i;
1010
1011         /* It is possible to run out of possible slots
1012          * when the name is long and there isn't enough space left
1013          * for the digits, or if all bits are used.
1014          */
1015         return -ENFILE;
1016 }
1017
1018 /**
1019  *      dev_alloc_name - allocate a name for a device
1020  *      @dev: device
1021  *      @name: name format string
1022  *
1023  *      Passed a format string - eg "lt%d" it will try and find a suitable
1024  *      id. It scans list of devices to build up a free map, then chooses
1025  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1026  *      while allocating the name and adding the device in order to avoid
1027  *      duplicates.
1028  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1029  *      Returns the number of the unit assigned or a negative errno code.
1030  */
1031
1032 int dev_alloc_name(struct net_device *dev, const char *name)
1033 {
1034         char buf[IFNAMSIZ];
1035         struct net *net;
1036         int ret;
1037
1038         BUG_ON(!dev_net(dev));
1039         net = dev_net(dev);
1040         ret = __dev_alloc_name(net, name, buf);
1041         if (ret >= 0)
1042                 strlcpy(dev->name, buf, IFNAMSIZ);
1043         return ret;
1044 }
1045 EXPORT_SYMBOL(dev_alloc_name);
1046
1047 static int dev_alloc_name_ns(struct net *net,
1048                              struct net_device *dev,
1049                              const char *name)
1050 {
1051         char buf[IFNAMSIZ];
1052         int ret;
1053
1054         ret = __dev_alloc_name(net, name, buf);
1055         if (ret >= 0)
1056                 strlcpy(dev->name, buf, IFNAMSIZ);
1057         return ret;
1058 }
1059
1060 static int dev_get_valid_name(struct net *net,
1061                               struct net_device *dev,
1062                               const char *name)
1063 {
1064         BUG_ON(!net);
1065
1066         if (!dev_valid_name(name))
1067                 return -EINVAL;
1068
1069         if (strchr(name, '%'))
1070                 return dev_alloc_name_ns(net, dev, name);
1071         else if (__dev_get_by_name(net, name))
1072                 return -EEXIST;
1073         else if (dev->name != name)
1074                 strlcpy(dev->name, name, IFNAMSIZ);
1075
1076         return 0;
1077 }
1078
1079 /**
1080  *      dev_change_name - change name of a device
1081  *      @dev: device
1082  *      @newname: name (or format string) must be at least IFNAMSIZ
1083  *
1084  *      Change name of a device, can pass format strings "eth%d".
1085  *      for wildcarding.
1086  */
1087 int dev_change_name(struct net_device *dev, const char *newname)
1088 {
1089         unsigned char old_assign_type;
1090         char oldname[IFNAMSIZ];
1091         int err = 0;
1092         int ret;
1093         struct net *net;
1094
1095         ASSERT_RTNL();
1096         BUG_ON(!dev_net(dev));
1097
1098         net = dev_net(dev);
1099         if (dev->flags & IFF_UP)
1100                 return -EBUSY;
1101
1102         write_seqcount_begin(&devnet_rename_seq);
1103
1104         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1105                 write_seqcount_end(&devnet_rename_seq);
1106                 return 0;
1107         }
1108
1109         memcpy(oldname, dev->name, IFNAMSIZ);
1110
1111         err = dev_get_valid_name(net, dev, newname);
1112         if (err < 0) {
1113                 write_seqcount_end(&devnet_rename_seq);
1114                 return err;
1115         }
1116
1117         if (oldname[0] && !strchr(oldname, '%'))
1118                 netdev_info(dev, "renamed from %s\n", oldname);
1119
1120         old_assign_type = dev->name_assign_type;
1121         dev->name_assign_type = NET_NAME_RENAMED;
1122
1123 rollback:
1124         ret = device_rename(&dev->dev, dev->name);
1125         if (ret) {
1126                 memcpy(dev->name, oldname, IFNAMSIZ);
1127                 dev->name_assign_type = old_assign_type;
1128                 write_seqcount_end(&devnet_rename_seq);
1129                 return ret;
1130         }
1131
1132         write_seqcount_end(&devnet_rename_seq);
1133
1134         netdev_adjacent_rename_links(dev, oldname);
1135
1136         write_lock_bh(&dev_base_lock);
1137         hlist_del_rcu(&dev->name_hlist);
1138         write_unlock_bh(&dev_base_lock);
1139
1140         synchronize_rcu();
1141
1142         write_lock_bh(&dev_base_lock);
1143         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1144         write_unlock_bh(&dev_base_lock);
1145
1146         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1147         ret = notifier_to_errno(ret);
1148
1149         if (ret) {
1150                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1151                 if (err >= 0) {
1152                         err = ret;
1153                         write_seqcount_begin(&devnet_rename_seq);
1154                         memcpy(dev->name, oldname, IFNAMSIZ);
1155                         memcpy(oldname, newname, IFNAMSIZ);
1156                         dev->name_assign_type = old_assign_type;
1157                         old_assign_type = NET_NAME_RENAMED;
1158                         goto rollback;
1159                 } else {
1160                         pr_err("%s: name change rollback failed: %d\n",
1161                                dev->name, ret);
1162                 }
1163         }
1164
1165         return err;
1166 }
1167
1168 /**
1169  *      dev_set_alias - change ifalias of a device
1170  *      @dev: device
1171  *      @alias: name up to IFALIASZ
1172  *      @len: limit of bytes to copy from info
1173  *
1174  *      Set ifalias for a device,
1175  */
1176 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1177 {
1178         char *new_ifalias;
1179
1180         ASSERT_RTNL();
1181
1182         if (len >= IFALIASZ)
1183                 return -EINVAL;
1184
1185         if (!len) {
1186                 kfree(dev->ifalias);
1187                 dev->ifalias = NULL;
1188                 return 0;
1189         }
1190
1191         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1192         if (!new_ifalias)
1193                 return -ENOMEM;
1194         dev->ifalias = new_ifalias;
1195
1196         strlcpy(dev->ifalias, alias, len+1);
1197         return len;
1198 }
1199
1200
1201 /**
1202  *      netdev_features_change - device changes features
1203  *      @dev: device to cause notification
1204  *
1205  *      Called to indicate a device has changed features.
1206  */
1207 void netdev_features_change(struct net_device *dev)
1208 {
1209         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1210 }
1211 EXPORT_SYMBOL(netdev_features_change);
1212
1213 /**
1214  *      netdev_state_change - device changes state
1215  *      @dev: device to cause notification
1216  *
1217  *      Called to indicate a device has changed state. This function calls
1218  *      the notifier chains for netdev_chain and sends a NEWLINK message
1219  *      to the routing socket.
1220  */
1221 void netdev_state_change(struct net_device *dev)
1222 {
1223         if (dev->flags & IFF_UP) {
1224                 struct netdev_notifier_change_info change_info;
1225
1226                 change_info.flags_changed = 0;
1227                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1228                                               &change_info.info);
1229                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1230         }
1231 }
1232 EXPORT_SYMBOL(netdev_state_change);
1233
1234 /**
1235  *      netdev_notify_peers - notify network peers about existence of @dev
1236  *      @dev: network device
1237  *
1238  * Generate traffic such that interested network peers are aware of
1239  * @dev, such as by generating a gratuitous ARP. This may be used when
1240  * a device wants to inform the rest of the network about some sort of
1241  * reconfiguration such as a failover event or virtual machine
1242  * migration.
1243  */
1244 void netdev_notify_peers(struct net_device *dev)
1245 {
1246         rtnl_lock();
1247         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1248         rtnl_unlock();
1249 }
1250 EXPORT_SYMBOL(netdev_notify_peers);
1251
1252 static int __dev_open(struct net_device *dev)
1253 {
1254         const struct net_device_ops *ops = dev->netdev_ops;
1255         int ret;
1256
1257         ASSERT_RTNL();
1258
1259         if (!netif_device_present(dev))
1260                 return -ENODEV;
1261
1262         /* Block netpoll from trying to do any rx path servicing.
1263          * If we don't do this there is a chance ndo_poll_controller
1264          * or ndo_poll may be running while we open the device
1265          */
1266         netpoll_poll_disable(dev);
1267
1268         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1269         ret = notifier_to_errno(ret);
1270         if (ret)
1271                 return ret;
1272
1273         set_bit(__LINK_STATE_START, &dev->state);
1274
1275         if (ops->ndo_validate_addr)
1276                 ret = ops->ndo_validate_addr(dev);
1277
1278         if (!ret && ops->ndo_open)
1279                 ret = ops->ndo_open(dev);
1280
1281         netpoll_poll_enable(dev);
1282
1283         if (ret)
1284                 clear_bit(__LINK_STATE_START, &dev->state);
1285         else {
1286                 dev->flags |= IFF_UP;
1287                 net_dmaengine_get();
1288                 dev_set_rx_mode(dev);
1289                 dev_activate(dev);
1290                 add_device_randomness(dev->dev_addr, dev->addr_len);
1291         }
1292
1293         return ret;
1294 }
1295
1296 /**
1297  *      dev_open        - prepare an interface for use.
1298  *      @dev:   device to open
1299  *
1300  *      Takes a device from down to up state. The device's private open
1301  *      function is invoked and then the multicast lists are loaded. Finally
1302  *      the device is moved into the up state and a %NETDEV_UP message is
1303  *      sent to the netdev notifier chain.
1304  *
1305  *      Calling this function on an active interface is a nop. On a failure
1306  *      a negative errno code is returned.
1307  */
1308 int dev_open(struct net_device *dev)
1309 {
1310         int ret;
1311
1312         if (dev->flags & IFF_UP)
1313                 return 0;
1314
1315         ret = __dev_open(dev);
1316         if (ret < 0)
1317                 return ret;
1318
1319         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1320         call_netdevice_notifiers(NETDEV_UP, dev);
1321
1322         return ret;
1323 }
1324 EXPORT_SYMBOL(dev_open);
1325
1326 static int __dev_close_many(struct list_head *head)
1327 {
1328         struct net_device *dev;
1329
1330         ASSERT_RTNL();
1331         might_sleep();
1332
1333         list_for_each_entry(dev, head, close_list) {
1334                 /* Temporarily disable netpoll until the interface is down */
1335                 netpoll_poll_disable(dev);
1336
1337                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1338
1339                 clear_bit(__LINK_STATE_START, &dev->state);
1340
1341                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1342                  * can be even on different cpu. So just clear netif_running().
1343                  *
1344                  * dev->stop() will invoke napi_disable() on all of it's
1345                  * napi_struct instances on this device.
1346                  */
1347                 smp_mb__after_atomic(); /* Commit netif_running(). */
1348         }
1349
1350         dev_deactivate_many(head);
1351
1352         list_for_each_entry(dev, head, close_list) {
1353                 const struct net_device_ops *ops = dev->netdev_ops;
1354
1355                 /*
1356                  *      Call the device specific close. This cannot fail.
1357                  *      Only if device is UP
1358                  *
1359                  *      We allow it to be called even after a DETACH hot-plug
1360                  *      event.
1361                  */
1362                 if (ops->ndo_stop)
1363                         ops->ndo_stop(dev);
1364
1365                 dev->flags &= ~IFF_UP;
1366                 net_dmaengine_put();
1367                 netpoll_poll_enable(dev);
1368         }
1369
1370         return 0;
1371 }
1372
1373 static int __dev_close(struct net_device *dev)
1374 {
1375         int retval;
1376         LIST_HEAD(single);
1377
1378         list_add(&dev->close_list, &single);
1379         retval = __dev_close_many(&single);
1380         list_del(&single);
1381
1382         return retval;
1383 }
1384
1385 static int dev_close_many(struct list_head *head)
1386 {
1387         struct net_device *dev, *tmp;
1388
1389         /* Remove the devices that don't need to be closed */
1390         list_for_each_entry_safe(dev, tmp, head, close_list)
1391                 if (!(dev->flags & IFF_UP))
1392                         list_del_init(&dev->close_list);
1393
1394         __dev_close_many(head);
1395
1396         list_for_each_entry_safe(dev, tmp, head, close_list) {
1397                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1398                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1399                 list_del_init(&dev->close_list);
1400         }
1401
1402         return 0;
1403 }
1404
1405 /**
1406  *      dev_close - shutdown an interface.
1407  *      @dev: device to shutdown
1408  *
1409  *      This function moves an active device into down state. A
1410  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1411  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1412  *      chain.
1413  */
1414 int dev_close(struct net_device *dev)
1415 {
1416         if (dev->flags & IFF_UP) {
1417                 LIST_HEAD(single);
1418
1419                 list_add(&dev->close_list, &single);
1420                 dev_close_many(&single);
1421                 list_del(&single);
1422         }
1423         return 0;
1424 }
1425 EXPORT_SYMBOL(dev_close);
1426
1427
1428 /**
1429  *      dev_disable_lro - disable Large Receive Offload on a device
1430  *      @dev: device
1431  *
1432  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1433  *      called under RTNL.  This is needed if received packets may be
1434  *      forwarded to another interface.
1435  */
1436 void dev_disable_lro(struct net_device *dev)
1437 {
1438         /*
1439          * If we're trying to disable lro on a vlan device
1440          * use the underlying physical device instead
1441          */
1442         if (is_vlan_dev(dev))
1443                 dev = vlan_dev_real_dev(dev);
1444
1445         /* the same for macvlan devices */
1446         if (netif_is_macvlan(dev))
1447                 dev = macvlan_dev_real_dev(dev);
1448
1449         dev->wanted_features &= ~NETIF_F_LRO;
1450         netdev_update_features(dev);
1451
1452         if (unlikely(dev->features & NETIF_F_LRO))
1453                 netdev_WARN(dev, "failed to disable LRO!\n");
1454 }
1455 EXPORT_SYMBOL(dev_disable_lro);
1456
1457 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1458                                    struct net_device *dev)
1459 {
1460         struct netdev_notifier_info info;
1461
1462         netdev_notifier_info_init(&info, dev);
1463         return nb->notifier_call(nb, val, &info);
1464 }
1465
1466 static int dev_boot_phase = 1;
1467
1468 /**
1469  *      register_netdevice_notifier - register a network notifier block
1470  *      @nb: notifier
1471  *
1472  *      Register a notifier to be called when network device events occur.
1473  *      The notifier passed is linked into the kernel structures and must
1474  *      not be reused until it has been unregistered. A negative errno code
1475  *      is returned on a failure.
1476  *
1477  *      When registered all registration and up events are replayed
1478  *      to the new notifier to allow device to have a race free
1479  *      view of the network device list.
1480  */
1481
1482 int register_netdevice_notifier(struct notifier_block *nb)
1483 {
1484         struct net_device *dev;
1485         struct net_device *last;
1486         struct net *net;
1487         int err;
1488
1489         rtnl_lock();
1490         err = raw_notifier_chain_register(&netdev_chain, nb);
1491         if (err)
1492                 goto unlock;
1493         if (dev_boot_phase)
1494                 goto unlock;
1495         for_each_net(net) {
1496                 for_each_netdev(net, dev) {
1497                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1498                         err = notifier_to_errno(err);
1499                         if (err)
1500                                 goto rollback;
1501
1502                         if (!(dev->flags & IFF_UP))
1503                                 continue;
1504
1505                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1506                 }
1507         }
1508
1509 unlock:
1510         rtnl_unlock();
1511         return err;
1512
1513 rollback:
1514         last = dev;
1515         for_each_net(net) {
1516                 for_each_netdev(net, dev) {
1517                         if (dev == last)
1518                                 goto outroll;
1519
1520                         if (dev->flags & IFF_UP) {
1521                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1522                                                         dev);
1523                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1524                         }
1525                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1526                 }
1527         }
1528
1529 outroll:
1530         raw_notifier_chain_unregister(&netdev_chain, nb);
1531         goto unlock;
1532 }
1533 EXPORT_SYMBOL(register_netdevice_notifier);
1534
1535 /**
1536  *      unregister_netdevice_notifier - unregister a network notifier block
1537  *      @nb: notifier
1538  *
1539  *      Unregister a notifier previously registered by
1540  *      register_netdevice_notifier(). The notifier is unlinked into the
1541  *      kernel structures and may then be reused. A negative errno code
1542  *      is returned on a failure.
1543  *
1544  *      After unregistering unregister and down device events are synthesized
1545  *      for all devices on the device list to the removed notifier to remove
1546  *      the need for special case cleanup code.
1547  */
1548
1549 int unregister_netdevice_notifier(struct notifier_block *nb)
1550 {
1551         struct net_device *dev;
1552         struct net *net;
1553         int err;
1554
1555         rtnl_lock();
1556         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1557         if (err)
1558                 goto unlock;
1559
1560         for_each_net(net) {
1561                 for_each_netdev(net, dev) {
1562                         if (dev->flags & IFF_UP) {
1563                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1564                                                         dev);
1565                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1566                         }
1567                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1568                 }
1569         }
1570 unlock:
1571         rtnl_unlock();
1572         return err;
1573 }
1574 EXPORT_SYMBOL(unregister_netdevice_notifier);
1575
1576 /**
1577  *      call_netdevice_notifiers_info - call all network notifier blocks
1578  *      @val: value passed unmodified to notifier function
1579  *      @dev: net_device pointer passed unmodified to notifier function
1580  *      @info: notifier information data
1581  *
1582  *      Call all network notifier blocks.  Parameters and return value
1583  *      are as for raw_notifier_call_chain().
1584  */
1585
1586 static int call_netdevice_notifiers_info(unsigned long val,
1587                                          struct net_device *dev,
1588                                          struct netdev_notifier_info *info)
1589 {
1590         ASSERT_RTNL();
1591         netdev_notifier_info_init(info, dev);
1592         return raw_notifier_call_chain(&netdev_chain, val, info);
1593 }
1594
1595 /**
1596  *      call_netdevice_notifiers - call all network notifier blocks
1597  *      @val: value passed unmodified to notifier function
1598  *      @dev: net_device pointer passed unmodified to notifier function
1599  *
1600  *      Call all network notifier blocks.  Parameters and return value
1601  *      are as for raw_notifier_call_chain().
1602  */
1603
1604 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1605 {
1606         struct netdev_notifier_info info;
1607
1608         return call_netdevice_notifiers_info(val, dev, &info);
1609 }
1610 EXPORT_SYMBOL(call_netdevice_notifiers);
1611
1612 static struct static_key netstamp_needed __read_mostly;
1613 #ifdef HAVE_JUMP_LABEL
1614 /* We are not allowed to call static_key_slow_dec() from irq context
1615  * If net_disable_timestamp() is called from irq context, defer the
1616  * static_key_slow_dec() calls.
1617  */
1618 static atomic_t netstamp_needed_deferred;
1619 #endif
1620
1621 void net_enable_timestamp(void)
1622 {
1623 #ifdef HAVE_JUMP_LABEL
1624         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1625
1626         if (deferred) {
1627                 while (--deferred)
1628                         static_key_slow_dec(&netstamp_needed);
1629                 return;
1630         }
1631 #endif
1632         static_key_slow_inc(&netstamp_needed);
1633 }
1634 EXPORT_SYMBOL(net_enable_timestamp);
1635
1636 void net_disable_timestamp(void)
1637 {
1638 #ifdef HAVE_JUMP_LABEL
1639         if (in_interrupt()) {
1640                 atomic_inc(&netstamp_needed_deferred);
1641                 return;
1642         }
1643 #endif
1644         static_key_slow_dec(&netstamp_needed);
1645 }
1646 EXPORT_SYMBOL(net_disable_timestamp);
1647
1648 static inline void net_timestamp_set(struct sk_buff *skb)
1649 {
1650         skb->tstamp.tv64 = 0;
1651         if (static_key_false(&netstamp_needed))
1652                 __net_timestamp(skb);
1653 }
1654
1655 #define net_timestamp_check(COND, SKB)                  \
1656         if (static_key_false(&netstamp_needed)) {               \
1657                 if ((COND) && !(SKB)->tstamp.tv64)      \
1658                         __net_timestamp(SKB);           \
1659         }                                               \
1660
1661 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1662 {
1663         unsigned int len;
1664
1665         if (!(dev->flags & IFF_UP))
1666                 return false;
1667
1668         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1669         if (skb->len <= len)
1670                 return true;
1671
1672         /* if TSO is enabled, we don't care about the length as the packet
1673          * could be forwarded without being segmented before
1674          */
1675         if (skb_is_gso(skb))
1676                 return true;
1677
1678         return false;
1679 }
1680 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1681
1682 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1683 {
1684         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1685                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1686                         atomic_long_inc(&dev->rx_dropped);
1687                         kfree_skb(skb);
1688                         return NET_RX_DROP;
1689                 }
1690         }
1691
1692         if (unlikely(!is_skb_forwardable(dev, skb))) {
1693                 atomic_long_inc(&dev->rx_dropped);
1694                 kfree_skb(skb);
1695                 return NET_RX_DROP;
1696         }
1697
1698         skb_scrub_packet(skb, true);
1699         skb->protocol = eth_type_trans(skb, dev);
1700
1701         return 0;
1702 }
1703 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1704
1705 /**
1706  * dev_forward_skb - loopback an skb to another netif
1707  *
1708  * @dev: destination network device
1709  * @skb: buffer to forward
1710  *
1711  * return values:
1712  *      NET_RX_SUCCESS  (no congestion)
1713  *      NET_RX_DROP     (packet was dropped, but freed)
1714  *
1715  * dev_forward_skb can be used for injecting an skb from the
1716  * start_xmit function of one device into the receive queue
1717  * of another device.
1718  *
1719  * The receiving device may be in another namespace, so
1720  * we have to clear all information in the skb that could
1721  * impact namespace isolation.
1722  */
1723 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1724 {
1725         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1726 }
1727 EXPORT_SYMBOL_GPL(dev_forward_skb);
1728
1729 static inline int deliver_skb(struct sk_buff *skb,
1730                               struct packet_type *pt_prev,
1731                               struct net_device *orig_dev)
1732 {
1733         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1734                 return -ENOMEM;
1735         atomic_inc(&skb->users);
1736         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1737 }
1738
1739 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1740 {
1741         if (!ptype->af_packet_priv || !skb->sk)
1742                 return false;
1743
1744         if (ptype->id_match)
1745                 return ptype->id_match(ptype, skb->sk);
1746         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1747                 return true;
1748
1749         return false;
1750 }
1751
1752 /*
1753  *      Support routine. Sends outgoing frames to any network
1754  *      taps currently in use.
1755  */
1756
1757 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1758 {
1759         struct packet_type *ptype;
1760         struct sk_buff *skb2 = NULL;
1761         struct packet_type *pt_prev = NULL;
1762
1763         rcu_read_lock();
1764         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1765                 /* Never send packets back to the socket
1766                  * they originated from - MvS (miquels@drinkel.ow.org)
1767                  */
1768                 if ((ptype->dev == dev || !ptype->dev) &&
1769                     (!skb_loop_sk(ptype, skb))) {
1770                         if (pt_prev) {
1771                                 deliver_skb(skb2, pt_prev, skb->dev);
1772                                 pt_prev = ptype;
1773                                 continue;
1774                         }
1775
1776                         skb2 = skb_clone(skb, GFP_ATOMIC);
1777                         if (!skb2)
1778                                 break;
1779
1780                         net_timestamp_set(skb2);
1781
1782                         /* skb->nh should be correctly
1783                            set by sender, so that the second statement is
1784                            just protection against buggy protocols.
1785                          */
1786                         skb_reset_mac_header(skb2);
1787
1788                         if (skb_network_header(skb2) < skb2->data ||
1789                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1790                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1791                                                      ntohs(skb2->protocol),
1792                                                      dev->name);
1793                                 skb_reset_network_header(skb2);
1794                         }
1795
1796                         skb2->transport_header = skb2->network_header;
1797                         skb2->pkt_type = PACKET_OUTGOING;
1798                         pt_prev = ptype;
1799                 }
1800         }
1801         if (pt_prev)
1802                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1803         rcu_read_unlock();
1804 }
1805
1806 /**
1807  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1808  * @dev: Network device
1809  * @txq: number of queues available
1810  *
1811  * If real_num_tx_queues is changed the tc mappings may no longer be
1812  * valid. To resolve this verify the tc mapping remains valid and if
1813  * not NULL the mapping. With no priorities mapping to this
1814  * offset/count pair it will no longer be used. In the worst case TC0
1815  * is invalid nothing can be done so disable priority mappings. If is
1816  * expected that drivers will fix this mapping if they can before
1817  * calling netif_set_real_num_tx_queues.
1818  */
1819 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1820 {
1821         int i;
1822         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1823
1824         /* If TC0 is invalidated disable TC mapping */
1825         if (tc->offset + tc->count > txq) {
1826                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1827                 dev->num_tc = 0;
1828                 return;
1829         }
1830
1831         /* Invalidated prio to tc mappings set to TC0 */
1832         for (i = 1; i < TC_BITMASK + 1; i++) {
1833                 int q = netdev_get_prio_tc_map(dev, i);
1834
1835                 tc = &dev->tc_to_txq[q];
1836                 if (tc->offset + tc->count > txq) {
1837                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1838                                 i, q);
1839                         netdev_set_prio_tc_map(dev, i, 0);
1840                 }
1841         }
1842 }
1843
1844 #ifdef CONFIG_XPS
1845 static DEFINE_MUTEX(xps_map_mutex);
1846 #define xmap_dereference(P)             \
1847         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1848
1849 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1850                                         int cpu, u16 index)
1851 {
1852         struct xps_map *map = NULL;
1853         int pos;
1854
1855         if (dev_maps)
1856                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1857
1858         for (pos = 0; map && pos < map->len; pos++) {
1859                 if (map->queues[pos] == index) {
1860                         if (map->len > 1) {
1861                                 map->queues[pos] = map->queues[--map->len];
1862                         } else {
1863                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1864                                 kfree_rcu(map, rcu);
1865                                 map = NULL;
1866                         }
1867                         break;
1868                 }
1869         }
1870
1871         return map;
1872 }
1873
1874 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1875 {
1876         struct xps_dev_maps *dev_maps;
1877         int cpu, i;
1878         bool active = false;
1879
1880         mutex_lock(&xps_map_mutex);
1881         dev_maps = xmap_dereference(dev->xps_maps);
1882
1883         if (!dev_maps)
1884                 goto out_no_maps;
1885
1886         for_each_possible_cpu(cpu) {
1887                 for (i = index; i < dev->num_tx_queues; i++) {
1888                         if (!remove_xps_queue(dev_maps, cpu, i))
1889                                 break;
1890                 }
1891                 if (i == dev->num_tx_queues)
1892                         active = true;
1893         }
1894
1895         if (!active) {
1896                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1897                 kfree_rcu(dev_maps, rcu);
1898         }
1899
1900         for (i = index; i < dev->num_tx_queues; i++)
1901                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1902                                              NUMA_NO_NODE);
1903
1904 out_no_maps:
1905         mutex_unlock(&xps_map_mutex);
1906 }
1907
1908 static struct xps_map *expand_xps_map(struct xps_map *map,
1909                                       int cpu, u16 index)
1910 {
1911         struct xps_map *new_map;
1912         int alloc_len = XPS_MIN_MAP_ALLOC;
1913         int i, pos;
1914
1915         for (pos = 0; map && pos < map->len; pos++) {
1916                 if (map->queues[pos] != index)
1917                         continue;
1918                 return map;
1919         }
1920
1921         /* Need to add queue to this CPU's existing map */
1922         if (map) {
1923                 if (pos < map->alloc_len)
1924                         return map;
1925
1926                 alloc_len = map->alloc_len * 2;
1927         }
1928
1929         /* Need to allocate new map to store queue on this CPU's map */
1930         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1931                                cpu_to_node(cpu));
1932         if (!new_map)
1933                 return NULL;
1934
1935         for (i = 0; i < pos; i++)
1936                 new_map->queues[i] = map->queues[i];
1937         new_map->alloc_len = alloc_len;
1938         new_map->len = pos;
1939
1940         return new_map;
1941 }
1942
1943 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1944                         u16 index)
1945 {
1946         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1947         struct xps_map *map, *new_map;
1948         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1949         int cpu, numa_node_id = -2;
1950         bool active = false;
1951
1952         mutex_lock(&xps_map_mutex);
1953
1954         dev_maps = xmap_dereference(dev->xps_maps);
1955
1956         /* allocate memory for queue storage */
1957         for_each_online_cpu(cpu) {
1958                 if (!cpumask_test_cpu(cpu, mask))
1959                         continue;
1960
1961                 if (!new_dev_maps)
1962                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1963                 if (!new_dev_maps) {
1964                         mutex_unlock(&xps_map_mutex);
1965                         return -ENOMEM;
1966                 }
1967
1968                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1969                                  NULL;
1970
1971                 map = expand_xps_map(map, cpu, index);
1972                 if (!map)
1973                         goto error;
1974
1975                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1976         }
1977
1978         if (!new_dev_maps)
1979                 goto out_no_new_maps;
1980
1981         for_each_possible_cpu(cpu) {
1982                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1983                         /* add queue to CPU maps */
1984                         int pos = 0;
1985
1986                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1987                         while ((pos < map->len) && (map->queues[pos] != index))
1988                                 pos++;
1989
1990                         if (pos == map->len)
1991                                 map->queues[map->len++] = index;
1992 #ifdef CONFIG_NUMA
1993                         if (numa_node_id == -2)
1994                                 numa_node_id = cpu_to_node(cpu);
1995                         else if (numa_node_id != cpu_to_node(cpu))
1996                                 numa_node_id = -1;
1997 #endif
1998                 } else if (dev_maps) {
1999                         /* fill in the new device map from the old device map */
2000                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2001                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2002                 }
2003
2004         }
2005
2006         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2007
2008         /* Cleanup old maps */
2009         if (dev_maps) {
2010                 for_each_possible_cpu(cpu) {
2011                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2012                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2013                         if (map && map != new_map)
2014                                 kfree_rcu(map, rcu);
2015                 }
2016
2017                 kfree_rcu(dev_maps, rcu);
2018         }
2019
2020         dev_maps = new_dev_maps;
2021         active = true;
2022
2023 out_no_new_maps:
2024         /* update Tx queue numa node */
2025         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2026                                      (numa_node_id >= 0) ? numa_node_id :
2027                                      NUMA_NO_NODE);
2028
2029         if (!dev_maps)
2030                 goto out_no_maps;
2031
2032         /* removes queue from unused CPUs */
2033         for_each_possible_cpu(cpu) {
2034                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2035                         continue;
2036
2037                 if (remove_xps_queue(dev_maps, cpu, index))
2038                         active = true;
2039         }
2040
2041         /* free map if not active */
2042         if (!active) {
2043                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2044                 kfree_rcu(dev_maps, rcu);
2045         }
2046
2047 out_no_maps:
2048         mutex_unlock(&xps_map_mutex);
2049
2050         return 0;
2051 error:
2052         /* remove any maps that we added */
2053         for_each_possible_cpu(cpu) {
2054                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2055                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2056                                  NULL;
2057                 if (new_map && new_map != map)
2058                         kfree(new_map);
2059         }
2060
2061         mutex_unlock(&xps_map_mutex);
2062
2063         kfree(new_dev_maps);
2064         return -ENOMEM;
2065 }
2066 EXPORT_SYMBOL(netif_set_xps_queue);
2067
2068 #endif
2069 /*
2070  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2071  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2072  */
2073 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2074 {
2075         int rc;
2076
2077         if (txq < 1 || txq > dev->num_tx_queues)
2078                 return -EINVAL;
2079
2080         if (dev->reg_state == NETREG_REGISTERED ||
2081             dev->reg_state == NETREG_UNREGISTERING) {
2082                 ASSERT_RTNL();
2083
2084                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2085                                                   txq);
2086                 if (rc)
2087                         return rc;
2088
2089                 if (dev->num_tc)
2090                         netif_setup_tc(dev, txq);
2091
2092                 if (txq < dev->real_num_tx_queues) {
2093                         qdisc_reset_all_tx_gt(dev, txq);
2094 #ifdef CONFIG_XPS
2095                         netif_reset_xps_queues_gt(dev, txq);
2096 #endif
2097                 }
2098         }
2099
2100         dev->real_num_tx_queues = txq;
2101         return 0;
2102 }
2103 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2104
2105 #ifdef CONFIG_SYSFS
2106 /**
2107  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2108  *      @dev: Network device
2109  *      @rxq: Actual number of RX queues
2110  *
2111  *      This must be called either with the rtnl_lock held or before
2112  *      registration of the net device.  Returns 0 on success, or a
2113  *      negative error code.  If called before registration, it always
2114  *      succeeds.
2115  */
2116 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2117 {
2118         int rc;
2119
2120         if (rxq < 1 || rxq > dev->num_rx_queues)
2121                 return -EINVAL;
2122
2123         if (dev->reg_state == NETREG_REGISTERED) {
2124                 ASSERT_RTNL();
2125
2126                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2127                                                   rxq);
2128                 if (rc)
2129                         return rc;
2130         }
2131
2132         dev->real_num_rx_queues = rxq;
2133         return 0;
2134 }
2135 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2136 #endif
2137
2138 /**
2139  * netif_get_num_default_rss_queues - default number of RSS queues
2140  *
2141  * This routine should set an upper limit on the number of RSS queues
2142  * used by default by multiqueue devices.
2143  */
2144 int netif_get_num_default_rss_queues(void)
2145 {
2146         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2147 }
2148 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2149
2150 static inline void __netif_reschedule(struct Qdisc *q)
2151 {
2152         struct softnet_data *sd;
2153         unsigned long flags;
2154
2155         local_irq_save(flags);
2156         sd = &__get_cpu_var(softnet_data);
2157         q->next_sched = NULL;
2158         *sd->output_queue_tailp = q;
2159         sd->output_queue_tailp = &q->next_sched;
2160         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2161         local_irq_restore(flags);
2162 }
2163
2164 void __netif_schedule(struct Qdisc *q)
2165 {
2166         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2167                 __netif_reschedule(q);
2168 }
2169 EXPORT_SYMBOL(__netif_schedule);
2170
2171 struct dev_kfree_skb_cb {
2172         enum skb_free_reason reason;
2173 };
2174
2175 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2176 {
2177         return (struct dev_kfree_skb_cb *)skb->cb;
2178 }
2179
2180 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2181 {
2182         unsigned long flags;
2183
2184         if (likely(atomic_read(&skb->users) == 1)) {
2185                 smp_rmb();
2186                 atomic_set(&skb->users, 0);
2187         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2188                 return;
2189         }
2190         get_kfree_skb_cb(skb)->reason = reason;
2191         local_irq_save(flags);
2192         skb->next = __this_cpu_read(softnet_data.completion_queue);
2193         __this_cpu_write(softnet_data.completion_queue, skb);
2194         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2195         local_irq_restore(flags);
2196 }
2197 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2198
2199 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2200 {
2201         if (in_irq() || irqs_disabled())
2202                 __dev_kfree_skb_irq(skb, reason);
2203         else
2204                 dev_kfree_skb(skb);
2205 }
2206 EXPORT_SYMBOL(__dev_kfree_skb_any);
2207
2208
2209 /**
2210  * netif_device_detach - mark device as removed
2211  * @dev: network device
2212  *
2213  * Mark device as removed from system and therefore no longer available.
2214  */
2215 void netif_device_detach(struct net_device *dev)
2216 {
2217         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2218             netif_running(dev)) {
2219                 netif_tx_stop_all_queues(dev);
2220         }
2221 }
2222 EXPORT_SYMBOL(netif_device_detach);
2223
2224 /**
2225  * netif_device_attach - mark device as attached
2226  * @dev: network device
2227  *
2228  * Mark device as attached from system and restart if needed.
2229  */
2230 void netif_device_attach(struct net_device *dev)
2231 {
2232         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2233             netif_running(dev)) {
2234                 netif_tx_wake_all_queues(dev);
2235                 __netdev_watchdog_up(dev);
2236         }
2237 }
2238 EXPORT_SYMBOL(netif_device_attach);
2239
2240 static void skb_warn_bad_offload(const struct sk_buff *skb)
2241 {
2242         static const netdev_features_t null_features = 0;
2243         struct net_device *dev = skb->dev;
2244         const char *driver = "";
2245
2246         if (!net_ratelimit())
2247                 return;
2248
2249         if (dev && dev->dev.parent)
2250                 driver = dev_driver_string(dev->dev.parent);
2251
2252         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2253              "gso_type=%d ip_summed=%d\n",
2254              driver, dev ? &dev->features : &null_features,
2255              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2256              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2257              skb_shinfo(skb)->gso_type, skb->ip_summed);
2258 }
2259
2260 /*
2261  * Invalidate hardware checksum when packet is to be mangled, and
2262  * complete checksum manually on outgoing path.
2263  */
2264 int skb_checksum_help(struct sk_buff *skb)
2265 {
2266         __wsum csum;
2267         int ret = 0, offset;
2268
2269         if (skb->ip_summed == CHECKSUM_COMPLETE)
2270                 goto out_set_summed;
2271
2272         if (unlikely(skb_shinfo(skb)->gso_size)) {
2273                 skb_warn_bad_offload(skb);
2274                 return -EINVAL;
2275         }
2276
2277         /* Before computing a checksum, we should make sure no frag could
2278          * be modified by an external entity : checksum could be wrong.
2279          */
2280         if (skb_has_shared_frag(skb)) {
2281                 ret = __skb_linearize(skb);
2282                 if (ret)
2283                         goto out;
2284         }
2285
2286         offset = skb_checksum_start_offset(skb);
2287         BUG_ON(offset >= skb_headlen(skb));
2288         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2289
2290         offset += skb->csum_offset;
2291         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2292
2293         if (skb_cloned(skb) &&
2294             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2295                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2296                 if (ret)
2297                         goto out;
2298         }
2299
2300         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2301 out_set_summed:
2302         skb->ip_summed = CHECKSUM_NONE;
2303 out:
2304         return ret;
2305 }
2306 EXPORT_SYMBOL(skb_checksum_help);
2307
2308 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2309 {
2310         unsigned int vlan_depth = skb->mac_len;
2311         __be16 type = skb->protocol;
2312
2313         /* Tunnel gso handlers can set protocol to ethernet. */
2314         if (type == htons(ETH_P_TEB)) {
2315                 struct ethhdr *eth;
2316
2317                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2318                         return 0;
2319
2320                 eth = (struct ethhdr *)skb_mac_header(skb);
2321                 type = eth->h_proto;
2322         }
2323
2324         /* if skb->protocol is 802.1Q/AD then the header should already be
2325          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2326          * ETH_HLEN otherwise
2327          */
2328         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2329                 if (vlan_depth) {
2330                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2331                                 return 0;
2332                         vlan_depth -= VLAN_HLEN;
2333                 } else {
2334                         vlan_depth = ETH_HLEN;
2335                 }
2336                 do {
2337                         struct vlan_hdr *vh;
2338
2339                         if (unlikely(!pskb_may_pull(skb,
2340                                                     vlan_depth + VLAN_HLEN)))
2341                                 return 0;
2342
2343                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2344                         type = vh->h_vlan_encapsulated_proto;
2345                         vlan_depth += VLAN_HLEN;
2346                 } while (type == htons(ETH_P_8021Q) ||
2347                          type == htons(ETH_P_8021AD));
2348         }
2349
2350         *depth = vlan_depth;
2351
2352         return type;
2353 }
2354
2355 /**
2356  *      skb_mac_gso_segment - mac layer segmentation handler.
2357  *      @skb: buffer to segment
2358  *      @features: features for the output path (see dev->features)
2359  */
2360 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2361                                     netdev_features_t features)
2362 {
2363         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2364         struct packet_offload *ptype;
2365         int vlan_depth = skb->mac_len;
2366         __be16 type = skb_network_protocol(skb, &vlan_depth);
2367
2368         if (unlikely(!type))
2369                 return ERR_PTR(-EINVAL);
2370
2371         __skb_pull(skb, vlan_depth);
2372
2373         rcu_read_lock();
2374         list_for_each_entry_rcu(ptype, &offload_base, list) {
2375                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2376                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2377                                 int err;
2378
2379                                 err = ptype->callbacks.gso_send_check(skb);
2380                                 segs = ERR_PTR(err);
2381                                 if (err || skb_gso_ok(skb, features))
2382                                         break;
2383                                 __skb_push(skb, (skb->data -
2384                                                  skb_network_header(skb)));
2385                         }
2386                         segs = ptype->callbacks.gso_segment(skb, features);
2387                         break;
2388                 }
2389         }
2390         rcu_read_unlock();
2391
2392         __skb_push(skb, skb->data - skb_mac_header(skb));
2393
2394         return segs;
2395 }
2396 EXPORT_SYMBOL(skb_mac_gso_segment);
2397
2398
2399 /* openvswitch calls this on rx path, so we need a different check.
2400  */
2401 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2402 {
2403         if (tx_path)
2404                 return skb->ip_summed != CHECKSUM_PARTIAL;
2405         else
2406                 return skb->ip_summed == CHECKSUM_NONE;
2407 }
2408
2409 /**
2410  *      __skb_gso_segment - Perform segmentation on skb.
2411  *      @skb: buffer to segment
2412  *      @features: features for the output path (see dev->features)
2413  *      @tx_path: whether it is called in TX path
2414  *
2415  *      This function segments the given skb and returns a list of segments.
2416  *
2417  *      It may return NULL if the skb requires no segmentation.  This is
2418  *      only possible when GSO is used for verifying header integrity.
2419  */
2420 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2421                                   netdev_features_t features, bool tx_path)
2422 {
2423         if (unlikely(skb_needs_check(skb, tx_path))) {
2424                 int err;
2425
2426                 skb_warn_bad_offload(skb);
2427
2428                 err = skb_cow_head(skb, 0);
2429                 if (err < 0)
2430                         return ERR_PTR(err);
2431         }
2432
2433         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2434         SKB_GSO_CB(skb)->encap_level = 0;
2435
2436         skb_reset_mac_header(skb);
2437         skb_reset_mac_len(skb);
2438
2439         return skb_mac_gso_segment(skb, features);
2440 }
2441 EXPORT_SYMBOL(__skb_gso_segment);
2442
2443 /* Take action when hardware reception checksum errors are detected. */
2444 #ifdef CONFIG_BUG
2445 void netdev_rx_csum_fault(struct net_device *dev)
2446 {
2447         if (net_ratelimit()) {
2448                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2449                 dump_stack();
2450         }
2451 }
2452 EXPORT_SYMBOL(netdev_rx_csum_fault);
2453 #endif
2454
2455 /* Actually, we should eliminate this check as soon as we know, that:
2456  * 1. IOMMU is present and allows to map all the memory.
2457  * 2. No high memory really exists on this machine.
2458  */
2459
2460 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2461 {
2462 #ifdef CONFIG_HIGHMEM
2463         int i;
2464         if (!(dev->features & NETIF_F_HIGHDMA)) {
2465                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2466                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2467                         if (PageHighMem(skb_frag_page(frag)))
2468                                 return 1;
2469                 }
2470         }
2471
2472         if (PCI_DMA_BUS_IS_PHYS) {
2473                 struct device *pdev = dev->dev.parent;
2474
2475                 if (!pdev)
2476                         return 0;
2477                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2478                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2479                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2480                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2481                                 return 1;
2482                 }
2483         }
2484 #endif
2485         return 0;
2486 }
2487
2488 struct dev_gso_cb {
2489         void (*destructor)(struct sk_buff *skb);
2490 };
2491
2492 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2493
2494 static void dev_gso_skb_destructor(struct sk_buff *skb)
2495 {
2496         struct dev_gso_cb *cb;
2497
2498         kfree_skb_list(skb->next);
2499         skb->next = NULL;
2500
2501         cb = DEV_GSO_CB(skb);
2502         if (cb->destructor)
2503                 cb->destructor(skb);
2504 }
2505
2506 /**
2507  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2508  *      @skb: buffer to segment
2509  *      @features: device features as applicable to this skb
2510  *
2511  *      This function segments the given skb and stores the list of segments
2512  *      in skb->next.
2513  */
2514 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2515 {
2516         struct sk_buff *segs;
2517
2518         segs = skb_gso_segment(skb, features);
2519
2520         /* Verifying header integrity only. */
2521         if (!segs)
2522                 return 0;
2523
2524         if (IS_ERR(segs))
2525                 return PTR_ERR(segs);
2526
2527         skb->next = segs;
2528         DEV_GSO_CB(skb)->destructor = skb->destructor;
2529         skb->destructor = dev_gso_skb_destructor;
2530
2531         return 0;
2532 }
2533
2534 /* If MPLS offload request, verify we are testing hardware MPLS features
2535  * instead of standard features for the netdev.
2536  */
2537 #ifdef CONFIG_NET_MPLS_GSO
2538 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2539                                            netdev_features_t features,
2540                                            __be16 type)
2541 {
2542         if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2543                 features &= skb->dev->mpls_features;
2544
2545         return features;
2546 }
2547 #else
2548 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2549                                            netdev_features_t features,
2550                                            __be16 type)
2551 {
2552         return features;
2553 }
2554 #endif
2555
2556 static netdev_features_t harmonize_features(struct sk_buff *skb,
2557         netdev_features_t features)
2558 {
2559         int tmp;
2560         __be16 type;
2561
2562         type = skb_network_protocol(skb, &tmp);
2563         features = net_mpls_features(skb, features, type);
2564
2565         if (skb->ip_summed != CHECKSUM_NONE &&
2566             !can_checksum_protocol(features, type)) {
2567                 features &= ~NETIF_F_ALL_CSUM;
2568         } else if (illegal_highdma(skb->dev, skb)) {
2569                 features &= ~NETIF_F_SG;
2570         }
2571
2572         return features;
2573 }
2574
2575 netdev_features_t netif_skb_features(struct sk_buff *skb)
2576 {
2577         __be16 protocol = skb->protocol;
2578         netdev_features_t features = skb->dev->features;
2579
2580         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2581                 features &= ~NETIF_F_GSO_MASK;
2582
2583         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2584                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2585                 protocol = veh->h_vlan_encapsulated_proto;
2586         } else if (!vlan_tx_tag_present(skb)) {
2587                 return harmonize_features(skb, features);
2588         }
2589
2590         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2591                                                NETIF_F_HW_VLAN_STAG_TX);
2592
2593         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2594                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2595                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2596                                 NETIF_F_HW_VLAN_STAG_TX;
2597
2598         return harmonize_features(skb, features);
2599 }
2600 EXPORT_SYMBOL(netif_skb_features);
2601
2602 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2603                         struct netdev_queue *txq)
2604 {
2605         const struct net_device_ops *ops = dev->netdev_ops;
2606         int rc = NETDEV_TX_OK;
2607         unsigned int skb_len;
2608
2609         if (likely(!skb->next)) {
2610                 netdev_features_t features;
2611
2612                 /*
2613                  * If device doesn't need skb->dst, release it right now while
2614                  * its hot in this cpu cache
2615                  */
2616                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2617                         skb_dst_drop(skb);
2618
2619                 features = netif_skb_features(skb);
2620
2621                 if (vlan_tx_tag_present(skb) &&
2622                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2623                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2624                                              vlan_tx_tag_get(skb));
2625                         if (unlikely(!skb))
2626                                 goto out;
2627
2628                         skb->vlan_tci = 0;
2629                 }
2630
2631                 /* If encapsulation offload request, verify we are testing
2632                  * hardware encapsulation features instead of standard
2633                  * features for the netdev
2634                  */
2635                 if (skb->encapsulation)
2636                         features &= dev->hw_enc_features;
2637
2638                 if (netif_needs_gso(skb, features)) {
2639                         if (unlikely(dev_gso_segment(skb, features)))
2640                                 goto out_kfree_skb;
2641                         if (skb->next)
2642                                 goto gso;
2643                 } else {
2644                         if (skb_needs_linearize(skb, features) &&
2645                             __skb_linearize(skb))
2646                                 goto out_kfree_skb;
2647
2648                         /* If packet is not checksummed and device does not
2649                          * support checksumming for this protocol, complete
2650                          * checksumming here.
2651                          */
2652                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2653                                 if (skb->encapsulation)
2654                                         skb_set_inner_transport_header(skb,
2655                                                 skb_checksum_start_offset(skb));
2656                                 else
2657                                         skb_set_transport_header(skb,
2658                                                 skb_checksum_start_offset(skb));
2659                                 if (!(features & NETIF_F_ALL_CSUM) &&
2660                                      skb_checksum_help(skb))
2661                                         goto out_kfree_skb;
2662                         }
2663                 }
2664
2665                 if (!list_empty(&ptype_all))
2666                         dev_queue_xmit_nit(skb, dev);
2667
2668                 skb_len = skb->len;
2669                 trace_net_dev_start_xmit(skb, dev);
2670                 rc = ops->ndo_start_xmit(skb, dev);
2671                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2672                 if (rc == NETDEV_TX_OK)
2673                         txq_trans_update(txq);
2674                 return rc;
2675         }
2676
2677 gso:
2678         do {
2679                 struct sk_buff *nskb = skb->next;
2680
2681                 skb->next = nskb->next;
2682                 nskb->next = NULL;
2683
2684                 if (!list_empty(&ptype_all))
2685                         dev_queue_xmit_nit(nskb, dev);
2686
2687                 skb_len = nskb->len;
2688                 trace_net_dev_start_xmit(nskb, dev);
2689                 rc = ops->ndo_start_xmit(nskb, dev);
2690                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2691                 if (unlikely(rc != NETDEV_TX_OK)) {
2692                         if (rc & ~NETDEV_TX_MASK)
2693                                 goto out_kfree_gso_skb;
2694                         nskb->next = skb->next;
2695                         skb->next = nskb;
2696                         return rc;
2697                 }
2698                 txq_trans_update(txq);
2699                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2700                         return NETDEV_TX_BUSY;
2701         } while (skb->next);
2702
2703 out_kfree_gso_skb:
2704         if (likely(skb->next == NULL)) {
2705                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2706                 consume_skb(skb);
2707                 return rc;
2708         }
2709 out_kfree_skb:
2710         kfree_skb(skb);
2711 out:
2712         return rc;
2713 }
2714 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2715
2716 static void qdisc_pkt_len_init(struct sk_buff *skb)
2717 {
2718         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2719
2720         qdisc_skb_cb(skb)->pkt_len = skb->len;
2721
2722         /* To get more precise estimation of bytes sent on wire,
2723          * we add to pkt_len the headers size of all segments
2724          */
2725         if (shinfo->gso_size)  {
2726                 unsigned int hdr_len;
2727                 u16 gso_segs = shinfo->gso_segs;
2728
2729                 /* mac layer + network layer */
2730                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2731
2732                 /* + transport layer */
2733                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2734                         hdr_len += tcp_hdrlen(skb);
2735                 else
2736                         hdr_len += sizeof(struct udphdr);
2737
2738                 if (shinfo->gso_type & SKB_GSO_DODGY)
2739                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2740                                                 shinfo->gso_size);
2741
2742                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2743         }
2744 }
2745
2746 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2747                                  struct net_device *dev,
2748                                  struct netdev_queue *txq)
2749 {
2750         spinlock_t *root_lock = qdisc_lock(q);
2751         bool contended;
2752         int rc;
2753
2754         qdisc_pkt_len_init(skb);
2755         qdisc_calculate_pkt_len(skb, q);
2756         /*
2757          * Heuristic to force contended enqueues to serialize on a
2758          * separate lock before trying to get qdisc main lock.
2759          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2760          * often and dequeue packets faster.
2761          */
2762         contended = qdisc_is_running(q);
2763         if (unlikely(contended))
2764                 spin_lock(&q->busylock);
2765
2766         spin_lock(root_lock);
2767         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2768                 kfree_skb(skb);
2769                 rc = NET_XMIT_DROP;
2770         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2771                    qdisc_run_begin(q)) {
2772                 /*
2773                  * This is a work-conserving queue; there are no old skbs
2774                  * waiting to be sent out; and the qdisc is not running -
2775                  * xmit the skb directly.
2776                  */
2777                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2778                         skb_dst_force(skb);
2779
2780                 qdisc_bstats_update(q, skb);
2781
2782                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2783                         if (unlikely(contended)) {
2784                                 spin_unlock(&q->busylock);
2785                                 contended = false;
2786                         }
2787                         __qdisc_run(q);
2788                 } else
2789                         qdisc_run_end(q);
2790
2791                 rc = NET_XMIT_SUCCESS;
2792         } else {
2793                 skb_dst_force(skb);
2794                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2795                 if (qdisc_run_begin(q)) {
2796                         if (unlikely(contended)) {
2797                                 spin_unlock(&q->busylock);
2798                                 contended = false;
2799                         }
2800                         __qdisc_run(q);
2801                 }
2802         }
2803         spin_unlock(root_lock);
2804         if (unlikely(contended))
2805                 spin_unlock(&q->busylock);
2806         return rc;
2807 }
2808
2809 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2810 static void skb_update_prio(struct sk_buff *skb)
2811 {
2812         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2813
2814         if (!skb->priority && skb->sk && map) {
2815                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2816
2817                 if (prioidx < map->priomap_len)
2818                         skb->priority = map->priomap[prioidx];
2819         }
2820 }
2821 #else
2822 #define skb_update_prio(skb)
2823 #endif
2824
2825 static DEFINE_PER_CPU(int, xmit_recursion);
2826 #define RECURSION_LIMIT 10
2827
2828 /**
2829  *      dev_loopback_xmit - loop back @skb
2830  *      @skb: buffer to transmit
2831  */
2832 int dev_loopback_xmit(struct sk_buff *skb)
2833 {
2834         skb_reset_mac_header(skb);
2835         __skb_pull(skb, skb_network_offset(skb));
2836         skb->pkt_type = PACKET_LOOPBACK;
2837         skb->ip_summed = CHECKSUM_UNNECESSARY;
2838         WARN_ON(!skb_dst(skb));
2839         skb_dst_force(skb);
2840         netif_rx_ni(skb);
2841         return 0;
2842 }
2843 EXPORT_SYMBOL(dev_loopback_xmit);
2844
2845 /**
2846  *      __dev_queue_xmit - transmit a buffer
2847  *      @skb: buffer to transmit
2848  *      @accel_priv: private data used for L2 forwarding offload
2849  *
2850  *      Queue a buffer for transmission to a network device. The caller must
2851  *      have set the device and priority and built the buffer before calling
2852  *      this function. The function can be called from an interrupt.
2853  *
2854  *      A negative errno code is returned on a failure. A success does not
2855  *      guarantee the frame will be transmitted as it may be dropped due
2856  *      to congestion or traffic shaping.
2857  *
2858  * -----------------------------------------------------------------------------------
2859  *      I notice this method can also return errors from the queue disciplines,
2860  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2861  *      be positive.
2862  *
2863  *      Regardless of the return value, the skb is consumed, so it is currently
2864  *      difficult to retry a send to this method.  (You can bump the ref count
2865  *      before sending to hold a reference for retry if you are careful.)
2866  *
2867  *      When calling this method, interrupts MUST be enabled.  This is because
2868  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2869  *          --BLG
2870  */
2871 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2872 {
2873         struct net_device *dev = skb->dev;
2874         struct netdev_queue *txq;
2875         struct Qdisc *q;
2876         int rc = -ENOMEM;
2877
2878         skb_reset_mac_header(skb);
2879
2880         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2881                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2882
2883         /* Disable soft irqs for various locks below. Also
2884          * stops preemption for RCU.
2885          */
2886         rcu_read_lock_bh();
2887
2888         skb_update_prio(skb);
2889
2890         txq = netdev_pick_tx(dev, skb, accel_priv);
2891         q = rcu_dereference_bh(txq->qdisc);
2892
2893 #ifdef CONFIG_NET_CLS_ACT
2894         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2895 #endif
2896         trace_net_dev_queue(skb);
2897         if (q->enqueue) {
2898                 rc = __dev_xmit_skb(skb, q, dev, txq);
2899                 goto out;
2900         }
2901
2902         /* The device has no queue. Common case for software devices:
2903            loopback, all the sorts of tunnels...
2904
2905            Really, it is unlikely that netif_tx_lock protection is necessary
2906            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2907            counters.)
2908            However, it is possible, that they rely on protection
2909            made by us here.
2910
2911            Check this and shot the lock. It is not prone from deadlocks.
2912            Either shot noqueue qdisc, it is even simpler 8)
2913          */
2914         if (dev->flags & IFF_UP) {
2915                 int cpu = smp_processor_id(); /* ok because BHs are off */
2916
2917                 if (txq->xmit_lock_owner != cpu) {
2918
2919                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2920                                 goto recursion_alert;
2921
2922                         HARD_TX_LOCK(dev, txq, cpu);
2923
2924                         if (!netif_xmit_stopped(txq)) {
2925                                 __this_cpu_inc(xmit_recursion);
2926                                 rc = dev_hard_start_xmit(skb, dev, txq);
2927                                 __this_cpu_dec(xmit_recursion);
2928                                 if (dev_xmit_complete(rc)) {
2929                                         HARD_TX_UNLOCK(dev, txq);
2930                                         goto out;
2931                                 }
2932                         }
2933                         HARD_TX_UNLOCK(dev, txq);
2934                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2935                                              dev->name);
2936                 } else {
2937                         /* Recursion is detected! It is possible,
2938                          * unfortunately
2939                          */
2940 recursion_alert:
2941                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2942                                              dev->name);
2943                 }
2944         }
2945
2946         rc = -ENETDOWN;
2947         rcu_read_unlock_bh();
2948
2949         atomic_long_inc(&dev->tx_dropped);
2950         kfree_skb(skb);
2951         return rc;
2952 out:
2953         rcu_read_unlock_bh();
2954         return rc;
2955 }
2956
2957 int dev_queue_xmit(struct sk_buff *skb)
2958 {
2959         return __dev_queue_xmit(skb, NULL);
2960 }
2961 EXPORT_SYMBOL(dev_queue_xmit);
2962
2963 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2964 {
2965         return __dev_queue_xmit(skb, accel_priv);
2966 }
2967 EXPORT_SYMBOL(dev_queue_xmit_accel);
2968
2969
2970 /*=======================================================================
2971                         Receiver routines
2972   =======================================================================*/
2973
2974 int netdev_max_backlog __read_mostly = 1000;
2975 EXPORT_SYMBOL(netdev_max_backlog);
2976
2977 int netdev_tstamp_prequeue __read_mostly = 1;
2978 int netdev_budget __read_mostly = 300;
2979 int weight_p __read_mostly = 64;            /* old backlog weight */
2980
2981 /* Called with irq disabled */
2982 static inline void ____napi_schedule(struct softnet_data *sd,
2983                                      struct napi_struct *napi)
2984 {
2985         list_add_tail(&napi->poll_list, &sd->poll_list);
2986         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2987 }
2988
2989 #ifdef CONFIG_RPS
2990
2991 /* One global table that all flow-based protocols share. */
2992 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2993 EXPORT_SYMBOL(rps_sock_flow_table);
2994
2995 struct static_key rps_needed __read_mostly;
2996
2997 static struct rps_dev_flow *
2998 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2999             struct rps_dev_flow *rflow, u16 next_cpu)
3000 {
3001         if (next_cpu != RPS_NO_CPU) {
3002 #ifdef CONFIG_RFS_ACCEL
3003                 struct netdev_rx_queue *rxqueue;
3004                 struct rps_dev_flow_table *flow_table;
3005                 struct rps_dev_flow *old_rflow;
3006                 u32 flow_id;
3007                 u16 rxq_index;
3008                 int rc;
3009
3010                 /* Should we steer this flow to a different hardware queue? */
3011                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3012                     !(dev->features & NETIF_F_NTUPLE))
3013                         goto out;
3014                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3015                 if (rxq_index == skb_get_rx_queue(skb))
3016                         goto out;
3017
3018                 rxqueue = dev->_rx + rxq_index;
3019                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3020                 if (!flow_table)
3021                         goto out;
3022                 flow_id = skb_get_hash(skb) & flow_table->mask;
3023                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3024                                                         rxq_index, flow_id);
3025                 if (rc < 0)
3026                         goto out;
3027                 old_rflow = rflow;
3028                 rflow = &flow_table->flows[flow_id];
3029                 rflow->filter = rc;
3030                 if (old_rflow->filter == rflow->filter)
3031                         old_rflow->filter = RPS_NO_FILTER;
3032         out:
3033 #endif
3034                 rflow->last_qtail =
3035                         per_cpu(softnet_data, next_cpu).input_queue_head;
3036         }
3037
3038         rflow->cpu = next_cpu;
3039         return rflow;
3040 }
3041
3042 /*
3043  * get_rps_cpu is called from netif_receive_skb and returns the target
3044  * CPU from the RPS map of the receiving queue for a given skb.
3045  * rcu_read_lock must be held on entry.
3046  */
3047 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3048                        struct rps_dev_flow **rflowp)
3049 {
3050         struct netdev_rx_queue *rxqueue;
3051         struct rps_map *map;
3052         struct rps_dev_flow_table *flow_table;
3053         struct rps_sock_flow_table *sock_flow_table;
3054         int cpu = -1;
3055         u16 tcpu;
3056         u32 hash;
3057
3058         if (skb_rx_queue_recorded(skb)) {
3059                 u16 index = skb_get_rx_queue(skb);
3060                 if (unlikely(index >= dev->real_num_rx_queues)) {
3061                         WARN_ONCE(dev->real_num_rx_queues > 1,
3062                                   "%s received packet on queue %u, but number "
3063                                   "of RX queues is %u\n",
3064                                   dev->name, index, dev->real_num_rx_queues);
3065                         goto done;
3066                 }
3067                 rxqueue = dev->_rx + index;
3068         } else
3069                 rxqueue = dev->_rx;
3070
3071         map = rcu_dereference(rxqueue->rps_map);
3072         if (map) {
3073                 if (map->len == 1 &&
3074                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3075                         tcpu = map->cpus[0];
3076                         if (cpu_online(tcpu))
3077                                 cpu = tcpu;
3078                         goto done;
3079                 }
3080         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3081                 goto done;
3082         }
3083
3084         skb_reset_network_header(skb);
3085         hash = skb_get_hash(skb);
3086         if (!hash)
3087                 goto done;
3088
3089         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3090         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3091         if (flow_table && sock_flow_table) {
3092                 u16 next_cpu;
3093                 struct rps_dev_flow *rflow;
3094
3095                 rflow = &flow_table->flows[hash & flow_table->mask];
3096                 tcpu = rflow->cpu;
3097
3098                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3099
3100                 /*
3101                  * If the desired CPU (where last recvmsg was done) is
3102                  * different from current CPU (one in the rx-queue flow
3103                  * table entry), switch if one of the following holds:
3104                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3105                  *   - Current CPU is offline.
3106                  *   - The current CPU's queue tail has advanced beyond the
3107                  *     last packet that was enqueued using this table entry.
3108                  *     This guarantees that all previous packets for the flow
3109                  *     have been dequeued, thus preserving in order delivery.
3110                  */
3111                 if (unlikely(tcpu != next_cpu) &&
3112                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3113                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3114                       rflow->last_qtail)) >= 0)) {
3115                         tcpu = next_cpu;
3116                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3117                 }
3118
3119                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3120                         *rflowp = rflow;
3121                         cpu = tcpu;
3122                         goto done;
3123                 }
3124         }
3125
3126         if (map) {
3127                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3128                 if (cpu_online(tcpu)) {
3129                         cpu = tcpu;
3130                         goto done;
3131                 }
3132         }
3133
3134 done:
3135         return cpu;
3136 }
3137
3138 #ifdef CONFIG_RFS_ACCEL
3139
3140 /**
3141  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3142  * @dev: Device on which the filter was set
3143  * @rxq_index: RX queue index
3144  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3145  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3146  *
3147  * Drivers that implement ndo_rx_flow_steer() should periodically call
3148  * this function for each installed filter and remove the filters for
3149  * which it returns %true.
3150  */
3151 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3152                          u32 flow_id, u16 filter_id)
3153 {
3154         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3155         struct rps_dev_flow_table *flow_table;
3156         struct rps_dev_flow *rflow;
3157         bool expire = true;
3158         int cpu;
3159
3160         rcu_read_lock();
3161         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3162         if (flow_table && flow_id <= flow_table->mask) {
3163                 rflow = &flow_table->flows[flow_id];
3164                 cpu = ACCESS_ONCE(rflow->cpu);
3165                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3166                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3167                            rflow->last_qtail) <
3168                      (int)(10 * flow_table->mask)))
3169                         expire = false;
3170         }
3171         rcu_read_unlock();
3172         return expire;
3173 }
3174 EXPORT_SYMBOL(rps_may_expire_flow);
3175
3176 #endif /* CONFIG_RFS_ACCEL */
3177
3178 /* Called from hardirq (IPI) context */
3179 static void rps_trigger_softirq(void *data)
3180 {
3181         struct softnet_data *sd = data;
3182
3183         ____napi_schedule(sd, &sd->backlog);
3184         sd->received_rps++;
3185 }
3186
3187 #endif /* CONFIG_RPS */
3188
3189 /*
3190  * Check if this softnet_data structure is another cpu one
3191  * If yes, queue it to our IPI list and return 1
3192  * If no, return 0
3193  */
3194 static int rps_ipi_queued(struct softnet_data *sd)
3195 {
3196 #ifdef CONFIG_RPS
3197         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3198
3199         if (sd != mysd) {
3200                 sd->rps_ipi_next = mysd->rps_ipi_list;
3201                 mysd->rps_ipi_list = sd;
3202
3203                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3204                 return 1;
3205         }
3206 #endif /* CONFIG_RPS */
3207         return 0;
3208 }
3209
3210 #ifdef CONFIG_NET_FLOW_LIMIT
3211 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3212 #endif
3213
3214 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3215 {
3216 #ifdef CONFIG_NET_FLOW_LIMIT
3217         struct sd_flow_limit *fl;
3218         struct softnet_data *sd;
3219         unsigned int old_flow, new_flow;
3220
3221         if (qlen < (netdev_max_backlog >> 1))
3222                 return false;
3223
3224         sd = &__get_cpu_var(softnet_data);
3225
3226         rcu_read_lock();
3227         fl = rcu_dereference(sd->flow_limit);
3228         if (fl) {
3229                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3230                 old_flow = fl->history[fl->history_head];
3231                 fl->history[fl->history_head] = new_flow;
3232
3233                 fl->history_head++;
3234                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3235
3236                 if (likely(fl->buckets[old_flow]))
3237                         fl->buckets[old_flow]--;
3238
3239                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3240                         fl->count++;
3241                         rcu_read_unlock();
3242                         return true;
3243                 }
3244         }
3245         rcu_read_unlock();
3246 #endif
3247         return false;
3248 }
3249
3250 /*
3251  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3252  * queue (may be a remote CPU queue).
3253  */
3254 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3255                               unsigned int *qtail)
3256 {
3257         struct softnet_data *sd;
3258         unsigned long flags;
3259         unsigned int qlen;
3260
3261         sd = &per_cpu(softnet_data, cpu);
3262
3263         local_irq_save(flags);
3264
3265         rps_lock(sd);
3266         qlen = skb_queue_len(&sd->input_pkt_queue);
3267         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3268                 if (skb_queue_len(&sd->input_pkt_queue)) {
3269 enqueue:
3270                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3271                         input_queue_tail_incr_save(sd, qtail);
3272                         rps_unlock(sd);
3273                         local_irq_restore(flags);
3274                         return NET_RX_SUCCESS;
3275                 }
3276
3277                 /* Schedule NAPI for backlog device
3278                  * We can use non atomic operation since we own the queue lock
3279                  */
3280                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3281                         if (!rps_ipi_queued(sd))
3282                                 ____napi_schedule(sd, &sd->backlog);
3283                 }
3284                 goto enqueue;
3285         }
3286
3287         sd->dropped++;
3288         rps_unlock(sd);
3289
3290         local_irq_restore(flags);
3291
3292         atomic_long_inc(&skb->dev->rx_dropped);
3293         kfree_skb(skb);
3294         return NET_RX_DROP;
3295 }
3296
3297 static int netif_rx_internal(struct sk_buff *skb)
3298 {
3299         int ret;
3300
3301         net_timestamp_check(netdev_tstamp_prequeue, skb);
3302
3303         trace_netif_rx(skb);
3304 #ifdef CONFIG_RPS
3305         if (static_key_false(&rps_needed)) {
3306                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3307                 int cpu;
3308
3309                 preempt_disable();
3310                 rcu_read_lock();
3311
3312                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3313                 if (cpu < 0)
3314                         cpu = smp_processor_id();
3315
3316                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3317
3318                 rcu_read_unlock();
3319                 preempt_enable();
3320         } else
3321 #endif
3322         {
3323                 unsigned int qtail;
3324                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3325                 put_cpu();
3326         }
3327         return ret;
3328 }
3329
3330 /**
3331  *      netif_rx        -       post buffer to the network code
3332  *      @skb: buffer to post
3333  *
3334  *      This function receives a packet from a device driver and queues it for
3335  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3336  *      may be dropped during processing for congestion control or by the
3337  *      protocol layers.
3338  *
3339  *      return values:
3340  *      NET_RX_SUCCESS  (no congestion)
3341  *      NET_RX_DROP     (packet was dropped)
3342  *
3343  */
3344
3345 int netif_rx(struct sk_buff *skb)
3346 {
3347         trace_netif_rx_entry(skb);
3348
3349         return netif_rx_internal(skb);
3350 }
3351 EXPORT_SYMBOL(netif_rx);
3352
3353 int netif_rx_ni(struct sk_buff *skb)
3354 {
3355         int err;
3356
3357         trace_netif_rx_ni_entry(skb);
3358
3359         preempt_disable();
3360         err = netif_rx_internal(skb);
3361         if (local_softirq_pending())
3362                 do_softirq();
3363         preempt_enable();
3364
3365         return err;
3366 }
3367 EXPORT_SYMBOL(netif_rx_ni);
3368
3369 static void net_tx_action(struct softirq_action *h)
3370 {
3371         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3372
3373         if (sd->completion_queue) {
3374                 struct sk_buff *clist;
3375
3376                 local_irq_disable();
3377                 clist = sd->completion_queue;
3378                 sd->completion_queue = NULL;
3379                 local_irq_enable();
3380
3381                 while (clist) {
3382                         struct sk_buff *skb = clist;
3383                         clist = clist->next;
3384
3385                         WARN_ON(atomic_read(&skb->users));
3386                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3387                                 trace_consume_skb(skb);
3388                         else
3389                                 trace_kfree_skb(skb, net_tx_action);
3390                         __kfree_skb(skb);
3391                 }
3392         }
3393
3394         if (sd->output_queue) {
3395                 struct Qdisc *head;
3396
3397                 local_irq_disable();
3398                 head = sd->output_queue;
3399                 sd->output_queue = NULL;
3400                 sd->output_queue_tailp = &sd->output_queue;
3401                 local_irq_enable();
3402
3403                 while (head) {
3404                         struct Qdisc *q = head;
3405                         spinlock_t *root_lock;
3406
3407                         head = head->next_sched;
3408
3409                         root_lock = qdisc_lock(q);
3410                         if (spin_trylock(root_lock)) {
3411                                 smp_mb__before_atomic();
3412                                 clear_bit(__QDISC_STATE_SCHED,
3413                                           &q->state);
3414                                 qdisc_run(q);
3415                                 spin_unlock(root_lock);
3416                         } else {
3417                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3418                                               &q->state)) {
3419                                         __netif_reschedule(q);
3420                                 } else {
3421                                         smp_mb__before_atomic();
3422                                         clear_bit(__QDISC_STATE_SCHED,
3423                                                   &q->state);
3424                                 }
3425                         }
3426                 }
3427         }
3428 }
3429
3430 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3431     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3432 /* This hook is defined here for ATM LANE */
3433 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3434                              unsigned char *addr) __read_mostly;
3435 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3436 #endif
3437
3438 #ifdef CONFIG_NET_CLS_ACT
3439 /* TODO: Maybe we should just force sch_ingress to be compiled in
3440  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3441  * a compare and 2 stores extra right now if we dont have it on
3442  * but have CONFIG_NET_CLS_ACT
3443  * NOTE: This doesn't stop any functionality; if you dont have
3444  * the ingress scheduler, you just can't add policies on ingress.
3445  *
3446  */
3447 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3448 {
3449         struct net_device *dev = skb->dev;
3450         u32 ttl = G_TC_RTTL(skb->tc_verd);
3451         int result = TC_ACT_OK;
3452         struct Qdisc *q;
3453
3454         if (unlikely(MAX_RED_LOOP < ttl++)) {
3455                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3456                                      skb->skb_iif, dev->ifindex);
3457                 return TC_ACT_SHOT;
3458         }
3459
3460         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3461         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3462
3463         q = rxq->qdisc;
3464         if (q != &noop_qdisc) {
3465                 spin_lock(qdisc_lock(q));
3466                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3467                         result = qdisc_enqueue_root(skb, q);
3468                 spin_unlock(qdisc_lock(q));
3469         }
3470
3471         return result;
3472 }
3473
3474 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3475                                          struct packet_type **pt_prev,
3476                                          int *ret, struct net_device *orig_dev)
3477 {
3478         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3479
3480         if (!rxq || rxq->qdisc == &noop_qdisc)
3481                 goto out;
3482
3483         if (*pt_prev) {
3484                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3485                 *pt_prev = NULL;
3486         }
3487
3488         switch (ing_filter(skb, rxq)) {
3489         case TC_ACT_SHOT:
3490         case TC_ACT_STOLEN:
3491                 kfree_skb(skb);
3492                 return NULL;
3493         }
3494
3495 out:
3496         skb->tc_verd = 0;
3497         return skb;
3498 }
3499 #endif
3500
3501 /**
3502  *      netdev_rx_handler_register - register receive handler
3503  *      @dev: device to register a handler for
3504  *      @rx_handler: receive handler to register
3505  *      @rx_handler_data: data pointer that is used by rx handler
3506  *
3507  *      Register a receive handler for a device. This handler will then be
3508  *      called from __netif_receive_skb. A negative errno code is returned
3509  *      on a failure.
3510  *
3511  *      The caller must hold the rtnl_mutex.
3512  *
3513  *      For a general description of rx_handler, see enum rx_handler_result.
3514  */
3515 int netdev_rx_handler_register(struct net_device *dev,
3516                                rx_handler_func_t *rx_handler,
3517                                void *rx_handler_data)
3518 {
3519         ASSERT_RTNL();
3520
3521         if (dev->rx_handler)
3522                 return -EBUSY;
3523
3524         /* Note: rx_handler_data must be set before rx_handler */
3525         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3526         rcu_assign_pointer(dev->rx_handler, rx_handler);
3527
3528         return 0;
3529 }
3530 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3531
3532 /**
3533  *      netdev_rx_handler_unregister - unregister receive handler
3534  *      @dev: device to unregister a handler from
3535  *
3536  *      Unregister a receive handler from a device.
3537  *
3538  *      The caller must hold the rtnl_mutex.
3539  */
3540 void netdev_rx_handler_unregister(struct net_device *dev)
3541 {
3542
3543         ASSERT_RTNL();
3544         RCU_INIT_POINTER(dev->rx_handler, NULL);
3545         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3546          * section has a guarantee to see a non NULL rx_handler_data
3547          * as well.
3548          */
3549         synchronize_net();
3550         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3551 }
3552 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3553
3554 /*
3555  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3556  * the special handling of PFMEMALLOC skbs.
3557  */
3558 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3559 {
3560         switch (skb->protocol) {
3561         case htons(ETH_P_ARP):
3562         case htons(ETH_P_IP):
3563         case htons(ETH_P_IPV6):
3564         case htons(ETH_P_8021Q):
3565         case htons(ETH_P_8021AD):
3566                 return true;
3567         default:
3568                 return false;
3569         }
3570 }
3571
3572 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3573 {
3574         struct packet_type *ptype, *pt_prev;
3575         rx_handler_func_t *rx_handler;
3576         struct net_device *orig_dev;
3577         struct net_device *null_or_dev;
3578         bool deliver_exact = false;
3579         int ret = NET_RX_DROP;
3580         __be16 type;
3581
3582         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3583
3584         trace_netif_receive_skb(skb);
3585
3586         orig_dev = skb->dev;
3587
3588         skb_reset_network_header(skb);
3589         if (!skb_transport_header_was_set(skb))
3590                 skb_reset_transport_header(skb);
3591         skb_reset_mac_len(skb);
3592
3593         pt_prev = NULL;
3594
3595         rcu_read_lock();
3596
3597 another_round:
3598         skb->skb_iif = skb->dev->ifindex;
3599
3600         __this_cpu_inc(softnet_data.processed);
3601
3602         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3603             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3604                 skb = skb_vlan_untag(skb);
3605                 if (unlikely(!skb))
3606                         goto unlock;
3607         }
3608
3609 #ifdef CONFIG_NET_CLS_ACT
3610         if (skb->tc_verd & TC_NCLS) {
3611                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3612                 goto ncls;
3613         }
3614 #endif
3615
3616         if (pfmemalloc)
3617                 goto skip_taps;
3618
3619         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3620                 if (!ptype->dev || ptype->dev == skb->dev) {
3621                         if (pt_prev)
3622                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3623                         pt_prev = ptype;
3624                 }
3625         }
3626
3627 skip_taps:
3628 #ifdef CONFIG_NET_CLS_ACT
3629         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3630         if (!skb)
3631                 goto unlock;
3632 ncls:
3633 #endif
3634
3635         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3636                 goto drop;
3637
3638         if (vlan_tx_tag_present(skb)) {
3639                 if (pt_prev) {
3640                         ret = deliver_skb(skb, pt_prev, orig_dev);
3641                         pt_prev = NULL;
3642                 }
3643                 if (vlan_do_receive(&skb))
3644                         goto another_round;
3645                 else if (unlikely(!skb))
3646                         goto unlock;
3647         }
3648
3649         rx_handler = rcu_dereference(skb->dev->rx_handler);
3650         if (rx_handler) {
3651                 if (pt_prev) {
3652                         ret = deliver_skb(skb, pt_prev, orig_dev);
3653                         pt_prev = NULL;
3654                 }
3655                 switch (rx_handler(&skb)) {
3656                 case RX_HANDLER_CONSUMED:
3657                         ret = NET_RX_SUCCESS;
3658                         goto unlock;
3659                 case RX_HANDLER_ANOTHER:
3660                         goto another_round;
3661                 case RX_HANDLER_EXACT:
3662                         deliver_exact = true;
3663                 case RX_HANDLER_PASS:
3664                         break;
3665                 default:
3666                         BUG();
3667                 }
3668         }
3669
3670         if (unlikely(vlan_tx_tag_present(skb))) {
3671                 if (vlan_tx_tag_get_id(skb))
3672                         skb->pkt_type = PACKET_OTHERHOST;
3673                 /* Note: we might in the future use prio bits
3674                  * and set skb->priority like in vlan_do_receive()
3675                  * For the time being, just ignore Priority Code Point
3676                  */
3677                 skb->vlan_tci = 0;
3678         }
3679
3680         /* deliver only exact match when indicated */
3681         null_or_dev = deliver_exact ? skb->dev : NULL;
3682
3683         type = skb->protocol;
3684         list_for_each_entry_rcu(ptype,
3685                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3686                 if (ptype->type == type &&
3687                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3688                      ptype->dev == orig_dev)) {
3689                         if (pt_prev)
3690                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3691                         pt_prev = ptype;
3692                 }
3693         }
3694
3695         if (pt_prev) {
3696                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3697                         goto drop;
3698                 else
3699                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3700         } else {
3701 drop:
3702                 atomic_long_inc(&skb->dev->rx_dropped);
3703                 kfree_skb(skb);
3704                 /* Jamal, now you will not able to escape explaining
3705                  * me how you were going to use this. :-)
3706                  */
3707                 ret = NET_RX_DROP;
3708         }
3709
3710 unlock:
3711         rcu_read_unlock();
3712         return ret;
3713 }
3714
3715 static int __netif_receive_skb(struct sk_buff *skb)
3716 {
3717         int ret;
3718
3719         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3720                 unsigned long pflags = current->flags;
3721
3722                 /*
3723                  * PFMEMALLOC skbs are special, they should
3724                  * - be delivered to SOCK_MEMALLOC sockets only
3725                  * - stay away from userspace
3726                  * - have bounded memory usage
3727                  *
3728                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3729                  * context down to all allocation sites.
3730                  */
3731                 current->flags |= PF_MEMALLOC;
3732                 ret = __netif_receive_skb_core(skb, true);
3733                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3734         } else
3735                 ret = __netif_receive_skb_core(skb, false);
3736
3737         return ret;
3738 }
3739
3740 static int netif_receive_skb_internal(struct sk_buff *skb)
3741 {
3742         net_timestamp_check(netdev_tstamp_prequeue, skb);
3743
3744         if (skb_defer_rx_timestamp(skb))
3745                 return NET_RX_SUCCESS;
3746
3747 #ifdef CONFIG_RPS
3748         if (static_key_false(&rps_needed)) {
3749                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3750                 int cpu, ret;
3751
3752                 rcu_read_lock();
3753
3754                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3755
3756                 if (cpu >= 0) {
3757                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3758                         rcu_read_unlock();
3759                         return ret;
3760                 }
3761                 rcu_read_unlock();
3762         }
3763 #endif
3764         return __netif_receive_skb(skb);
3765 }
3766
3767 /**
3768  *      netif_receive_skb - process receive buffer from network
3769  *      @skb: buffer to process
3770  *
3771  *      netif_receive_skb() is the main receive data processing function.
3772  *      It always succeeds. The buffer may be dropped during processing
3773  *      for congestion control or by the protocol layers.
3774  *
3775  *      This function may only be called from softirq context and interrupts
3776  *      should be enabled.
3777  *
3778  *      Return values (usually ignored):
3779  *      NET_RX_SUCCESS: no congestion
3780  *      NET_RX_DROP: packet was dropped
3781  */
3782 int netif_receive_skb(struct sk_buff *skb)
3783 {
3784         trace_netif_receive_skb_entry(skb);
3785
3786         return netif_receive_skb_internal(skb);
3787 }
3788 EXPORT_SYMBOL(netif_receive_skb);
3789
3790 /* Network device is going away, flush any packets still pending
3791  * Called with irqs disabled.
3792  */
3793 static void flush_backlog(void *arg)
3794 {
3795         struct net_device *dev = arg;
3796         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3797         struct sk_buff *skb, *tmp;
3798
3799         rps_lock(sd);
3800         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3801                 if (skb->dev == dev) {
3802                         __skb_unlink(skb, &sd->input_pkt_queue);
3803                         kfree_skb(skb);
3804                         input_queue_head_incr(sd);
3805                 }
3806         }
3807         rps_unlock(sd);
3808
3809         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3810                 if (skb->dev == dev) {
3811                         __skb_unlink(skb, &sd->process_queue);
3812                         kfree_skb(skb);
3813                         input_queue_head_incr(sd);
3814                 }
3815         }
3816 }
3817
3818 static int napi_gro_complete(struct sk_buff *skb)
3819 {
3820         struct packet_offload *ptype;
3821         __be16 type = skb->protocol;
3822         struct list_head *head = &offload_base;
3823         int err = -ENOENT;
3824
3825         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3826
3827         if (NAPI_GRO_CB(skb)->count == 1) {
3828                 skb_shinfo(skb)->gso_size = 0;
3829                 goto out;
3830         }
3831
3832         rcu_read_lock();
3833         list_for_each_entry_rcu(ptype, head, list) {
3834                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3835                         continue;
3836
3837                 err = ptype->callbacks.gro_complete(skb, 0);
3838                 break;
3839         }
3840         rcu_read_unlock();
3841
3842         if (err) {
3843                 WARN_ON(&ptype->list == head);
3844                 kfree_skb(skb);
3845                 return NET_RX_SUCCESS;
3846         }
3847
3848 out:
3849         return netif_receive_skb_internal(skb);
3850 }
3851
3852 /* napi->gro_list contains packets ordered by age.
3853  * youngest packets at the head of it.
3854  * Complete skbs in reverse order to reduce latencies.
3855  */
3856 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3857 {
3858         struct sk_buff *skb, *prev = NULL;
3859
3860         /* scan list and build reverse chain */
3861         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3862                 skb->prev = prev;
3863                 prev = skb;
3864         }
3865
3866         for (skb = prev; skb; skb = prev) {
3867                 skb->next = NULL;
3868
3869                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3870                         return;
3871
3872                 prev = skb->prev;
3873                 napi_gro_complete(skb);
3874                 napi->gro_count--;
3875         }
3876
3877         napi->gro_list = NULL;
3878 }
3879 EXPORT_SYMBOL(napi_gro_flush);
3880
3881 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3882 {
3883         struct sk_buff *p;
3884         unsigned int maclen = skb->dev->hard_header_len;
3885         u32 hash = skb_get_hash_raw(skb);
3886
3887         for (p = napi->gro_list; p; p = p->next) {
3888                 unsigned long diffs;
3889
3890                 NAPI_GRO_CB(p)->flush = 0;
3891
3892                 if (hash != skb_get_hash_raw(p)) {
3893                         NAPI_GRO_CB(p)->same_flow = 0;
3894                         continue;
3895                 }
3896
3897                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3898                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3899                 if (maclen == ETH_HLEN)
3900                         diffs |= compare_ether_header(skb_mac_header(p),
3901                                                       skb_mac_header(skb));
3902                 else if (!diffs)
3903                         diffs = memcmp(skb_mac_header(p),
3904                                        skb_mac_header(skb),
3905                                        maclen);
3906                 NAPI_GRO_CB(p)->same_flow = !diffs;
3907         }
3908 }
3909
3910 static void skb_gro_reset_offset(struct sk_buff *skb)
3911 {
3912         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3913         const skb_frag_t *frag0 = &pinfo->frags[0];
3914
3915         NAPI_GRO_CB(skb)->data_offset = 0;
3916         NAPI_GRO_CB(skb)->frag0 = NULL;
3917         NAPI_GRO_CB(skb)->frag0_len = 0;
3918
3919         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3920             pinfo->nr_frags &&
3921             !PageHighMem(skb_frag_page(frag0))) {
3922                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3923                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3924         }
3925 }
3926
3927 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3928 {
3929         struct skb_shared_info *pinfo = skb_shinfo(skb);
3930
3931         BUG_ON(skb->end - skb->tail < grow);
3932
3933         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3934
3935         skb->data_len -= grow;
3936         skb->tail += grow;
3937
3938         pinfo->frags[0].page_offset += grow;
3939         skb_frag_size_sub(&pinfo->frags[0], grow);
3940
3941         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3942                 skb_frag_unref(skb, 0);
3943                 memmove(pinfo->frags, pinfo->frags + 1,
3944                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3945         }
3946 }
3947
3948 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3949 {
3950         struct sk_buff **pp = NULL;
3951         struct packet_offload *ptype;
3952         __be16 type = skb->protocol;
3953         struct list_head *head = &offload_base;
3954         int same_flow;
3955         enum gro_result ret;
3956         int grow;
3957
3958         if (!(skb->dev->features & NETIF_F_GRO))
3959                 goto normal;
3960
3961         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3962                 goto normal;
3963
3964         gro_list_prepare(napi, skb);
3965
3966         if (skb->ip_summed == CHECKSUM_COMPLETE) {
3967                 NAPI_GRO_CB(skb)->csum = skb->csum;
3968                 NAPI_GRO_CB(skb)->csum_valid = 1;
3969         } else {
3970                 NAPI_GRO_CB(skb)->csum_valid = 0;
3971         }
3972
3973         rcu_read_lock();
3974         list_for_each_entry_rcu(ptype, head, list) {
3975                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3976                         continue;
3977
3978                 skb_set_network_header(skb, skb_gro_offset(skb));
3979                 skb_reset_mac_len(skb);
3980                 NAPI_GRO_CB(skb)->same_flow = 0;
3981                 NAPI_GRO_CB(skb)->flush = 0;
3982                 NAPI_GRO_CB(skb)->free = 0;
3983                 NAPI_GRO_CB(skb)->udp_mark = 0;
3984                 NAPI_GRO_CB(skb)->encapsulation = 0;
3985
3986                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3987                 break;
3988         }
3989         rcu_read_unlock();
3990
3991         if (&ptype->list == head)
3992                 goto normal;
3993
3994         same_flow = NAPI_GRO_CB(skb)->same_flow;
3995         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3996
3997         if (pp) {
3998                 struct sk_buff *nskb = *pp;
3999
4000                 *pp = nskb->next;
4001                 nskb->next = NULL;
4002                 napi_gro_complete(nskb);
4003                 napi->gro_count--;
4004         }
4005
4006         if (same_flow)
4007                 goto ok;
4008
4009         if (NAPI_GRO_CB(skb)->flush)
4010                 goto normal;
4011
4012         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4013                 struct sk_buff *nskb = napi->gro_list;
4014
4015                 /* locate the end of the list to select the 'oldest' flow */
4016                 while (nskb->next) {
4017                         pp = &nskb->next;
4018                         nskb = *pp;
4019                 }
4020                 *pp = NULL;
4021                 nskb->next = NULL;
4022                 napi_gro_complete(nskb);
4023         } else {
4024                 napi->gro_count++;
4025         }
4026         NAPI_GRO_CB(skb)->count = 1;
4027         NAPI_GRO_CB(skb)->age = jiffies;
4028         NAPI_GRO_CB(skb)->last = skb;
4029         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4030         skb->next = napi->gro_list;
4031         napi->gro_list = skb;
4032         ret = GRO_HELD;
4033
4034 pull:
4035         grow = skb_gro_offset(skb) - skb_headlen(skb);
4036         if (grow > 0)
4037                 gro_pull_from_frag0(skb, grow);
4038 ok:
4039         return ret;
4040
4041 normal:
4042         ret = GRO_NORMAL;
4043         goto pull;
4044 }
4045
4046 struct packet_offload *gro_find_receive_by_type(__be16 type)
4047 {
4048         struct list_head *offload_head = &offload_base;
4049         struct packet_offload *ptype;
4050
4051         list_for_each_entry_rcu(ptype, offload_head, list) {
4052                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4053                         continue;
4054                 return ptype;
4055         }
4056         return NULL;
4057 }
4058 EXPORT_SYMBOL(gro_find_receive_by_type);
4059
4060 struct packet_offload *gro_find_complete_by_type(__be16 type)
4061 {
4062         struct list_head *offload_head = &offload_base;
4063         struct packet_offload *ptype;
4064
4065         list_for_each_entry_rcu(ptype, offload_head, list) {
4066                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4067                         continue;
4068                 return ptype;
4069         }
4070         return NULL;
4071 }
4072 EXPORT_SYMBOL(gro_find_complete_by_type);
4073
4074 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4075 {
4076         switch (ret) {
4077         case GRO_NORMAL:
4078                 if (netif_receive_skb_internal(skb))
4079                         ret = GRO_DROP;
4080                 break;
4081
4082         case GRO_DROP:
4083                 kfree_skb(skb);
4084                 break;
4085
4086         case GRO_MERGED_FREE:
4087                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4088                         kmem_cache_free(skbuff_head_cache, skb);
4089                 else
4090                         __kfree_skb(skb);
4091                 break;
4092
4093         case GRO_HELD:
4094         case GRO_MERGED:
4095                 break;
4096         }
4097
4098         return ret;
4099 }
4100
4101 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4102 {
4103         trace_napi_gro_receive_entry(skb);
4104
4105         skb_gro_reset_offset(skb);
4106
4107         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4108 }
4109 EXPORT_SYMBOL(napi_gro_receive);
4110
4111 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4112 {
4113         __skb_pull(skb, skb_headlen(skb));
4114         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4115         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4116         skb->vlan_tci = 0;
4117         skb->dev = napi->dev;
4118         skb->skb_iif = 0;
4119         skb->encapsulation = 0;
4120         skb_shinfo(skb)->gso_type = 0;
4121         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4122
4123         napi->skb = skb;
4124 }
4125
4126 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4127 {
4128         struct sk_buff *skb = napi->skb;
4129
4130         if (!skb) {
4131                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4132                 napi->skb = skb;
4133         }
4134         return skb;
4135 }
4136 EXPORT_SYMBOL(napi_get_frags);
4137
4138 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4139                                       struct sk_buff *skb,
4140                                       gro_result_t ret)
4141 {
4142         switch (ret) {
4143         case GRO_NORMAL:
4144         case GRO_HELD:
4145                 __skb_push(skb, ETH_HLEN);
4146                 skb->protocol = eth_type_trans(skb, skb->dev);
4147                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4148                         ret = GRO_DROP;
4149                 break;
4150
4151         case GRO_DROP:
4152         case GRO_MERGED_FREE:
4153                 napi_reuse_skb(napi, skb);
4154                 break;
4155
4156         case GRO_MERGED:
4157                 break;
4158         }
4159
4160         return ret;
4161 }
4162
4163 /* Upper GRO stack assumes network header starts at gro_offset=0
4164  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4165  * We copy ethernet header into skb->data to have a common layout.
4166  */
4167 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4168 {
4169         struct sk_buff *skb = napi->skb;
4170         const struct ethhdr *eth;
4171         unsigned int hlen = sizeof(*eth);
4172
4173         napi->skb = NULL;
4174
4175         skb_reset_mac_header(skb);
4176         skb_gro_reset_offset(skb);
4177
4178         eth = skb_gro_header_fast(skb, 0);
4179         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4180                 eth = skb_gro_header_slow(skb, hlen, 0);
4181                 if (unlikely(!eth)) {
4182                         napi_reuse_skb(napi, skb);
4183                         return NULL;
4184                 }
4185         } else {
4186                 gro_pull_from_frag0(skb, hlen);
4187                 NAPI_GRO_CB(skb)->frag0 += hlen;
4188                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4189         }
4190         __skb_pull(skb, hlen);
4191
4192         /*
4193          * This works because the only protocols we care about don't require
4194          * special handling.
4195          * We'll fix it up properly in napi_frags_finish()
4196          */
4197         skb->protocol = eth->h_proto;
4198
4199         return skb;
4200 }
4201
4202 gro_result_t napi_gro_frags(struct napi_struct *napi)
4203 {
4204         struct sk_buff *skb = napi_frags_skb(napi);
4205
4206         if (!skb)
4207                 return GRO_DROP;
4208
4209         trace_napi_gro_frags_entry(skb);
4210
4211         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4212 }
4213 EXPORT_SYMBOL(napi_gro_frags);
4214
4215 /* Compute the checksum from gro_offset and return the folded value
4216  * after adding in any pseudo checksum.
4217  */
4218 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4219 {
4220         __wsum wsum;
4221         __sum16 sum;
4222
4223         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4224
4225         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4226         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4227         if (likely(!sum)) {
4228                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4229                     !skb->csum_complete_sw)
4230                         netdev_rx_csum_fault(skb->dev);
4231         }
4232
4233         NAPI_GRO_CB(skb)->csum = wsum;
4234         NAPI_GRO_CB(skb)->csum_valid = 1;
4235
4236         return sum;
4237 }
4238 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4239
4240 /*
4241  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4242  * Note: called with local irq disabled, but exits with local irq enabled.
4243  */
4244 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4245 {
4246 #ifdef CONFIG_RPS
4247         struct softnet_data *remsd = sd->rps_ipi_list;
4248
4249         if (remsd) {
4250                 sd->rps_ipi_list = NULL;
4251
4252                 local_irq_enable();
4253
4254                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4255                 while (remsd) {
4256                         struct softnet_data *next = remsd->rps_ipi_next;
4257
4258                         if (cpu_online(remsd->cpu))
4259                                 smp_call_function_single_async(remsd->cpu,
4260                                                            &remsd->csd);
4261                         remsd = next;
4262                 }
4263         } else
4264 #endif
4265                 local_irq_enable();
4266 }
4267
4268 static int process_backlog(struct napi_struct *napi, int quota)
4269 {
4270         int work = 0;
4271         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4272
4273 #ifdef CONFIG_RPS
4274         /* Check if we have pending ipi, its better to send them now,
4275          * not waiting net_rx_action() end.
4276          */
4277         if (sd->rps_ipi_list) {
4278                 local_irq_disable();
4279                 net_rps_action_and_irq_enable(sd);
4280         }
4281 #endif
4282         napi->weight = weight_p;
4283         local_irq_disable();
4284         while (1) {
4285                 struct sk_buff *skb;
4286
4287                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4288                         local_irq_enable();
4289                         __netif_receive_skb(skb);
4290                         local_irq_disable();
4291                         input_queue_head_incr(sd);
4292                         if (++work >= quota) {
4293                                 local_irq_enable();
4294                                 return work;
4295                         }
4296                 }
4297
4298                 rps_lock(sd);
4299                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4300                         /*
4301                          * Inline a custom version of __napi_complete().
4302                          * only current cpu owns and manipulates this napi,
4303                          * and NAPI_STATE_SCHED is the only possible flag set
4304                          * on backlog.
4305                          * We can use a plain write instead of clear_bit(),
4306                          * and we dont need an smp_mb() memory barrier.
4307                          */
4308                         list_del(&napi->poll_list);
4309                         napi->state = 0;
4310                         rps_unlock(sd);
4311
4312                         break;
4313                 }
4314
4315                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4316                                            &sd->process_queue);
4317                 rps_unlock(sd);
4318         }
4319         local_irq_enable();
4320
4321         return work;
4322 }
4323
4324 /**
4325  * __napi_schedule - schedule for receive
4326  * @n: entry to schedule
4327  *
4328  * The entry's receive function will be scheduled to run
4329  */
4330 void __napi_schedule(struct napi_struct *n)
4331 {
4332         unsigned long flags;
4333
4334         local_irq_save(flags);
4335         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4336         local_irq_restore(flags);
4337 }
4338 EXPORT_SYMBOL(__napi_schedule);
4339
4340 void __napi_complete(struct napi_struct *n)
4341 {
4342         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4343         BUG_ON(n->gro_list);
4344
4345         list_del(&n->poll_list);
4346         smp_mb__before_atomic();
4347         clear_bit(NAPI_STATE_SCHED, &n->state);
4348 }
4349 EXPORT_SYMBOL(__napi_complete);
4350
4351 void napi_complete(struct napi_struct *n)
4352 {
4353         unsigned long flags;
4354
4355         /*
4356          * don't let napi dequeue from the cpu poll list
4357          * just in case its running on a different cpu
4358          */
4359         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4360                 return;
4361
4362         napi_gro_flush(n, false);
4363         local_irq_save(flags);
4364         __napi_complete(n);
4365         local_irq_restore(flags);
4366 }
4367 EXPORT_SYMBOL(napi_complete);
4368
4369 /* must be called under rcu_read_lock(), as we dont take a reference */
4370 struct napi_struct *napi_by_id(unsigned int napi_id)
4371 {
4372         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4373         struct napi_struct *napi;
4374
4375         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4376                 if (napi->napi_id == napi_id)
4377                         return napi;
4378
4379         return NULL;
4380 }
4381 EXPORT_SYMBOL_GPL(napi_by_id);
4382
4383 void napi_hash_add(struct napi_struct *napi)
4384 {
4385         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4386
4387                 spin_lock(&napi_hash_lock);
4388
4389                 /* 0 is not a valid id, we also skip an id that is taken
4390                  * we expect both events to be extremely rare
4391                  */
4392                 napi->napi_id = 0;
4393                 while (!napi->napi_id) {
4394                         napi->napi_id = ++napi_gen_id;
4395                         if (napi_by_id(napi->napi_id))
4396                                 napi->napi_id = 0;
4397                 }
4398
4399                 hlist_add_head_rcu(&napi->napi_hash_node,
4400                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4401
4402                 spin_unlock(&napi_hash_lock);
4403         }
4404 }
4405 EXPORT_SYMBOL_GPL(napi_hash_add);
4406
4407 /* Warning : caller is responsible to make sure rcu grace period
4408  * is respected before freeing memory containing @napi
4409  */
4410 void napi_hash_del(struct napi_struct *napi)
4411 {
4412         spin_lock(&napi_hash_lock);
4413
4414         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4415                 hlist_del_rcu(&napi->napi_hash_node);
4416
4417         spin_unlock(&napi_hash_lock);
4418 }
4419 EXPORT_SYMBOL_GPL(napi_hash_del);
4420
4421 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4422                     int (*poll)(struct napi_struct *, int), int weight)
4423 {
4424         INIT_LIST_HEAD(&napi->poll_list);
4425         napi->gro_count = 0;
4426         napi->gro_list = NULL;
4427         napi->skb = NULL;
4428         napi->poll = poll;
4429         if (weight > NAPI_POLL_WEIGHT)
4430                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4431                             weight, dev->name);
4432         napi->weight = weight;
4433         list_add(&napi->dev_list, &dev->napi_list);
4434         napi->dev = dev;
4435 #ifdef CONFIG_NETPOLL
4436         spin_lock_init(&napi->poll_lock);
4437         napi->poll_owner = -1;
4438 #endif
4439         set_bit(NAPI_STATE_SCHED, &napi->state);
4440 }
4441 EXPORT_SYMBOL(netif_napi_add);
4442
4443 void netif_napi_del(struct napi_struct *napi)
4444 {
4445         list_del_init(&napi->dev_list);
4446         napi_free_frags(napi);
4447
4448         kfree_skb_list(napi->gro_list);
4449         napi->gro_list = NULL;
4450         napi->gro_count = 0;
4451 }
4452 EXPORT_SYMBOL(netif_napi_del);
4453
4454 static void net_rx_action(struct softirq_action *h)
4455 {
4456         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4457         unsigned long time_limit = jiffies + 2;
4458         int budget = netdev_budget;
4459         void *have;
4460
4461         local_irq_disable();
4462
4463         while (!list_empty(&sd->poll_list)) {
4464                 struct napi_struct *n;
4465                 int work, weight;
4466
4467                 /* If softirq window is exhuasted then punt.
4468                  * Allow this to run for 2 jiffies since which will allow
4469                  * an average latency of 1.5/HZ.
4470                  */
4471                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4472                         goto softnet_break;
4473
4474                 local_irq_enable();
4475
4476                 /* Even though interrupts have been re-enabled, this
4477                  * access is safe because interrupts can only add new
4478                  * entries to the tail of this list, and only ->poll()
4479                  * calls can remove this head entry from the list.
4480                  */
4481                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4482
4483                 have = netpoll_poll_lock(n);
4484
4485                 weight = n->weight;
4486
4487                 /* This NAPI_STATE_SCHED test is for avoiding a race
4488                  * with netpoll's poll_napi().  Only the entity which
4489                  * obtains the lock and sees NAPI_STATE_SCHED set will
4490                  * actually make the ->poll() call.  Therefore we avoid
4491                  * accidentally calling ->poll() when NAPI is not scheduled.
4492                  */
4493                 work = 0;
4494                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4495                         work = n->poll(n, weight);
4496                         trace_napi_poll(n);
4497                 }
4498
4499                 WARN_ON_ONCE(work > weight);
4500
4501                 budget -= work;
4502
4503                 local_irq_disable();
4504
4505                 /* Drivers must not modify the NAPI state if they
4506                  * consume the entire weight.  In such cases this code
4507                  * still "owns" the NAPI instance and therefore can
4508                  * move the instance around on the list at-will.
4509                  */
4510                 if (unlikely(work == weight)) {
4511                         if (unlikely(napi_disable_pending(n))) {
4512                                 local_irq_enable();
4513                                 napi_complete(n);
4514                                 local_irq_disable();
4515                         } else {
4516                                 if (n->gro_list) {
4517                                         /* flush too old packets
4518                                          * If HZ < 1000, flush all packets.
4519                                          */
4520                                         local_irq_enable();
4521                                         napi_gro_flush(n, HZ >= 1000);
4522                                         local_irq_disable();
4523                                 }
4524                                 list_move_tail(&n->poll_list, &sd->poll_list);
4525                         }
4526                 }
4527
4528                 netpoll_poll_unlock(have);
4529         }
4530 out:
4531         net_rps_action_and_irq_enable(sd);
4532
4533 #ifdef CONFIG_NET_DMA
4534         /*
4535          * There may not be any more sk_buffs coming right now, so push
4536          * any pending DMA copies to hardware
4537          */
4538         dma_issue_pending_all();
4539 #endif
4540
4541         return;
4542
4543 softnet_break:
4544         sd->time_squeeze++;
4545         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4546         goto out;
4547 }
4548
4549 struct netdev_adjacent {
4550         struct net_device *dev;
4551
4552         /* upper master flag, there can only be one master device per list */
4553         bool master;
4554
4555         /* counter for the number of times this device was added to us */
4556         u16 ref_nr;
4557
4558         /* private field for the users */
4559         void *private;
4560
4561         struct list_head list;
4562         struct rcu_head rcu;
4563 };
4564
4565 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4566                                                  struct net_device *adj_dev,
4567                                                  struct list_head *adj_list)
4568 {
4569         struct netdev_adjacent *adj;
4570
4571         list_for_each_entry(adj, adj_list, list) {
4572                 if (adj->dev == adj_dev)
4573                         return adj;
4574         }
4575         return NULL;
4576 }
4577
4578 /**
4579  * netdev_has_upper_dev - Check if device is linked to an upper device
4580  * @dev: device
4581  * @upper_dev: upper device to check
4582  *
4583  * Find out if a device is linked to specified upper device and return true
4584  * in case it is. Note that this checks only immediate upper device,
4585  * not through a complete stack of devices. The caller must hold the RTNL lock.
4586  */
4587 bool netdev_has_upper_dev(struct net_device *dev,
4588                           struct net_device *upper_dev)
4589 {
4590         ASSERT_RTNL();
4591
4592         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4593 }
4594 EXPORT_SYMBOL(netdev_has_upper_dev);
4595
4596 /**
4597  * netdev_has_any_upper_dev - Check if device is linked to some device
4598  * @dev: device
4599  *
4600  * Find out if a device is linked to an upper device and return true in case
4601  * it is. The caller must hold the RTNL lock.
4602  */
4603 static bool netdev_has_any_upper_dev(struct net_device *dev)
4604 {
4605         ASSERT_RTNL();
4606
4607         return !list_empty(&dev->all_adj_list.upper);
4608 }
4609
4610 /**
4611  * netdev_master_upper_dev_get - Get master upper device
4612  * @dev: device
4613  *
4614  * Find a master upper device and return pointer to it or NULL in case
4615  * it's not there. The caller must hold the RTNL lock.
4616  */
4617 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4618 {
4619         struct netdev_adjacent *upper;
4620
4621         ASSERT_RTNL();
4622
4623         if (list_empty(&dev->adj_list.upper))
4624                 return NULL;
4625
4626         upper = list_first_entry(&dev->adj_list.upper,
4627                                  struct netdev_adjacent, list);
4628         if (likely(upper->master))
4629                 return upper->dev;
4630         return NULL;
4631 }
4632 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4633
4634 void *netdev_adjacent_get_private(struct list_head *adj_list)
4635 {
4636         struct netdev_adjacent *adj;
4637
4638         adj = list_entry(adj_list, struct netdev_adjacent, list);
4639
4640         return adj->private;
4641 }
4642 EXPORT_SYMBOL(netdev_adjacent_get_private);
4643
4644 /**
4645  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4646  * @dev: device
4647  * @iter: list_head ** of the current position
4648  *
4649  * Gets the next device from the dev's upper list, starting from iter
4650  * position. The caller must hold RCU read lock.
4651  */
4652 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4653                                                  struct list_head **iter)
4654 {
4655         struct netdev_adjacent *upper;
4656
4657         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4658
4659         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4660
4661         if (&upper->list == &dev->adj_list.upper)
4662                 return NULL;
4663
4664         *iter = &upper->list;
4665
4666         return upper->dev;
4667 }
4668 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4669
4670 /**
4671  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4672  * @dev: device
4673  * @iter: list_head ** of the current position
4674  *
4675  * Gets the next device from the dev's upper list, starting from iter
4676  * position. The caller must hold RCU read lock.
4677  */
4678 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4679                                                      struct list_head **iter)
4680 {
4681         struct netdev_adjacent *upper;
4682
4683         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4684
4685         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4686
4687         if (&upper->list == &dev->all_adj_list.upper)
4688                 return NULL;
4689
4690         *iter = &upper->list;
4691
4692         return upper->dev;
4693 }
4694 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4695
4696 /**
4697  * netdev_lower_get_next_private - Get the next ->private from the
4698  *                                 lower neighbour list
4699  * @dev: device
4700  * @iter: list_head ** of the current position
4701  *
4702  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4703  * list, starting from iter position. The caller must hold either hold the
4704  * RTNL lock or its own locking that guarantees that the neighbour lower
4705  * list will remain unchainged.
4706  */
4707 void *netdev_lower_get_next_private(struct net_device *dev,
4708                                     struct list_head **iter)
4709 {
4710         struct netdev_adjacent *lower;
4711
4712         lower = list_entry(*iter, struct netdev_adjacent, list);
4713
4714         if (&lower->list == &dev->adj_list.lower)
4715                 return NULL;
4716
4717         *iter = lower->list.next;
4718
4719         return lower->private;
4720 }
4721 EXPORT_SYMBOL(netdev_lower_get_next_private);
4722
4723 /**
4724  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4725  *                                     lower neighbour list, RCU
4726  *                                     variant
4727  * @dev: device
4728  * @iter: list_head ** of the current position
4729  *
4730  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4731  * list, starting from iter position. The caller must hold RCU read lock.
4732  */
4733 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4734                                         struct list_head **iter)
4735 {
4736         struct netdev_adjacent *lower;
4737
4738         WARN_ON_ONCE(!rcu_read_lock_held());
4739
4740         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4741
4742         if (&lower->list == &dev->adj_list.lower)
4743                 return NULL;
4744
4745         *iter = &lower->list;
4746
4747         return lower->private;
4748 }
4749 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4750
4751 /**
4752  * netdev_lower_get_next - Get the next device from the lower neighbour
4753  *                         list
4754  * @dev: device
4755  * @iter: list_head ** of the current position
4756  *
4757  * Gets the next netdev_adjacent from the dev's lower neighbour
4758  * list, starting from iter position. The caller must hold RTNL lock or
4759  * its own locking that guarantees that the neighbour lower
4760  * list will remain unchainged.
4761  */
4762 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4763 {
4764         struct netdev_adjacent *lower;
4765
4766         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4767
4768         if (&lower->list == &dev->adj_list.lower)
4769                 return NULL;
4770
4771         *iter = &lower->list;
4772
4773         return lower->dev;
4774 }
4775 EXPORT_SYMBOL(netdev_lower_get_next);
4776
4777 /**
4778  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4779  *                                     lower neighbour list, RCU
4780  *                                     variant
4781  * @dev: device
4782  *
4783  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4784  * list. The caller must hold RCU read lock.
4785  */
4786 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4787 {
4788         struct netdev_adjacent *lower;
4789
4790         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4791                         struct netdev_adjacent, list);
4792         if (lower)
4793                 return lower->private;
4794         return NULL;
4795 }
4796 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4797
4798 /**
4799  * netdev_master_upper_dev_get_rcu - Get master upper device
4800  * @dev: device
4801  *
4802  * Find a master upper device and return pointer to it or NULL in case
4803  * it's not there. The caller must hold the RCU read lock.
4804  */
4805 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4806 {
4807         struct netdev_adjacent *upper;
4808
4809         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4810                                        struct netdev_adjacent, list);
4811         if (upper && likely(upper->master))
4812                 return upper->dev;
4813         return NULL;
4814 }
4815 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4816
4817 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4818                               struct net_device *adj_dev,
4819                               struct list_head *dev_list)
4820 {
4821         char linkname[IFNAMSIZ+7];
4822         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4823                 "upper_%s" : "lower_%s", adj_dev->name);
4824         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4825                                  linkname);
4826 }
4827 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4828                                char *name,
4829                                struct list_head *dev_list)
4830 {
4831         char linkname[IFNAMSIZ+7];
4832         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4833                 "upper_%s" : "lower_%s", name);
4834         sysfs_remove_link(&(dev->dev.kobj), linkname);
4835 }
4836
4837 #define netdev_adjacent_is_neigh_list(dev, dev_list) \
4838                 (dev_list == &dev->adj_list.upper || \
4839                  dev_list == &dev->adj_list.lower)
4840
4841 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4842                                         struct net_device *adj_dev,
4843                                         struct list_head *dev_list,
4844                                         void *private, bool master)
4845 {
4846         struct netdev_adjacent *adj;
4847         int ret;
4848
4849         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4850
4851         if (adj) {
4852                 adj->ref_nr++;
4853                 return 0;
4854         }
4855
4856         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4857         if (!adj)
4858                 return -ENOMEM;
4859
4860         adj->dev = adj_dev;
4861         adj->master = master;
4862         adj->ref_nr = 1;
4863         adj->private = private;
4864         dev_hold(adj_dev);
4865
4866         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4867                  adj_dev->name, dev->name, adj_dev->name);
4868
4869         if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4870                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4871                 if (ret)
4872                         goto free_adj;
4873         }
4874
4875         /* Ensure that master link is always the first item in list. */
4876         if (master) {
4877                 ret = sysfs_create_link(&(dev->dev.kobj),
4878                                         &(adj_dev->dev.kobj), "master");
4879                 if (ret)
4880                         goto remove_symlinks;
4881
4882                 list_add_rcu(&adj->list, dev_list);
4883         } else {
4884                 list_add_tail_rcu(&adj->list, dev_list);
4885         }
4886
4887         return 0;
4888
4889 remove_symlinks:
4890         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4891                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4892 free_adj:
4893         kfree(adj);
4894         dev_put(adj_dev);
4895
4896         return ret;
4897 }
4898
4899 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4900                                          struct net_device *adj_dev,
4901                                          struct list_head *dev_list)
4902 {
4903         struct netdev_adjacent *adj;
4904
4905         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4906
4907         if (!adj) {
4908                 pr_err("tried to remove device %s from %s\n",
4909                        dev->name, adj_dev->name);
4910                 BUG();
4911         }
4912
4913         if (adj->ref_nr > 1) {
4914                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4915                          adj->ref_nr-1);
4916                 adj->ref_nr--;
4917                 return;
4918         }
4919
4920         if (adj->master)
4921                 sysfs_remove_link(&(dev->dev.kobj), "master");
4922
4923         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4924                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4925
4926         list_del_rcu(&adj->list);
4927         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4928                  adj_dev->name, dev->name, adj_dev->name);
4929         dev_put(adj_dev);
4930         kfree_rcu(adj, rcu);
4931 }
4932
4933 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4934                                             struct net_device *upper_dev,
4935                                             struct list_head *up_list,
4936                                             struct list_head *down_list,
4937                                             void *private, bool master)
4938 {
4939         int ret;
4940
4941         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4942                                            master);
4943         if (ret)
4944                 return ret;
4945
4946         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4947                                            false);
4948         if (ret) {
4949                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4950                 return ret;
4951         }
4952
4953         return 0;
4954 }
4955
4956 static int __netdev_adjacent_dev_link(struct net_device *dev,
4957                                       struct net_device *upper_dev)
4958 {
4959         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4960                                                 &dev->all_adj_list.upper,
4961                                                 &upper_dev->all_adj_list.lower,
4962                                                 NULL, false);
4963 }
4964
4965 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4966                                                struct net_device *upper_dev,
4967                                                struct list_head *up_list,
4968                                                struct list_head *down_list)
4969 {
4970         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4971         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4972 }
4973
4974 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4975                                          struct net_device *upper_dev)
4976 {
4977         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4978                                            &dev->all_adj_list.upper,
4979                                            &upper_dev->all_adj_list.lower);
4980 }
4981
4982 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4983                                                 struct net_device *upper_dev,
4984                                                 void *private, bool master)
4985 {
4986         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4987
4988         if (ret)
4989                 return ret;
4990
4991         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4992                                                &dev->adj_list.upper,
4993                                                &upper_dev->adj_list.lower,
4994                                                private, master);
4995         if (ret) {
4996                 __netdev_adjacent_dev_unlink(dev, upper_dev);
4997                 return ret;
4998         }
4999
5000         return 0;
5001 }
5002
5003 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5004                                                    struct net_device *upper_dev)
5005 {
5006         __netdev_adjacent_dev_unlink(dev, upper_dev);
5007         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5008                                            &dev->adj_list.upper,
5009                                            &upper_dev->adj_list.lower);
5010 }
5011
5012 static int __netdev_upper_dev_link(struct net_device *dev,
5013                                    struct net_device *upper_dev, bool master,
5014                                    void *private)
5015 {
5016         struct netdev_adjacent *i, *j, *to_i, *to_j;
5017         int ret = 0;
5018
5019         ASSERT_RTNL();
5020
5021         if (dev == upper_dev)
5022                 return -EBUSY;
5023
5024         /* To prevent loops, check if dev is not upper device to upper_dev. */
5025         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5026                 return -EBUSY;
5027
5028         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5029                 return -EEXIST;
5030
5031         if (master && netdev_master_upper_dev_get(dev))
5032                 return -EBUSY;
5033
5034         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5035                                                    master);
5036         if (ret)
5037                 return ret;
5038
5039         /* Now that we linked these devs, make all the upper_dev's
5040          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5041          * versa, and don't forget the devices itself. All of these
5042          * links are non-neighbours.
5043          */
5044         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5045                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5046                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5047                                  i->dev->name, j->dev->name);
5048                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5049                         if (ret)
5050                                 goto rollback_mesh;
5051                 }
5052         }
5053
5054         /* add dev to every upper_dev's upper device */
5055         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5056                 pr_debug("linking %s's upper device %s with %s\n",
5057                          upper_dev->name, i->dev->name, dev->name);
5058                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5059                 if (ret)
5060                         goto rollback_upper_mesh;
5061         }
5062
5063         /* add upper_dev to every dev's lower device */
5064         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5065                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5066                          i->dev->name, upper_dev->name);
5067                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5068                 if (ret)
5069                         goto rollback_lower_mesh;
5070         }
5071
5072         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5073         return 0;
5074
5075 rollback_lower_mesh:
5076         to_i = i;
5077         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5078                 if (i == to_i)
5079                         break;
5080                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5081         }
5082
5083         i = NULL;
5084
5085 rollback_upper_mesh:
5086         to_i = i;
5087         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5088                 if (i == to_i)
5089                         break;
5090                 __netdev_adjacent_dev_unlink(dev, i->dev);
5091         }
5092
5093         i = j = NULL;
5094
5095 rollback_mesh:
5096         to_i = i;
5097         to_j = j;
5098         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5099                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5100                         if (i == to_i && j == to_j)
5101                                 break;
5102                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5103                 }
5104                 if (i == to_i)
5105                         break;
5106         }
5107
5108         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5109
5110         return ret;
5111 }
5112
5113 /**
5114  * netdev_upper_dev_link - Add a link to the upper device
5115  * @dev: device
5116  * @upper_dev: new upper device
5117  *
5118  * Adds a link to device which is upper to this one. The caller must hold
5119  * the RTNL lock. On a failure a negative errno code is returned.
5120  * On success the reference counts are adjusted and the function
5121  * returns zero.
5122  */
5123 int netdev_upper_dev_link(struct net_device *dev,
5124                           struct net_device *upper_dev)
5125 {
5126         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5127 }
5128 EXPORT_SYMBOL(netdev_upper_dev_link);
5129
5130 /**
5131  * netdev_master_upper_dev_link - Add a master link to the upper device
5132  * @dev: device
5133  * @upper_dev: new upper device
5134  *
5135  * Adds a link to device which is upper to this one. In this case, only
5136  * one master upper device can be linked, although other non-master devices
5137  * might be linked as well. The caller must hold the RTNL lock.
5138  * On a failure a negative errno code is returned. On success the reference
5139  * counts are adjusted and the function returns zero.
5140  */
5141 int netdev_master_upper_dev_link(struct net_device *dev,
5142                                  struct net_device *upper_dev)
5143 {
5144         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5145 }
5146 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5147
5148 int netdev_master_upper_dev_link_private(struct net_device *dev,
5149                                          struct net_device *upper_dev,
5150                                          void *private)
5151 {
5152         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5153 }
5154 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5155
5156 /**
5157  * netdev_upper_dev_unlink - Removes a link to upper device
5158  * @dev: device
5159  * @upper_dev: new upper device
5160  *
5161  * Removes a link to device which is upper to this one. The caller must hold
5162  * the RTNL lock.
5163  */
5164 void netdev_upper_dev_unlink(struct net_device *dev,
5165                              struct net_device *upper_dev)
5166 {
5167         struct netdev_adjacent *i, *j;
5168         ASSERT_RTNL();
5169
5170         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5171
5172         /* Here is the tricky part. We must remove all dev's lower
5173          * devices from all upper_dev's upper devices and vice
5174          * versa, to maintain the graph relationship.
5175          */
5176         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5177                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5178                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5179
5180         /* remove also the devices itself from lower/upper device
5181          * list
5182          */
5183         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5184                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5185
5186         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5187                 __netdev_adjacent_dev_unlink(dev, i->dev);
5188
5189         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5190 }
5191 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5192
5193 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5194 {
5195         struct netdev_adjacent *iter;
5196
5197         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5198                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5199                                           &iter->dev->adj_list.lower);
5200                 netdev_adjacent_sysfs_add(iter->dev, dev,
5201                                           &iter->dev->adj_list.lower);
5202         }
5203
5204         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5205                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5206                                           &iter->dev->adj_list.upper);
5207                 netdev_adjacent_sysfs_add(iter->dev, dev,
5208                                           &iter->dev->adj_list.upper);
5209         }
5210 }
5211
5212 void *netdev_lower_dev_get_private(struct net_device *dev,
5213                                    struct net_device *lower_dev)
5214 {
5215         struct netdev_adjacent *lower;
5216
5217         if (!lower_dev)
5218                 return NULL;
5219         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5220         if (!lower)
5221                 return NULL;
5222
5223         return lower->private;
5224 }
5225 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5226
5227
5228 int dev_get_nest_level(struct net_device *dev,
5229                        bool (*type_check)(struct net_device *dev))
5230 {
5231         struct net_device *lower = NULL;
5232         struct list_head *iter;
5233         int max_nest = -1;
5234         int nest;
5235
5236         ASSERT_RTNL();
5237
5238         netdev_for_each_lower_dev(dev, lower, iter) {
5239                 nest = dev_get_nest_level(lower, type_check);
5240                 if (max_nest < nest)
5241                         max_nest = nest;
5242         }
5243
5244         if (type_check(dev))
5245                 max_nest++;
5246
5247         return max_nest;
5248 }
5249 EXPORT_SYMBOL(dev_get_nest_level);
5250
5251 static void dev_change_rx_flags(struct net_device *dev, int flags)
5252 {
5253         const struct net_device_ops *ops = dev->netdev_ops;
5254
5255         if (ops->ndo_change_rx_flags)
5256                 ops->ndo_change_rx_flags(dev, flags);
5257 }
5258
5259 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5260 {
5261         unsigned int old_flags = dev->flags;
5262         kuid_t uid;
5263         kgid_t gid;
5264
5265         ASSERT_RTNL();
5266
5267         dev->flags |= IFF_PROMISC;
5268         dev->promiscuity += inc;
5269         if (dev->promiscuity == 0) {
5270                 /*
5271                  * Avoid overflow.
5272                  * If inc causes overflow, untouch promisc and return error.
5273                  */
5274                 if (inc < 0)
5275                         dev->flags &= ~IFF_PROMISC;
5276                 else {
5277                         dev->promiscuity -= inc;
5278                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5279                                 dev->name);
5280                         return -EOVERFLOW;
5281                 }
5282         }
5283         if (dev->flags != old_flags) {
5284                 pr_info("device %s %s promiscuous mode\n",
5285                         dev->name,
5286                         dev->flags & IFF_PROMISC ? "entered" : "left");
5287                 if (audit_enabled) {
5288                         current_uid_gid(&uid, &gid);
5289                         audit_log(current->audit_context, GFP_ATOMIC,
5290                                 AUDIT_ANOM_PROMISCUOUS,
5291                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5292                                 dev->name, (dev->flags & IFF_PROMISC),
5293                                 (old_flags & IFF_PROMISC),
5294                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5295                                 from_kuid(&init_user_ns, uid),
5296                                 from_kgid(&init_user_ns, gid),
5297                                 audit_get_sessionid(current));
5298                 }
5299
5300                 dev_change_rx_flags(dev, IFF_PROMISC);
5301         }
5302         if (notify)
5303                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5304         return 0;
5305 }
5306
5307 /**
5308  *      dev_set_promiscuity     - update promiscuity count on a device
5309  *      @dev: device
5310  *      @inc: modifier
5311  *
5312  *      Add or remove promiscuity from a device. While the count in the device
5313  *      remains above zero the interface remains promiscuous. Once it hits zero
5314  *      the device reverts back to normal filtering operation. A negative inc
5315  *      value is used to drop promiscuity on the device.
5316  *      Return 0 if successful or a negative errno code on error.
5317  */
5318 int dev_set_promiscuity(struct net_device *dev, int inc)
5319 {
5320         unsigned int old_flags = dev->flags;
5321         int err;
5322
5323         err = __dev_set_promiscuity(dev, inc, true);
5324         if (err < 0)
5325                 return err;
5326         if (dev->flags != old_flags)
5327                 dev_set_rx_mode(dev);
5328         return err;
5329 }
5330 EXPORT_SYMBOL(dev_set_promiscuity);
5331
5332 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5333 {
5334         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5335
5336         ASSERT_RTNL();
5337
5338         dev->flags |= IFF_ALLMULTI;
5339         dev->allmulti += inc;
5340         if (dev->allmulti == 0) {
5341                 /*
5342                  * Avoid overflow.
5343                  * If inc causes overflow, untouch allmulti and return error.
5344                  */
5345                 if (inc < 0)
5346                         dev->flags &= ~IFF_ALLMULTI;
5347                 else {
5348                         dev->allmulti -= inc;
5349                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5350                                 dev->name);
5351                         return -EOVERFLOW;
5352                 }
5353         }
5354         if (dev->flags ^ old_flags) {
5355                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5356                 dev_set_rx_mode(dev);
5357                 if (notify)
5358                         __dev_notify_flags(dev, old_flags,
5359                                            dev->gflags ^ old_gflags);
5360         }
5361         return 0;
5362 }
5363
5364 /**
5365  *      dev_set_allmulti        - update allmulti count on a device
5366  *      @dev: device
5367  *      @inc: modifier
5368  *
5369  *      Add or remove reception of all multicast frames to a device. While the
5370  *      count in the device remains above zero the interface remains listening
5371  *      to all interfaces. Once it hits zero the device reverts back to normal
5372  *      filtering operation. A negative @inc value is used to drop the counter
5373  *      when releasing a resource needing all multicasts.
5374  *      Return 0 if successful or a negative errno code on error.
5375  */
5376
5377 int dev_set_allmulti(struct net_device *dev, int inc)
5378 {
5379         return __dev_set_allmulti(dev, inc, true);
5380 }
5381 EXPORT_SYMBOL(dev_set_allmulti);
5382
5383 /*
5384  *      Upload unicast and multicast address lists to device and
5385  *      configure RX filtering. When the device doesn't support unicast
5386  *      filtering it is put in promiscuous mode while unicast addresses
5387  *      are present.
5388  */
5389 void __dev_set_rx_mode(struct net_device *dev)
5390 {
5391         const struct net_device_ops *ops = dev->netdev_ops;
5392
5393         /* dev_open will call this function so the list will stay sane. */
5394         if (!(dev->flags&IFF_UP))
5395                 return;
5396
5397         if (!netif_device_present(dev))
5398                 return;
5399
5400         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5401                 /* Unicast addresses changes may only happen under the rtnl,
5402                  * therefore calling __dev_set_promiscuity here is safe.
5403                  */
5404                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5405                         __dev_set_promiscuity(dev, 1, false);
5406                         dev->uc_promisc = true;
5407                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5408                         __dev_set_promiscuity(dev, -1, false);
5409                         dev->uc_promisc = false;
5410                 }
5411         }
5412
5413         if (ops->ndo_set_rx_mode)
5414                 ops->ndo_set_rx_mode(dev);
5415 }
5416
5417 void dev_set_rx_mode(struct net_device *dev)
5418 {
5419         netif_addr_lock_bh(dev);
5420         __dev_set_rx_mode(dev);
5421         netif_addr_unlock_bh(dev);
5422 }
5423
5424 /**
5425  *      dev_get_flags - get flags reported to userspace
5426  *      @dev: device
5427  *
5428  *      Get the combination of flag bits exported through APIs to userspace.
5429  */
5430 unsigned int dev_get_flags(const struct net_device *dev)
5431 {
5432         unsigned int flags;
5433
5434         flags = (dev->flags & ~(IFF_PROMISC |
5435                                 IFF_ALLMULTI |
5436                                 IFF_RUNNING |
5437                                 IFF_LOWER_UP |
5438                                 IFF_DORMANT)) |
5439                 (dev->gflags & (IFF_PROMISC |
5440                                 IFF_ALLMULTI));
5441
5442         if (netif_running(dev)) {
5443                 if (netif_oper_up(dev))
5444                         flags |= IFF_RUNNING;
5445                 if (netif_carrier_ok(dev))
5446                         flags |= IFF_LOWER_UP;
5447                 if (netif_dormant(dev))
5448                         flags |= IFF_DORMANT;
5449         }
5450
5451         return flags;
5452 }
5453 EXPORT_SYMBOL(dev_get_flags);
5454
5455 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5456 {
5457         unsigned int old_flags = dev->flags;
5458         int ret;
5459
5460         ASSERT_RTNL();
5461
5462         /*
5463          *      Set the flags on our device.
5464          */
5465
5466         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5467                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5468                                IFF_AUTOMEDIA)) |
5469                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5470                                     IFF_ALLMULTI));
5471
5472         /*
5473          *      Load in the correct multicast list now the flags have changed.
5474          */
5475
5476         if ((old_flags ^ flags) & IFF_MULTICAST)
5477                 dev_change_rx_flags(dev, IFF_MULTICAST);
5478
5479         dev_set_rx_mode(dev);
5480
5481         /*
5482          *      Have we downed the interface. We handle IFF_UP ourselves
5483          *      according to user attempts to set it, rather than blindly
5484          *      setting it.
5485          */
5486
5487         ret = 0;
5488         if ((old_flags ^ flags) & IFF_UP)
5489                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5490
5491         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5492                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5493                 unsigned int old_flags = dev->flags;
5494
5495                 dev->gflags ^= IFF_PROMISC;
5496
5497                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5498                         if (dev->flags != old_flags)
5499                                 dev_set_rx_mode(dev);
5500         }
5501
5502         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5503            is important. Some (broken) drivers set IFF_PROMISC, when
5504            IFF_ALLMULTI is requested not asking us and not reporting.
5505          */
5506         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5507                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5508
5509                 dev->gflags ^= IFF_ALLMULTI;
5510                 __dev_set_allmulti(dev, inc, false);
5511         }
5512
5513         return ret;
5514 }
5515
5516 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5517                         unsigned int gchanges)
5518 {
5519         unsigned int changes = dev->flags ^ old_flags;
5520
5521         if (gchanges)
5522                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5523
5524         if (changes & IFF_UP) {
5525                 if (dev->flags & IFF_UP)
5526                         call_netdevice_notifiers(NETDEV_UP, dev);
5527                 else
5528                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5529         }
5530
5531         if (dev->flags & IFF_UP &&
5532             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5533                 struct netdev_notifier_change_info change_info;
5534
5535                 change_info.flags_changed = changes;
5536                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5537                                               &change_info.info);
5538         }
5539 }
5540
5541 /**
5542  *      dev_change_flags - change device settings
5543  *      @dev: device
5544  *      @flags: device state flags
5545  *
5546  *      Change settings on device based state flags. The flags are
5547  *      in the userspace exported format.
5548  */
5549 int dev_change_flags(struct net_device *dev, unsigned int flags)
5550 {
5551         int ret;
5552         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5553
5554         ret = __dev_change_flags(dev, flags);
5555         if (ret < 0)
5556                 return ret;
5557
5558         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5559         __dev_notify_flags(dev, old_flags, changes);
5560         return ret;
5561 }
5562 EXPORT_SYMBOL(dev_change_flags);
5563
5564 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5565 {
5566         const struct net_device_ops *ops = dev->netdev_ops;
5567
5568         if (ops->ndo_change_mtu)
5569                 return ops->ndo_change_mtu(dev, new_mtu);
5570
5571         dev->mtu = new_mtu;
5572         return 0;
5573 }
5574
5575 /**
5576  *      dev_set_mtu - Change maximum transfer unit
5577  *      @dev: device
5578  *      @new_mtu: new transfer unit
5579  *
5580  *      Change the maximum transfer size of the network device.
5581  */
5582 int dev_set_mtu(struct net_device *dev, int new_mtu)
5583 {
5584         int err, orig_mtu;
5585
5586         if (new_mtu == dev->mtu)
5587                 return 0;
5588
5589         /*      MTU must be positive.    */
5590         if (new_mtu < 0)
5591                 return -EINVAL;
5592
5593         if (!netif_device_present(dev))
5594                 return -ENODEV;
5595
5596         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5597         err = notifier_to_errno(err);
5598         if (err)
5599                 return err;
5600
5601         orig_mtu = dev->mtu;
5602         err = __dev_set_mtu(dev, new_mtu);
5603
5604         if (!err) {
5605                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5606                 err = notifier_to_errno(err);
5607                 if (err) {
5608                         /* setting mtu back and notifying everyone again,
5609                          * so that they have a chance to revert changes.
5610                          */
5611                         __dev_set_mtu(dev, orig_mtu);
5612                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5613                 }
5614         }
5615         return err;
5616 }
5617 EXPORT_SYMBOL(dev_set_mtu);
5618
5619 /**
5620  *      dev_set_group - Change group this device belongs to
5621  *      @dev: device
5622  *      @new_group: group this device should belong to
5623  */
5624 void dev_set_group(struct net_device *dev, int new_group)
5625 {
5626         dev->group = new_group;
5627 }
5628 EXPORT_SYMBOL(dev_set_group);
5629
5630 /**
5631  *      dev_set_mac_address - Change Media Access Control Address
5632  *      @dev: device
5633  *      @sa: new address
5634  *
5635  *      Change the hardware (MAC) address of the device
5636  */
5637 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5638 {
5639         const struct net_device_ops *ops = dev->netdev_ops;
5640         int err;
5641
5642         if (!ops->ndo_set_mac_address)
5643                 return -EOPNOTSUPP;
5644         if (sa->sa_family != dev->type)
5645                 return -EINVAL;
5646         if (!netif_device_present(dev))
5647                 return -ENODEV;
5648         err = ops->ndo_set_mac_address(dev, sa);
5649         if (err)
5650                 return err;
5651         dev->addr_assign_type = NET_ADDR_SET;
5652         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5653         add_device_randomness(dev->dev_addr, dev->addr_len);
5654         return 0;
5655 }
5656 EXPORT_SYMBOL(dev_set_mac_address);
5657
5658 /**
5659  *      dev_change_carrier - Change device carrier
5660  *      @dev: device
5661  *      @new_carrier: new value
5662  *
5663  *      Change device carrier
5664  */
5665 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5666 {
5667         const struct net_device_ops *ops = dev->netdev_ops;
5668
5669         if (!ops->ndo_change_carrier)
5670                 return -EOPNOTSUPP;
5671         if (!netif_device_present(dev))
5672                 return -ENODEV;
5673         return ops->ndo_change_carrier(dev, new_carrier);
5674 }
5675 EXPORT_SYMBOL(dev_change_carrier);
5676
5677 /**
5678  *      dev_get_phys_port_id - Get device physical port ID
5679  *      @dev: device
5680  *      @ppid: port ID
5681  *
5682  *      Get device physical port ID
5683  */
5684 int dev_get_phys_port_id(struct net_device *dev,
5685                          struct netdev_phys_port_id *ppid)
5686 {
5687         const struct net_device_ops *ops = dev->netdev_ops;
5688
5689         if (!ops->ndo_get_phys_port_id)
5690                 return -EOPNOTSUPP;
5691         return ops->ndo_get_phys_port_id(dev, ppid);
5692 }
5693 EXPORT_SYMBOL(dev_get_phys_port_id);
5694
5695 /**
5696  *      dev_new_index   -       allocate an ifindex
5697  *      @net: the applicable net namespace
5698  *
5699  *      Returns a suitable unique value for a new device interface
5700  *      number.  The caller must hold the rtnl semaphore or the
5701  *      dev_base_lock to be sure it remains unique.
5702  */
5703 static int dev_new_index(struct net *net)
5704 {
5705         int ifindex = net->ifindex;
5706         for (;;) {
5707                 if (++ifindex <= 0)
5708                         ifindex = 1;
5709                 if (!__dev_get_by_index(net, ifindex))
5710                         return net->ifindex = ifindex;
5711         }
5712 }
5713
5714 /* Delayed registration/unregisteration */
5715 static LIST_HEAD(net_todo_list);
5716 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5717
5718 static void net_set_todo(struct net_device *dev)
5719 {
5720         list_add_tail(&dev->todo_list, &net_todo_list);
5721         dev_net(dev)->dev_unreg_count++;
5722 }
5723
5724 static void rollback_registered_many(struct list_head *head)
5725 {
5726         struct net_device *dev, *tmp;
5727         LIST_HEAD(close_head);
5728
5729         BUG_ON(dev_boot_phase);
5730         ASSERT_RTNL();
5731
5732         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5733                 /* Some devices call without registering
5734                  * for initialization unwind. Remove those
5735                  * devices and proceed with the remaining.
5736                  */
5737                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5738                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5739                                  dev->name, dev);
5740
5741                         WARN_ON(1);
5742                         list_del(&dev->unreg_list);
5743                         continue;
5744                 }
5745                 dev->dismantle = true;
5746                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5747         }
5748
5749         /* If device is running, close it first. */
5750         list_for_each_entry(dev, head, unreg_list)
5751                 list_add_tail(&dev->close_list, &close_head);
5752         dev_close_many(&close_head);
5753
5754         list_for_each_entry(dev, head, unreg_list) {
5755                 /* And unlink it from device chain. */
5756                 unlist_netdevice(dev);
5757
5758                 dev->reg_state = NETREG_UNREGISTERING;
5759         }
5760
5761         synchronize_net();
5762
5763         list_for_each_entry(dev, head, unreg_list) {
5764                 /* Shutdown queueing discipline. */
5765                 dev_shutdown(dev);
5766
5767
5768                 /* Notify protocols, that we are about to destroy
5769                    this device. They should clean all the things.
5770                 */
5771                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5772
5773                 /*
5774                  *      Flush the unicast and multicast chains
5775                  */
5776                 dev_uc_flush(dev);
5777                 dev_mc_flush(dev);
5778
5779                 if (dev->netdev_ops->ndo_uninit)
5780                         dev->netdev_ops->ndo_uninit(dev);
5781
5782                 if (!dev->rtnl_link_ops ||
5783                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5784                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5785
5786                 /* Notifier chain MUST detach us all upper devices. */
5787                 WARN_ON(netdev_has_any_upper_dev(dev));
5788
5789                 /* Remove entries from kobject tree */
5790                 netdev_unregister_kobject(dev);
5791 #ifdef CONFIG_XPS
5792                 /* Remove XPS queueing entries */
5793                 netif_reset_xps_queues_gt(dev, 0);
5794 #endif
5795         }
5796
5797         synchronize_net();
5798
5799         list_for_each_entry(dev, head, unreg_list)
5800                 dev_put(dev);
5801 }
5802
5803 static void rollback_registered(struct net_device *dev)
5804 {
5805         LIST_HEAD(single);
5806
5807         list_add(&dev->unreg_list, &single);
5808         rollback_registered_many(&single);
5809         list_del(&single);
5810 }
5811
5812 static netdev_features_t netdev_fix_features(struct net_device *dev,
5813         netdev_features_t features)
5814 {
5815         /* Fix illegal checksum combinations */
5816         if ((features & NETIF_F_HW_CSUM) &&
5817             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5818                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5819                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5820         }
5821
5822         /* TSO requires that SG is present as well. */
5823         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5824                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5825                 features &= ~NETIF_F_ALL_TSO;
5826         }
5827
5828         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5829                                         !(features & NETIF_F_IP_CSUM)) {
5830                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5831                 features &= ~NETIF_F_TSO;
5832                 features &= ~NETIF_F_TSO_ECN;
5833         }
5834
5835         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5836                                          !(features & NETIF_F_IPV6_CSUM)) {
5837                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5838                 features &= ~NETIF_F_TSO6;
5839         }
5840
5841         /* TSO ECN requires that TSO is present as well. */
5842         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5843                 features &= ~NETIF_F_TSO_ECN;
5844
5845         /* Software GSO depends on SG. */
5846         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5847                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5848                 features &= ~NETIF_F_GSO;
5849         }
5850
5851         /* UFO needs SG and checksumming */
5852         if (features & NETIF_F_UFO) {
5853                 /* maybe split UFO into V4 and V6? */
5854                 if (!((features & NETIF_F_GEN_CSUM) ||
5855                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5856                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5857                         netdev_dbg(dev,
5858                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5859                         features &= ~NETIF_F_UFO;
5860                 }
5861
5862                 if (!(features & NETIF_F_SG)) {
5863                         netdev_dbg(dev,
5864                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5865                         features &= ~NETIF_F_UFO;
5866                 }
5867         }
5868
5869 #ifdef CONFIG_NET_RX_BUSY_POLL
5870         if (dev->netdev_ops->ndo_busy_poll)
5871                 features |= NETIF_F_BUSY_POLL;
5872         else
5873 #endif
5874                 features &= ~NETIF_F_BUSY_POLL;
5875
5876         return features;
5877 }
5878
5879 int __netdev_update_features(struct net_device *dev)
5880 {
5881         netdev_features_t features;
5882         int err = 0;
5883
5884         ASSERT_RTNL();
5885
5886         features = netdev_get_wanted_features(dev);
5887
5888         if (dev->netdev_ops->ndo_fix_features)
5889                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5890
5891         /* driver might be less strict about feature dependencies */
5892         features = netdev_fix_features(dev, features);
5893
5894         if (dev->features == features)
5895                 return 0;
5896
5897         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5898                 &dev->features, &features);
5899
5900         if (dev->netdev_ops->ndo_set_features)
5901                 err = dev->netdev_ops->ndo_set_features(dev, features);
5902
5903         if (unlikely(err < 0)) {
5904                 netdev_err(dev,
5905                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5906                         err, &features, &dev->features);
5907                 return -1;
5908         }
5909
5910         if (!err)
5911                 dev->features = features;
5912
5913         return 1;
5914 }
5915
5916 /**
5917  *      netdev_update_features - recalculate device features
5918  *      @dev: the device to check
5919  *
5920  *      Recalculate dev->features set and send notifications if it
5921  *      has changed. Should be called after driver or hardware dependent
5922  *      conditions might have changed that influence the features.
5923  */
5924 void netdev_update_features(struct net_device *dev)
5925 {
5926         if (__netdev_update_features(dev))
5927                 netdev_features_change(dev);
5928 }
5929 EXPORT_SYMBOL(netdev_update_features);
5930
5931 /**
5932  *      netdev_change_features - recalculate device features
5933  *      @dev: the device to check
5934  *
5935  *      Recalculate dev->features set and send notifications even
5936  *      if they have not changed. Should be called instead of
5937  *      netdev_update_features() if also dev->vlan_features might
5938  *      have changed to allow the changes to be propagated to stacked
5939  *      VLAN devices.
5940  */
5941 void netdev_change_features(struct net_device *dev)
5942 {
5943         __netdev_update_features(dev);
5944         netdev_features_change(dev);
5945 }
5946 EXPORT_SYMBOL(netdev_change_features);
5947
5948 /**
5949  *      netif_stacked_transfer_operstate -      transfer operstate
5950  *      @rootdev: the root or lower level device to transfer state from
5951  *      @dev: the device to transfer operstate to
5952  *
5953  *      Transfer operational state from root to device. This is normally
5954  *      called when a stacking relationship exists between the root
5955  *      device and the device(a leaf device).
5956  */
5957 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5958                                         struct net_device *dev)
5959 {
5960         if (rootdev->operstate == IF_OPER_DORMANT)
5961                 netif_dormant_on(dev);
5962         else
5963                 netif_dormant_off(dev);
5964
5965         if (netif_carrier_ok(rootdev)) {
5966                 if (!netif_carrier_ok(dev))
5967                         netif_carrier_on(dev);
5968         } else {
5969                 if (netif_carrier_ok(dev))
5970                         netif_carrier_off(dev);
5971         }
5972 }
5973 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5974
5975 #ifdef CONFIG_SYSFS
5976 static int netif_alloc_rx_queues(struct net_device *dev)
5977 {
5978         unsigned int i, count = dev->num_rx_queues;
5979         struct netdev_rx_queue *rx;
5980
5981         BUG_ON(count < 1);
5982
5983         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5984         if (!rx)
5985                 return -ENOMEM;
5986
5987         dev->_rx = rx;
5988
5989         for (i = 0; i < count; i++)
5990                 rx[i].dev = dev;
5991         return 0;
5992 }
5993 #endif
5994
5995 static void netdev_init_one_queue(struct net_device *dev,
5996                                   struct netdev_queue *queue, void *_unused)
5997 {
5998         /* Initialize queue lock */
5999         spin_lock_init(&queue->_xmit_lock);
6000         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6001         queue->xmit_lock_owner = -1;
6002         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6003         queue->dev = dev;
6004 #ifdef CONFIG_BQL
6005         dql_init(&queue->dql, HZ);
6006 #endif
6007 }
6008
6009 static void netif_free_tx_queues(struct net_device *dev)
6010 {
6011         kvfree(dev->_tx);
6012 }
6013
6014 static int netif_alloc_netdev_queues(struct net_device *dev)
6015 {
6016         unsigned int count = dev->num_tx_queues;
6017         struct netdev_queue *tx;
6018         size_t sz = count * sizeof(*tx);
6019
6020         BUG_ON(count < 1 || count > 0xffff);
6021
6022         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6023         if (!tx) {
6024                 tx = vzalloc(sz);
6025                 if (!tx)
6026                         return -ENOMEM;
6027         }
6028         dev->_tx = tx;
6029
6030         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6031         spin_lock_init(&dev->tx_global_lock);
6032
6033         return 0;
6034 }
6035
6036 /**
6037  *      register_netdevice      - register a network device
6038  *      @dev: device to register
6039  *
6040  *      Take a completed network device structure and add it to the kernel
6041  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6042  *      chain. 0 is returned on success. A negative errno code is returned
6043  *      on a failure to set up the device, or if the name is a duplicate.
6044  *
6045  *      Callers must hold the rtnl semaphore. You may want
6046  *      register_netdev() instead of this.
6047  *
6048  *      BUGS:
6049  *      The locking appears insufficient to guarantee two parallel registers
6050  *      will not get the same name.
6051  */
6052
6053 int register_netdevice(struct net_device *dev)
6054 {
6055         int ret;
6056         struct net *net = dev_net(dev);
6057
6058         BUG_ON(dev_boot_phase);
6059         ASSERT_RTNL();
6060
6061         might_sleep();
6062
6063         /* When net_device's are persistent, this will be fatal. */
6064         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6065         BUG_ON(!net);
6066
6067         spin_lock_init(&dev->addr_list_lock);
6068         netdev_set_addr_lockdep_class(dev);
6069
6070         dev->iflink = -1;
6071
6072         ret = dev_get_valid_name(net, dev, dev->name);
6073         if (ret < 0)
6074                 goto out;
6075
6076         /* Init, if this function is available */
6077         if (dev->netdev_ops->ndo_init) {
6078                 ret = dev->netdev_ops->ndo_init(dev);
6079                 if (ret) {
6080                         if (ret > 0)
6081                                 ret = -EIO;
6082                         goto out;
6083                 }
6084         }
6085
6086         if (((dev->hw_features | dev->features) &
6087              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6088             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6089              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6090                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6091                 ret = -EINVAL;
6092                 goto err_uninit;
6093         }
6094
6095         ret = -EBUSY;
6096         if (!dev->ifindex)
6097                 dev->ifindex = dev_new_index(net);
6098         else if (__dev_get_by_index(net, dev->ifindex))
6099                 goto err_uninit;
6100
6101         if (dev->iflink == -1)
6102                 dev->iflink = dev->ifindex;
6103
6104         /* Transfer changeable features to wanted_features and enable
6105          * software offloads (GSO and GRO).
6106          */
6107         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6108         dev->features |= NETIF_F_SOFT_FEATURES;
6109         dev->wanted_features = dev->features & dev->hw_features;
6110
6111         if (!(dev->flags & IFF_LOOPBACK)) {
6112                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6113         }
6114
6115         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6116          */
6117         dev->vlan_features |= NETIF_F_HIGHDMA;
6118
6119         /* Make NETIF_F_SG inheritable to tunnel devices.
6120          */
6121         dev->hw_enc_features |= NETIF_F_SG;
6122
6123         /* Make NETIF_F_SG inheritable to MPLS.
6124          */
6125         dev->mpls_features |= NETIF_F_SG;
6126
6127         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6128         ret = notifier_to_errno(ret);
6129         if (ret)
6130                 goto err_uninit;
6131
6132         ret = netdev_register_kobject(dev);
6133         if (ret)
6134                 goto err_uninit;
6135         dev->reg_state = NETREG_REGISTERED;
6136
6137         __netdev_update_features(dev);
6138
6139         /*
6140          *      Default initial state at registry is that the
6141          *      device is present.
6142          */
6143
6144         set_bit(__LINK_STATE_PRESENT, &dev->state);
6145
6146         linkwatch_init_dev(dev);
6147
6148         dev_init_scheduler(dev);
6149         dev_hold(dev);
6150         list_netdevice(dev);
6151         add_device_randomness(dev->dev_addr, dev->addr_len);
6152
6153         /* If the device has permanent device address, driver should
6154          * set dev_addr and also addr_assign_type should be set to
6155          * NET_ADDR_PERM (default value).
6156          */
6157         if (dev->addr_assign_type == NET_ADDR_PERM)
6158                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6159
6160         /* Notify protocols, that a new device appeared. */
6161         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6162         ret = notifier_to_errno(ret);
6163         if (ret) {
6164                 rollback_registered(dev);
6165                 dev->reg_state = NETREG_UNREGISTERED;
6166         }
6167         /*
6168          *      Prevent userspace races by waiting until the network
6169          *      device is fully setup before sending notifications.
6170          */
6171         if (!dev->rtnl_link_ops ||
6172             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6173                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6174
6175 out:
6176         return ret;
6177
6178 err_uninit:
6179         if (dev->netdev_ops->ndo_uninit)
6180                 dev->netdev_ops->ndo_uninit(dev);
6181         goto out;
6182 }
6183 EXPORT_SYMBOL(register_netdevice);
6184
6185 /**
6186  *      init_dummy_netdev       - init a dummy network device for NAPI
6187  *      @dev: device to init
6188  *
6189  *      This takes a network device structure and initialize the minimum
6190  *      amount of fields so it can be used to schedule NAPI polls without
6191  *      registering a full blown interface. This is to be used by drivers
6192  *      that need to tie several hardware interfaces to a single NAPI
6193  *      poll scheduler due to HW limitations.
6194  */
6195 int init_dummy_netdev(struct net_device *dev)
6196 {
6197         /* Clear everything. Note we don't initialize spinlocks
6198          * are they aren't supposed to be taken by any of the
6199          * NAPI code and this dummy netdev is supposed to be
6200          * only ever used for NAPI polls
6201          */
6202         memset(dev, 0, sizeof(struct net_device));
6203
6204         /* make sure we BUG if trying to hit standard
6205          * register/unregister code path
6206          */
6207         dev->reg_state = NETREG_DUMMY;
6208
6209         /* NAPI wants this */
6210         INIT_LIST_HEAD(&dev->napi_list);
6211
6212         /* a dummy interface is started by default */
6213         set_bit(__LINK_STATE_PRESENT, &dev->state);
6214         set_bit(__LINK_STATE_START, &dev->state);
6215
6216         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6217          * because users of this 'device' dont need to change
6218          * its refcount.
6219          */
6220
6221         return 0;
6222 }
6223 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6224
6225
6226 /**
6227  *      register_netdev - register a network device
6228  *      @dev: device to register
6229  *
6230  *      Take a completed network device structure and add it to the kernel
6231  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6232  *      chain. 0 is returned on success. A negative errno code is returned
6233  *      on a failure to set up the device, or if the name is a duplicate.
6234  *
6235  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6236  *      and expands the device name if you passed a format string to
6237  *      alloc_netdev.
6238  */
6239 int register_netdev(struct net_device *dev)
6240 {
6241         int err;
6242
6243         rtnl_lock();
6244         err = register_netdevice(dev);
6245         rtnl_unlock();
6246         return err;
6247 }
6248 EXPORT_SYMBOL(register_netdev);
6249
6250 int netdev_refcnt_read(const struct net_device *dev)
6251 {
6252         int i, refcnt = 0;
6253
6254         for_each_possible_cpu(i)
6255                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6256         return refcnt;
6257 }
6258 EXPORT_SYMBOL(netdev_refcnt_read);
6259
6260 /**
6261  * netdev_wait_allrefs - wait until all references are gone.
6262  * @dev: target net_device
6263  *
6264  * This is called when unregistering network devices.
6265  *
6266  * Any protocol or device that holds a reference should register
6267  * for netdevice notification, and cleanup and put back the
6268  * reference if they receive an UNREGISTER event.
6269  * We can get stuck here if buggy protocols don't correctly
6270  * call dev_put.
6271  */
6272 static void netdev_wait_allrefs(struct net_device *dev)
6273 {
6274         unsigned long rebroadcast_time, warning_time;
6275         int refcnt;
6276
6277         linkwatch_forget_dev(dev);
6278
6279         rebroadcast_time = warning_time = jiffies;
6280         refcnt = netdev_refcnt_read(dev);
6281
6282         while (refcnt != 0) {
6283                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6284                         rtnl_lock();
6285
6286                         /* Rebroadcast unregister notification */
6287                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6288
6289                         __rtnl_unlock();
6290                         rcu_barrier();
6291                         rtnl_lock();
6292
6293                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6294                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6295                                      &dev->state)) {
6296                                 /* We must not have linkwatch events
6297                                  * pending on unregister. If this
6298                                  * happens, we simply run the queue
6299                                  * unscheduled, resulting in a noop
6300                                  * for this device.
6301                                  */
6302                                 linkwatch_run_queue();
6303                         }
6304
6305                         __rtnl_unlock();
6306
6307                         rebroadcast_time = jiffies;
6308                 }
6309
6310                 msleep(250);
6311
6312                 refcnt = netdev_refcnt_read(dev);
6313
6314                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6315                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6316                                  dev->name, refcnt);
6317                         warning_time = jiffies;
6318                 }
6319         }
6320 }
6321
6322 /* The sequence is:
6323  *
6324  *      rtnl_lock();
6325  *      ...
6326  *      register_netdevice(x1);
6327  *      register_netdevice(x2);
6328  *      ...
6329  *      unregister_netdevice(y1);
6330  *      unregister_netdevice(y2);
6331  *      ...
6332  *      rtnl_unlock();
6333  *      free_netdev(y1);
6334  *      free_netdev(y2);
6335  *
6336  * We are invoked by rtnl_unlock().
6337  * This allows us to deal with problems:
6338  * 1) We can delete sysfs objects which invoke hotplug
6339  *    without deadlocking with linkwatch via keventd.
6340  * 2) Since we run with the RTNL semaphore not held, we can sleep
6341  *    safely in order to wait for the netdev refcnt to drop to zero.
6342  *
6343  * We must not return until all unregister events added during
6344  * the interval the lock was held have been completed.
6345  */
6346 void netdev_run_todo(void)
6347 {
6348         struct list_head list;
6349
6350         /* Snapshot list, allow later requests */
6351         list_replace_init(&net_todo_list, &list);
6352
6353         __rtnl_unlock();
6354
6355
6356         /* Wait for rcu callbacks to finish before next phase */
6357         if (!list_empty(&list))
6358                 rcu_barrier();
6359
6360         while (!list_empty(&list)) {
6361                 struct net_device *dev
6362                         = list_first_entry(&list, struct net_device, todo_list);
6363                 list_del(&dev->todo_list);
6364
6365                 rtnl_lock();
6366                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6367                 __rtnl_unlock();
6368
6369                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6370                         pr_err("network todo '%s' but state %d\n",
6371                                dev->name, dev->reg_state);
6372                         dump_stack();
6373                         continue;
6374                 }
6375
6376                 dev->reg_state = NETREG_UNREGISTERED;
6377
6378                 on_each_cpu(flush_backlog, dev, 1);
6379
6380                 netdev_wait_allrefs(dev);
6381
6382                 /* paranoia */
6383                 BUG_ON(netdev_refcnt_read(dev));
6384                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6385                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6386                 WARN_ON(dev->dn_ptr);
6387
6388                 if (dev->destructor)
6389                         dev->destructor(dev);
6390
6391                 /* Report a network device has been unregistered */
6392                 rtnl_lock();
6393                 dev_net(dev)->dev_unreg_count--;
6394                 __rtnl_unlock();
6395                 wake_up(&netdev_unregistering_wq);
6396
6397                 /* Free network device */
6398                 kobject_put(&dev->dev.kobj);
6399         }
6400 }
6401
6402 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6403  * fields in the same order, with only the type differing.
6404  */
6405 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6406                              const struct net_device_stats *netdev_stats)
6407 {
6408 #if BITS_PER_LONG == 64
6409         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6410         memcpy(stats64, netdev_stats, sizeof(*stats64));
6411 #else
6412         size_t i, n = sizeof(*stats64) / sizeof(u64);
6413         const unsigned long *src = (const unsigned long *)netdev_stats;
6414         u64 *dst = (u64 *)stats64;
6415
6416         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6417                      sizeof(*stats64) / sizeof(u64));
6418         for (i = 0; i < n; i++)
6419                 dst[i] = src[i];
6420 #endif
6421 }
6422 EXPORT_SYMBOL(netdev_stats_to_stats64);
6423
6424 /**
6425  *      dev_get_stats   - get network device statistics
6426  *      @dev: device to get statistics from
6427  *      @storage: place to store stats
6428  *
6429  *      Get network statistics from device. Return @storage.
6430  *      The device driver may provide its own method by setting
6431  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6432  *      otherwise the internal statistics structure is used.
6433  */
6434 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6435                                         struct rtnl_link_stats64 *storage)
6436 {
6437         const struct net_device_ops *ops = dev->netdev_ops;
6438
6439         if (ops->ndo_get_stats64) {
6440                 memset(storage, 0, sizeof(*storage));
6441                 ops->ndo_get_stats64(dev, storage);
6442         } else if (ops->ndo_get_stats) {
6443                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6444         } else {
6445                 netdev_stats_to_stats64(storage, &dev->stats);
6446         }
6447         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6448         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6449         return storage;
6450 }
6451 EXPORT_SYMBOL(dev_get_stats);
6452
6453 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6454 {
6455         struct netdev_queue *queue = dev_ingress_queue(dev);
6456
6457 #ifdef CONFIG_NET_CLS_ACT
6458         if (queue)
6459                 return queue;
6460         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6461         if (!queue)
6462                 return NULL;
6463         netdev_init_one_queue(dev, queue, NULL);
6464         queue->qdisc = &noop_qdisc;
6465         queue->qdisc_sleeping = &noop_qdisc;
6466         rcu_assign_pointer(dev->ingress_queue, queue);
6467 #endif
6468         return queue;
6469 }
6470
6471 static const struct ethtool_ops default_ethtool_ops;
6472
6473 void netdev_set_default_ethtool_ops(struct net_device *dev,
6474                                     const struct ethtool_ops *ops)
6475 {
6476         if (dev->ethtool_ops == &default_ethtool_ops)
6477                 dev->ethtool_ops = ops;
6478 }
6479 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6480
6481 void netdev_freemem(struct net_device *dev)
6482 {
6483         char *addr = (char *)dev - dev->padded;
6484
6485         kvfree(addr);
6486 }
6487
6488 /**
6489  *      alloc_netdev_mqs - allocate network device
6490  *      @sizeof_priv:           size of private data to allocate space for
6491  *      @name:                  device name format string
6492  *      @name_assign_type:      origin of device name
6493  *      @setup:                 callback to initialize device
6494  *      @txqs:                  the number of TX subqueues to allocate
6495  *      @rxqs:                  the number of RX subqueues to allocate
6496  *
6497  *      Allocates a struct net_device with private data area for driver use
6498  *      and performs basic initialization.  Also allocates subqueue structs
6499  *      for each queue on the device.
6500  */
6501 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6502                 unsigned char name_assign_type,
6503                 void (*setup)(struct net_device *),
6504                 unsigned int txqs, unsigned int rxqs)
6505 {
6506         struct net_device *dev;
6507         size_t alloc_size;
6508         struct net_device *p;
6509
6510         BUG_ON(strlen(name) >= sizeof(dev->name));
6511
6512         if (txqs < 1) {
6513                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6514                 return NULL;
6515         }
6516
6517 #ifdef CONFIG_SYSFS
6518         if (rxqs < 1) {
6519                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6520                 return NULL;
6521         }
6522 #endif
6523
6524         alloc_size = sizeof(struct net_device);
6525         if (sizeof_priv) {
6526                 /* ensure 32-byte alignment of private area */
6527                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6528                 alloc_size += sizeof_priv;
6529         }
6530         /* ensure 32-byte alignment of whole construct */
6531         alloc_size += NETDEV_ALIGN - 1;
6532
6533         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6534         if (!p)
6535                 p = vzalloc(alloc_size);
6536         if (!p)
6537                 return NULL;
6538
6539         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6540         dev->padded = (char *)dev - (char *)p;
6541
6542         dev->pcpu_refcnt = alloc_percpu(int);
6543         if (!dev->pcpu_refcnt)
6544                 goto free_dev;
6545
6546         if (dev_addr_init(dev))
6547                 goto free_pcpu;
6548
6549         dev_mc_init(dev);
6550         dev_uc_init(dev);
6551
6552         dev_net_set(dev, &init_net);
6553
6554         dev->gso_max_size = GSO_MAX_SIZE;
6555         dev->gso_max_segs = GSO_MAX_SEGS;
6556
6557         INIT_LIST_HEAD(&dev->napi_list);
6558         INIT_LIST_HEAD(&dev->unreg_list);
6559         INIT_LIST_HEAD(&dev->close_list);
6560         INIT_LIST_HEAD(&dev->link_watch_list);
6561         INIT_LIST_HEAD(&dev->adj_list.upper);
6562         INIT_LIST_HEAD(&dev->adj_list.lower);
6563         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6564         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6565         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6566         setup(dev);
6567
6568         dev->num_tx_queues = txqs;
6569         dev->real_num_tx_queues = txqs;
6570         if (netif_alloc_netdev_queues(dev))
6571                 goto free_all;
6572
6573 #ifdef CONFIG_SYSFS
6574         dev->num_rx_queues = rxqs;
6575         dev->real_num_rx_queues = rxqs;
6576         if (netif_alloc_rx_queues(dev))
6577                 goto free_all;
6578 #endif
6579
6580         strcpy(dev->name, name);
6581         dev->name_assign_type = name_assign_type;
6582         dev->group = INIT_NETDEV_GROUP;
6583         if (!dev->ethtool_ops)
6584                 dev->ethtool_ops = &default_ethtool_ops;
6585         return dev;
6586
6587 free_all:
6588         free_netdev(dev);
6589         return NULL;
6590
6591 free_pcpu:
6592         free_percpu(dev->pcpu_refcnt);
6593 free_dev:
6594         netdev_freemem(dev);
6595         return NULL;
6596 }
6597 EXPORT_SYMBOL(alloc_netdev_mqs);
6598
6599 /**
6600  *      free_netdev - free network device
6601  *      @dev: device
6602  *
6603  *      This function does the last stage of destroying an allocated device
6604  *      interface. The reference to the device object is released.
6605  *      If this is the last reference then it will be freed.
6606  */
6607 void free_netdev(struct net_device *dev)
6608 {
6609         struct napi_struct *p, *n;
6610
6611         release_net(dev_net(dev));
6612
6613         netif_free_tx_queues(dev);
6614 #ifdef CONFIG_SYSFS
6615         kfree(dev->_rx);
6616 #endif
6617
6618         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6619
6620         /* Flush device addresses */
6621         dev_addr_flush(dev);
6622
6623         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6624                 netif_napi_del(p);
6625
6626         free_percpu(dev->pcpu_refcnt);
6627         dev->pcpu_refcnt = NULL;
6628
6629         /*  Compatibility with error handling in drivers */
6630         if (dev->reg_state == NETREG_UNINITIALIZED) {
6631                 netdev_freemem(dev);
6632                 return;
6633         }
6634
6635         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6636         dev->reg_state = NETREG_RELEASED;
6637
6638         /* will free via device release */
6639         put_device(&dev->dev);
6640 }
6641 EXPORT_SYMBOL(free_netdev);
6642
6643 /**
6644  *      synchronize_net -  Synchronize with packet receive processing
6645  *
6646  *      Wait for packets currently being received to be done.
6647  *      Does not block later packets from starting.
6648  */
6649 void synchronize_net(void)
6650 {
6651         might_sleep();
6652         if (rtnl_is_locked())
6653                 synchronize_rcu_expedited();
6654         else
6655                 synchronize_rcu();
6656 }
6657 EXPORT_SYMBOL(synchronize_net);
6658
6659 /**
6660  *      unregister_netdevice_queue - remove device from the kernel
6661  *      @dev: device
6662  *      @head: list
6663  *
6664  *      This function shuts down a device interface and removes it
6665  *      from the kernel tables.
6666  *      If head not NULL, device is queued to be unregistered later.
6667  *
6668  *      Callers must hold the rtnl semaphore.  You may want
6669  *      unregister_netdev() instead of this.
6670  */
6671
6672 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6673 {
6674         ASSERT_RTNL();
6675
6676         if (head) {
6677                 list_move_tail(&dev->unreg_list, head);
6678         } else {
6679                 rollback_registered(dev);
6680                 /* Finish processing unregister after unlock */
6681                 net_set_todo(dev);
6682         }
6683 }
6684 EXPORT_SYMBOL(unregister_netdevice_queue);
6685
6686 /**
6687  *      unregister_netdevice_many - unregister many devices
6688  *      @head: list of devices
6689  *
6690  *  Note: As most callers use a stack allocated list_head,
6691  *  we force a list_del() to make sure stack wont be corrupted later.
6692  */
6693 void unregister_netdevice_many(struct list_head *head)
6694 {
6695         struct net_device *dev;
6696
6697         if (!list_empty(head)) {
6698                 rollback_registered_many(head);
6699                 list_for_each_entry(dev, head, unreg_list)
6700                         net_set_todo(dev);
6701                 list_del(head);
6702         }
6703 }
6704 EXPORT_SYMBOL(unregister_netdevice_many);
6705
6706 /**
6707  *      unregister_netdev - remove device from the kernel
6708  *      @dev: device
6709  *
6710  *      This function shuts down a device interface and removes it
6711  *      from the kernel tables.
6712  *
6713  *      This is just a wrapper for unregister_netdevice that takes
6714  *      the rtnl semaphore.  In general you want to use this and not
6715  *      unregister_netdevice.
6716  */
6717 void unregister_netdev(struct net_device *dev)
6718 {
6719         rtnl_lock();
6720         unregister_netdevice(dev);
6721         rtnl_unlock();
6722 }
6723 EXPORT_SYMBOL(unregister_netdev);
6724
6725 /**
6726  *      dev_change_net_namespace - move device to different nethost namespace
6727  *      @dev: device
6728  *      @net: network namespace
6729  *      @pat: If not NULL name pattern to try if the current device name
6730  *            is already taken in the destination network namespace.
6731  *
6732  *      This function shuts down a device interface and moves it
6733  *      to a new network namespace. On success 0 is returned, on
6734  *      a failure a netagive errno code is returned.
6735  *
6736  *      Callers must hold the rtnl semaphore.
6737  */
6738
6739 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6740 {
6741         int err;
6742
6743         ASSERT_RTNL();
6744
6745         /* Don't allow namespace local devices to be moved. */
6746         err = -EINVAL;
6747         if (dev->features & NETIF_F_NETNS_LOCAL)
6748                 goto out;
6749
6750         /* Ensure the device has been registrered */
6751         if (dev->reg_state != NETREG_REGISTERED)
6752                 goto out;
6753
6754         /* Get out if there is nothing todo */
6755         err = 0;
6756         if (net_eq(dev_net(dev), net))
6757                 goto out;
6758
6759         /* Pick the destination device name, and ensure
6760          * we can use it in the destination network namespace.
6761          */
6762         err = -EEXIST;
6763         if (__dev_get_by_name(net, dev->name)) {
6764                 /* We get here if we can't use the current device name */
6765                 if (!pat)
6766                         goto out;
6767                 if (dev_get_valid_name(net, dev, pat) < 0)
6768                         goto out;
6769         }
6770
6771         /*
6772          * And now a mini version of register_netdevice unregister_netdevice.
6773          */
6774
6775         /* If device is running close it first. */
6776         dev_close(dev);
6777
6778         /* And unlink it from device chain */
6779         err = -ENODEV;
6780         unlist_netdevice(dev);
6781
6782         synchronize_net();
6783
6784         /* Shutdown queueing discipline. */
6785         dev_shutdown(dev);
6786
6787         /* Notify protocols, that we are about to destroy
6788            this device. They should clean all the things.
6789
6790            Note that dev->reg_state stays at NETREG_REGISTERED.
6791            This is wanted because this way 8021q and macvlan know
6792            the device is just moving and can keep their slaves up.
6793         */
6794         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6795         rcu_barrier();
6796         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6797         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6798
6799         /*
6800          *      Flush the unicast and multicast chains
6801          */
6802         dev_uc_flush(dev);
6803         dev_mc_flush(dev);
6804
6805         /* Send a netdev-removed uevent to the old namespace */
6806         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6807
6808         /* Actually switch the network namespace */
6809         dev_net_set(dev, net);
6810
6811         /* If there is an ifindex conflict assign a new one */
6812         if (__dev_get_by_index(net, dev->ifindex)) {
6813                 int iflink = (dev->iflink == dev->ifindex);
6814                 dev->ifindex = dev_new_index(net);
6815                 if (iflink)
6816                         dev->iflink = dev->ifindex;
6817         }
6818
6819         /* Send a netdev-add uevent to the new namespace */
6820         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6821
6822         /* Fixup kobjects */
6823         err = device_rename(&dev->dev, dev->name);
6824         WARN_ON(err);
6825
6826         /* Add the device back in the hashes */
6827         list_netdevice(dev);
6828
6829         /* Notify protocols, that a new device appeared. */
6830         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6831
6832         /*
6833          *      Prevent userspace races by waiting until the network
6834          *      device is fully setup before sending notifications.
6835          */
6836         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6837
6838         synchronize_net();
6839         err = 0;
6840 out:
6841         return err;
6842 }
6843 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6844
6845 static int dev_cpu_callback(struct notifier_block *nfb,
6846                             unsigned long action,
6847                             void *ocpu)
6848 {
6849         struct sk_buff **list_skb;
6850         struct sk_buff *skb;
6851         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6852         struct softnet_data *sd, *oldsd;
6853
6854         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6855                 return NOTIFY_OK;
6856
6857         local_irq_disable();
6858         cpu = smp_processor_id();
6859         sd = &per_cpu(softnet_data, cpu);
6860         oldsd = &per_cpu(softnet_data, oldcpu);
6861
6862         /* Find end of our completion_queue. */
6863         list_skb = &sd->completion_queue;
6864         while (*list_skb)
6865                 list_skb = &(*list_skb)->next;
6866         /* Append completion queue from offline CPU. */
6867         *list_skb = oldsd->completion_queue;
6868         oldsd->completion_queue = NULL;
6869
6870         /* Append output queue from offline CPU. */
6871         if (oldsd->output_queue) {
6872                 *sd->output_queue_tailp = oldsd->output_queue;
6873                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6874                 oldsd->output_queue = NULL;
6875                 oldsd->output_queue_tailp = &oldsd->output_queue;
6876         }
6877         /* Append NAPI poll list from offline CPU. */
6878         if (!list_empty(&oldsd->poll_list)) {
6879                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6880                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6881         }
6882
6883         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6884         local_irq_enable();
6885
6886         /* Process offline CPU's input_pkt_queue */
6887         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6888                 netif_rx_internal(skb);
6889                 input_queue_head_incr(oldsd);
6890         }
6891         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6892                 netif_rx_internal(skb);
6893                 input_queue_head_incr(oldsd);
6894         }
6895
6896         return NOTIFY_OK;
6897 }
6898
6899
6900 /**
6901  *      netdev_increment_features - increment feature set by one
6902  *      @all: current feature set
6903  *      @one: new feature set
6904  *      @mask: mask feature set
6905  *
6906  *      Computes a new feature set after adding a device with feature set
6907  *      @one to the master device with current feature set @all.  Will not
6908  *      enable anything that is off in @mask. Returns the new feature set.
6909  */
6910 netdev_features_t netdev_increment_features(netdev_features_t all,
6911         netdev_features_t one, netdev_features_t mask)
6912 {
6913         if (mask & NETIF_F_GEN_CSUM)
6914                 mask |= NETIF_F_ALL_CSUM;
6915         mask |= NETIF_F_VLAN_CHALLENGED;
6916
6917         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6918         all &= one | ~NETIF_F_ALL_FOR_ALL;
6919
6920         /* If one device supports hw checksumming, set for all. */
6921         if (all & NETIF_F_GEN_CSUM)
6922                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6923
6924         return all;
6925 }
6926 EXPORT_SYMBOL(netdev_increment_features);
6927
6928 static struct hlist_head * __net_init netdev_create_hash(void)
6929 {
6930         int i;
6931         struct hlist_head *hash;
6932
6933         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6934         if (hash != NULL)
6935                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6936                         INIT_HLIST_HEAD(&hash[i]);
6937
6938         return hash;
6939 }
6940
6941 /* Initialize per network namespace state */
6942 static int __net_init netdev_init(struct net *net)
6943 {
6944         if (net != &init_net)
6945                 INIT_LIST_HEAD(&net->dev_base_head);
6946
6947         net->dev_name_head = netdev_create_hash();
6948         if (net->dev_name_head == NULL)
6949                 goto err_name;
6950
6951         net->dev_index_head = netdev_create_hash();
6952         if (net->dev_index_head == NULL)
6953                 goto err_idx;
6954
6955         return 0;
6956
6957 err_idx:
6958         kfree(net->dev_name_head);
6959 err_name:
6960         return -ENOMEM;
6961 }
6962
6963 /**
6964  *      netdev_drivername - network driver for the device
6965  *      @dev: network device
6966  *
6967  *      Determine network driver for device.
6968  */
6969 const char *netdev_drivername(const struct net_device *dev)
6970 {
6971         const struct device_driver *driver;
6972         const struct device *parent;
6973         const char *empty = "";
6974
6975         parent = dev->dev.parent;
6976         if (!parent)
6977                 return empty;
6978
6979         driver = parent->driver;
6980         if (driver && driver->name)
6981                 return driver->name;
6982         return empty;
6983 }
6984
6985 static int __netdev_printk(const char *level, const struct net_device *dev,
6986                            struct va_format *vaf)
6987 {
6988         int r;
6989
6990         if (dev && dev->dev.parent) {
6991                 r = dev_printk_emit(level[1] - '0',
6992                                     dev->dev.parent,
6993                                     "%s %s %s%s: %pV",
6994                                     dev_driver_string(dev->dev.parent),
6995                                     dev_name(dev->dev.parent),
6996                                     netdev_name(dev), netdev_reg_state(dev),
6997                                     vaf);
6998         } else if (dev) {
6999                 r = printk("%s%s%s: %pV", level, netdev_name(dev),
7000                            netdev_reg_state(dev), vaf);
7001         } else {
7002                 r = printk("%s(NULL net_device): %pV", level, vaf);
7003         }
7004
7005         return r;
7006 }
7007
7008 int netdev_printk(const char *level, const struct net_device *dev,
7009                   const char *format, ...)
7010 {
7011         struct va_format vaf;
7012         va_list args;
7013         int r;
7014
7015         va_start(args, format);
7016
7017         vaf.fmt = format;
7018         vaf.va = &args;
7019
7020         r = __netdev_printk(level, dev, &vaf);
7021
7022         va_end(args);
7023
7024         return r;
7025 }
7026 EXPORT_SYMBOL(netdev_printk);
7027
7028 #define define_netdev_printk_level(func, level)                 \
7029 int func(const struct net_device *dev, const char *fmt, ...)    \
7030 {                                                               \
7031         int r;                                                  \
7032         struct va_format vaf;                                   \
7033         va_list args;                                           \
7034                                                                 \
7035         va_start(args, fmt);                                    \
7036                                                                 \
7037         vaf.fmt = fmt;                                          \
7038         vaf.va = &args;                                         \
7039                                                                 \
7040         r = __netdev_printk(level, dev, &vaf);                  \
7041                                                                 \
7042         va_end(args);                                           \
7043                                                                 \
7044         return r;                                               \
7045 }                                                               \
7046 EXPORT_SYMBOL(func);
7047
7048 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7049 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7050 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7051 define_netdev_printk_level(netdev_err, KERN_ERR);
7052 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7053 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7054 define_netdev_printk_level(netdev_info, KERN_INFO);
7055
7056 static void __net_exit netdev_exit(struct net *net)
7057 {
7058         kfree(net->dev_name_head);
7059         kfree(net->dev_index_head);
7060 }
7061
7062 static struct pernet_operations __net_initdata netdev_net_ops = {
7063         .init = netdev_init,
7064         .exit = netdev_exit,
7065 };
7066
7067 static void __net_exit default_device_exit(struct net *net)
7068 {
7069         struct net_device *dev, *aux;
7070         /*
7071          * Push all migratable network devices back to the
7072          * initial network namespace
7073          */
7074         rtnl_lock();
7075         for_each_netdev_safe(net, dev, aux) {
7076                 int err;
7077                 char fb_name[IFNAMSIZ];
7078
7079                 /* Ignore unmoveable devices (i.e. loopback) */
7080                 if (dev->features & NETIF_F_NETNS_LOCAL)
7081                         continue;
7082
7083                 /* Leave virtual devices for the generic cleanup */
7084                 if (dev->rtnl_link_ops)
7085                         continue;
7086
7087                 /* Push remaining network devices to init_net */
7088                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7089                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7090                 if (err) {
7091                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7092                                  __func__, dev->name, err);
7093                         BUG();
7094                 }
7095         }
7096         rtnl_unlock();
7097 }
7098
7099 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7100 {
7101         /* Return with the rtnl_lock held when there are no network
7102          * devices unregistering in any network namespace in net_list.
7103          */
7104         struct net *net;
7105         bool unregistering;
7106         DEFINE_WAIT(wait);
7107
7108         for (;;) {
7109                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7110                                 TASK_UNINTERRUPTIBLE);
7111                 unregistering = false;
7112                 rtnl_lock();
7113                 list_for_each_entry(net, net_list, exit_list) {
7114                         if (net->dev_unreg_count > 0) {
7115                                 unregistering = true;
7116                                 break;
7117                         }
7118                 }
7119                 if (!unregistering)
7120                         break;
7121                 __rtnl_unlock();
7122                 schedule();
7123         }
7124         finish_wait(&netdev_unregistering_wq, &wait);
7125 }
7126
7127 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7128 {
7129         /* At exit all network devices most be removed from a network
7130          * namespace.  Do this in the reverse order of registration.
7131          * Do this across as many network namespaces as possible to
7132          * improve batching efficiency.
7133          */
7134         struct net_device *dev;
7135         struct net *net;
7136         LIST_HEAD(dev_kill_list);
7137
7138         /* To prevent network device cleanup code from dereferencing
7139          * loopback devices or network devices that have been freed
7140          * wait here for all pending unregistrations to complete,
7141          * before unregistring the loopback device and allowing the
7142          * network namespace be freed.
7143          *
7144          * The netdev todo list containing all network devices
7145          * unregistrations that happen in default_device_exit_batch
7146          * will run in the rtnl_unlock() at the end of
7147          * default_device_exit_batch.
7148          */
7149         rtnl_lock_unregistering(net_list);
7150         list_for_each_entry(net, net_list, exit_list) {
7151                 for_each_netdev_reverse(net, dev) {
7152                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7153                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7154                         else
7155                                 unregister_netdevice_queue(dev, &dev_kill_list);
7156                 }
7157         }
7158         unregister_netdevice_many(&dev_kill_list);
7159         rtnl_unlock();
7160 }
7161
7162 static struct pernet_operations __net_initdata default_device_ops = {
7163         .exit = default_device_exit,
7164         .exit_batch = default_device_exit_batch,
7165 };
7166
7167 /*
7168  *      Initialize the DEV module. At boot time this walks the device list and
7169  *      unhooks any devices that fail to initialise (normally hardware not
7170  *      present) and leaves us with a valid list of present and active devices.
7171  *
7172  */
7173
7174 /*
7175  *       This is called single threaded during boot, so no need
7176  *       to take the rtnl semaphore.
7177  */
7178 static int __init net_dev_init(void)
7179 {
7180         int i, rc = -ENOMEM;
7181
7182         BUG_ON(!dev_boot_phase);
7183
7184         if (dev_proc_init())
7185                 goto out;
7186
7187         if (netdev_kobject_init())
7188                 goto out;
7189
7190         INIT_LIST_HEAD(&ptype_all);
7191         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7192                 INIT_LIST_HEAD(&ptype_base[i]);
7193
7194         INIT_LIST_HEAD(&offload_base);
7195
7196         if (register_pernet_subsys(&netdev_net_ops))
7197                 goto out;
7198
7199         /*
7200          *      Initialise the packet receive queues.
7201          */
7202
7203         for_each_possible_cpu(i) {
7204                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7205
7206                 skb_queue_head_init(&sd->input_pkt_queue);
7207                 skb_queue_head_init(&sd->process_queue);
7208                 INIT_LIST_HEAD(&sd->poll_list);
7209                 sd->output_queue_tailp = &sd->output_queue;
7210 #ifdef CONFIG_RPS
7211                 sd->csd.func = rps_trigger_softirq;
7212                 sd->csd.info = sd;
7213                 sd->cpu = i;
7214 #endif
7215
7216                 sd->backlog.poll = process_backlog;
7217                 sd->backlog.weight = weight_p;
7218         }
7219
7220         dev_boot_phase = 0;
7221
7222         /* The loopback device is special if any other network devices
7223          * is present in a network namespace the loopback device must
7224          * be present. Since we now dynamically allocate and free the
7225          * loopback device ensure this invariant is maintained by
7226          * keeping the loopback device as the first device on the
7227          * list of network devices.  Ensuring the loopback devices
7228          * is the first device that appears and the last network device
7229          * that disappears.
7230          */
7231         if (register_pernet_device(&loopback_net_ops))
7232                 goto out;
7233
7234         if (register_pernet_device(&default_device_ops))
7235                 goto out;
7236
7237         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7238         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7239
7240         hotcpu_notifier(dev_cpu_callback, 0);
7241         dst_init();
7242         rc = 0;
7243 out:
7244         return rc;
7245 }
7246
7247 subsys_initcall(net_dev_init);