net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 static DEFINE_SPINLOCK(ptype_lock);
 145 static DEFINE_SPINLOCK(offload_lock);
 146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147 struct list_head ptype_all __read_mostly;       /* Taps */
 148 static struct list_head offload_base __read_mostly;
 149
 150 static int netif_rx_internal(struct sk_buff *skb);
 151 static int call_netdevice_notifiers_info(unsigned long val,
 152                                          struct net_device *dev,
 153                                          struct netdev_notifier_info *info);
 154
 155 /*
 156  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 157  * semaphore.
 158  *
 159  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 160  *
 161  * Writers must hold the rtnl semaphore while they loop through the
 162  * dev_base_head list, and hold dev_base_lock for writing when they do the
 163  * actual updates.  This allows pure readers to access the list even
 164  * while a writer is preparing to update it.
 165  *
 166  * To put it another way, dev_base_lock is held for writing only to
 167  * protect against pure readers; the rtnl semaphore provides the
 168  * protection against other writers.
 169  *
 170  * See, for example usages, register_netdevice() and
 171  * unregister_netdevice(), which must be called with the rtnl
 172  * semaphore held.
 173  */
 174 DEFINE_RWLOCK(dev_base_lock);
 175 EXPORT_SYMBOL(dev_base_lock);
 176
 177 /* protects napi_hash addition/deletion and napi_gen_id */
 178 static DEFINE_SPINLOCK(napi_hash_lock);
 179
 180 static unsigned int napi_gen_id;
 181 static DEFINE_HASHTABLE(napi_hash, 8);
 182
 183 static seqcount_t devnet_rename_seq;
 184
 185 static inline void dev_base_seq_inc(struct net *net)
 186 {
 187         while (++net->dev_base_seq == 0);
 188 }
 189
 190 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 191 {
 192         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 193
 194         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 195 }
 196
 197 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 198 {
 199         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 200 }
 201
 202 static inline void rps_lock(struct softnet_data *sd)
 203 {
 204 #ifdef CONFIG_RPS
 205         spin_lock(&sd->input_pkt_queue.lock);
 206 #endif
 207 }
 208
 209 static inline void rps_unlock(struct softnet_data *sd)
 210 {
 211 #ifdef CONFIG_RPS
 212         spin_unlock(&sd->input_pkt_queue.lock);
 213 #endif
 214 }
 215
 216 /* Device list insertion */
 217 static void list_netdevice(struct net_device *dev)
 218 {
 219         struct net *net = dev_net(dev);
 220
 221         ASSERT_RTNL();
 222
 223         write_lock_bh(&dev_base_lock);
 224         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 225         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 226         hlist_add_head_rcu(&dev->index_hlist,
 227                            dev_index_hash(net, dev->ifindex));
 228         write_unlock_bh(&dev_base_lock);
 229
 230         dev_base_seq_inc(net);
 231 }
 232
 233 /* Device list removal
 234  * caller must respect a RCU grace period before freeing/reusing dev
 235  */
 236 static void unlist_netdevice(struct net_device *dev)
 237 {
 238         ASSERT_RTNL();
 239
 240         /* Unlink dev from the device chain */
 241         write_lock_bh(&dev_base_lock);
 242         list_del_rcu(&dev->dev_list);
 243         hlist_del_rcu(&dev->name_hlist);
 244         hlist_del_rcu(&dev->index_hlist);
 245         write_unlock_bh(&dev_base_lock);
 246
 247         dev_base_seq_inc(dev_net(dev));
 248 }
 249
 250 /*
 251  *      Our notifier list
 252  */
 253
 254 static RAW_NOTIFIER_HEAD(netdev_chain);
 255
 256 /*
 257  *      Device drivers call our routines to queue packets here. We empty the
 258  *      queue in the local softnet handler.
 259  */
 260
 261 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 262 EXPORT_PER_CPU_SYMBOL(softnet_data);
 263
 264 #ifdef CONFIG_LOCKDEP
 265 /*
 266  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 267  * according to dev->type
 268  */
 269 static const unsigned short netdev_lock_type[] =
 270         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 271          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 272          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 273          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 274          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 275          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 276          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 277          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 278          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 279          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 280          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 281          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 282          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 283          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 284          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 285
 286 static const char *const netdev_lock_name[] =
 287         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 288          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 289          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 290          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 291          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 292          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 293          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 294          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 295          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 296          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 297          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 298          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 299          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 300          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 301          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 302
 303 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 304 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305
 306 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 307 {
 308         int i;
 309
 310         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 311                 if (netdev_lock_type[i] == dev_type)
 312                         return i;
 313         /* the last key is used by default */
 314         return ARRAY_SIZE(netdev_lock_type) - 1;
 315 }
 316
 317 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 318                                                  unsigned short dev_type)
 319 {
 320         int i;
 321
 322         i = netdev_lock_pos(dev_type);
 323         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev->type);
 332         lockdep_set_class_and_name(&dev->addr_list_lock,
 333                                    &netdev_addr_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336 #else
 337 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338                                                  unsigned short dev_type)
 339 {
 340 }
 341 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 342 {
 343 }
 344 #endif
 345
 346 /*******************************************************************************
 347
 348                 Protocol management and registration routines
 349
 350 *******************************************************************************/
 351
 352 /*
 353  *      Add a protocol ID to the list. Now that the input handler is
 354  *      smarter we can dispense with all the messy stuff that used to be
 355  *      here.
 356  *
 357  *      BEWARE!!! Protocol handlers, mangling input packets,
 358  *      MUST BE last in hash buckets and checking protocol handlers
 359  *      MUST start from promiscuous ptype_all chain in net_bh.
 360  *      It is true now, do not change it.
 361  *      Explanation follows: if protocol handler, mangling packet, will
 362  *      be the first on list, it is not able to sense, that packet
 363  *      is cloned and should be copied-on-write, so that it will
 364  *      change it and subsequent readers will get broken packet.
 365  *                                                      --ANK (980803)
 366  */
 367
 368 static inline struct list_head *ptype_head(const struct packet_type *pt)
 369 {
 370         if (pt->type == htons(ETH_P_ALL))
 371                 return &ptype_all;
 372         else
 373                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 374 }
 375
 376 /**
 377  *      dev_add_pack - add packet handler
 378  *      @pt: packet type declaration
 379  *
 380  *      Add a protocol handler to the networking stack. The passed &packet_type
 381  *      is linked into kernel lists and may not be freed until it has been
 382  *      removed from the kernel lists.
 383  *
 384  *      This call does not sleep therefore it can not
 385  *      guarantee all CPU's that are in middle of receiving packets
 386  *      will see the new packet type (until the next received packet).
 387  */
 388
 389 void dev_add_pack(struct packet_type *pt)
 390 {
 391         struct list_head *head = ptype_head(pt);
 392
 393         spin_lock(&ptype_lock);
 394         list_add_rcu(&pt->list, head);
 395         spin_unlock(&ptype_lock);
 396 }
 397 EXPORT_SYMBOL(dev_add_pack);
 398
 399 /**
 400  *      __dev_remove_pack        - remove packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Remove a protocol handler that was previously added to the kernel
 404  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 405  *      from the kernel lists and can be freed or reused once this function
 406  *      returns.
 407  *
 408  *      The packet type might still be in use by receivers
 409  *      and must not be freed until after all the CPU's have gone
 410  *      through a quiescent state.
 411  */
 412 void __dev_remove_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415         struct packet_type *pt1;
 416
 417         spin_lock(&ptype_lock);
 418
 419         list_for_each_entry(pt1, head, list) {
 420                 if (pt == pt1) {
 421                         list_del_rcu(&pt->list);
 422                         goto out;
 423                 }
 424         }
 425
 426         pr_warn("dev_remove_pack: %p not found\n", pt);
 427 out:
 428         spin_unlock(&ptype_lock);
 429 }
 430 EXPORT_SYMBOL(__dev_remove_pack);
 431
 432 /**
 433  *      dev_remove_pack  - remove packet handler
 434  *      @pt: packet type declaration
 435  *
 436  *      Remove a protocol handler that was previously added to the kernel
 437  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 438  *      from the kernel lists and can be freed or reused once this function
 439  *      returns.
 440  *
 441  *      This call sleeps to guarantee that no CPU is looking at the packet
 442  *      type after return.
 443  */
 444 void dev_remove_pack(struct packet_type *pt)
 445 {
 446         __dev_remove_pack(pt);
 447
 448         synchronize_net();
 449 }
 450 EXPORT_SYMBOL(dev_remove_pack);
 451
 452
 453 /**
 454  *      dev_add_offload - register offload handlers
 455  *      @po: protocol offload declaration
 456  *
 457  *      Add protocol offload handlers to the networking stack. The passed
 458  *      &proto_offload is linked into kernel lists and may not be freed until
 459  *      it has been removed from the kernel lists.
 460  *
 461  *      This call does not sleep therefore it can not
 462  *      guarantee all CPU's that are in middle of receiving packets
 463  *      will see the new offload handlers (until the next received packet).
 464  */
 465 void dev_add_offload(struct packet_offload *po)
 466 {
 467         struct list_head *head = &offload_base;
 468
 469         spin_lock(&offload_lock);
 470         list_add_rcu(&po->list, head);
 471         spin_unlock(&offload_lock);
 472 }
 473 EXPORT_SYMBOL(dev_add_offload);
 474
 475 /**
 476  *      __dev_remove_offload     - remove offload handler
 477  *      @po: packet offload declaration
 478  *
 479  *      Remove a protocol offload handler that was previously added to the
 480  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 481  *      is removed from the kernel lists and can be freed or reused once this
 482  *      function returns.
 483  *
 484  *      The packet type might still be in use by receivers
 485  *      and must not be freed until after all the CPU's have gone
 486  *      through a quiescent state.
 487  */
 488 static void __dev_remove_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491         struct packet_offload *po1;
 492
 493         spin_lock(&offload_lock);
 494
 495         list_for_each_entry(po1, head, list) {
 496                 if (po == po1) {
 497                         list_del_rcu(&po->list);
 498                         goto out;
 499                 }
 500         }
 501
 502         pr_warn("dev_remove_offload: %p not found\n", po);
 503 out:
 504         spin_unlock(&offload_lock);
 505 }
 506
 507 /**
 508  *      dev_remove_offload       - remove packet offload handler
 509  *      @po: packet offload declaration
 510  *
 511  *      Remove a packet offload handler that was previously added to the kernel
 512  *      offload handlers by dev_add_offload(). The passed &offload_type is
 513  *      removed from the kernel lists and can be freed or reused once this
 514  *      function returns.
 515  *
 516  *      This call sleeps to guarantee that no CPU is looking at the packet
 517  *      type after return.
 518  */
 519 void dev_remove_offload(struct packet_offload *po)
 520 {
 521         __dev_remove_offload(po);
 522
 523         synchronize_net();
 524 }
 525 EXPORT_SYMBOL(dev_remove_offload);
 526
 527 /******************************************************************************
 528
 529                       Device Boot-time Settings Routines
 530
 531 *******************************************************************************/
 532
 533 /* Boot time configuration table */
 534 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 535
 536 /**
 537  *      netdev_boot_setup_add   - add new setup entry
 538  *      @name: name of the device
 539  *      @map: configured settings for the device
 540  *
 541  *      Adds new setup entry to the dev_boot_setup list.  The function
 542  *      returns 0 on error and 1 on success.  This is a generic routine to
 543  *      all netdevices.
 544  */
 545 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 546 {
 547         struct netdev_boot_setup *s;
 548         int i;
 549
 550         s = dev_boot_setup;
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 552                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 553                         memset(s[i].name, 0, sizeof(s[i].name));
 554                         strlcpy(s[i].name, name, IFNAMSIZ);
 555                         memcpy(&s[i].map, map, sizeof(s[i].map));
 556                         break;
 557                 }
 558         }
 559
 560         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 561 }
 562
 563 /**
 564  *      netdev_boot_setup_check - check boot time settings
 565  *      @dev: the netdevice
 566  *
 567  *      Check boot time settings for the device.
 568  *      The found settings are set for the device to be used
 569  *      later in the device probing.
 570  *      Returns 0 if no settings found, 1 if they are.
 571  */
 572 int netdev_boot_setup_check(struct net_device *dev)
 573 {
 574         struct netdev_boot_setup *s = dev_boot_setup;
 575         int i;
 576
 577         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 578                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 579                     !strcmp(dev->name, s[i].name)) {
 580                         dev->irq        = s[i].map.irq;
 581                         dev->base_addr  = s[i].map.base_addr;
 582                         dev->mem_start  = s[i].map.mem_start;
 583                         dev->mem_end    = s[i].map.mem_end;
 584                         return 1;
 585                 }
 586         }
 587         return 0;
 588 }
 589 EXPORT_SYMBOL(netdev_boot_setup_check);
 590
 591
 592 /**
 593  *      netdev_boot_base        - get address from boot time settings
 594  *      @prefix: prefix for network device
 595  *      @unit: id for network device
 596  *
 597  *      Check boot time settings for the base address of device.
 598  *      The found settings are set for the device to be used
 599  *      later in the device probing.
 600  *      Returns 0 if no settings found.
 601  */
 602 unsigned long netdev_boot_base(const char *prefix, int unit)
 603 {
 604         const struct netdev_boot_setup *s = dev_boot_setup;
 605         char name[IFNAMSIZ];
 606         int i;
 607
 608         sprintf(name, "%s%d", prefix, unit);
 609
 610         /*
 611          * If device already registered then return base of 1
 612          * to indicate not to probe for this interface
 613          */
 614         if (__dev_get_by_name(&init_net, name))
 615                 return 1;
 616
 617         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 618                 if (!strcmp(name, s[i].name))
 619                         return s[i].map.base_addr;
 620         return 0;
 621 }
 622
 623 /*
 624  * Saves at boot time configured settings for any netdevice.
 625  */
 626 int __init netdev_boot_setup(char *str)
 627 {
 628         int ints[5];
 629         struct ifmap map;
 630
 631         str = get_options(str, ARRAY_SIZE(ints), ints);
 632         if (!str || !*str)
 633                 return 0;
 634
 635         /* Save settings */
 636         memset(&map, 0, sizeof(map));
 637         if (ints[0] > 0)
 638                 map.irq = ints[1];
 639         if (ints[0] > 1)
 640                 map.base_addr = ints[2];
 641         if (ints[0] > 2)
 642                 map.mem_start = ints[3];
 643         if (ints[0] > 3)
 644                 map.mem_end = ints[4];
 645
 646         /* Add new entry to the list */
 647         return netdev_boot_setup_add(str, &map);
 648 }
 649
 650 __setup("netdev=", netdev_boot_setup);
 651
 652 /*******************************************************************************
 653
 654                             Device Interface Subroutines
 655
 656 *******************************************************************************/
 657
 658 /**
 659  *      __dev_get_by_name       - find a device by its name
 660  *      @net: the applicable net namespace
 661  *      @name: name to find
 662  *
 663  *      Find an interface by name. Must be called under RTNL semaphore
 664  *      or @dev_base_lock. If the name is found a pointer to the device
 665  *      is returned. If the name is not found then %NULL is returned. The
 666  *      reference counters are not incremented so the caller must be
 667  *      careful with locks.
 668  */
 669
 670 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 671 {
 672         struct net_device *dev;
 673         struct hlist_head *head = dev_name_hash(net, name);
 674
 675         hlist_for_each_entry(dev, head, name_hlist)
 676                 if (!strncmp(dev->name, name, IFNAMSIZ))
 677                         return dev;
 678
 679         return NULL;
 680 }
 681 EXPORT_SYMBOL(__dev_get_by_name);
 682
 683 /**
 684  *      dev_get_by_name_rcu     - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name.
 689  *      If the name is found a pointer to the device is returned.
 690  *      If the name is not found then %NULL is returned.
 691  *      The reference counters are not incremented so the caller must be
 692  *      careful with locks. The caller must hold RCU lock.
 693  */
 694
 695 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 696 {
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry_rcu(dev, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(dev_get_by_name_rcu);
 707
 708 /**
 709  *      dev_get_by_name         - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name. This can be called from any
 714  *      context and does its own locking. The returned handle has
 715  *      the usage count incremented and the caller must use dev_put() to
 716  *      release it when it is no longer needed. %NULL is returned if no
 717  *      matching device is found.
 718  */
 719
 720 struct net_device *dev_get_by_name(struct net *net, const char *name)
 721 {
 722         struct net_device *dev;
 723
 724         rcu_read_lock();
 725         dev = dev_get_by_name_rcu(net, name);
 726         if (dev)
 727                 dev_hold(dev);
 728         rcu_read_unlock();
 729         return dev;
 730 }
 731 EXPORT_SYMBOL(dev_get_by_name);
 732
 733 /**
 734  *      __dev_get_by_index - find a device by its ifindex
 735  *      @net: the applicable net namespace
 736  *      @ifindex: index of device
 737  *
 738  *      Search for an interface by index. Returns %NULL if the device
 739  *      is not found or a pointer to the device. The device has not
 740  *      had its reference counter increased so the caller must be careful
 741  *      about locking. The caller must hold either the RTNL semaphore
 742  *      or @dev_base_lock.
 743  */
 744
 745 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 746 {
 747         struct net_device *dev;
 748         struct hlist_head *head = dev_index_hash(net, ifindex);
 749
 750         hlist_for_each_entry(dev, head, index_hlist)
 751                 if (dev->ifindex == ifindex)
 752                         return dev;
 753
 754         return NULL;
 755 }
 756 EXPORT_SYMBOL(__dev_get_by_index);
 757
 758 /**
 759  *      dev_get_by_index_rcu - find a device by its ifindex
 760  *      @net: the applicable net namespace
 761  *      @ifindex: index of device
 762  *
 763  *      Search for an interface by index. Returns %NULL if the device
 764  *      is not found or a pointer to the device. The device has not
 765  *      had its reference counter increased so the caller must be careful
 766  *      about locking. The caller must hold RCU lock.
 767  */
 768
 769 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 770 {
 771         struct net_device *dev;
 772         struct hlist_head *head = dev_index_hash(net, ifindex);
 773
 774         hlist_for_each_entry_rcu(dev, head, index_hlist)
 775                 if (dev->ifindex == ifindex)
 776                         return dev;
 777
 778         return NULL;
 779 }
 780 EXPORT_SYMBOL(dev_get_by_index_rcu);
 781
 782
 783 /**
 784  *      dev_get_by_index - find a device by its ifindex
 785  *      @net: the applicable net namespace
 786  *      @ifindex: index of device
 787  *
 788  *      Search for an interface by index. Returns NULL if the device
 789  *      is not found or a pointer to the device. The device returned has
 790  *      had a reference added and the pointer is safe until the user calls
 791  *      dev_put to indicate they have finished with it.
 792  */
 793
 794 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 795 {
 796         struct net_device *dev;
 797
 798         rcu_read_lock();
 799         dev = dev_get_by_index_rcu(net, ifindex);
 800         if (dev)
 801                 dev_hold(dev);
 802         rcu_read_unlock();
 803         return dev;
 804 }
 805 EXPORT_SYMBOL(dev_get_by_index);
 806
 807 /**
 808  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 809  *      @net: network namespace
 810  *      @name: a pointer to the buffer where the name will be stored.
 811  *      @ifindex: the ifindex of the interface to get the name from.
 812  *
 813  *      The use of raw_seqcount_begin() and cond_resched() before
 814  *      retrying is required as we want to give the writers a chance
 815  *      to complete when CONFIG_PREEMPT is not set.
 816  */
 817 int netdev_get_name(struct net *net, char *name, int ifindex)
 818 {
 819         struct net_device *dev;
 820         unsigned int seq;
 821
 822 retry:
 823         seq = raw_seqcount_begin(&devnet_rename_seq);
 824         rcu_read_lock();
 825         dev = dev_get_by_index_rcu(net, ifindex);
 826         if (!dev) {
 827                 rcu_read_unlock();
 828                 return -ENODEV;
 829         }
 830
 831         strcpy(name, dev->name);
 832         rcu_read_unlock();
 833         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 834                 cond_resched();
 835                 goto retry;
 836         }
 837
 838         return 0;
 839 }
 840
 841 /**
 842  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 843  *      @net: the applicable net namespace
 844  *      @type: media type of device
 845  *      @ha: hardware address
 846  *
 847  *      Search for an interface by MAC address. Returns NULL if the device
 848  *      is not found or a pointer to the device.
 849  *      The caller must hold RCU or RTNL.
 850  *      The returned device has not had its ref count increased
 851  *      and the caller must therefore be careful about locking
 852  *
 853  */
 854
 855 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 856                                        const char *ha)
 857 {
 858         struct net_device *dev;
 859
 860         for_each_netdev_rcu(net, dev)
 861                 if (dev->type == type &&
 862                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 863                         return dev;
 864
 865         return NULL;
 866 }
 867 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 868
 869 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 870 {
 871         struct net_device *dev;
 872
 873         ASSERT_RTNL();
 874         for_each_netdev(net, dev)
 875                 if (dev->type == type)
 876                         return dev;
 877
 878         return NULL;
 879 }
 880 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 881
 882 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 883 {
 884         struct net_device *dev, *ret = NULL;
 885
 886         rcu_read_lock();
 887         for_each_netdev_rcu(net, dev)
 888                 if (dev->type == type) {
 889                         dev_hold(dev);
 890                         ret = dev;
 891                         break;
 892                 }
 893         rcu_read_unlock();
 894         return ret;
 895 }
 896 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 897
 898 /**
 899  *      dev_get_by_flags_rcu - find any device with given flags
 900  *      @net: the applicable net namespace
 901  *      @if_flags: IFF_* values
 902  *      @mask: bitmask of bits in if_flags to check
 903  *
 904  *      Search for any interface with the given flags. Returns NULL if a device
 905  *      is not found or a pointer to the device. Must be called inside
 906  *      rcu_read_lock(), and result refcount is unchanged.
 907  */
 908
 909 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 910                                     unsigned short mask)
 911 {
 912         struct net_device *dev, *ret;
 913
 914         ret = NULL;
 915         for_each_netdev_rcu(net, dev) {
 916                 if (((dev->flags ^ if_flags) & mask) == 0) {
 917                         ret = dev;
 918                         break;
 919                 }
 920         }
 921         return ret;
 922 }
 923 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 924
 925 /**
 926  *      dev_valid_name - check if name is okay for network device
 927  *      @name: name string
 928  *
 929  *      Network device names need to be valid file names to
 930  *      to allow sysfs to work.  We also disallow any kind of
 931  *      whitespace.
 932  */
 933 bool dev_valid_name(const char *name)
 934 {
 935         if (*name == '\0')
 936                 return false;
 937         if (strlen(name) >= IFNAMSIZ)
 938                 return false;
 939         if (!strcmp(name, ".") || !strcmp(name, ".."))
 940                 return false;
 941
 942         while (*name) {
 943                 if (*name == '/' || isspace(*name))
 944                         return false;
 945                 name++;
 946         }
 947         return true;
 948 }
 949 EXPORT_SYMBOL(dev_valid_name);
 950
 951 /**
 952  *      __dev_alloc_name - allocate a name for a device
 953  *      @net: network namespace to allocate the device name in
 954  *      @name: name format string
 955  *      @buf:  scratch buffer and result name string
 956  *
 957  *      Passed a format string - eg "lt%d" it will try and find a suitable
 958  *      id. It scans list of devices to build up a free map, then chooses
 959  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 960  *      while allocating the name and adding the device in order to avoid
 961  *      duplicates.
 962  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 963  *      Returns the number of the unit assigned or a negative errno code.
 964  */
 965
 966 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 967 {
 968         int i = 0;
 969         const char *p;
 970         const int max_netdevices = 8*PAGE_SIZE;
 971         unsigned long *inuse;
 972         struct net_device *d;
 973
 974         p = strnchr(name, IFNAMSIZ-1, '%');
 975         if (p) {
 976                 /*
 977                  * Verify the string as this thing may have come from
 978                  * the user.  There must be either one "%d" and no other "%"
 979                  * characters.
 980                  */
 981                 if (p[1] != 'd' || strchr(p + 2, '%'))
 982                         return -EINVAL;
 983
 984                 /* Use one page as a bit array of possible slots */
 985                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 986                 if (!inuse)
 987                         return -ENOMEM;
 988
 989                 for_each_netdev(net, d) {
 990                         if (!sscanf(d->name, name, &i))
 991                                 continue;
 992                         if (i < 0 || i >= max_netdevices)
 993                                 continue;
 994
 995                         /*  avoid cases where sscanf is not exact inverse of printf */
 996                         snprintf(buf, IFNAMSIZ, name, i);
 997                         if (!strncmp(buf, d->name, IFNAMSIZ))
 998                                 set_bit(i, inuse);
 999                 }
1000
1001                 i = find_first_zero_bit(inuse, max_netdevices);
1002                 free_page((unsigned long) inuse);
1003         }
1004
1005         if (buf != name)
1006                 snprintf(buf, IFNAMSIZ, name, i);
1007         if (!__dev_get_by_name(net, buf))
1008                 return i;
1009
1010         /* It is possible to run out of possible slots
1011          * when the name is long and there isn't enough space left
1012          * for the digits, or if all bits are used.
1013          */
1014         return -ENFILE;
1015 }
1016
1017 /**
1018  *      dev_alloc_name - allocate a name for a device
1019  *      @dev: device
1020  *      @name: name format string
1021  *
1022  *      Passed a format string - eg "lt%d" it will try and find a suitable
1023  *      id. It scans list of devices to build up a free map, then chooses
1024  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1025  *      while allocating the name and adding the device in order to avoid
1026  *      duplicates.
1027  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1028  *      Returns the number of the unit assigned or a negative errno code.
1029  */
1030
1031 int dev_alloc_name(struct net_device *dev, const char *name)
1032 {
1033         char buf[IFNAMSIZ];
1034         struct net *net;
1035         int ret;
1036
1037         BUG_ON(!dev_net(dev));
1038         net = dev_net(dev);
1039         ret = __dev_alloc_name(net, name, buf);
1040         if (ret >= 0)
1041                 strlcpy(dev->name, buf, IFNAMSIZ);
1042         return ret;
1043 }
1044 EXPORT_SYMBOL(dev_alloc_name);
1045
1046 static int dev_alloc_name_ns(struct net *net,
1047                              struct net_device *dev,
1048                              const char *name)
1049 {
1050         char buf[IFNAMSIZ];
1051         int ret;
1052
1053         ret = __dev_alloc_name(net, name, buf);
1054         if (ret >= 0)
1055                 strlcpy(dev->name, buf, IFNAMSIZ);
1056         return ret;
1057 }
1058
1059 static int dev_get_valid_name(struct net *net,
1060                               struct net_device *dev,
1061                               const char *name)
1062 {
1063         BUG_ON(!net);
1064
1065         if (!dev_valid_name(name))
1066                 return -EINVAL;
1067
1068         if (strchr(name, '%'))
1069                 return dev_alloc_name_ns(net, dev, name);
1070         else if (__dev_get_by_name(net, name))
1071                 return -EEXIST;
1072         else if (dev->name != name)
1073                 strlcpy(dev->name, name, IFNAMSIZ);
1074
1075         return 0;
1076 }
1077
1078 /**
1079  *      dev_change_name - change name of a device
1080  *      @dev: device
1081  *      @newname: name (or format string) must be at least IFNAMSIZ
1082  *
1083  *      Change name of a device, can pass format strings "eth%d".
1084  *      for wildcarding.
1085  */
1086 int dev_change_name(struct net_device *dev, const char *newname)
1087 {
1088         char oldname[IFNAMSIZ];
1089         int err = 0;
1090         int ret;
1091         struct net *net;
1092
1093         ASSERT_RTNL();
1094         BUG_ON(!dev_net(dev));
1095
1096         net = dev_net(dev);
1097         if (dev->flags & IFF_UP)
1098                 return -EBUSY;
1099
1100         write_seqcount_begin(&devnet_rename_seq);
1101
1102         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1103                 write_seqcount_end(&devnet_rename_seq);
1104                 return 0;
1105         }
1106
1107         memcpy(oldname, dev->name, IFNAMSIZ);
1108
1109         err = dev_get_valid_name(net, dev, newname);
1110         if (err < 0) {
1111                 write_seqcount_end(&devnet_rename_seq);
1112                 return err;
1113         }
1114
1115 rollback:
1116         ret = device_rename(&dev->dev, dev->name);
1117         if (ret) {
1118                 memcpy(dev->name, oldname, IFNAMSIZ);
1119                 write_seqcount_end(&devnet_rename_seq);
1120                 return ret;
1121         }
1122
1123         write_seqcount_end(&devnet_rename_seq);
1124
1125         netdev_adjacent_rename_links(dev, oldname);
1126
1127         write_lock_bh(&dev_base_lock);
1128         hlist_del_rcu(&dev->name_hlist);
1129         write_unlock_bh(&dev_base_lock);
1130
1131         synchronize_rcu();
1132
1133         write_lock_bh(&dev_base_lock);
1134         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1135         write_unlock_bh(&dev_base_lock);
1136
1137         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1138         ret = notifier_to_errno(ret);
1139
1140         if (ret) {
1141                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1142                 if (err >= 0) {
1143                         err = ret;
1144                         write_seqcount_begin(&devnet_rename_seq);
1145                         memcpy(dev->name, oldname, IFNAMSIZ);
1146                         memcpy(oldname, newname, IFNAMSIZ);
1147                         goto rollback;
1148                 } else {
1149                         pr_err("%s: name change rollback failed: %d\n",
1150                                dev->name, ret);
1151                 }
1152         }
1153
1154         return err;
1155 }
1156
1157 /**
1158  *      dev_set_alias - change ifalias of a device
1159  *      @dev: device
1160  *      @alias: name up to IFALIASZ
1161  *      @len: limit of bytes to copy from info
1162  *
1163  *      Set ifalias for a device,
1164  */
1165 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1166 {
1167         char *new_ifalias;
1168
1169         ASSERT_RTNL();
1170
1171         if (len >= IFALIASZ)
1172                 return -EINVAL;
1173
1174         if (!len) {
1175                 kfree(dev->ifalias);
1176                 dev->ifalias = NULL;
1177                 return 0;
1178         }
1179
1180         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1181         if (!new_ifalias)
1182                 return -ENOMEM;
1183         dev->ifalias = new_ifalias;
1184
1185         strlcpy(dev->ifalias, alias, len+1);
1186         return len;
1187 }
1188
1189
1190 /**
1191  *      netdev_features_change - device changes features
1192  *      @dev: device to cause notification
1193  *
1194  *      Called to indicate a device has changed features.
1195  */
1196 void netdev_features_change(struct net_device *dev)
1197 {
1198         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1199 }
1200 EXPORT_SYMBOL(netdev_features_change);
1201
1202 /**
1203  *      netdev_state_change - device changes state
1204  *      @dev: device to cause notification
1205  *
1206  *      Called to indicate a device has changed state. This function calls
1207  *      the notifier chains for netdev_chain and sends a NEWLINK message
1208  *      to the routing socket.
1209  */
1210 void netdev_state_change(struct net_device *dev)
1211 {
1212         if (dev->flags & IFF_UP) {
1213                 struct netdev_notifier_change_info change_info;
1214
1215                 change_info.flags_changed = 0;
1216                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1217                                               &change_info.info);
1218                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1219         }
1220 }
1221 EXPORT_SYMBOL(netdev_state_change);
1222
1223 /**
1224  *      netdev_notify_peers - notify network peers about existence of @dev
1225  *      @dev: network device
1226  *
1227  * Generate traffic such that interested network peers are aware of
1228  * @dev, such as by generating a gratuitous ARP. This may be used when
1229  * a device wants to inform the rest of the network about some sort of
1230  * reconfiguration such as a failover event or virtual machine
1231  * migration.
1232  */
1233 void netdev_notify_peers(struct net_device *dev)
1234 {
1235         rtnl_lock();
1236         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1237         rtnl_unlock();
1238 }
1239 EXPORT_SYMBOL(netdev_notify_peers);
1240
1241 static int __dev_open(struct net_device *dev)
1242 {
1243         const struct net_device_ops *ops = dev->netdev_ops;
1244         int ret;
1245
1246         ASSERT_RTNL();
1247
1248         if (!netif_device_present(dev))
1249                 return -ENODEV;
1250
1251         /* Block netpoll from trying to do any rx path servicing.
1252          * If we don't do this there is a chance ndo_poll_controller
1253          * or ndo_poll may be running while we open the device
1254          */
1255         netpoll_poll_disable(dev);
1256
1257         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1258         ret = notifier_to_errno(ret);
1259         if (ret)
1260                 return ret;
1261
1262         set_bit(__LINK_STATE_START, &dev->state);
1263
1264         if (ops->ndo_validate_addr)
1265                 ret = ops->ndo_validate_addr(dev);
1266
1267         if (!ret && ops->ndo_open)
1268                 ret = ops->ndo_open(dev);
1269
1270         netpoll_poll_enable(dev);
1271
1272         if (ret)
1273                 clear_bit(__LINK_STATE_START, &dev->state);
1274         else {
1275                 dev->flags |= IFF_UP;
1276                 net_dmaengine_get();
1277                 dev_set_rx_mode(dev);
1278                 dev_activate(dev);
1279                 add_device_randomness(dev->dev_addr, dev->addr_len);
1280         }
1281
1282         return ret;
1283 }
1284
1285 /**
1286  *      dev_open        - prepare an interface for use.
1287  *      @dev:   device to open
1288  *
1289  *      Takes a device from down to up state. The device's private open
1290  *      function is invoked and then the multicast lists are loaded. Finally
1291  *      the device is moved into the up state and a %NETDEV_UP message is
1292  *      sent to the netdev notifier chain.
1293  *
1294  *      Calling this function on an active interface is a nop. On a failure
1295  *      a negative errno code is returned.
1296  */
1297 int dev_open(struct net_device *dev)
1298 {
1299         int ret;
1300
1301         if (dev->flags & IFF_UP)
1302                 return 0;
1303
1304         ret = __dev_open(dev);
1305         if (ret < 0)
1306                 return ret;
1307
1308         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1309         call_netdevice_notifiers(NETDEV_UP, dev);
1310
1311         return ret;
1312 }
1313 EXPORT_SYMBOL(dev_open);
1314
1315 static int __dev_close_many(struct list_head *head)
1316 {
1317         struct net_device *dev;
1318
1319         ASSERT_RTNL();
1320         might_sleep();
1321
1322         list_for_each_entry(dev, head, close_list) {
1323                 /* Temporarily disable netpoll until the interface is down */
1324                 netpoll_poll_disable(dev);
1325
1326                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1327
1328                 clear_bit(__LINK_STATE_START, &dev->state);
1329
1330                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1331                  * can be even on different cpu. So just clear netif_running().
1332                  *
1333                  * dev->stop() will invoke napi_disable() on all of it's
1334                  * napi_struct instances on this device.
1335                  */
1336                 smp_mb__after_atomic(); /* Commit netif_running(). */
1337         }
1338
1339         dev_deactivate_many(head);
1340
1341         list_for_each_entry(dev, head, close_list) {
1342                 const struct net_device_ops *ops = dev->netdev_ops;
1343
1344                 /*
1345                  *      Call the device specific close. This cannot fail.
1346                  *      Only if device is UP
1347                  *
1348                  *      We allow it to be called even after a DETACH hot-plug
1349                  *      event.
1350                  */
1351                 if (ops->ndo_stop)
1352                         ops->ndo_stop(dev);
1353
1354                 dev->flags &= ~IFF_UP;
1355                 net_dmaengine_put();
1356                 netpoll_poll_enable(dev);
1357         }
1358
1359         return 0;
1360 }
1361
1362 static int __dev_close(struct net_device *dev)
1363 {
1364         int retval;
1365         LIST_HEAD(single);
1366
1367         list_add(&dev->close_list, &single);
1368         retval = __dev_close_many(&single);
1369         list_del(&single);
1370
1371         return retval;
1372 }
1373
1374 static int dev_close_many(struct list_head *head)
1375 {
1376         struct net_device *dev, *tmp;
1377
1378         /* Remove the devices that don't need to be closed */
1379         list_for_each_entry_safe(dev, tmp, head, close_list)
1380                 if (!(dev->flags & IFF_UP))
1381                         list_del_init(&dev->close_list);
1382
1383         __dev_close_many(head);
1384
1385         list_for_each_entry_safe(dev, tmp, head, close_list) {
1386                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1387                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1388                 list_del_init(&dev->close_list);
1389         }
1390
1391         return 0;
1392 }
1393
1394 /**
1395  *      dev_close - shutdown an interface.
1396  *      @dev: device to shutdown
1397  *
1398  *      This function moves an active device into down state. A
1399  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1400  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1401  *      chain.
1402  */
1403 int dev_close(struct net_device *dev)
1404 {
1405         if (dev->flags & IFF_UP) {
1406                 LIST_HEAD(single);
1407
1408                 list_add(&dev->close_list, &single);
1409                 dev_close_many(&single);
1410                 list_del(&single);
1411         }
1412         return 0;
1413 }
1414 EXPORT_SYMBOL(dev_close);
1415
1416
1417 /**
1418  *      dev_disable_lro - disable Large Receive Offload on a device
1419  *      @dev: device
1420  *
1421  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1422  *      called under RTNL.  This is needed if received packets may be
1423  *      forwarded to another interface.
1424  */
1425 void dev_disable_lro(struct net_device *dev)
1426 {
1427         /*
1428          * If we're trying to disable lro on a vlan device
1429          * use the underlying physical device instead
1430          */
1431         if (is_vlan_dev(dev))
1432                 dev = vlan_dev_real_dev(dev);
1433
1434         /* the same for macvlan devices */
1435         if (netif_is_macvlan(dev))
1436                 dev = macvlan_dev_real_dev(dev);
1437
1438         dev->wanted_features &= ~NETIF_F_LRO;
1439         netdev_update_features(dev);
1440
1441         if (unlikely(dev->features & NETIF_F_LRO))
1442                 netdev_WARN(dev, "failed to disable LRO!\n");
1443 }
1444 EXPORT_SYMBOL(dev_disable_lro);
1445
1446 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1447                                    struct net_device *dev)
1448 {
1449         struct netdev_notifier_info info;
1450
1451         netdev_notifier_info_init(&info, dev);
1452         return nb->notifier_call(nb, val, &info);
1453 }
1454
1455 static int dev_boot_phase = 1;
1456
1457 /**
1458  *      register_netdevice_notifier - register a network notifier block
1459  *      @nb: notifier
1460  *
1461  *      Register a notifier to be called when network device events occur.
1462  *      The notifier passed is linked into the kernel structures and must
1463  *      not be reused until it has been unregistered. A negative errno code
1464  *      is returned on a failure.
1465  *
1466  *      When registered all registration and up events are replayed
1467  *      to the new notifier to allow device to have a race free
1468  *      view of the network device list.
1469  */
1470
1471 int register_netdevice_notifier(struct notifier_block *nb)
1472 {
1473         struct net_device *dev;
1474         struct net_device *last;
1475         struct net *net;
1476         int err;
1477
1478         rtnl_lock();
1479         err = raw_notifier_chain_register(&netdev_chain, nb);
1480         if (err)
1481                 goto unlock;
1482         if (dev_boot_phase)
1483                 goto unlock;
1484         for_each_net(net) {
1485                 for_each_netdev(net, dev) {
1486                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1487                         err = notifier_to_errno(err);
1488                         if (err)
1489                                 goto rollback;
1490
1491                         if (!(dev->flags & IFF_UP))
1492                                 continue;
1493
1494                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1495                 }
1496         }
1497
1498 unlock:
1499         rtnl_unlock();
1500         return err;
1501
1502 rollback:
1503         last = dev;
1504         for_each_net(net) {
1505                 for_each_netdev(net, dev) {
1506                         if (dev == last)
1507                                 goto outroll;
1508
1509                         if (dev->flags & IFF_UP) {
1510                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1511                                                         dev);
1512                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1513                         }
1514                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1515                 }
1516         }
1517
1518 outroll:
1519         raw_notifier_chain_unregister(&netdev_chain, nb);
1520         goto unlock;
1521 }
1522 EXPORT_SYMBOL(register_netdevice_notifier);
1523
1524 /**
1525  *      unregister_netdevice_notifier - unregister a network notifier block
1526  *      @nb: notifier
1527  *
1528  *      Unregister a notifier previously registered by
1529  *      register_netdevice_notifier(). The notifier is unlinked into the
1530  *      kernel structures and may then be reused. A negative errno code
1531  *      is returned on a failure.
1532  *
1533  *      After unregistering unregister and down device events are synthesized
1534  *      for all devices on the device list to the removed notifier to remove
1535  *      the need for special case cleanup code.
1536  */
1537
1538 int unregister_netdevice_notifier(struct notifier_block *nb)
1539 {
1540         struct net_device *dev;
1541         struct net *net;
1542         int err;
1543
1544         rtnl_lock();
1545         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1546         if (err)
1547                 goto unlock;
1548
1549         for_each_net(net) {
1550                 for_each_netdev(net, dev) {
1551                         if (dev->flags & IFF_UP) {
1552                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1553                                                         dev);
1554                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1555                         }
1556                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1557                 }
1558         }
1559 unlock:
1560         rtnl_unlock();
1561         return err;
1562 }
1563 EXPORT_SYMBOL(unregister_netdevice_notifier);
1564
1565 /**
1566  *      call_netdevice_notifiers_info - call all network notifier blocks
1567  *      @val: value passed unmodified to notifier function
1568  *      @dev: net_device pointer passed unmodified to notifier function
1569  *      @info: notifier information data
1570  *
1571  *      Call all network notifier blocks.  Parameters and return value
1572  *      are as for raw_notifier_call_chain().
1573  */
1574
1575 static int call_netdevice_notifiers_info(unsigned long val,
1576                                          struct net_device *dev,
1577                                          struct netdev_notifier_info *info)
1578 {
1579         ASSERT_RTNL();
1580         netdev_notifier_info_init(info, dev);
1581         return raw_notifier_call_chain(&netdev_chain, val, info);
1582 }
1583
1584 /**
1585  *      call_netdevice_notifiers - call all network notifier blocks
1586  *      @val: value passed unmodified to notifier function
1587  *      @dev: net_device pointer passed unmodified to notifier function
1588  *
1589  *      Call all network notifier blocks.  Parameters and return value
1590  *      are as for raw_notifier_call_chain().
1591  */
1592
1593 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1594 {
1595         struct netdev_notifier_info info;
1596
1597         return call_netdevice_notifiers_info(val, dev, &info);
1598 }
1599 EXPORT_SYMBOL(call_netdevice_notifiers);
1600
1601 static struct static_key netstamp_needed __read_mostly;
1602 #ifdef HAVE_JUMP_LABEL
1603 /* We are not allowed to call static_key_slow_dec() from irq context
1604  * If net_disable_timestamp() is called from irq context, defer the
1605  * static_key_slow_dec() calls.
1606  */
1607 static atomic_t netstamp_needed_deferred;
1608 #endif
1609
1610 void net_enable_timestamp(void)
1611 {
1612 #ifdef HAVE_JUMP_LABEL
1613         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1614
1615         if (deferred) {
1616                 while (--deferred)
1617                         static_key_slow_dec(&netstamp_needed);
1618                 return;
1619         }
1620 #endif
1621         static_key_slow_inc(&netstamp_needed);
1622 }
1623 EXPORT_SYMBOL(net_enable_timestamp);
1624
1625 void net_disable_timestamp(void)
1626 {
1627 #ifdef HAVE_JUMP_LABEL
1628         if (in_interrupt()) {
1629                 atomic_inc(&netstamp_needed_deferred);
1630                 return;
1631         }
1632 #endif
1633         static_key_slow_dec(&netstamp_needed);
1634 }
1635 EXPORT_SYMBOL(net_disable_timestamp);
1636
1637 static inline void net_timestamp_set(struct sk_buff *skb)
1638 {
1639         skb->tstamp.tv64 = 0;
1640         if (static_key_false(&netstamp_needed))
1641                 __net_timestamp(skb);
1642 }
1643
1644 #define net_timestamp_check(COND, SKB)                  \
1645         if (static_key_false(&netstamp_needed)) {               \
1646                 if ((COND) && !(SKB)->tstamp.tv64)      \
1647                         __net_timestamp(SKB);           \
1648         }                                               \
1649
1650 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1651 {
1652         unsigned int len;
1653
1654         if (!(dev->flags & IFF_UP))
1655                 return false;
1656
1657         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1658         if (skb->len <= len)
1659                 return true;
1660
1661         /* if TSO is enabled, we don't care about the length as the packet
1662          * could be forwarded without being segmented before
1663          */
1664         if (skb_is_gso(skb))
1665                 return true;
1666
1667         return false;
1668 }
1669 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1670
1671 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1672 {
1673         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1674                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1675                         atomic_long_inc(&dev->rx_dropped);
1676                         kfree_skb(skb);
1677                         return NET_RX_DROP;
1678                 }
1679         }
1680
1681         if (unlikely(!is_skb_forwardable(dev, skb))) {
1682                 atomic_long_inc(&dev->rx_dropped);
1683                 kfree_skb(skb);
1684                 return NET_RX_DROP;
1685         }
1686
1687         skb_scrub_packet(skb, true);
1688         skb->protocol = eth_type_trans(skb, dev);
1689
1690         return 0;
1691 }
1692 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1693
1694 /**
1695  * dev_forward_skb - loopback an skb to another netif
1696  *
1697  * @dev: destination network device
1698  * @skb: buffer to forward
1699  *
1700  * return values:
1701  *      NET_RX_SUCCESS  (no congestion)
1702  *      NET_RX_DROP     (packet was dropped, but freed)
1703  *
1704  * dev_forward_skb can be used for injecting an skb from the
1705  * start_xmit function of one device into the receive queue
1706  * of another device.
1707  *
1708  * The receiving device may be in another namespace, so
1709  * we have to clear all information in the skb that could
1710  * impact namespace isolation.
1711  */
1712 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1713 {
1714         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1715 }
1716 EXPORT_SYMBOL_GPL(dev_forward_skb);
1717
1718 static inline int deliver_skb(struct sk_buff *skb,
1719                               struct packet_type *pt_prev,
1720                               struct net_device *orig_dev)
1721 {
1722         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1723                 return -ENOMEM;
1724         atomic_inc(&skb->users);
1725         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1726 }
1727
1728 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1729 {
1730         if (!ptype->af_packet_priv || !skb->sk)
1731                 return false;
1732
1733         if (ptype->id_match)
1734                 return ptype->id_match(ptype, skb->sk);
1735         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1736                 return true;
1737
1738         return false;
1739 }
1740
1741 /*
1742  *      Support routine. Sends outgoing frames to any network
1743  *      taps currently in use.
1744  */
1745
1746 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1747 {
1748         struct packet_type *ptype;
1749         struct sk_buff *skb2 = NULL;
1750         struct packet_type *pt_prev = NULL;
1751
1752         rcu_read_lock();
1753         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1754                 /* Never send packets back to the socket
1755                  * they originated from - MvS (miquels@drinkel.ow.org)
1756                  */
1757                 if ((ptype->dev == dev || !ptype->dev) &&
1758                     (!skb_loop_sk(ptype, skb))) {
1759                         if (pt_prev) {
1760                                 deliver_skb(skb2, pt_prev, skb->dev);
1761                                 pt_prev = ptype;
1762                                 continue;
1763                         }
1764
1765                         skb2 = skb_clone(skb, GFP_ATOMIC);
1766                         if (!skb2)
1767                                 break;
1768
1769                         net_timestamp_set(skb2);
1770
1771                         /* skb->nh should be correctly
1772                            set by sender, so that the second statement is
1773                            just protection against buggy protocols.
1774                          */
1775                         skb_reset_mac_header(skb2);
1776
1777                         if (skb_network_header(skb2) < skb2->data ||
1778                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1779                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1780                                                      ntohs(skb2->protocol),
1781                                                      dev->name);
1782                                 skb_reset_network_header(skb2);
1783                         }
1784
1785                         skb2->transport_header = skb2->network_header;
1786                         skb2->pkt_type = PACKET_OUTGOING;
1787                         pt_prev = ptype;
1788                 }
1789         }
1790         if (pt_prev)
1791                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1792         rcu_read_unlock();
1793 }
1794
1795 /**
1796  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1797  * @dev: Network device
1798  * @txq: number of queues available
1799  *
1800  * If real_num_tx_queues is changed the tc mappings may no longer be
1801  * valid. To resolve this verify the tc mapping remains valid and if
1802  * not NULL the mapping. With no priorities mapping to this
1803  * offset/count pair it will no longer be used. In the worst case TC0
1804  * is invalid nothing can be done so disable priority mappings. If is
1805  * expected that drivers will fix this mapping if they can before
1806  * calling netif_set_real_num_tx_queues.
1807  */
1808 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1809 {
1810         int i;
1811         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1812
1813         /* If TC0 is invalidated disable TC mapping */
1814         if (tc->offset + tc->count > txq) {
1815                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1816                 dev->num_tc = 0;
1817                 return;
1818         }
1819
1820         /* Invalidated prio to tc mappings set to TC0 */
1821         for (i = 1; i < TC_BITMASK + 1; i++) {
1822                 int q = netdev_get_prio_tc_map(dev, i);
1823
1824                 tc = &dev->tc_to_txq[q];
1825                 if (tc->offset + tc->count > txq) {
1826                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1827                                 i, q);
1828                         netdev_set_prio_tc_map(dev, i, 0);
1829                 }
1830         }
1831 }
1832
1833 #ifdef CONFIG_XPS
1834 static DEFINE_MUTEX(xps_map_mutex);
1835 #define xmap_dereference(P)             \
1836         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1837
1838 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1839                                         int cpu, u16 index)
1840 {
1841         struct xps_map *map = NULL;
1842         int pos;
1843
1844         if (dev_maps)
1845                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1846
1847         for (pos = 0; map && pos < map->len; pos++) {
1848                 if (map->queues[pos] == index) {
1849                         if (map->len > 1) {
1850                                 map->queues[pos] = map->queues[--map->len];
1851                         } else {
1852                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1853                                 kfree_rcu(map, rcu);
1854                                 map = NULL;
1855                         }
1856                         break;
1857                 }
1858         }
1859
1860         return map;
1861 }
1862
1863 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1864 {
1865         struct xps_dev_maps *dev_maps;
1866         int cpu, i;
1867         bool active = false;
1868
1869         mutex_lock(&xps_map_mutex);
1870         dev_maps = xmap_dereference(dev->xps_maps);
1871
1872         if (!dev_maps)
1873                 goto out_no_maps;
1874
1875         for_each_possible_cpu(cpu) {
1876                 for (i = index; i < dev->num_tx_queues; i++) {
1877                         if (!remove_xps_queue(dev_maps, cpu, i))
1878                                 break;
1879                 }
1880                 if (i == dev->num_tx_queues)
1881                         active = true;
1882         }
1883
1884         if (!active) {
1885                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1886                 kfree_rcu(dev_maps, rcu);
1887         }
1888
1889         for (i = index; i < dev->num_tx_queues; i++)
1890                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1891                                              NUMA_NO_NODE);
1892
1893 out_no_maps:
1894         mutex_unlock(&xps_map_mutex);
1895 }
1896
1897 static struct xps_map *expand_xps_map(struct xps_map *map,
1898                                       int cpu, u16 index)
1899 {
1900         struct xps_map *new_map;
1901         int alloc_len = XPS_MIN_MAP_ALLOC;
1902         int i, pos;
1903
1904         for (pos = 0; map && pos < map->len; pos++) {
1905                 if (map->queues[pos] != index)
1906                         continue;
1907                 return map;
1908         }
1909
1910         /* Need to add queue to this CPU's existing map */
1911         if (map) {
1912                 if (pos < map->alloc_len)
1913                         return map;
1914
1915                 alloc_len = map->alloc_len * 2;
1916         }
1917
1918         /* Need to allocate new map to store queue on this CPU's map */
1919         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1920                                cpu_to_node(cpu));
1921         if (!new_map)
1922                 return NULL;
1923
1924         for (i = 0; i < pos; i++)
1925                 new_map->queues[i] = map->queues[i];
1926         new_map->alloc_len = alloc_len;
1927         new_map->len = pos;
1928
1929         return new_map;
1930 }
1931
1932 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1933                         u16 index)
1934 {
1935         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1936         struct xps_map *map, *new_map;
1937         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1938         int cpu, numa_node_id = -2;
1939         bool active = false;
1940
1941         mutex_lock(&xps_map_mutex);
1942
1943         dev_maps = xmap_dereference(dev->xps_maps);
1944
1945         /* allocate memory for queue storage */
1946         for_each_online_cpu(cpu) {
1947                 if (!cpumask_test_cpu(cpu, mask))
1948                         continue;
1949
1950                 if (!new_dev_maps)
1951                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1952                 if (!new_dev_maps) {
1953                         mutex_unlock(&xps_map_mutex);
1954                         return -ENOMEM;
1955                 }
1956
1957                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1958                                  NULL;
1959
1960                 map = expand_xps_map(map, cpu, index);
1961                 if (!map)
1962                         goto error;
1963
1964                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1965         }
1966
1967         if (!new_dev_maps)
1968                 goto out_no_new_maps;
1969
1970         for_each_possible_cpu(cpu) {
1971                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1972                         /* add queue to CPU maps */
1973                         int pos = 0;
1974
1975                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1976                         while ((pos < map->len) && (map->queues[pos] != index))
1977                                 pos++;
1978
1979                         if (pos == map->len)
1980                                 map->queues[map->len++] = index;
1981 #ifdef CONFIG_NUMA
1982                         if (numa_node_id == -2)
1983                                 numa_node_id = cpu_to_node(cpu);
1984                         else if (numa_node_id != cpu_to_node(cpu))
1985                                 numa_node_id = -1;
1986 #endif
1987                 } else if (dev_maps) {
1988                         /* fill in the new device map from the old device map */
1989                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1991                 }
1992
1993         }
1994
1995         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1996
1997         /* Cleanup old maps */
1998         if (dev_maps) {
1999                 for_each_possible_cpu(cpu) {
2000                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2001                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2002                         if (map && map != new_map)
2003                                 kfree_rcu(map, rcu);
2004                 }
2005
2006                 kfree_rcu(dev_maps, rcu);
2007         }
2008
2009         dev_maps = new_dev_maps;
2010         active = true;
2011
2012 out_no_new_maps:
2013         /* update Tx queue numa node */
2014         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2015                                      (numa_node_id >= 0) ? numa_node_id :
2016                                      NUMA_NO_NODE);
2017
2018         if (!dev_maps)
2019                 goto out_no_maps;
2020
2021         /* removes queue from unused CPUs */
2022         for_each_possible_cpu(cpu) {
2023                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2024                         continue;
2025
2026                 if (remove_xps_queue(dev_maps, cpu, index))
2027                         active = true;
2028         }
2029
2030         /* free map if not active */
2031         if (!active) {
2032                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2033                 kfree_rcu(dev_maps, rcu);
2034         }
2035
2036 out_no_maps:
2037         mutex_unlock(&xps_map_mutex);
2038
2039         return 0;
2040 error:
2041         /* remove any maps that we added */
2042         for_each_possible_cpu(cpu) {
2043                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2044                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2045                                  NULL;
2046                 if (new_map && new_map != map)
2047                         kfree(new_map);
2048         }
2049
2050         mutex_unlock(&xps_map_mutex);
2051
2052         kfree(new_dev_maps);
2053         return -ENOMEM;
2054 }
2055 EXPORT_SYMBOL(netif_set_xps_queue);
2056
2057 #endif
2058 /*
2059  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2060  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2061  */
2062 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2063 {
2064         int rc;
2065
2066         if (txq < 1 || txq > dev->num_tx_queues)
2067                 return -EINVAL;
2068
2069         if (dev->reg_state == NETREG_REGISTERED ||
2070             dev->reg_state == NETREG_UNREGISTERING) {
2071                 ASSERT_RTNL();
2072
2073                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2074                                                   txq);
2075                 if (rc)
2076                         return rc;
2077
2078                 if (dev->num_tc)
2079                         netif_setup_tc(dev, txq);
2080
2081                 if (txq < dev->real_num_tx_queues) {
2082                         qdisc_reset_all_tx_gt(dev, txq);
2083 #ifdef CONFIG_XPS
2084                         netif_reset_xps_queues_gt(dev, txq);
2085 #endif
2086                 }
2087         }
2088
2089         dev->real_num_tx_queues = txq;
2090         return 0;
2091 }
2092 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2093
2094 #ifdef CONFIG_SYSFS
2095 /**
2096  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2097  *      @dev: Network device
2098  *      @rxq: Actual number of RX queues
2099  *
2100  *      This must be called either with the rtnl_lock held or before
2101  *      registration of the net device.  Returns 0 on success, or a
2102  *      negative error code.  If called before registration, it always
2103  *      succeeds.
2104  */
2105 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2106 {
2107         int rc;
2108
2109         if (rxq < 1 || rxq > dev->num_rx_queues)
2110                 return -EINVAL;
2111
2112         if (dev->reg_state == NETREG_REGISTERED) {
2113                 ASSERT_RTNL();
2114
2115                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2116                                                   rxq);
2117                 if (rc)
2118                         return rc;
2119         }
2120
2121         dev->real_num_rx_queues = rxq;
2122         return 0;
2123 }
2124 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2125 #endif
2126
2127 /**
2128  * netif_get_num_default_rss_queues - default number of RSS queues
2129  *
2130  * This routine should set an upper limit on the number of RSS queues
2131  * used by default by multiqueue devices.
2132  */
2133 int netif_get_num_default_rss_queues(void)
2134 {
2135         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2136 }
2137 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2138
2139 static inline void __netif_reschedule(struct Qdisc *q)
2140 {
2141         struct softnet_data *sd;
2142         unsigned long flags;
2143
2144         local_irq_save(flags);
2145         sd = &__get_cpu_var(softnet_data);
2146         q->next_sched = NULL;
2147         *sd->output_queue_tailp = q;
2148         sd->output_queue_tailp = &q->next_sched;
2149         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2150         local_irq_restore(flags);
2151 }
2152
2153 void __netif_schedule(struct Qdisc *q)
2154 {
2155         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2156                 __netif_reschedule(q);
2157 }
2158 EXPORT_SYMBOL(__netif_schedule);
2159
2160 struct dev_kfree_skb_cb {
2161         enum skb_free_reason reason;
2162 };
2163
2164 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2165 {
2166         return (struct dev_kfree_skb_cb *)skb->cb;
2167 }
2168
2169 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2170 {
2171         unsigned long flags;
2172
2173         if (likely(atomic_read(&skb->users) == 1)) {
2174                 smp_rmb();
2175                 atomic_set(&skb->users, 0);
2176         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2177                 return;
2178         }
2179         get_kfree_skb_cb(skb)->reason = reason;
2180         local_irq_save(flags);
2181         skb->next = __this_cpu_read(softnet_data.completion_queue);
2182         __this_cpu_write(softnet_data.completion_queue, skb);
2183         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2184         local_irq_restore(flags);
2185 }
2186 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2187
2188 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2189 {
2190         if (in_irq() || irqs_disabled())
2191                 __dev_kfree_skb_irq(skb, reason);
2192         else
2193                 dev_kfree_skb(skb);
2194 }
2195 EXPORT_SYMBOL(__dev_kfree_skb_any);
2196
2197
2198 /**
2199  * netif_device_detach - mark device as removed
2200  * @dev: network device
2201  *
2202  * Mark device as removed from system and therefore no longer available.
2203  */
2204 void netif_device_detach(struct net_device *dev)
2205 {
2206         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2207             netif_running(dev)) {
2208                 netif_tx_stop_all_queues(dev);
2209         }
2210 }
2211 EXPORT_SYMBOL(netif_device_detach);
2212
2213 /**
2214  * netif_device_attach - mark device as attached
2215  * @dev: network device
2216  *
2217  * Mark device as attached from system and restart if needed.
2218  */
2219 void netif_device_attach(struct net_device *dev)
2220 {
2221         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2222             netif_running(dev)) {
2223                 netif_tx_wake_all_queues(dev);
2224                 __netdev_watchdog_up(dev);
2225         }
2226 }
2227 EXPORT_SYMBOL(netif_device_attach);
2228
2229 static void skb_warn_bad_offload(const struct sk_buff *skb)
2230 {
2231         static const netdev_features_t null_features = 0;
2232         struct net_device *dev = skb->dev;
2233         const char *driver = "";
2234
2235         if (!net_ratelimit())
2236                 return;
2237
2238         if (dev && dev->dev.parent)
2239                 driver = dev_driver_string(dev->dev.parent);
2240
2241         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2242              "gso_type=%d ip_summed=%d\n",
2243              driver, dev ? &dev->features : &null_features,
2244              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2245              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2246              skb_shinfo(skb)->gso_type, skb->ip_summed);
2247 }
2248
2249 /*
2250  * Invalidate hardware checksum when packet is to be mangled, and
2251  * complete checksum manually on outgoing path.
2252  */
2253 int skb_checksum_help(struct sk_buff *skb)
2254 {
2255         __wsum csum;
2256         int ret = 0, offset;
2257
2258         if (skb->ip_summed == CHECKSUM_COMPLETE)
2259                 goto out_set_summed;
2260
2261         if (unlikely(skb_shinfo(skb)->gso_size)) {
2262                 skb_warn_bad_offload(skb);
2263                 return -EINVAL;
2264         }
2265
2266         /* Before computing a checksum, we should make sure no frag could
2267          * be modified by an external entity : checksum could be wrong.
2268          */
2269         if (skb_has_shared_frag(skb)) {
2270                 ret = __skb_linearize(skb);
2271                 if (ret)
2272                         goto out;
2273         }
2274
2275         offset = skb_checksum_start_offset(skb);
2276         BUG_ON(offset >= skb_headlen(skb));
2277         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2278
2279         offset += skb->csum_offset;
2280         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2281
2282         if (skb_cloned(skb) &&
2283             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2284                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2285                 if (ret)
2286                         goto out;
2287         }
2288
2289         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2290 out_set_summed:
2291         skb->ip_summed = CHECKSUM_NONE;
2292 out:
2293         return ret;
2294 }
2295 EXPORT_SYMBOL(skb_checksum_help);
2296
2297 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2298 {
2299         unsigned int vlan_depth = skb->mac_len;
2300         __be16 type = skb->protocol;
2301
2302         /* Tunnel gso handlers can set protocol to ethernet. */
2303         if (type == htons(ETH_P_TEB)) {
2304                 struct ethhdr *eth;
2305
2306                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2307                         return 0;
2308
2309                 eth = (struct ethhdr *)skb_mac_header(skb);
2310                 type = eth->h_proto;
2311         }
2312
2313         /* if skb->protocol is 802.1Q/AD then the header should already be
2314          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2315          * ETH_HLEN otherwise
2316          */
2317         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2318                 if (vlan_depth) {
2319                         if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
2320                                 return 0;
2321                         vlan_depth -= VLAN_HLEN;
2322                 } else {
2323                         vlan_depth = ETH_HLEN;
2324                 }
2325                 do {
2326                         struct vlan_hdr *vh;
2327
2328                         if (unlikely(!pskb_may_pull(skb,
2329                                                     vlan_depth + VLAN_HLEN)))
2330                                 return 0;
2331
2332                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2333                         type = vh->h_vlan_encapsulated_proto;
2334                         vlan_depth += VLAN_HLEN;
2335                 } while (type == htons(ETH_P_8021Q) ||
2336                          type == htons(ETH_P_8021AD));
2337         }
2338
2339         *depth = vlan_depth;
2340
2341         return type;
2342 }
2343
2344 /**
2345  *      skb_mac_gso_segment - mac layer segmentation handler.
2346  *      @skb: buffer to segment
2347  *      @features: features for the output path (see dev->features)
2348  */
2349 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2350                                     netdev_features_t features)
2351 {
2352         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2353         struct packet_offload *ptype;
2354         int vlan_depth = skb->mac_len;
2355         __be16 type = skb_network_protocol(skb, &vlan_depth);
2356
2357         if (unlikely(!type))
2358                 return ERR_PTR(-EINVAL);
2359
2360         __skb_pull(skb, vlan_depth);
2361
2362         rcu_read_lock();
2363         list_for_each_entry_rcu(ptype, &offload_base, list) {
2364                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2365                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2366                                 int err;
2367
2368                                 err = ptype->callbacks.gso_send_check(skb);
2369                                 segs = ERR_PTR(err);
2370                                 if (err || skb_gso_ok(skb, features))
2371                                         break;
2372                                 __skb_push(skb, (skb->data -
2373                                                  skb_network_header(skb)));
2374                         }
2375                         segs = ptype->callbacks.gso_segment(skb, features);
2376                         break;
2377                 }
2378         }
2379         rcu_read_unlock();
2380
2381         __skb_push(skb, skb->data - skb_mac_header(skb));
2382
2383         return segs;
2384 }
2385 EXPORT_SYMBOL(skb_mac_gso_segment);
2386
2387
2388 /* openvswitch calls this on rx path, so we need a different check.
2389  */
2390 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2391 {
2392         if (tx_path)
2393                 return skb->ip_summed != CHECKSUM_PARTIAL;
2394         else
2395                 return skb->ip_summed == CHECKSUM_NONE;
2396 }
2397
2398 /**
2399  *      __skb_gso_segment - Perform segmentation on skb.
2400  *      @skb: buffer to segment
2401  *      @features: features for the output path (see dev->features)
2402  *      @tx_path: whether it is called in TX path
2403  *
2404  *      This function segments the given skb and returns a list of segments.
2405  *
2406  *      It may return NULL if the skb requires no segmentation.  This is
2407  *      only possible when GSO is used for verifying header integrity.
2408  */
2409 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2410                                   netdev_features_t features, bool tx_path)
2411 {
2412         if (unlikely(skb_needs_check(skb, tx_path))) {
2413                 int err;
2414
2415                 skb_warn_bad_offload(skb);
2416
2417                 if (skb_header_cloned(skb) &&
2418                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2419                         return ERR_PTR(err);
2420         }
2421
2422         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2423         SKB_GSO_CB(skb)->encap_level = 0;
2424
2425         skb_reset_mac_header(skb);
2426         skb_reset_mac_len(skb);
2427
2428         return skb_mac_gso_segment(skb, features);
2429 }
2430 EXPORT_SYMBOL(__skb_gso_segment);
2431
2432 /* Take action when hardware reception checksum errors are detected. */
2433 #ifdef CONFIG_BUG
2434 void netdev_rx_csum_fault(struct net_device *dev)
2435 {
2436         if (net_ratelimit()) {
2437                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2438                 dump_stack();
2439         }
2440 }
2441 EXPORT_SYMBOL(netdev_rx_csum_fault);
2442 #endif
2443
2444 /* Actually, we should eliminate this check as soon as we know, that:
2445  * 1. IOMMU is present and allows to map all the memory.
2446  * 2. No high memory really exists on this machine.
2447  */
2448
2449 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2450 {
2451 #ifdef CONFIG_HIGHMEM
2452         int i;
2453         if (!(dev->features & NETIF_F_HIGHDMA)) {
2454                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2455                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2456                         if (PageHighMem(skb_frag_page(frag)))
2457                                 return 1;
2458                 }
2459         }
2460
2461         if (PCI_DMA_BUS_IS_PHYS) {
2462                 struct device *pdev = dev->dev.parent;
2463
2464                 if (!pdev)
2465                         return 0;
2466                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2467                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2468                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2469                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2470                                 return 1;
2471                 }
2472         }
2473 #endif
2474         return 0;
2475 }
2476
2477 struct dev_gso_cb {
2478         void (*destructor)(struct sk_buff *skb);
2479 };
2480
2481 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2482
2483 static void dev_gso_skb_destructor(struct sk_buff *skb)
2484 {
2485         struct dev_gso_cb *cb;
2486
2487         kfree_skb_list(skb->next);
2488         skb->next = NULL;
2489
2490         cb = DEV_GSO_CB(skb);
2491         if (cb->destructor)
2492                 cb->destructor(skb);
2493 }
2494
2495 /**
2496  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2497  *      @skb: buffer to segment
2498  *      @features: device features as applicable to this skb
2499  *
2500  *      This function segments the given skb and stores the list of segments
2501  *      in skb->next.
2502  */
2503 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2504 {
2505         struct sk_buff *segs;
2506
2507         segs = skb_gso_segment(skb, features);
2508
2509         /* Verifying header integrity only. */
2510         if (!segs)
2511                 return 0;
2512
2513         if (IS_ERR(segs))
2514                 return PTR_ERR(segs);
2515
2516         skb->next = segs;
2517         DEV_GSO_CB(skb)->destructor = skb->destructor;
2518         skb->destructor = dev_gso_skb_destructor;
2519
2520         return 0;
2521 }
2522
2523 /* If MPLS offload request, verify we are testing hardware MPLS features
2524  * instead of standard features for the netdev.
2525  */
2526 #ifdef CONFIG_NET_MPLS_GSO
2527 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2528                                            netdev_features_t features,
2529                                            __be16 type)
2530 {
2531         if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2532                 features &= skb->dev->mpls_features;
2533
2534         return features;
2535 }
2536 #else
2537 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538                                            netdev_features_t features,
2539                                            __be16 type)
2540 {
2541         return features;
2542 }
2543 #endif
2544
2545 static netdev_features_t harmonize_features(struct sk_buff *skb,
2546         netdev_features_t features)
2547 {
2548         int tmp;
2549         __be16 type;
2550
2551         type = skb_network_protocol(skb, &tmp);
2552         features = net_mpls_features(skb, features, type);
2553
2554         if (skb->ip_summed != CHECKSUM_NONE &&
2555             !can_checksum_protocol(features, type)) {
2556                 features &= ~NETIF_F_ALL_CSUM;
2557         } else if (illegal_highdma(skb->dev, skb)) {
2558                 features &= ~NETIF_F_SG;
2559         }
2560
2561         return features;
2562 }
2563
2564 netdev_features_t netif_skb_features(struct sk_buff *skb)
2565 {
2566         __be16 protocol = skb->protocol;
2567         netdev_features_t features = skb->dev->features;
2568
2569         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2570                 features &= ~NETIF_F_GSO_MASK;
2571
2572         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2573                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2574                 protocol = veh->h_vlan_encapsulated_proto;
2575         } else if (!vlan_tx_tag_present(skb)) {
2576                 return harmonize_features(skb, features);
2577         }
2578
2579         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2580                                                NETIF_F_HW_VLAN_STAG_TX);
2581
2582         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2583                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2584                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2585                                 NETIF_F_HW_VLAN_STAG_TX;
2586
2587         return harmonize_features(skb, features);
2588 }
2589 EXPORT_SYMBOL(netif_skb_features);
2590
2591 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2592                         struct netdev_queue *txq)
2593 {
2594         const struct net_device_ops *ops = dev->netdev_ops;
2595         int rc = NETDEV_TX_OK;
2596         unsigned int skb_len;
2597
2598         if (likely(!skb->next)) {
2599                 netdev_features_t features;
2600
2601                 /*
2602                  * If device doesn't need skb->dst, release it right now while
2603                  * its hot in this cpu cache
2604                  */
2605                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2606                         skb_dst_drop(skb);
2607
2608                 features = netif_skb_features(skb);
2609
2610                 if (vlan_tx_tag_present(skb) &&
2611                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2612                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2613                                              vlan_tx_tag_get(skb));
2614                         if (unlikely(!skb))
2615                                 goto out;
2616
2617                         skb->vlan_tci = 0;
2618                 }
2619
2620                 /* If encapsulation offload request, verify we are testing
2621                  * hardware encapsulation features instead of standard
2622                  * features for the netdev
2623                  */
2624                 if (skb->encapsulation)
2625                         features &= dev->hw_enc_features;
2626
2627                 if (netif_needs_gso(skb, features)) {
2628                         if (unlikely(dev_gso_segment(skb, features)))
2629                                 goto out_kfree_skb;
2630                         if (skb->next)
2631                                 goto gso;
2632                 } else {
2633                         if (skb_needs_linearize(skb, features) &&
2634                             __skb_linearize(skb))
2635                                 goto out_kfree_skb;
2636
2637                         /* If packet is not checksummed and device does not
2638                          * support checksumming for this protocol, complete
2639                          * checksumming here.
2640                          */
2641                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2642                                 if (skb->encapsulation)
2643                                         skb_set_inner_transport_header(skb,
2644                                                 skb_checksum_start_offset(skb));
2645                                 else
2646                                         skb_set_transport_header(skb,
2647                                                 skb_checksum_start_offset(skb));
2648                                 if (!(features & NETIF_F_ALL_CSUM) &&
2649                                      skb_checksum_help(skb))
2650                                         goto out_kfree_skb;
2651                         }
2652                 }
2653
2654                 if (!list_empty(&ptype_all))
2655                         dev_queue_xmit_nit(skb, dev);
2656
2657                 skb_len = skb->len;
2658                 trace_net_dev_start_xmit(skb, dev);
2659                 rc = ops->ndo_start_xmit(skb, dev);
2660                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2661                 if (rc == NETDEV_TX_OK)
2662                         txq_trans_update(txq);
2663                 return rc;
2664         }
2665
2666 gso:
2667         do {
2668                 struct sk_buff *nskb = skb->next;
2669
2670                 skb->next = nskb->next;
2671                 nskb->next = NULL;
2672
2673                 if (!list_empty(&ptype_all))
2674                         dev_queue_xmit_nit(nskb, dev);
2675
2676                 skb_len = nskb->len;
2677                 trace_net_dev_start_xmit(nskb, dev);
2678                 rc = ops->ndo_start_xmit(nskb, dev);
2679                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2680                 if (unlikely(rc != NETDEV_TX_OK)) {
2681                         if (rc & ~NETDEV_TX_MASK)
2682                                 goto out_kfree_gso_skb;
2683                         nskb->next = skb->next;
2684                         skb->next = nskb;
2685                         return rc;
2686                 }
2687                 txq_trans_update(txq);
2688                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2689                         return NETDEV_TX_BUSY;
2690         } while (skb->next);
2691
2692 out_kfree_gso_skb:
2693         if (likely(skb->next == NULL)) {
2694                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2695                 consume_skb(skb);
2696                 return rc;
2697         }
2698 out_kfree_skb:
2699         kfree_skb(skb);
2700 out:
2701         return rc;
2702 }
2703 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2704
2705 static void qdisc_pkt_len_init(struct sk_buff *skb)
2706 {
2707         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2708
2709         qdisc_skb_cb(skb)->pkt_len = skb->len;
2710
2711         /* To get more precise estimation of bytes sent on wire,
2712          * we add to pkt_len the headers size of all segments
2713          */
2714         if (shinfo->gso_size)  {
2715                 unsigned int hdr_len;
2716                 u16 gso_segs = shinfo->gso_segs;
2717
2718                 /* mac layer + network layer */
2719                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2720
2721                 /* + transport layer */
2722                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2723                         hdr_len += tcp_hdrlen(skb);
2724                 else
2725                         hdr_len += sizeof(struct udphdr);
2726
2727                 if (shinfo->gso_type & SKB_GSO_DODGY)
2728                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2729                                                 shinfo->gso_size);
2730
2731                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2732         }
2733 }
2734
2735 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2736                                  struct net_device *dev,
2737                                  struct netdev_queue *txq)
2738 {
2739         spinlock_t *root_lock = qdisc_lock(q);
2740         bool contended;
2741         int rc;
2742
2743         qdisc_pkt_len_init(skb);
2744         qdisc_calculate_pkt_len(skb, q);
2745         /*
2746          * Heuristic to force contended enqueues to serialize on a
2747          * separate lock before trying to get qdisc main lock.
2748          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2749          * and dequeue packets faster.
2750          */
2751         contended = qdisc_is_running(q);
2752         if (unlikely(contended))
2753                 spin_lock(&q->busylock);
2754
2755         spin_lock(root_lock);
2756         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2757                 kfree_skb(skb);
2758                 rc = NET_XMIT_DROP;
2759         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2760                    qdisc_run_begin(q)) {
2761                 /*
2762                  * This is a work-conserving queue; there are no old skbs
2763                  * waiting to be sent out; and the qdisc is not running -
2764                  * xmit the skb directly.
2765                  */
2766                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2767                         skb_dst_force(skb);
2768
2769                 qdisc_bstats_update(q, skb);
2770
2771                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2772                         if (unlikely(contended)) {
2773                                 spin_unlock(&q->busylock);
2774                                 contended = false;
2775                         }
2776                         __qdisc_run(q);
2777                 } else
2778                         qdisc_run_end(q);
2779
2780                 rc = NET_XMIT_SUCCESS;
2781         } else {
2782                 skb_dst_force(skb);
2783                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2784                 if (qdisc_run_begin(q)) {
2785                         if (unlikely(contended)) {
2786                                 spin_unlock(&q->busylock);
2787                                 contended = false;
2788                         }
2789                         __qdisc_run(q);
2790                 }
2791         }
2792         spin_unlock(root_lock);
2793         if (unlikely(contended))
2794                 spin_unlock(&q->busylock);
2795         return rc;
2796 }
2797
2798 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2799 static void skb_update_prio(struct sk_buff *skb)
2800 {
2801         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2802
2803         if (!skb->priority && skb->sk && map) {
2804                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2805
2806                 if (prioidx < map->priomap_len)
2807                         skb->priority = map->priomap[prioidx];
2808         }
2809 }
2810 #else
2811 #define skb_update_prio(skb)
2812 #endif
2813
2814 static DEFINE_PER_CPU(int, xmit_recursion);
2815 #define RECURSION_LIMIT 10
2816
2817 /**
2818  *      dev_loopback_xmit - loop back @skb
2819  *      @skb: buffer to transmit
2820  */
2821 int dev_loopback_xmit(struct sk_buff *skb)
2822 {
2823         skb_reset_mac_header(skb);
2824         __skb_pull(skb, skb_network_offset(skb));
2825         skb->pkt_type = PACKET_LOOPBACK;
2826         skb->ip_summed = CHECKSUM_UNNECESSARY;
2827         WARN_ON(!skb_dst(skb));
2828         skb_dst_force(skb);
2829         netif_rx_ni(skb);
2830         return 0;
2831 }
2832 EXPORT_SYMBOL(dev_loopback_xmit);
2833
2834 /**
2835  *      __dev_queue_xmit - transmit a buffer
2836  *      @skb: buffer to transmit
2837  *      @accel_priv: private data used for L2 forwarding offload
2838  *
2839  *      Queue a buffer for transmission to a network device. The caller must
2840  *      have set the device and priority and built the buffer before calling
2841  *      this function. The function can be called from an interrupt.
2842  *
2843  *      A negative errno code is returned on a failure. A success does not
2844  *      guarantee the frame will be transmitted as it may be dropped due
2845  *      to congestion or traffic shaping.
2846  *
2847  * -----------------------------------------------------------------------------------
2848  *      I notice this method can also return errors from the queue disciplines,
2849  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2850  *      be positive.
2851  *
2852  *      Regardless of the return value, the skb is consumed, so it is currently
2853  *      difficult to retry a send to this method.  (You can bump the ref count
2854  *      before sending to hold a reference for retry if you are careful.)
2855  *
2856  *      When calling this method, interrupts MUST be enabled.  This is because
2857  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2858  *          --BLG
2859  */
2860 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2861 {
2862         struct net_device *dev = skb->dev;
2863         struct netdev_queue *txq;
2864         struct Qdisc *q;
2865         int rc = -ENOMEM;
2866
2867         skb_reset_mac_header(skb);
2868
2869         /* Disable soft irqs for various locks below. Also
2870          * stops preemption for RCU.
2871          */
2872         rcu_read_lock_bh();
2873
2874         skb_update_prio(skb);
2875
2876         txq = netdev_pick_tx(dev, skb, accel_priv);
2877         q = rcu_dereference_bh(txq->qdisc);
2878
2879 #ifdef CONFIG_NET_CLS_ACT
2880         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2881 #endif
2882         trace_net_dev_queue(skb);
2883         if (q->enqueue) {
2884                 rc = __dev_xmit_skb(skb, q, dev, txq);
2885                 goto out;
2886         }
2887
2888         /* The device has no queue. Common case for software devices:
2889            loopback, all the sorts of tunnels...
2890
2891            Really, it is unlikely that netif_tx_lock protection is necessary
2892            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2893            counters.)
2894            However, it is possible, that they rely on protection
2895            made by us here.
2896
2897            Check this and shot the lock. It is not prone from deadlocks.
2898            Either shot noqueue qdisc, it is even simpler 8)
2899          */
2900         if (dev->flags & IFF_UP) {
2901                 int cpu = smp_processor_id(); /* ok because BHs are off */
2902
2903                 if (txq->xmit_lock_owner != cpu) {
2904
2905                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2906                                 goto recursion_alert;
2907
2908                         HARD_TX_LOCK(dev, txq, cpu);
2909
2910                         if (!netif_xmit_stopped(txq)) {
2911                                 __this_cpu_inc(xmit_recursion);
2912                                 rc = dev_hard_start_xmit(skb, dev, txq);
2913                                 __this_cpu_dec(xmit_recursion);
2914                                 if (dev_xmit_complete(rc)) {
2915                                         HARD_TX_UNLOCK(dev, txq);
2916                                         goto out;
2917                                 }
2918                         }
2919                         HARD_TX_UNLOCK(dev, txq);
2920                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2921                                              dev->name);
2922                 } else {
2923                         /* Recursion is detected! It is possible,
2924                          * unfortunately
2925                          */
2926 recursion_alert:
2927                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2928                                              dev->name);
2929                 }
2930         }
2931
2932         rc = -ENETDOWN;
2933         rcu_read_unlock_bh();
2934
2935         atomic_long_inc(&dev->tx_dropped);
2936         kfree_skb(skb);
2937         return rc;
2938 out:
2939         rcu_read_unlock_bh();
2940         return rc;
2941 }
2942
2943 int dev_queue_xmit(struct sk_buff *skb)
2944 {
2945         return __dev_queue_xmit(skb, NULL);
2946 }
2947 EXPORT_SYMBOL(dev_queue_xmit);
2948
2949 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2950 {
2951         return __dev_queue_xmit(skb, accel_priv);
2952 }
2953 EXPORT_SYMBOL(dev_queue_xmit_accel);
2954
2955
2956 /*=======================================================================
2957                         Receiver routines
2958   =======================================================================*/
2959
2960 int netdev_max_backlog __read_mostly = 1000;
2961 EXPORT_SYMBOL(netdev_max_backlog);
2962
2963 int netdev_tstamp_prequeue __read_mostly = 1;
2964 int netdev_budget __read_mostly = 300;
2965 int weight_p __read_mostly = 64;            /* old backlog weight */
2966
2967 /* Called with irq disabled */
2968 static inline void ____napi_schedule(struct softnet_data *sd,
2969                                      struct napi_struct *napi)
2970 {
2971         list_add_tail(&napi->poll_list, &sd->poll_list);
2972         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2973 }
2974
2975 #ifdef CONFIG_RPS
2976
2977 /* One global table that all flow-based protocols share. */
2978 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2979 EXPORT_SYMBOL(rps_sock_flow_table);
2980
2981 struct static_key rps_needed __read_mostly;
2982
2983 static struct rps_dev_flow *
2984 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2985             struct rps_dev_flow *rflow, u16 next_cpu)
2986 {
2987         if (next_cpu != RPS_NO_CPU) {
2988 #ifdef CONFIG_RFS_ACCEL
2989                 struct netdev_rx_queue *rxqueue;
2990                 struct rps_dev_flow_table *flow_table;
2991                 struct rps_dev_flow *old_rflow;
2992                 u32 flow_id;
2993                 u16 rxq_index;
2994                 int rc;
2995
2996                 /* Should we steer this flow to a different hardware queue? */
2997                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2998                     !(dev->features & NETIF_F_NTUPLE))
2999                         goto out;
3000                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3001                 if (rxq_index == skb_get_rx_queue(skb))
3002                         goto out;
3003
3004                 rxqueue = dev->_rx + rxq_index;
3005                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3006                 if (!flow_table)
3007                         goto out;
3008                 flow_id = skb_get_hash(skb) & flow_table->mask;
3009                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3010                                                         rxq_index, flow_id);
3011                 if (rc < 0)
3012                         goto out;
3013                 old_rflow = rflow;
3014                 rflow = &flow_table->flows[flow_id];
3015                 rflow->filter = rc;
3016                 if (old_rflow->filter == rflow->filter)
3017                         old_rflow->filter = RPS_NO_FILTER;
3018         out:
3019 #endif
3020                 rflow->last_qtail =
3021                         per_cpu(softnet_data, next_cpu).input_queue_head;
3022         }
3023
3024         rflow->cpu = next_cpu;
3025         return rflow;
3026 }
3027
3028 /*
3029  * get_rps_cpu is called from netif_receive_skb and returns the target
3030  * CPU from the RPS map of the receiving queue for a given skb.
3031  * rcu_read_lock must be held on entry.
3032  */
3033 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3034                        struct rps_dev_flow **rflowp)
3035 {
3036         struct netdev_rx_queue *rxqueue;
3037         struct rps_map *map;
3038         struct rps_dev_flow_table *flow_table;
3039         struct rps_sock_flow_table *sock_flow_table;
3040         int cpu = -1;
3041         u16 tcpu;
3042         u32 hash;
3043
3044         if (skb_rx_queue_recorded(skb)) {
3045                 u16 index = skb_get_rx_queue(skb);
3046                 if (unlikely(index >= dev->real_num_rx_queues)) {
3047                         WARN_ONCE(dev->real_num_rx_queues > 1,
3048                                   "%s received packet on queue %u, but number "
3049                                   "of RX queues is %u\n",
3050                                   dev->name, index, dev->real_num_rx_queues);
3051                         goto done;
3052                 }
3053                 rxqueue = dev->_rx + index;
3054         } else
3055                 rxqueue = dev->_rx;
3056
3057         map = rcu_dereference(rxqueue->rps_map);
3058         if (map) {
3059                 if (map->len == 1 &&
3060                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3061                         tcpu = map->cpus[0];
3062                         if (cpu_online(tcpu))
3063                                 cpu = tcpu;
3064                         goto done;
3065                 }
3066         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3067                 goto done;
3068         }
3069
3070         skb_reset_network_header(skb);
3071         hash = skb_get_hash(skb);
3072         if (!hash)
3073                 goto done;
3074
3075         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3076         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3077         if (flow_table && sock_flow_table) {
3078                 u16 next_cpu;
3079                 struct rps_dev_flow *rflow;
3080
3081                 rflow = &flow_table->flows[hash & flow_table->mask];
3082                 tcpu = rflow->cpu;
3083
3084                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3085
3086                 /*
3087                  * If the desired CPU (where last recvmsg was done) is
3088                  * different from current CPU (one in the rx-queue flow
3089                  * table entry), switch if one of the following holds:
3090                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3091                  *   - Current CPU is offline.
3092                  *   - The current CPU's queue tail has advanced beyond the
3093                  *     last packet that was enqueued using this table entry.
3094                  *     This guarantees that all previous packets for the flow
3095                  *     have been dequeued, thus preserving in order delivery.
3096                  */
3097                 if (unlikely(tcpu != next_cpu) &&
3098                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3099                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3100                       rflow->last_qtail)) >= 0)) {
3101                         tcpu = next_cpu;
3102                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3103                 }
3104
3105                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3106                         *rflowp = rflow;
3107                         cpu = tcpu;
3108                         goto done;
3109                 }
3110         }
3111
3112         if (map) {
3113                 tcpu = map->cpus[((u64) hash * map->len) >> 32];
3114
3115                 if (cpu_online(tcpu)) {
3116                         cpu = tcpu;
3117                         goto done;
3118                 }
3119         }
3120
3121 done:
3122         return cpu;
3123 }
3124
3125 #ifdef CONFIG_RFS_ACCEL
3126
3127 /**
3128  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3129  * @dev: Device on which the filter was set
3130  * @rxq_index: RX queue index
3131  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3132  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3133  *
3134  * Drivers that implement ndo_rx_flow_steer() should periodically call
3135  * this function for each installed filter and remove the filters for
3136  * which it returns %true.
3137  */
3138 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3139                          u32 flow_id, u16 filter_id)
3140 {
3141         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3142         struct rps_dev_flow_table *flow_table;
3143         struct rps_dev_flow *rflow;
3144         bool expire = true;
3145         int cpu;
3146
3147         rcu_read_lock();
3148         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3149         if (flow_table && flow_id <= flow_table->mask) {
3150                 rflow = &flow_table->flows[flow_id];
3151                 cpu = ACCESS_ONCE(rflow->cpu);
3152                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3153                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3154                            rflow->last_qtail) <
3155                      (int)(10 * flow_table->mask)))
3156                         expire = false;
3157         }
3158         rcu_read_unlock();
3159         return expire;
3160 }
3161 EXPORT_SYMBOL(rps_may_expire_flow);
3162
3163 #endif /* CONFIG_RFS_ACCEL */
3164
3165 /* Called from hardirq (IPI) context */
3166 static void rps_trigger_softirq(void *data)
3167 {
3168         struct softnet_data *sd = data;
3169
3170         ____napi_schedule(sd, &sd->backlog);
3171         sd->received_rps++;
3172 }
3173
3174 #endif /* CONFIG_RPS */
3175
3176 /*
3177  * Check if this softnet_data structure is another cpu one
3178  * If yes, queue it to our IPI list and return 1
3179  * If no, return 0
3180  */
3181 static int rps_ipi_queued(struct softnet_data *sd)
3182 {
3183 #ifdef CONFIG_RPS
3184         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3185
3186         if (sd != mysd) {
3187                 sd->rps_ipi_next = mysd->rps_ipi_list;
3188                 mysd->rps_ipi_list = sd;
3189
3190                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3191                 return 1;
3192         }
3193 #endif /* CONFIG_RPS */
3194         return 0;
3195 }
3196
3197 #ifdef CONFIG_NET_FLOW_LIMIT
3198 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3199 #endif
3200
3201 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3202 {
3203 #ifdef CONFIG_NET_FLOW_LIMIT
3204         struct sd_flow_limit *fl;
3205         struct softnet_data *sd;
3206         unsigned int old_flow, new_flow;
3207
3208         if (qlen < (netdev_max_backlog >> 1))
3209                 return false;
3210
3211         sd = &__get_cpu_var(softnet_data);
3212
3213         rcu_read_lock();
3214         fl = rcu_dereference(sd->flow_limit);
3215         if (fl) {
3216                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3217                 old_flow = fl->history[fl->history_head];
3218                 fl->history[fl->history_head] = new_flow;
3219
3220                 fl->history_head++;
3221                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3222
3223                 if (likely(fl->buckets[old_flow]))
3224                         fl->buckets[old_flow]--;
3225
3226                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3227                         fl->count++;
3228                         rcu_read_unlock();
3229                         return true;
3230                 }
3231         }
3232         rcu_read_unlock();
3233 #endif
3234         return false;
3235 }
3236
3237 /*
3238  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3239  * queue (may be a remote CPU queue).
3240  */
3241 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3242                               unsigned int *qtail)
3243 {
3244         struct softnet_data *sd;
3245         unsigned long flags;
3246         unsigned int qlen;
3247
3248         sd = &per_cpu(softnet_data, cpu);
3249
3250         local_irq_save(flags);
3251
3252         rps_lock(sd);
3253         qlen = skb_queue_len(&sd->input_pkt_queue);
3254         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3255                 if (skb_queue_len(&sd->input_pkt_queue)) {
3256 enqueue:
3257                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3258                         input_queue_tail_incr_save(sd, qtail);
3259                         rps_unlock(sd);
3260                         local_irq_restore(flags);
3261                         return NET_RX_SUCCESS;
3262                 }
3263
3264                 /* Schedule NAPI for backlog device
3265                  * We can use non atomic operation since we own the queue lock
3266                  */
3267                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3268                         if (!rps_ipi_queued(sd))
3269                                 ____napi_schedule(sd, &sd->backlog);
3270                 }
3271                 goto enqueue;
3272         }
3273
3274         sd->dropped++;
3275         rps_unlock(sd);
3276
3277         local_irq_restore(flags);
3278
3279         atomic_long_inc(&skb->dev->rx_dropped);
3280         kfree_skb(skb);
3281         return NET_RX_DROP;
3282 }
3283
3284 static int netif_rx_internal(struct sk_buff *skb)
3285 {
3286         int ret;
3287
3288         net_timestamp_check(netdev_tstamp_prequeue, skb);
3289
3290         trace_netif_rx(skb);
3291 #ifdef CONFIG_RPS
3292         if (static_key_false(&rps_needed)) {
3293                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3294                 int cpu;
3295
3296                 preempt_disable();
3297                 rcu_read_lock();
3298
3299                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3300                 if (cpu < 0)
3301                         cpu = smp_processor_id();
3302
3303                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3304
3305                 rcu_read_unlock();
3306                 preempt_enable();
3307         } else
3308 #endif
3309         {
3310                 unsigned int qtail;
3311                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3312                 put_cpu();
3313         }
3314         return ret;
3315 }
3316
3317 /**
3318  *      netif_rx        -       post buffer to the network code
3319  *      @skb: buffer to post
3320  *
3321  *      This function receives a packet from a device driver and queues it for
3322  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3323  *      may be dropped during processing for congestion control or by the
3324  *      protocol layers.
3325  *
3326  *      return values:
3327  *      NET_RX_SUCCESS  (no congestion)
3328  *      NET_RX_DROP     (packet was dropped)
3329  *
3330  */
3331
3332 int netif_rx(struct sk_buff *skb)
3333 {
3334         trace_netif_rx_entry(skb);
3335
3336         return netif_rx_internal(skb);
3337 }
3338 EXPORT_SYMBOL(netif_rx);
3339
3340 int netif_rx_ni(struct sk_buff *skb)
3341 {
3342         int err;
3343
3344         trace_netif_rx_ni_entry(skb);
3345
3346         preempt_disable();
3347         err = netif_rx_internal(skb);
3348         if (local_softirq_pending())
3349                 do_softirq();
3350         preempt_enable();
3351
3352         return err;
3353 }
3354 EXPORT_SYMBOL(netif_rx_ni);
3355
3356 static void net_tx_action(struct softirq_action *h)
3357 {
3358         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3359
3360         if (sd->completion_queue) {
3361                 struct sk_buff *clist;
3362
3363                 local_irq_disable();
3364                 clist = sd->completion_queue;
3365                 sd->completion_queue = NULL;
3366                 local_irq_enable();
3367
3368                 while (clist) {
3369                         struct sk_buff *skb = clist;
3370                         clist = clist->next;
3371
3372                         WARN_ON(atomic_read(&skb->users));
3373                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3374                                 trace_consume_skb(skb);
3375                         else
3376                                 trace_kfree_skb(skb, net_tx_action);
3377                         __kfree_skb(skb);
3378                 }
3379         }
3380
3381         if (sd->output_queue) {
3382                 struct Qdisc *head;
3383
3384                 local_irq_disable();
3385                 head = sd->output_queue;
3386                 sd->output_queue = NULL;
3387                 sd->output_queue_tailp = &sd->output_queue;
3388                 local_irq_enable();
3389
3390                 while (head) {
3391                         struct Qdisc *q = head;
3392                         spinlock_t *root_lock;
3393
3394                         head = head->next_sched;
3395
3396                         root_lock = qdisc_lock(q);
3397                         if (spin_trylock(root_lock)) {
3398                                 smp_mb__before_atomic();
3399                                 clear_bit(__QDISC_STATE_SCHED,
3400                                           &q->state);
3401                                 qdisc_run(q);
3402                                 spin_unlock(root_lock);
3403                         } else {
3404                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3405                                               &q->state)) {
3406                                         __netif_reschedule(q);
3407                                 } else {
3408                                         smp_mb__before_atomic();
3409                                         clear_bit(__QDISC_STATE_SCHED,
3410                                                   &q->state);
3411                                 }
3412                         }
3413                 }
3414         }
3415 }
3416
3417 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3418     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3419 /* This hook is defined here for ATM LANE */
3420 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3421                              unsigned char *addr) __read_mostly;
3422 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3423 #endif
3424
3425 #ifdef CONFIG_NET_CLS_ACT
3426 /* TODO: Maybe we should just force sch_ingress to be compiled in
3427  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3428  * a compare and 2 stores extra right now if we dont have it on
3429  * but have CONFIG_NET_CLS_ACT
3430  * NOTE: This doesn't stop any functionality; if you dont have
3431  * the ingress scheduler, you just can't add policies on ingress.
3432  *
3433  */
3434 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3435 {
3436         struct net_device *dev = skb->dev;
3437         u32 ttl = G_TC_RTTL(skb->tc_verd);
3438         int result = TC_ACT_OK;
3439         struct Qdisc *q;
3440
3441         if (unlikely(MAX_RED_LOOP < ttl++)) {
3442                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3443                                      skb->skb_iif, dev->ifindex);
3444                 return TC_ACT_SHOT;
3445         }
3446
3447         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3448         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3449
3450         q = rxq->qdisc;
3451         if (q != &noop_qdisc) {
3452                 spin_lock(qdisc_lock(q));
3453                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3454                         result = qdisc_enqueue_root(skb, q);
3455                 spin_unlock(qdisc_lock(q));
3456         }
3457
3458         return result;
3459 }
3460
3461 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3462                                          struct packet_type **pt_prev,
3463                                          int *ret, struct net_device *orig_dev)
3464 {
3465         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3466
3467         if (!rxq || rxq->qdisc == &noop_qdisc)
3468                 goto out;
3469
3470         if (*pt_prev) {
3471                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3472                 *pt_prev = NULL;
3473         }
3474
3475         switch (ing_filter(skb, rxq)) {
3476         case TC_ACT_SHOT:
3477         case TC_ACT_STOLEN:
3478                 kfree_skb(skb);
3479                 return NULL;
3480         }
3481
3482 out:
3483         skb->tc_verd = 0;
3484         return skb;
3485 }
3486 #endif
3487
3488 /**
3489  *      netdev_rx_handler_register - register receive handler
3490  *      @dev: device to register a handler for
3491  *      @rx_handler: receive handler to register
3492  *      @rx_handler_data: data pointer that is used by rx handler
3493  *
3494  *      Register a receive handler for a device. This handler will then be
3495  *      called from __netif_receive_skb. A negative errno code is returned
3496  *      on a failure.
3497  *
3498  *      The caller must hold the rtnl_mutex.
3499  *
3500  *      For a general description of rx_handler, see enum rx_handler_result.
3501  */
3502 int netdev_rx_handler_register(struct net_device *dev,
3503                                rx_handler_func_t *rx_handler,
3504                                void *rx_handler_data)
3505 {
3506         ASSERT_RTNL();
3507
3508         if (dev->rx_handler)
3509                 return -EBUSY;
3510
3511         /* Note: rx_handler_data must be set before rx_handler */
3512         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3513         rcu_assign_pointer(dev->rx_handler, rx_handler);
3514
3515         return 0;
3516 }
3517 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3518
3519 /**
3520  *      netdev_rx_handler_unregister - unregister receive handler
3521  *      @dev: device to unregister a handler from
3522  *
3523  *      Unregister a receive handler from a device.
3524  *
3525  *      The caller must hold the rtnl_mutex.
3526  */
3527 void netdev_rx_handler_unregister(struct net_device *dev)
3528 {
3529
3530         ASSERT_RTNL();
3531         RCU_INIT_POINTER(dev->rx_handler, NULL);
3532         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3533          * section has a guarantee to see a non NULL rx_handler_data
3534          * as well.
3535          */
3536         synchronize_net();
3537         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3538 }
3539 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3540
3541 /*
3542  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3543  * the special handling of PFMEMALLOC skbs.
3544  */
3545 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3546 {
3547         switch (skb->protocol) {
3548         case htons(ETH_P_ARP):
3549         case htons(ETH_P_IP):
3550         case htons(ETH_P_IPV6):
3551         case htons(ETH_P_8021Q):
3552         case htons(ETH_P_8021AD):
3553                 return true;
3554         default:
3555                 return false;
3556         }
3557 }
3558
3559 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3560 {
3561         struct packet_type *ptype, *pt_prev;
3562         rx_handler_func_t *rx_handler;
3563         struct net_device *orig_dev;
3564         struct net_device *null_or_dev;
3565         bool deliver_exact = false;
3566         int ret = NET_RX_DROP;
3567         __be16 type;
3568
3569         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3570
3571         trace_netif_receive_skb(skb);
3572
3573         orig_dev = skb->dev;
3574
3575         skb_reset_network_header(skb);
3576         if (!skb_transport_header_was_set(skb))
3577                 skb_reset_transport_header(skb);
3578         skb_reset_mac_len(skb);
3579
3580         pt_prev = NULL;
3581
3582         rcu_read_lock();
3583
3584 another_round:
3585         skb->skb_iif = skb->dev->ifindex;
3586
3587         __this_cpu_inc(softnet_data.processed);
3588
3589         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3590             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3591                 skb = vlan_untag(skb);
3592                 if (unlikely(!skb))
3593                         goto unlock;
3594         }
3595
3596 #ifdef CONFIG_NET_CLS_ACT
3597         if (skb->tc_verd & TC_NCLS) {
3598                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3599                 goto ncls;
3600         }
3601 #endif
3602
3603         if (pfmemalloc)
3604                 goto skip_taps;
3605
3606         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3607                 if (!ptype->dev || ptype->dev == skb->dev) {
3608                         if (pt_prev)
3609                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3610                         pt_prev = ptype;
3611                 }
3612         }
3613
3614 skip_taps:
3615 #ifdef CONFIG_NET_CLS_ACT
3616         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3617         if (!skb)
3618                 goto unlock;
3619 ncls:
3620 #endif
3621
3622         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3623                 goto drop;
3624
3625         if (vlan_tx_tag_present(skb)) {
3626                 if (pt_prev) {
3627                         ret = deliver_skb(skb, pt_prev, orig_dev);
3628                         pt_prev = NULL;
3629                 }
3630                 if (vlan_do_receive(&skb))
3631                         goto another_round;
3632                 else if (unlikely(!skb))
3633                         goto unlock;
3634         }
3635
3636         rx_handler = rcu_dereference(skb->dev->rx_handler);
3637         if (rx_handler) {
3638                 if (pt_prev) {
3639                         ret = deliver_skb(skb, pt_prev, orig_dev);
3640                         pt_prev = NULL;
3641                 }
3642                 switch (rx_handler(&skb)) {
3643                 case RX_HANDLER_CONSUMED:
3644                         ret = NET_RX_SUCCESS;
3645                         goto unlock;
3646                 case RX_HANDLER_ANOTHER:
3647                         goto another_round;
3648                 case RX_HANDLER_EXACT:
3649                         deliver_exact = true;
3650                 case RX_HANDLER_PASS:
3651                         break;
3652                 default:
3653                         BUG();
3654                 }
3655         }
3656
3657         if (unlikely(vlan_tx_tag_present(skb))) {
3658                 if (vlan_tx_tag_get_id(skb))
3659                         skb->pkt_type = PACKET_OTHERHOST;
3660                 /* Note: we might in the future use prio bits
3661                  * and set skb->priority like in vlan_do_receive()
3662                  * For the time being, just ignore Priority Code Point
3663                  */
3664                 skb->vlan_tci = 0;
3665         }
3666
3667         /* deliver only exact match when indicated */
3668         null_or_dev = deliver_exact ? skb->dev : NULL;
3669
3670         type = skb->protocol;
3671         list_for_each_entry_rcu(ptype,
3672                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3673                 if (ptype->type == type &&
3674                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3675                      ptype->dev == orig_dev)) {
3676                         if (pt_prev)
3677                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3678                         pt_prev = ptype;
3679                 }
3680         }
3681
3682         if (pt_prev) {
3683                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3684                         goto drop;
3685                 else
3686                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3687         } else {
3688 drop:
3689                 atomic_long_inc(&skb->dev->rx_dropped);
3690                 kfree_skb(skb);
3691                 /* Jamal, now you will not able to escape explaining
3692                  * me how you were going to use this. :-)
3693                  */
3694                 ret = NET_RX_DROP;
3695         }
3696
3697 unlock:
3698         rcu_read_unlock();
3699         return ret;
3700 }
3701
3702 static int __netif_receive_skb(struct sk_buff *skb)
3703 {
3704         int ret;
3705
3706         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3707                 unsigned long pflags = current->flags;
3708
3709                 /*
3710                  * PFMEMALLOC skbs are special, they should
3711                  * - be delivered to SOCK_MEMALLOC sockets only
3712                  * - stay away from userspace
3713                  * - have bounded memory usage
3714                  *
3715                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3716                  * context down to all allocation sites.
3717                  */
3718                 current->flags |= PF_MEMALLOC;
3719                 ret = __netif_receive_skb_core(skb, true);
3720                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3721         } else
3722                 ret = __netif_receive_skb_core(skb, false);
3723
3724         return ret;
3725 }
3726
3727 static int netif_receive_skb_internal(struct sk_buff *skb)
3728 {
3729         net_timestamp_check(netdev_tstamp_prequeue, skb);
3730
3731         if (skb_defer_rx_timestamp(skb))
3732                 return NET_RX_SUCCESS;
3733
3734 #ifdef CONFIG_RPS
3735         if (static_key_false(&rps_needed)) {
3736                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3737                 int cpu, ret;
3738
3739                 rcu_read_lock();
3740
3741                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3742
3743                 if (cpu >= 0) {
3744                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3745                         rcu_read_unlock();
3746                         return ret;
3747                 }
3748                 rcu_read_unlock();
3749         }
3750 #endif
3751         return __netif_receive_skb(skb);
3752 }
3753
3754 /**
3755  *      netif_receive_skb - process receive buffer from network
3756  *      @skb: buffer to process
3757  *
3758  *      netif_receive_skb() is the main receive data processing function.
3759  *      It always succeeds. The buffer may be dropped during processing
3760  *      for congestion control or by the protocol layers.
3761  *
3762  *      This function may only be called from softirq context and interrupts
3763  *      should be enabled.
3764  *
3765  *      Return values (usually ignored):
3766  *      NET_RX_SUCCESS: no congestion
3767  *      NET_RX_DROP: packet was dropped
3768  */
3769 int netif_receive_skb(struct sk_buff *skb)
3770 {
3771         trace_netif_receive_skb_entry(skb);
3772
3773         return netif_receive_skb_internal(skb);
3774 }
3775 EXPORT_SYMBOL(netif_receive_skb);
3776
3777 /* Network device is going away, flush any packets still pending
3778  * Called with irqs disabled.
3779  */
3780 static void flush_backlog(void *arg)
3781 {
3782         struct net_device *dev = arg;
3783         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3784         struct sk_buff *skb, *tmp;
3785
3786         rps_lock(sd);
3787         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3788                 if (skb->dev == dev) {
3789                         __skb_unlink(skb, &sd->input_pkt_queue);
3790                         kfree_skb(skb);
3791                         input_queue_head_incr(sd);
3792                 }
3793         }
3794         rps_unlock(sd);
3795
3796         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3797                 if (skb->dev == dev) {
3798                         __skb_unlink(skb, &sd->process_queue);
3799                         kfree_skb(skb);
3800                         input_queue_head_incr(sd);
3801                 }
3802         }
3803 }
3804
3805 static int napi_gro_complete(struct sk_buff *skb)
3806 {
3807         struct packet_offload *ptype;
3808         __be16 type = skb->protocol;
3809         struct list_head *head = &offload_base;
3810         int err = -ENOENT;
3811
3812         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3813
3814         if (NAPI_GRO_CB(skb)->count == 1) {
3815                 skb_shinfo(skb)->gso_size = 0;
3816                 goto out;
3817         }
3818
3819         rcu_read_lock();
3820         list_for_each_entry_rcu(ptype, head, list) {
3821                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3822                         continue;
3823
3824                 err = ptype->callbacks.gro_complete(skb, 0);
3825                 break;
3826         }
3827         rcu_read_unlock();
3828
3829         if (err) {
3830                 WARN_ON(&ptype->list == head);
3831                 kfree_skb(skb);
3832                 return NET_RX_SUCCESS;
3833         }
3834
3835 out:
3836         return netif_receive_skb_internal(skb);
3837 }
3838
3839 /* napi->gro_list contains packets ordered by age.
3840  * youngest packets at the head of it.
3841  * Complete skbs in reverse order to reduce latencies.
3842  */
3843 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3844 {
3845         struct sk_buff *skb, *prev = NULL;
3846
3847         /* scan list and build reverse chain */
3848         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3849                 skb->prev = prev;
3850                 prev = skb;
3851         }
3852
3853         for (skb = prev; skb; skb = prev) {
3854                 skb->next = NULL;
3855
3856                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3857                         return;
3858
3859                 prev = skb->prev;
3860                 napi_gro_complete(skb);
3861                 napi->gro_count--;
3862         }
3863
3864         napi->gro_list = NULL;
3865 }
3866 EXPORT_SYMBOL(napi_gro_flush);
3867
3868 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3869 {
3870         struct sk_buff *p;
3871         unsigned int maclen = skb->dev->hard_header_len;
3872         u32 hash = skb_get_hash_raw(skb);
3873
3874         for (p = napi->gro_list; p; p = p->next) {
3875                 unsigned long diffs;
3876
3877                 NAPI_GRO_CB(p)->flush = 0;
3878
3879                 if (hash != skb_get_hash_raw(p)) {
3880                         NAPI_GRO_CB(p)->same_flow = 0;
3881                         continue;
3882                 }
3883
3884                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3885                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3886                 if (maclen == ETH_HLEN)
3887                         diffs |= compare_ether_header(skb_mac_header(p),
3888                                                       skb_mac_header(skb));
3889                 else if (!diffs)
3890                         diffs = memcmp(skb_mac_header(p),
3891                                        skb_mac_header(skb),
3892                                        maclen);
3893                 NAPI_GRO_CB(p)->same_flow = !diffs;
3894         }
3895 }
3896
3897 static void skb_gro_reset_offset(struct sk_buff *skb)
3898 {
3899         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3900         const skb_frag_t *frag0 = &pinfo->frags[0];
3901
3902         NAPI_GRO_CB(skb)->data_offset = 0;
3903         NAPI_GRO_CB(skb)->frag0 = NULL;
3904         NAPI_GRO_CB(skb)->frag0_len = 0;
3905
3906         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3907             pinfo->nr_frags &&
3908             !PageHighMem(skb_frag_page(frag0))) {
3909                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3910                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3911         }
3912 }
3913
3914 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3915 {
3916         struct skb_shared_info *pinfo = skb_shinfo(skb);
3917
3918         BUG_ON(skb->end - skb->tail < grow);
3919
3920         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3921
3922         skb->data_len -= grow;
3923         skb->tail += grow;
3924
3925         pinfo->frags[0].page_offset += grow;
3926         skb_frag_size_sub(&pinfo->frags[0], grow);
3927
3928         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3929                 skb_frag_unref(skb, 0);
3930                 memmove(pinfo->frags, pinfo->frags + 1,
3931                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3932         }
3933 }
3934
3935 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3936 {
3937         struct sk_buff **pp = NULL;
3938         struct packet_offload *ptype;
3939         __be16 type = skb->protocol;
3940         struct list_head *head = &offload_base;
3941         int same_flow;
3942         enum gro_result ret;
3943         int grow;
3944
3945         if (!(skb->dev->features & NETIF_F_GRO))
3946                 goto normal;
3947
3948         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3949                 goto normal;
3950
3951         gro_list_prepare(napi, skb);
3952         NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3953
3954         rcu_read_lock();
3955         list_for_each_entry_rcu(ptype, head, list) {
3956                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3957                         continue;
3958
3959                 skb_set_network_header(skb, skb_gro_offset(skb));
3960                 skb_reset_mac_len(skb);
3961                 NAPI_GRO_CB(skb)->same_flow = 0;
3962                 NAPI_GRO_CB(skb)->flush = 0;
3963                 NAPI_GRO_CB(skb)->free = 0;
3964                 NAPI_GRO_CB(skb)->udp_mark = 0;
3965
3966                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3967                 break;
3968         }
3969         rcu_read_unlock();
3970
3971         if (&ptype->list == head)
3972                 goto normal;
3973
3974         same_flow = NAPI_GRO_CB(skb)->same_flow;
3975         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3976
3977         if (pp) {
3978                 struct sk_buff *nskb = *pp;
3979
3980                 *pp = nskb->next;
3981                 nskb->next = NULL;
3982                 napi_gro_complete(nskb);
3983                 napi->gro_count--;
3984         }
3985
3986         if (same_flow)
3987                 goto ok;
3988
3989         if (NAPI_GRO_CB(skb)->flush)
3990                 goto normal;
3991
3992         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3993                 struct sk_buff *nskb = napi->gro_list;
3994
3995                 /* locate the end of the list to select the 'oldest' flow */
3996                 while (nskb->next) {
3997                         pp = &nskb->next;
3998                         nskb = *pp;
3999                 }
4000                 *pp = NULL;
4001                 nskb->next = NULL;
4002                 napi_gro_complete(nskb);
4003         } else {
4004                 napi->gro_count++;
4005         }
4006         NAPI_GRO_CB(skb)->count = 1;
4007         NAPI_GRO_CB(skb)->age = jiffies;
4008         NAPI_GRO_CB(skb)->last = skb;
4009         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4010         skb->next = napi->gro_list;
4011         napi->gro_list = skb;
4012         ret = GRO_HELD;
4013
4014 pull:
4015         grow = skb_gro_offset(skb) - skb_headlen(skb);
4016         if (grow > 0)
4017                 gro_pull_from_frag0(skb, grow);
4018 ok:
4019         return ret;
4020
4021 normal:
4022         ret = GRO_NORMAL;
4023         goto pull;
4024 }
4025
4026 struct packet_offload *gro_find_receive_by_type(__be16 type)
4027 {
4028         struct list_head *offload_head = &offload_base;
4029         struct packet_offload *ptype;
4030
4031         list_for_each_entry_rcu(ptype, offload_head, list) {
4032                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4033                         continue;
4034                 return ptype;
4035         }
4036         return NULL;
4037 }
4038 EXPORT_SYMBOL(gro_find_receive_by_type);
4039
4040 struct packet_offload *gro_find_complete_by_type(__be16 type)
4041 {
4042         struct list_head *offload_head = &offload_base;
4043         struct packet_offload *ptype;
4044
4045         list_for_each_entry_rcu(ptype, offload_head, list) {
4046                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4047                         continue;
4048                 return ptype;
4049         }
4050         return NULL;
4051 }
4052 EXPORT_SYMBOL(gro_find_complete_by_type);
4053
4054 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4055 {
4056         switch (ret) {
4057         case GRO_NORMAL:
4058                 if (netif_receive_skb_internal(skb))
4059                         ret = GRO_DROP;
4060                 break;
4061
4062         case GRO_DROP:
4063                 kfree_skb(skb);
4064                 break;
4065
4066         case GRO_MERGED_FREE:
4067                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4068                         kmem_cache_free(skbuff_head_cache, skb);
4069                 else
4070                         __kfree_skb(skb);
4071                 break;
4072
4073         case GRO_HELD:
4074         case GRO_MERGED:
4075                 break;
4076         }
4077
4078         return ret;
4079 }
4080
4081 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4082 {
4083         trace_napi_gro_receive_entry(skb);
4084
4085         skb_gro_reset_offset(skb);
4086
4087         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4088 }
4089 EXPORT_SYMBOL(napi_gro_receive);
4090
4091 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4092 {
4093         __skb_pull(skb, skb_headlen(skb));
4094         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4095         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4096         skb->vlan_tci = 0;
4097         skb->dev = napi->dev;
4098         skb->skb_iif = 0;
4099         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4100
4101         napi->skb = skb;
4102 }
4103
4104 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4105 {
4106         struct sk_buff *skb = napi->skb;
4107
4108         if (!skb) {
4109                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4110                 napi->skb = skb;
4111         }
4112         return skb;
4113 }
4114 EXPORT_SYMBOL(napi_get_frags);
4115
4116 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4117                                       struct sk_buff *skb,
4118                                       gro_result_t ret)
4119 {
4120         switch (ret) {
4121         case GRO_NORMAL:
4122         case GRO_HELD:
4123                 __skb_push(skb, ETH_HLEN);
4124                 skb->protocol = eth_type_trans(skb, skb->dev);
4125                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4126                         ret = GRO_DROP;
4127                 break;
4128
4129         case GRO_DROP:
4130         case GRO_MERGED_FREE:
4131                 napi_reuse_skb(napi, skb);
4132                 break;
4133
4134         case GRO_MERGED:
4135                 break;
4136         }
4137
4138         return ret;
4139 }
4140
4141 /* Upper GRO stack assumes network header starts at gro_offset=0
4142  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4143  * We copy ethernet header into skb->data to have a common layout.
4144  */
4145 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4146 {
4147         struct sk_buff *skb = napi->skb;
4148         const struct ethhdr *eth;
4149         unsigned int hlen = sizeof(*eth);
4150
4151         napi->skb = NULL;
4152
4153         skb_reset_mac_header(skb);
4154         skb_gro_reset_offset(skb);
4155
4156         eth = skb_gro_header_fast(skb, 0);
4157         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4158                 eth = skb_gro_header_slow(skb, hlen, 0);
4159                 if (unlikely(!eth)) {
4160                         napi_reuse_skb(napi, skb);
4161                         return NULL;
4162                 }
4163         } else {
4164                 gro_pull_from_frag0(skb, hlen);
4165                 NAPI_GRO_CB(skb)->frag0 += hlen;
4166                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4167         }
4168         __skb_pull(skb, hlen);
4169
4170         /*
4171          * This works because the only protocols we care about don't require
4172          * special handling.
4173          * We'll fix it up properly in napi_frags_finish()
4174          */
4175         skb->protocol = eth->h_proto;
4176
4177         return skb;
4178 }
4179
4180 gro_result_t napi_gro_frags(struct napi_struct *napi)
4181 {
4182         struct sk_buff *skb = napi_frags_skb(napi);
4183
4184         if (!skb)
4185                 return GRO_DROP;
4186
4187         trace_napi_gro_frags_entry(skb);
4188
4189         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4190 }
4191 EXPORT_SYMBOL(napi_gro_frags);
4192
4193 /*
4194  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4195  * Note: called with local irq disabled, but exits with local irq enabled.
4196  */
4197 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4198 {
4199 #ifdef CONFIG_RPS
4200         struct softnet_data *remsd = sd->rps_ipi_list;
4201
4202         if (remsd) {
4203                 sd->rps_ipi_list = NULL;
4204
4205                 local_irq_enable();
4206
4207                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4208                 while (remsd) {
4209                         struct softnet_data *next = remsd->rps_ipi_next;
4210
4211                         if (cpu_online(remsd->cpu))
4212                                 smp_call_function_single_async(remsd->cpu,
4213                                                            &remsd->csd);
4214                         remsd = next;
4215                 }
4216         } else
4217 #endif
4218                 local_irq_enable();
4219 }
4220
4221 static int process_backlog(struct napi_struct *napi, int quota)
4222 {
4223         int work = 0;
4224         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4225
4226 #ifdef CONFIG_RPS
4227         /* Check if we have pending ipi, its better to send them now,
4228          * not waiting net_rx_action() end.
4229          */
4230         if (sd->rps_ipi_list) {
4231                 local_irq_disable();
4232                 net_rps_action_and_irq_enable(sd);
4233         }
4234 #endif
4235         napi->weight = weight_p;
4236         local_irq_disable();
4237         while (1) {
4238                 struct sk_buff *skb;
4239
4240                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4241                         local_irq_enable();
4242                         __netif_receive_skb(skb);
4243                         local_irq_disable();
4244                         input_queue_head_incr(sd);
4245                         if (++work >= quota) {
4246                                 local_irq_enable();
4247                                 return work;
4248                         }
4249                 }
4250
4251                 rps_lock(sd);
4252                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4253                         /*
4254                          * Inline a custom version of __napi_complete().
4255                          * only current cpu owns and manipulates this napi,
4256                          * and NAPI_STATE_SCHED is the only possible flag set
4257                          * on backlog.
4258                          * We can use a plain write instead of clear_bit(),
4259                          * and we dont need an smp_mb() memory barrier.
4260                          */
4261                         list_del(&napi->poll_list);
4262                         napi->state = 0;
4263                         rps_unlock(sd);
4264
4265                         break;
4266                 }
4267
4268                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4269                                            &sd->process_queue);
4270                 rps_unlock(sd);
4271         }
4272         local_irq_enable();
4273
4274         return work;
4275 }
4276
4277 /**
4278  * __napi_schedule - schedule for receive
4279  * @n: entry to schedule
4280  *
4281  * The entry's receive function will be scheduled to run
4282  */
4283 void __napi_schedule(struct napi_struct *n)
4284 {
4285         unsigned long flags;
4286
4287         local_irq_save(flags);
4288         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4289         local_irq_restore(flags);
4290 }
4291 EXPORT_SYMBOL(__napi_schedule);
4292
4293 void __napi_complete(struct napi_struct *n)
4294 {
4295         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4296         BUG_ON(n->gro_list);
4297
4298         list_del(&n->poll_list);
4299         smp_mb__before_atomic();
4300         clear_bit(NAPI_STATE_SCHED, &n->state);
4301 }
4302 EXPORT_SYMBOL(__napi_complete);
4303
4304 void napi_complete(struct napi_struct *n)
4305 {
4306         unsigned long flags;
4307
4308         /*
4309          * don't let napi dequeue from the cpu poll list
4310          * just in case its running on a different cpu
4311          */
4312         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4313                 return;
4314
4315         napi_gro_flush(n, false);
4316         local_irq_save(flags);
4317         __napi_complete(n);
4318         local_irq_restore(flags);
4319 }
4320 EXPORT_SYMBOL(napi_complete);
4321
4322 /* must be called under rcu_read_lock(), as we dont take a reference */
4323 struct napi_struct *napi_by_id(unsigned int napi_id)
4324 {
4325         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4326         struct napi_struct *napi;
4327
4328         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4329                 if (napi->napi_id == napi_id)
4330                         return napi;
4331
4332         return NULL;
4333 }
4334 EXPORT_SYMBOL_GPL(napi_by_id);
4335
4336 void napi_hash_add(struct napi_struct *napi)
4337 {
4338         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4339
4340                 spin_lock(&napi_hash_lock);
4341
4342                 /* 0 is not a valid id, we also skip an id that is taken
4343                  * we expect both events to be extremely rare
4344                  */
4345                 napi->napi_id = 0;
4346                 while (!napi->napi_id) {
4347                         napi->napi_id = ++napi_gen_id;
4348                         if (napi_by_id(napi->napi_id))
4349                                 napi->napi_id = 0;
4350                 }
4351
4352                 hlist_add_head_rcu(&napi->napi_hash_node,
4353                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4354
4355                 spin_unlock(&napi_hash_lock);
4356         }
4357 }
4358 EXPORT_SYMBOL_GPL(napi_hash_add);
4359
4360 /* Warning : caller is responsible to make sure rcu grace period
4361  * is respected before freeing memory containing @napi
4362  */
4363 void napi_hash_del(struct napi_struct *napi)
4364 {
4365         spin_lock(&napi_hash_lock);
4366
4367         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4368                 hlist_del_rcu(&napi->napi_hash_node);
4369
4370         spin_unlock(&napi_hash_lock);
4371 }
4372 EXPORT_SYMBOL_GPL(napi_hash_del);
4373
4374 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4375                     int (*poll)(struct napi_struct *, int), int weight)
4376 {
4377         INIT_LIST_HEAD(&napi->poll_list);
4378         napi->gro_count = 0;
4379         napi->gro_list = NULL;
4380         napi->skb = NULL;
4381         napi->poll = poll;
4382         if (weight > NAPI_POLL_WEIGHT)
4383                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4384                             weight, dev->name);
4385         napi->weight = weight;
4386         list_add(&napi->dev_list, &dev->napi_list);
4387         napi->dev = dev;
4388 #ifdef CONFIG_NETPOLL
4389         spin_lock_init(&napi->poll_lock);
4390         napi->poll_owner = -1;
4391 #endif
4392         set_bit(NAPI_STATE_SCHED, &napi->state);
4393 }
4394 EXPORT_SYMBOL(netif_napi_add);
4395
4396 void netif_napi_del(struct napi_struct *napi)
4397 {
4398         list_del_init(&napi->dev_list);
4399         napi_free_frags(napi);
4400
4401         kfree_skb_list(napi->gro_list);
4402         napi->gro_list = NULL;
4403         napi->gro_count = 0;
4404 }
4405 EXPORT_SYMBOL(netif_napi_del);
4406
4407 static void net_rx_action(struct softirq_action *h)
4408 {
4409         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4410         unsigned long time_limit = jiffies + 2;
4411         int budget = netdev_budget;
4412         void *have;
4413
4414         local_irq_disable();
4415
4416         while (!list_empty(&sd->poll_list)) {
4417                 struct napi_struct *n;
4418                 int work, weight;
4419
4420                 /* If softirq window is exhuasted then punt.
4421                  * Allow this to run for 2 jiffies since which will allow
4422                  * an average latency of 1.5/HZ.
4423                  */
4424                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4425                         goto softnet_break;
4426
4427                 local_irq_enable();
4428
4429                 /* Even though interrupts have been re-enabled, this
4430                  * access is safe because interrupts can only add new
4431                  * entries to the tail of this list, and only ->poll()
4432                  * calls can remove this head entry from the list.
4433                  */
4434                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4435
4436                 have = netpoll_poll_lock(n);
4437
4438                 weight = n->weight;
4439
4440                 /* This NAPI_STATE_SCHED test is for avoiding a race
4441                  * with netpoll's poll_napi().  Only the entity which
4442                  * obtains the lock and sees NAPI_STATE_SCHED set will
4443                  * actually make the ->poll() call.  Therefore we avoid
4444                  * accidentally calling ->poll() when NAPI is not scheduled.
4445                  */
4446                 work = 0;
4447                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4448                         work = n->poll(n, weight);
4449                         trace_napi_poll(n);
4450                 }
4451
4452                 WARN_ON_ONCE(work > weight);
4453
4454                 budget -= work;
4455
4456                 local_irq_disable();
4457
4458                 /* Drivers must not modify the NAPI state if they
4459                  * consume the entire weight.  In such cases this code
4460                  * still "owns" the NAPI instance and therefore can
4461                  * move the instance around on the list at-will.
4462                  */
4463                 if (unlikely(work == weight)) {
4464                         if (unlikely(napi_disable_pending(n))) {
4465                                 local_irq_enable();
4466                                 napi_complete(n);
4467                                 local_irq_disable();
4468                         } else {
4469                                 if (n->gro_list) {
4470                                         /* flush too old packets
4471                                          * If HZ < 1000, flush all packets.
4472                                          */
4473                                         local_irq_enable();
4474                                         napi_gro_flush(n, HZ >= 1000);
4475                                         local_irq_disable();
4476                                 }
4477                                 list_move_tail(&n->poll_list, &sd->poll_list);
4478                         }
4479                 }
4480
4481                 netpoll_poll_unlock(have);
4482         }
4483 out:
4484         net_rps_action_and_irq_enable(sd);
4485
4486 #ifdef CONFIG_NET_DMA
4487         /*
4488          * There may not be any more sk_buffs coming right now, so push
4489          * any pending DMA copies to hardware
4490          */
4491         dma_issue_pending_all();
4492 #endif
4493
4494         return;
4495
4496 softnet_break:
4497         sd->time_squeeze++;
4498         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4499         goto out;
4500 }
4501
4502 struct netdev_adjacent {
4503         struct net_device *dev;
4504
4505         /* upper master flag, there can only be one master device per list */
4506         bool master;
4507
4508         /* counter for the number of times this device was added to us */
4509         u16 ref_nr;
4510
4511         /* private field for the users */
4512         void *private;
4513
4514         struct list_head list;
4515         struct rcu_head rcu;
4516 };
4517
4518 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4519                                                  struct net_device *adj_dev,
4520                                                  struct list_head *adj_list)
4521 {
4522         struct netdev_adjacent *adj;
4523
4524         list_for_each_entry(adj, adj_list, list) {
4525                 if (adj->dev == adj_dev)
4526                         return adj;
4527         }
4528         return NULL;
4529 }
4530
4531 /**
4532  * netdev_has_upper_dev - Check if device is linked to an upper device
4533  * @dev: device
4534  * @upper_dev: upper device to check
4535  *
4536  * Find out if a device is linked to specified upper device and return true
4537  * in case it is. Note that this checks only immediate upper device,
4538  * not through a complete stack of devices. The caller must hold the RTNL lock.
4539  */
4540 bool netdev_has_upper_dev(struct net_device *dev,
4541                           struct net_device *upper_dev)
4542 {
4543         ASSERT_RTNL();
4544
4545         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4546 }
4547 EXPORT_SYMBOL(netdev_has_upper_dev);
4548
4549 /**
4550  * netdev_has_any_upper_dev - Check if device is linked to some device
4551  * @dev: device
4552  *
4553  * Find out if a device is linked to an upper device and return true in case
4554  * it is. The caller must hold the RTNL lock.
4555  */
4556 static bool netdev_has_any_upper_dev(struct net_device *dev)
4557 {
4558         ASSERT_RTNL();
4559
4560         return !list_empty(&dev->all_adj_list.upper);
4561 }
4562
4563 /**
4564  * netdev_master_upper_dev_get - Get master upper device
4565  * @dev: device
4566  *
4567  * Find a master upper device and return pointer to it or NULL in case
4568  * it's not there. The caller must hold the RTNL lock.
4569  */
4570 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4571 {
4572         struct netdev_adjacent *upper;
4573
4574         ASSERT_RTNL();
4575
4576         if (list_empty(&dev->adj_list.upper))
4577                 return NULL;
4578
4579         upper = list_first_entry(&dev->adj_list.upper,
4580                                  struct netdev_adjacent, list);
4581         if (likely(upper->master))
4582                 return upper->dev;
4583         return NULL;
4584 }
4585 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4586
4587 void *netdev_adjacent_get_private(struct list_head *adj_list)
4588 {
4589         struct netdev_adjacent *adj;
4590
4591         adj = list_entry(adj_list, struct netdev_adjacent, list);
4592
4593         return adj->private;
4594 }
4595 EXPORT_SYMBOL(netdev_adjacent_get_private);
4596
4597 /**
4598  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4599  * @dev: device
4600  * @iter: list_head ** of the current position
4601  *
4602  * Gets the next device from the dev's upper list, starting from iter
4603  * position. The caller must hold RCU read lock.
4604  */
4605 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4606                                                  struct list_head **iter)
4607 {
4608         struct netdev_adjacent *upper;
4609
4610         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4611
4612         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4613
4614         if (&upper->list == &dev->adj_list.upper)
4615                 return NULL;
4616
4617         *iter = &upper->list;
4618
4619         return upper->dev;
4620 }
4621 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4622
4623 /**
4624  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4625  * @dev: device
4626  * @iter: list_head ** of the current position
4627  *
4628  * Gets the next device from the dev's upper list, starting from iter
4629  * position. The caller must hold RCU read lock.
4630  */
4631 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4632                                                      struct list_head **iter)
4633 {
4634         struct netdev_adjacent *upper;
4635
4636         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4637
4638         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4639
4640         if (&upper->list == &dev->all_adj_list.upper)
4641                 return NULL;
4642
4643         *iter = &upper->list;
4644
4645         return upper->dev;
4646 }
4647 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4648
4649 /**
4650  * netdev_lower_get_next_private - Get the next ->private from the
4651  *                                 lower neighbour list
4652  * @dev: device
4653  * @iter: list_head ** of the current position
4654  *
4655  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4656  * list, starting from iter position. The caller must hold either hold the
4657  * RTNL lock or its own locking that guarantees that the neighbour lower
4658  * list will remain unchainged.
4659  */
4660 void *netdev_lower_get_next_private(struct net_device *dev,
4661                                     struct list_head **iter)
4662 {
4663         struct netdev_adjacent *lower;
4664
4665         lower = list_entry(*iter, struct netdev_adjacent, list);
4666
4667         if (&lower->list == &dev->adj_list.lower)
4668                 return NULL;
4669
4670         *iter = lower->list.next;
4671
4672         return lower->private;
4673 }
4674 EXPORT_SYMBOL(netdev_lower_get_next_private);
4675
4676 /**
4677  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4678  *                                     lower neighbour list, RCU
4679  *                                     variant
4680  * @dev: device
4681  * @iter: list_head ** of the current position
4682  *
4683  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4684  * list, starting from iter position. The caller must hold RCU read lock.
4685  */
4686 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4687                                         struct list_head **iter)
4688 {
4689         struct netdev_adjacent *lower;
4690
4691         WARN_ON_ONCE(!rcu_read_lock_held());
4692
4693         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4694
4695         if (&lower->list == &dev->adj_list.lower)
4696                 return NULL;
4697
4698         *iter = &lower->list;
4699
4700         return lower->private;
4701 }
4702 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4703
4704 /**
4705  * netdev_lower_get_next - Get the next device from the lower neighbour
4706  *                         list
4707  * @dev: device
4708  * @iter: list_head ** of the current position
4709  *
4710  * Gets the next netdev_adjacent from the dev's lower neighbour
4711  * list, starting from iter position. The caller must hold RTNL lock or
4712  * its own locking that guarantees that the neighbour lower
4713  * list will remain unchainged.
4714  */
4715 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4716 {
4717         struct netdev_adjacent *lower;
4718
4719         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4720
4721         if (&lower->list == &dev->adj_list.lower)
4722                 return NULL;
4723
4724         *iter = &lower->list;
4725
4726         return lower->dev;
4727 }
4728 EXPORT_SYMBOL(netdev_lower_get_next);
4729
4730 /**
4731  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4732  *                                     lower neighbour list, RCU
4733  *                                     variant
4734  * @dev: device
4735  *
4736  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4737  * list. The caller must hold RCU read lock.
4738  */
4739 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4740 {
4741         struct netdev_adjacent *lower;
4742
4743         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4744                         struct netdev_adjacent, list);
4745         if (lower)
4746                 return lower->private;
4747         return NULL;
4748 }
4749 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4750
4751 /**
4752  * netdev_master_upper_dev_get_rcu - Get master upper device
4753  * @dev: device
4754  *
4755  * Find a master upper device and return pointer to it or NULL in case
4756  * it's not there. The caller must hold the RCU read lock.
4757  */
4758 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4759 {
4760         struct netdev_adjacent *upper;
4761
4762         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4763                                        struct netdev_adjacent, list);
4764         if (upper && likely(upper->master))
4765                 return upper->dev;
4766         return NULL;
4767 }
4768 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4769
4770 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4771                               struct net_device *adj_dev,
4772                               struct list_head *dev_list)
4773 {
4774         char linkname[IFNAMSIZ+7];
4775         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4776                 "upper_%s" : "lower_%s", adj_dev->name);
4777         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4778                                  linkname);
4779 }
4780 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4781                                char *name,
4782                                struct list_head *dev_list)
4783 {
4784         char linkname[IFNAMSIZ+7];
4785         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4786                 "upper_%s" : "lower_%s", name);
4787         sysfs_remove_link(&(dev->dev.kobj), linkname);
4788 }
4789
4790 #define netdev_adjacent_is_neigh_list(dev, dev_list) \
4791                 (dev_list == &dev->adj_list.upper || \
4792                  dev_list == &dev->adj_list.lower)
4793
4794 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4795                                         struct net_device *adj_dev,
4796                                         struct list_head *dev_list,
4797                                         void *private, bool master)
4798 {
4799         struct netdev_adjacent *adj;
4800         int ret;
4801
4802         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4803
4804         if (adj) {
4805                 adj->ref_nr++;
4806                 return 0;
4807         }
4808
4809         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4810         if (!adj)
4811                 return -ENOMEM;
4812
4813         adj->dev = adj_dev;
4814         adj->master = master;
4815         adj->ref_nr = 1;
4816         adj->private = private;
4817         dev_hold(adj_dev);
4818
4819         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4820                  adj_dev->name, dev->name, adj_dev->name);
4821
4822         if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4823                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4824                 if (ret)
4825                         goto free_adj;
4826         }
4827
4828         /* Ensure that master link is always the first item in list. */
4829         if (master) {
4830                 ret = sysfs_create_link(&(dev->dev.kobj),
4831                                         &(adj_dev->dev.kobj), "master");
4832                 if (ret)
4833                         goto remove_symlinks;
4834
4835                 list_add_rcu(&adj->list, dev_list);
4836         } else {
4837                 list_add_tail_rcu(&adj->list, dev_list);
4838         }
4839
4840         return 0;
4841
4842 remove_symlinks:
4843         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4844                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4845 free_adj:
4846         kfree(adj);
4847         dev_put(adj_dev);
4848
4849         return ret;
4850 }
4851
4852 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4853                                          struct net_device *adj_dev,
4854                                          struct list_head *dev_list)
4855 {
4856         struct netdev_adjacent *adj;
4857
4858         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4859
4860         if (!adj) {
4861                 pr_err("tried to remove device %s from %s\n",
4862                        dev->name, adj_dev->name);
4863                 BUG();
4864         }
4865
4866         if (adj->ref_nr > 1) {
4867                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4868                          adj->ref_nr-1);
4869                 adj->ref_nr--;
4870                 return;
4871         }
4872
4873         if (adj->master)
4874                 sysfs_remove_link(&(dev->dev.kobj), "master");
4875
4876         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4877                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4878
4879         list_del_rcu(&adj->list);
4880         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4881                  adj_dev->name, dev->name, adj_dev->name);
4882         dev_put(adj_dev);
4883         kfree_rcu(adj, rcu);
4884 }
4885
4886 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4887                                             struct net_device *upper_dev,
4888                                             struct list_head *up_list,
4889                                             struct list_head *down_list,
4890                                             void *private, bool master)
4891 {
4892         int ret;
4893
4894         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4895                                            master);
4896         if (ret)
4897                 return ret;
4898
4899         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4900                                            false);
4901         if (ret) {
4902                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4903                 return ret;
4904         }
4905
4906         return 0;
4907 }
4908
4909 static int __netdev_adjacent_dev_link(struct net_device *dev,
4910                                       struct net_device *upper_dev)
4911 {
4912         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4913                                                 &dev->all_adj_list.upper,
4914                                                 &upper_dev->all_adj_list.lower,
4915                                                 NULL, false);
4916 }
4917
4918 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4919                                                struct net_device *upper_dev,
4920                                                struct list_head *up_list,
4921                                                struct list_head *down_list)
4922 {
4923         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4924         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4925 }
4926
4927 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4928                                          struct net_device *upper_dev)
4929 {
4930         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4931                                            &dev->all_adj_list.upper,
4932                                            &upper_dev->all_adj_list.lower);
4933 }
4934
4935 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4936                                                 struct net_device *upper_dev,
4937                                                 void *private, bool master)
4938 {
4939         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4940
4941         if (ret)
4942                 return ret;
4943
4944         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4945                                                &dev->adj_list.upper,
4946                                                &upper_dev->adj_list.lower,
4947                                                private, master);
4948         if (ret) {
4949                 __netdev_adjacent_dev_unlink(dev, upper_dev);
4950                 return ret;
4951         }
4952
4953         return 0;
4954 }
4955
4956 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4957                                                    struct net_device *upper_dev)
4958 {
4959         __netdev_adjacent_dev_unlink(dev, upper_dev);
4960         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4961                                            &dev->adj_list.upper,
4962                                            &upper_dev->adj_list.lower);
4963 }
4964
4965 static int __netdev_upper_dev_link(struct net_device *dev,
4966                                    struct net_device *upper_dev, bool master,
4967                                    void *private)
4968 {
4969         struct netdev_adjacent *i, *j, *to_i, *to_j;
4970         int ret = 0;
4971
4972         ASSERT_RTNL();
4973
4974         if (dev == upper_dev)
4975                 return -EBUSY;
4976
4977         /* To prevent loops, check if dev is not upper device to upper_dev. */
4978         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4979                 return -EBUSY;
4980
4981         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4982                 return -EEXIST;
4983
4984         if (master && netdev_master_upper_dev_get(dev))
4985                 return -EBUSY;
4986
4987         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4988                                                    master);
4989         if (ret)
4990                 return ret;
4991
4992         /* Now that we linked these devs, make all the upper_dev's
4993          * all_adj_list.upper visible to every dev's all_adj_list.lower an
4994          * versa, and don't forget the devices itself. All of these
4995          * links are non-neighbours.
4996          */
4997         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4998                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4999                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5000                                  i->dev->name, j->dev->name);
5001                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5002                         if (ret)
5003                                 goto rollback_mesh;
5004                 }
5005         }
5006
5007         /* add dev to every upper_dev's upper device */
5008         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5009                 pr_debug("linking %s's upper device %s with %s\n",
5010                          upper_dev->name, i->dev->name, dev->name);
5011                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5012                 if (ret)
5013                         goto rollback_upper_mesh;
5014         }
5015
5016         /* add upper_dev to every dev's lower device */
5017         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5018                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5019                          i->dev->name, upper_dev->name);
5020                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5021                 if (ret)
5022                         goto rollback_lower_mesh;
5023         }
5024
5025         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5026         return 0;
5027
5028 rollback_lower_mesh:
5029         to_i = i;
5030         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5031                 if (i == to_i)
5032                         break;
5033                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5034         }
5035
5036         i = NULL;
5037
5038 rollback_upper_mesh:
5039         to_i = i;
5040         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5041                 if (i == to_i)
5042                         break;
5043                 __netdev_adjacent_dev_unlink(dev, i->dev);
5044         }
5045
5046         i = j = NULL;
5047
5048 rollback_mesh:
5049         to_i = i;
5050         to_j = j;
5051         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5052                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5053                         if (i == to_i && j == to_j)
5054                                 break;
5055                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5056                 }
5057                 if (i == to_i)
5058                         break;
5059         }
5060
5061         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5062
5063         return ret;
5064 }
5065
5066 /**
5067  * netdev_upper_dev_link - Add a link to the upper device
5068  * @dev: device
5069  * @upper_dev: new upper device
5070  *
5071  * Adds a link to device which is upper to this one. The caller must hold
5072  * the RTNL lock. On a failure a negative errno code is returned.
5073  * On success the reference counts are adjusted and the function
5074  * returns zero.
5075  */
5076 int netdev_upper_dev_link(struct net_device *dev,
5077                           struct net_device *upper_dev)
5078 {
5079         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5080 }
5081 EXPORT_SYMBOL(netdev_upper_dev_link);
5082
5083 /**
5084  * netdev_master_upper_dev_link - Add a master link to the upper device
5085  * @dev: device
5086  * @upper_dev: new upper device
5087  *
5088  * Adds a link to device which is upper to this one. In this case, only
5089  * one master upper device can be linked, although other non-master devices
5090  * might be linked as well. The caller must hold the RTNL lock.
5091  * On a failure a negative errno code is returned. On success the reference
5092  * counts are adjusted and the function returns zero.
5093  */
5094 int netdev_master_upper_dev_link(struct net_device *dev,
5095                                  struct net_device *upper_dev)
5096 {
5097         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5098 }
5099 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5100
5101 int netdev_master_upper_dev_link_private(struct net_device *dev,
5102                                          struct net_device *upper_dev,
5103                                          void *private)
5104 {
5105         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5106 }
5107 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5108
5109 /**
5110  * netdev_upper_dev_unlink - Removes a link to upper device
5111  * @dev: device
5112  * @upper_dev: new upper device
5113  *
5114  * Removes a link to device which is upper to this one. The caller must hold
5115  * the RTNL lock.
5116  */
5117 void netdev_upper_dev_unlink(struct net_device *dev,
5118                              struct net_device *upper_dev)
5119 {
5120         struct netdev_adjacent *i, *j;
5121         ASSERT_RTNL();
5122
5123         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5124
5125         /* Here is the tricky part. We must remove all dev's lower
5126          * devices from all upper_dev's upper devices and vice
5127          * versa, to maintain the graph relationship.
5128          */
5129         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5130                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5131                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5132
5133         /* remove also the devices itself from lower/upper device
5134          * list
5135          */
5136         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5137                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5138
5139         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5140                 __netdev_adjacent_dev_unlink(dev, i->dev);
5141
5142         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5143 }
5144 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5145
5146 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5147 {
5148         struct netdev_adjacent *iter;
5149
5150         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5151                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5152                                           &iter->dev->adj_list.lower);
5153                 netdev_adjacent_sysfs_add(iter->dev, dev,
5154                                           &iter->dev->adj_list.lower);
5155         }
5156
5157         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5158                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5159                                           &iter->dev->adj_list.upper);
5160                 netdev_adjacent_sysfs_add(iter->dev, dev,
5161                                           &iter->dev->adj_list.upper);
5162         }
5163 }
5164
5165 void *netdev_lower_dev_get_private(struct net_device *dev,
5166                                    struct net_device *lower_dev)
5167 {
5168         struct netdev_adjacent *lower;
5169
5170         if (!lower_dev)
5171                 return NULL;
5172         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5173         if (!lower)
5174                 return NULL;
5175
5176         return lower->private;
5177 }
5178 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5179
5180
5181 int dev_get_nest_level(struct net_device *dev,
5182                        bool (*type_check)(struct net_device *dev))
5183 {
5184         struct net_device *lower = NULL;
5185         struct list_head *iter;
5186         int max_nest = -1;
5187         int nest;
5188
5189         ASSERT_RTNL();
5190
5191         netdev_for_each_lower_dev(dev, lower, iter) {
5192                 nest = dev_get_nest_level(lower, type_check);
5193                 if (max_nest < nest)
5194                         max_nest = nest;
5195         }
5196
5197         if (type_check(dev))
5198                 max_nest++;
5199
5200         return max_nest;
5201 }
5202 EXPORT_SYMBOL(dev_get_nest_level);
5203
5204 static void dev_change_rx_flags(struct net_device *dev, int flags)
5205 {
5206         const struct net_device_ops *ops = dev->netdev_ops;
5207
5208         if (ops->ndo_change_rx_flags)
5209                 ops->ndo_change_rx_flags(dev, flags);
5210 }
5211
5212 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5213 {
5214         unsigned int old_flags = dev->flags;
5215         kuid_t uid;
5216         kgid_t gid;
5217
5218         ASSERT_RTNL();
5219
5220         dev->flags |= IFF_PROMISC;
5221         dev->promiscuity += inc;
5222         if (dev->promiscuity == 0) {
5223                 /*
5224                  * Avoid overflow.
5225                  * If inc causes overflow, untouch promisc and return error.
5226                  */
5227                 if (inc < 0)
5228                         dev->flags &= ~IFF_PROMISC;
5229                 else {
5230                         dev->promiscuity -= inc;
5231                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5232                                 dev->name);
5233                         return -EOVERFLOW;
5234                 }
5235         }
5236         if (dev->flags != old_flags) {
5237                 pr_info("device %s %s promiscuous mode\n",
5238                         dev->name,
5239                         dev->flags & IFF_PROMISC ? "entered" : "left");
5240                 if (audit_enabled) {
5241                         current_uid_gid(&uid, &gid);
5242                         audit_log(current->audit_context, GFP_ATOMIC,
5243                                 AUDIT_ANOM_PROMISCUOUS,
5244                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5245                                 dev->name, (dev->flags & IFF_PROMISC),
5246                                 (old_flags & IFF_PROMISC),
5247                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5248                                 from_kuid(&init_user_ns, uid),
5249                                 from_kgid(&init_user_ns, gid),
5250                                 audit_get_sessionid(current));
5251                 }
5252
5253                 dev_change_rx_flags(dev, IFF_PROMISC);
5254         }
5255         if (notify)
5256                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5257         return 0;
5258 }
5259
5260 /**
5261  *      dev_set_promiscuity     - update promiscuity count on a device
5262  *      @dev: device
5263  *      @inc: modifier
5264  *
5265  *      Add or remove promiscuity from a device. While the count in the device
5266  *      remains above zero the interface remains promiscuous. Once it hits zero
5267  *      the device reverts back to normal filtering operation. A negative inc
5268  *      value is used to drop promiscuity on the device.
5269  *      Return 0 if successful or a negative errno code on error.
5270  */
5271 int dev_set_promiscuity(struct net_device *dev, int inc)
5272 {
5273         unsigned int old_flags = dev->flags;
5274         int err;
5275
5276         err = __dev_set_promiscuity(dev, inc, true);
5277         if (err < 0)
5278                 return err;
5279         if (dev->flags != old_flags)
5280                 dev_set_rx_mode(dev);
5281         return err;
5282 }
5283 EXPORT_SYMBOL(dev_set_promiscuity);
5284
5285 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5286 {
5287         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5288
5289         ASSERT_RTNL();
5290
5291         dev->flags |= IFF_ALLMULTI;
5292         dev->allmulti += inc;
5293         if (dev->allmulti == 0) {
5294                 /*
5295                  * Avoid overflow.
5296                  * If inc causes overflow, untouch allmulti and return error.
5297                  */
5298                 if (inc < 0)
5299                         dev->flags &= ~IFF_ALLMULTI;
5300                 else {
5301                         dev->allmulti -= inc;
5302                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5303                                 dev->name);
5304                         return -EOVERFLOW;
5305                 }
5306         }
5307         if (dev->flags ^ old_flags) {
5308                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5309                 dev_set_rx_mode(dev);
5310                 if (notify)
5311                         __dev_notify_flags(dev, old_flags,
5312                                            dev->gflags ^ old_gflags);
5313         }
5314         return 0;
5315 }
5316
5317 /**
5318  *      dev_set_allmulti        - update allmulti count on a device
5319  *      @dev: device
5320  *      @inc: modifier
5321  *
5322  *      Add or remove reception of all multicast frames to a device. While the
5323  *      count in the device remains above zero the interface remains listening
5324  *      to all interfaces. Once it hits zero the device reverts back to normal
5325  *      filtering operation. A negative @inc value is used to drop the counter
5326  *      when releasing a resource needing all multicasts.
5327  *      Return 0 if successful or a negative errno code on error.
5328  */
5329
5330 int dev_set_allmulti(struct net_device *dev, int inc)
5331 {
5332         return __dev_set_allmulti(dev, inc, true);
5333 }
5334 EXPORT_SYMBOL(dev_set_allmulti);
5335
5336 /*
5337  *      Upload unicast and multicast address lists to device and
5338  *      configure RX filtering. When the device doesn't support unicast
5339  *      filtering it is put in promiscuous mode while unicast addresses
5340  *      are present.
5341  */
5342 void __dev_set_rx_mode(struct net_device *dev)
5343 {
5344         const struct net_device_ops *ops = dev->netdev_ops;
5345
5346         /* dev_open will call this function so the list will stay sane. */
5347         if (!(dev->flags&IFF_UP))
5348                 return;
5349
5350         if (!netif_device_present(dev))
5351                 return;
5352
5353         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5354                 /* Unicast addresses changes may only happen under the rtnl,
5355                  * therefore calling __dev_set_promiscuity here is safe.
5356                  */
5357                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5358                         __dev_set_promiscuity(dev, 1, false);
5359                         dev->uc_promisc = true;
5360                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5361                         __dev_set_promiscuity(dev, -1, false);
5362                         dev->uc_promisc = false;
5363                 }
5364         }
5365
5366         if (ops->ndo_set_rx_mode)
5367                 ops->ndo_set_rx_mode(dev);
5368 }
5369
5370 void dev_set_rx_mode(struct net_device *dev)
5371 {
5372         netif_addr_lock_bh(dev);
5373         __dev_set_rx_mode(dev);
5374         netif_addr_unlock_bh(dev);
5375 }
5376
5377 /**
5378  *      dev_get_flags - get flags reported to userspace
5379  *      @dev: device
5380  *
5381  *      Get the combination of flag bits exported through APIs to userspace.
5382  */
5383 unsigned int dev_get_flags(const struct net_device *dev)
5384 {
5385         unsigned int flags;
5386
5387         flags = (dev->flags & ~(IFF_PROMISC |
5388                                 IFF_ALLMULTI |
5389                                 IFF_RUNNING |
5390                                 IFF_LOWER_UP |
5391                                 IFF_DORMANT)) |
5392                 (dev->gflags & (IFF_PROMISC |
5393                                 IFF_ALLMULTI));
5394
5395         if (netif_running(dev)) {
5396                 if (netif_oper_up(dev))
5397                         flags |= IFF_RUNNING;
5398                 if (netif_carrier_ok(dev))
5399                         flags |= IFF_LOWER_UP;
5400                 if (netif_dormant(dev))
5401                         flags |= IFF_DORMANT;
5402         }
5403
5404         return flags;
5405 }
5406 EXPORT_SYMBOL(dev_get_flags);
5407
5408 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5409 {
5410         unsigned int old_flags = dev->flags;
5411         int ret;
5412
5413         ASSERT_RTNL();
5414
5415         /*
5416          *      Set the flags on our device.
5417          */
5418
5419         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5420                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5421                                IFF_AUTOMEDIA)) |
5422                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5423                                     IFF_ALLMULTI));
5424
5425         /*
5426          *      Load in the correct multicast list now the flags have changed.
5427          */
5428
5429         if ((old_flags ^ flags) & IFF_MULTICAST)
5430                 dev_change_rx_flags(dev, IFF_MULTICAST);
5431
5432         dev_set_rx_mode(dev);
5433
5434         /*
5435          *      Have we downed the interface. We handle IFF_UP ourselves
5436          *      according to user attempts to set it, rather than blindly
5437          *      setting it.
5438          */
5439
5440         ret = 0;
5441         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5442                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5443
5444                 if (!ret)
5445                         dev_set_rx_mode(dev);
5446         }
5447
5448         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5449                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5450                 unsigned int old_flags = dev->flags;
5451
5452                 dev->gflags ^= IFF_PROMISC;
5453
5454                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5455                         if (dev->flags != old_flags)
5456                                 dev_set_rx_mode(dev);
5457         }
5458
5459         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5460            is important. Some (broken) drivers set IFF_PROMISC, when
5461            IFF_ALLMULTI is requested not asking us and not reporting.
5462          */
5463         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5464                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5465
5466                 dev->gflags ^= IFF_ALLMULTI;
5467                 __dev_set_allmulti(dev, inc, false);
5468         }
5469
5470         return ret;
5471 }
5472
5473 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5474                         unsigned int gchanges)
5475 {
5476         unsigned int changes = dev->flags ^ old_flags;
5477
5478         if (gchanges)
5479                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5480
5481         if (changes & IFF_UP) {
5482                 if (dev->flags & IFF_UP)
5483                         call_netdevice_notifiers(NETDEV_UP, dev);
5484                 else
5485                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5486         }
5487
5488         if (dev->flags & IFF_UP &&
5489             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5490                 struct netdev_notifier_change_info change_info;
5491
5492                 change_info.flags_changed = changes;
5493                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5494                                               &change_info.info);
5495         }
5496 }
5497
5498 /**
5499  *      dev_change_flags - change device settings
5500  *      @dev: device
5501  *      @flags: device state flags
5502  *
5503  *      Change settings on device based state flags. The flags are
5504  *      in the userspace exported format.
5505  */
5506 int dev_change_flags(struct net_device *dev, unsigned int flags)
5507 {
5508         int ret;
5509         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5510
5511         ret = __dev_change_flags(dev, flags);
5512         if (ret < 0)
5513                 return ret;
5514
5515         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5516         __dev_notify_flags(dev, old_flags, changes);
5517         return ret;
5518 }
5519 EXPORT_SYMBOL(dev_change_flags);
5520
5521 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5522 {
5523         const struct net_device_ops *ops = dev->netdev_ops;
5524
5525         if (ops->ndo_change_mtu)
5526                 return ops->ndo_change_mtu(dev, new_mtu);
5527
5528         dev->mtu = new_mtu;
5529         return 0;
5530 }
5531
5532 /**
5533  *      dev_set_mtu - Change maximum transfer unit
5534  *      @dev: device
5535  *      @new_mtu: new transfer unit
5536  *
5537  *      Change the maximum transfer size of the network device.
5538  */
5539 int dev_set_mtu(struct net_device *dev, int new_mtu)
5540 {
5541         int err, orig_mtu;
5542
5543         if (new_mtu == dev->mtu)
5544                 return 0;
5545
5546         /*      MTU must be positive.    */
5547         if (new_mtu < 0)
5548                 return -EINVAL;
5549
5550         if (!netif_device_present(dev))
5551                 return -ENODEV;
5552
5553         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5554         err = notifier_to_errno(err);
5555         if (err)
5556                 return err;
5557
5558         orig_mtu = dev->mtu;
5559         err = __dev_set_mtu(dev, new_mtu);
5560
5561         if (!err) {
5562                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5563                 err = notifier_to_errno(err);
5564                 if (err) {
5565                         /* setting mtu back and notifying everyone again,
5566                          * so that they have a chance to revert changes.
5567                          */
5568                         __dev_set_mtu(dev, orig_mtu);
5569                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5570                 }
5571         }
5572         return err;
5573 }
5574 EXPORT_SYMBOL(dev_set_mtu);
5575
5576 /**
5577  *      dev_set_group - Change group this device belongs to
5578  *      @dev: device
5579  *      @new_group: group this device should belong to
5580  */
5581 void dev_set_group(struct net_device *dev, int new_group)
5582 {
5583         dev->group = new_group;
5584 }
5585 EXPORT_SYMBOL(dev_set_group);
5586
5587 /**
5588  *      dev_set_mac_address - Change Media Access Control Address
5589  *      @dev: device
5590  *      @sa: new address
5591  *
5592  *      Change the hardware (MAC) address of the device
5593  */
5594 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5595 {
5596         const struct net_device_ops *ops = dev->netdev_ops;
5597         int err;
5598
5599         if (!ops->ndo_set_mac_address)
5600                 return -EOPNOTSUPP;
5601         if (sa->sa_family != dev->type)
5602                 return -EINVAL;
5603         if (!netif_device_present(dev))
5604                 return -ENODEV;
5605         err = ops->ndo_set_mac_address(dev, sa);
5606         if (err)
5607                 return err;
5608         dev->addr_assign_type = NET_ADDR_SET;
5609         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5610         add_device_randomness(dev->dev_addr, dev->addr_len);
5611         return 0;
5612 }
5613 EXPORT_SYMBOL(dev_set_mac_address);
5614
5615 /**
5616  *      dev_change_carrier - Change device carrier
5617  *      @dev: device
5618  *      @new_carrier: new value
5619  *
5620  *      Change device carrier
5621  */
5622 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5623 {
5624         const struct net_device_ops *ops = dev->netdev_ops;
5625
5626         if (!ops->ndo_change_carrier)
5627                 return -EOPNOTSUPP;
5628         if (!netif_device_present(dev))
5629                 return -ENODEV;
5630         return ops->ndo_change_carrier(dev, new_carrier);
5631 }
5632 EXPORT_SYMBOL(dev_change_carrier);
5633
5634 /**
5635  *      dev_get_phys_port_id - Get device physical port ID
5636  *      @dev: device
5637  *      @ppid: port ID
5638  *
5639  *      Get device physical port ID
5640  */
5641 int dev_get_phys_port_id(struct net_device *dev,
5642                          struct netdev_phys_port_id *ppid)
5643 {
5644         const struct net_device_ops *ops = dev->netdev_ops;
5645
5646         if (!ops->ndo_get_phys_port_id)
5647                 return -EOPNOTSUPP;
5648         return ops->ndo_get_phys_port_id(dev, ppid);
5649 }
5650 EXPORT_SYMBOL(dev_get_phys_port_id);
5651
5652 /**
5653  *      dev_new_index   -       allocate an ifindex
5654  *      @net: the applicable net namespace
5655  *
5656  *      Returns a suitable unique value for a new device interface
5657  *      number.  The caller must hold the rtnl semaphore or the
5658  *      dev_base_lock to be sure it remains unique.
5659  */
5660 static int dev_new_index(struct net *net)
5661 {
5662         int ifindex = net->ifindex;
5663         for (;;) {
5664                 if (++ifindex <= 0)
5665                         ifindex = 1;
5666                 if (!__dev_get_by_index(net, ifindex))
5667                         return net->ifindex = ifindex;
5668         }
5669 }
5670
5671 /* Delayed registration/unregisteration */
5672 static LIST_HEAD(net_todo_list);
5673 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5674
5675 static void net_set_todo(struct net_device *dev)
5676 {
5677         list_add_tail(&dev->todo_list, &net_todo_list);
5678         dev_net(dev)->dev_unreg_count++;
5679 }
5680
5681 static void rollback_registered_many(struct list_head *head)
5682 {
5683         struct net_device *dev, *tmp;
5684         LIST_HEAD(close_head);
5685
5686         BUG_ON(dev_boot_phase);
5687         ASSERT_RTNL();
5688
5689         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5690                 /* Some devices call without registering
5691                  * for initialization unwind. Remove those
5692                  * devices and proceed with the remaining.
5693                  */
5694                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5695                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5696                                  dev->name, dev);
5697
5698                         WARN_ON(1);
5699                         list_del(&dev->unreg_list);
5700                         continue;
5701                 }
5702                 dev->dismantle = true;
5703                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5704         }
5705
5706         /* If device is running, close it first. */
5707         list_for_each_entry(dev, head, unreg_list)
5708                 list_add_tail(&dev->close_list, &close_head);
5709         dev_close_many(&close_head);
5710
5711         list_for_each_entry(dev, head, unreg_list) {
5712                 /* And unlink it from device chain. */
5713                 unlist_netdevice(dev);
5714
5715                 dev->reg_state = NETREG_UNREGISTERING;
5716         }
5717
5718         synchronize_net();
5719
5720         list_for_each_entry(dev, head, unreg_list) {
5721                 /* Shutdown queueing discipline. */
5722                 dev_shutdown(dev);
5723
5724
5725                 /* Notify protocols, that we are about to destroy
5726                    this device. They should clean all the things.
5727                 */
5728                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5729
5730                 /*
5731                  *      Flush the unicast and multicast chains
5732                  */
5733                 dev_uc_flush(dev);
5734                 dev_mc_flush(dev);
5735
5736                 if (dev->netdev_ops->ndo_uninit)
5737                         dev->netdev_ops->ndo_uninit(dev);
5738
5739                 if (!dev->rtnl_link_ops ||
5740                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5741                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5742
5743                 /* Notifier chain MUST detach us all upper devices. */
5744                 WARN_ON(netdev_has_any_upper_dev(dev));
5745
5746                 /* Remove entries from kobject tree */
5747                 netdev_unregister_kobject(dev);
5748 #ifdef CONFIG_XPS
5749                 /* Remove XPS queueing entries */
5750                 netif_reset_xps_queues_gt(dev, 0);
5751 #endif
5752         }
5753
5754         synchronize_net();
5755
5756         list_for_each_entry(dev, head, unreg_list)
5757                 dev_put(dev);
5758 }
5759
5760 static void rollback_registered(struct net_device *dev)
5761 {
5762         LIST_HEAD(single);
5763
5764         list_add(&dev->unreg_list, &single);
5765         rollback_registered_many(&single);
5766         list_del(&single);
5767 }
5768
5769 static netdev_features_t netdev_fix_features(struct net_device *dev,
5770         netdev_features_t features)
5771 {
5772         /* Fix illegal checksum combinations */
5773         if ((features & NETIF_F_HW_CSUM) &&
5774             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5775                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5776                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5777         }
5778
5779         /* TSO requires that SG is present as well. */
5780         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5781                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5782                 features &= ~NETIF_F_ALL_TSO;
5783         }
5784
5785         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5786                                         !(features & NETIF_F_IP_CSUM)) {
5787                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5788                 features &= ~NETIF_F_TSO;
5789                 features &= ~NETIF_F_TSO_ECN;
5790         }
5791
5792         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5793                                          !(features & NETIF_F_IPV6_CSUM)) {
5794                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5795                 features &= ~NETIF_F_TSO6;
5796         }
5797
5798         /* TSO ECN requires that TSO is present as well. */
5799         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5800                 features &= ~NETIF_F_TSO_ECN;
5801
5802         /* Software GSO depends on SG. */
5803         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5804                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5805                 features &= ~NETIF_F_GSO;
5806         }
5807
5808         /* UFO needs SG and checksumming */
5809         if (features & NETIF_F_UFO) {
5810                 /* maybe split UFO into V4 and V6? */
5811                 if (!((features & NETIF_F_GEN_CSUM) ||
5812                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5813                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5814                         netdev_dbg(dev,
5815                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5816                         features &= ~NETIF_F_UFO;
5817                 }
5818
5819                 if (!(features & NETIF_F_SG)) {
5820                         netdev_dbg(dev,
5821                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5822                         features &= ~NETIF_F_UFO;
5823                 }
5824         }
5825
5826 #ifdef CONFIG_NET_RX_BUSY_POLL
5827         if (dev->netdev_ops->ndo_busy_poll)
5828                 features |= NETIF_F_BUSY_POLL;
5829         else
5830 #endif
5831                 features &= ~NETIF_F_BUSY_POLL;
5832
5833         return features;
5834 }
5835
5836 int __netdev_update_features(struct net_device *dev)
5837 {
5838         netdev_features_t features;
5839         int err = 0;
5840
5841         ASSERT_RTNL();
5842
5843         features = netdev_get_wanted_features(dev);
5844
5845         if (dev->netdev_ops->ndo_fix_features)
5846                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5847
5848         /* driver might be less strict about feature dependencies */
5849         features = netdev_fix_features(dev, features);
5850
5851         if (dev->features == features)
5852                 return 0;
5853
5854         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5855                 &dev->features, &features);
5856
5857         if (dev->netdev_ops->ndo_set_features)
5858                 err = dev->netdev_ops->ndo_set_features(dev, features);
5859
5860         if (unlikely(err < 0)) {
5861                 netdev_err(dev,
5862                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5863                         err, &features, &dev->features);
5864                 return -1;
5865         }
5866
5867         if (!err)
5868                 dev->features = features;
5869
5870         return 1;
5871 }
5872
5873 /**
5874  *      netdev_update_features - recalculate device features
5875  *      @dev: the device to check
5876  *
5877  *      Recalculate dev->features set and send notifications if it
5878  *      has changed. Should be called after driver or hardware dependent
5879  *      conditions might have changed that influence the features.
5880  */
5881 void netdev_update_features(struct net_device *dev)
5882 {
5883         if (__netdev_update_features(dev))
5884                 netdev_features_change(dev);
5885 }
5886 EXPORT_SYMBOL(netdev_update_features);
5887
5888 /**
5889  *      netdev_change_features - recalculate device features
5890  *      @dev: the device to check
5891  *
5892  *      Recalculate dev->features set and send notifications even
5893  *      if they have not changed. Should be called instead of
5894  *      netdev_update_features() if also dev->vlan_features might
5895  *      have changed to allow the changes to be propagated to stacked
5896  *      VLAN devices.
5897  */
5898 void netdev_change_features(struct net_device *dev)
5899 {
5900         __netdev_update_features(dev);
5901         netdev_features_change(dev);
5902 }
5903 EXPORT_SYMBOL(netdev_change_features);
5904
5905 /**
5906  *      netif_stacked_transfer_operstate -      transfer operstate
5907  *      @rootdev: the root or lower level device to transfer state from
5908  *      @dev: the device to transfer operstate to
5909  *
5910  *      Transfer operational state from root to device. This is normally
5911  *      called when a stacking relationship exists between the root
5912  *      device and the device(a leaf device).
5913  */
5914 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5915                                         struct net_device *dev)
5916 {
5917         if (rootdev->operstate == IF_OPER_DORMANT)
5918                 netif_dormant_on(dev);
5919         else
5920                 netif_dormant_off(dev);
5921
5922         if (netif_carrier_ok(rootdev)) {
5923                 if (!netif_carrier_ok(dev))
5924                         netif_carrier_on(dev);
5925         } else {
5926                 if (netif_carrier_ok(dev))
5927                         netif_carrier_off(dev);
5928         }
5929 }
5930 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5931
5932 #ifdef CONFIG_SYSFS
5933 static int netif_alloc_rx_queues(struct net_device *dev)
5934 {
5935         unsigned int i, count = dev->num_rx_queues;
5936         struct netdev_rx_queue *rx;
5937
5938         BUG_ON(count < 1);
5939
5940         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5941         if (!rx)
5942                 return -ENOMEM;
5943
5944         dev->_rx = rx;
5945
5946         for (i = 0; i < count; i++)
5947                 rx[i].dev = dev;
5948         return 0;
5949 }
5950 #endif
5951
5952 static void netdev_init_one_queue(struct net_device *dev,
5953                                   struct netdev_queue *queue, void *_unused)
5954 {
5955         /* Initialize queue lock */
5956         spin_lock_init(&queue->_xmit_lock);
5957         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5958         queue->xmit_lock_owner = -1;
5959         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5960         queue->dev = dev;
5961 #ifdef CONFIG_BQL
5962         dql_init(&queue->dql, HZ);
5963 #endif
5964 }
5965
5966 static void netif_free_tx_queues(struct net_device *dev)
5967 {
5968         kvfree(dev->_tx);
5969 }
5970
5971 static int netif_alloc_netdev_queues(struct net_device *dev)
5972 {
5973         unsigned int count = dev->num_tx_queues;
5974         struct netdev_queue *tx;
5975         size_t sz = count * sizeof(*tx);
5976
5977         BUG_ON(count < 1 || count > 0xffff);
5978
5979         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5980         if (!tx) {
5981                 tx = vzalloc(sz);
5982                 if (!tx)
5983                         return -ENOMEM;
5984         }
5985         dev->_tx = tx;
5986
5987         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5988         spin_lock_init(&dev->tx_global_lock);
5989
5990         return 0;
5991 }
5992
5993 /**
5994  *      register_netdevice      - register a network device
5995  *      @dev: device to register
5996  *
5997  *      Take a completed network device structure and add it to the kernel
5998  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5999  *      chain. 0 is returned on success. A negative errno code is returned
6000  *      on a failure to set up the device, or if the name is a duplicate.
6001  *
6002  *      Callers must hold the rtnl semaphore. You may want
6003  *      register_netdev() instead of this.
6004  *
6005  *      BUGS:
6006  *      The locking appears insufficient to guarantee two parallel registers
6007  *      will not get the same name.
6008  */
6009
6010 int register_netdevice(struct net_device *dev)
6011 {
6012         int ret;
6013         struct net *net = dev_net(dev);
6014
6015         BUG_ON(dev_boot_phase);
6016         ASSERT_RTNL();
6017
6018         might_sleep();
6019
6020         /* When net_device's are persistent, this will be fatal. */
6021         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6022         BUG_ON(!net);
6023
6024         spin_lock_init(&dev->addr_list_lock);
6025         netdev_set_addr_lockdep_class(dev);
6026
6027         dev->iflink = -1;
6028
6029         ret = dev_get_valid_name(net, dev, dev->name);
6030         if (ret < 0)
6031                 goto out;
6032
6033         /* Init, if this function is available */
6034         if (dev->netdev_ops->ndo_init) {
6035                 ret = dev->netdev_ops->ndo_init(dev);
6036                 if (ret) {
6037                         if (ret > 0)
6038                                 ret = -EIO;
6039                         goto out;
6040                 }
6041         }
6042
6043         if (((dev->hw_features | dev->features) &
6044              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6045             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6046              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6047                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6048                 ret = -EINVAL;
6049                 goto err_uninit;
6050         }
6051
6052         ret = -EBUSY;
6053         if (!dev->ifindex)
6054                 dev->ifindex = dev_new_index(net);
6055         else if (__dev_get_by_index(net, dev->ifindex))
6056                 goto err_uninit;
6057
6058         if (dev->iflink == -1)
6059                 dev->iflink = dev->ifindex;
6060
6061         /* Transfer changeable features to wanted_features and enable
6062          * software offloads (GSO and GRO).
6063          */
6064         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6065         dev->features |= NETIF_F_SOFT_FEATURES;
6066         dev->wanted_features = dev->features & dev->hw_features;
6067
6068         if (!(dev->flags & IFF_LOOPBACK)) {
6069                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6070         }
6071
6072         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6073          */
6074         dev->vlan_features |= NETIF_F_HIGHDMA;
6075
6076         /* Make NETIF_F_SG inheritable to tunnel devices.
6077          */
6078         dev->hw_enc_features |= NETIF_F_SG;
6079
6080         /* Make NETIF_F_SG inheritable to MPLS.
6081          */
6082         dev->mpls_features |= NETIF_F_SG;
6083
6084         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6085         ret = notifier_to_errno(ret);
6086         if (ret)
6087                 goto err_uninit;
6088
6089         ret = netdev_register_kobject(dev);
6090         if (ret)
6091                 goto err_uninit;
6092         dev->reg_state = NETREG_REGISTERED;
6093
6094         __netdev_update_features(dev);
6095
6096         /*
6097          *      Default initial state at registry is that the
6098          *      device is present.
6099          */
6100
6101         set_bit(__LINK_STATE_PRESENT, &dev->state);
6102
6103         linkwatch_init_dev(dev);
6104
6105         dev_init_scheduler(dev);
6106         dev_hold(dev);
6107         list_netdevice(dev);
6108         add_device_randomness(dev->dev_addr, dev->addr_len);
6109
6110         /* If the device has permanent device address, driver should
6111          * set dev_addr and also addr_assign_type should be set to
6112          * NET_ADDR_PERM (default value).
6113          */
6114         if (dev->addr_assign_type == NET_ADDR_PERM)
6115                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6116
6117         /* Notify protocols, that a new device appeared. */
6118         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6119         ret = notifier_to_errno(ret);
6120         if (ret) {
6121                 rollback_registered(dev);
6122                 dev->reg_state = NETREG_UNREGISTERED;
6123         }
6124         /*
6125          *      Prevent userspace races by waiting until the network
6126          *      device is fully setup before sending notifications.
6127          */
6128         if (!dev->rtnl_link_ops ||
6129             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6130                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6131
6132 out:
6133         return ret;
6134
6135 err_uninit:
6136         if (dev->netdev_ops->ndo_uninit)
6137                 dev->netdev_ops->ndo_uninit(dev);
6138         goto out;
6139 }
6140 EXPORT_SYMBOL(register_netdevice);
6141
6142 /**
6143  *      init_dummy_netdev       - init a dummy network device for NAPI
6144  *      @dev: device to init
6145  *
6146  *      This takes a network device structure and initialize the minimum
6147  *      amount of fields so it can be used to schedule NAPI polls without
6148  *      registering a full blown interface. This is to be used by drivers
6149  *      that need to tie several hardware interfaces to a single NAPI
6150  *      poll scheduler due to HW limitations.
6151  */
6152 int init_dummy_netdev(struct net_device *dev)
6153 {
6154         /* Clear everything. Note we don't initialize spinlocks
6155          * are they aren't supposed to be taken by any of the
6156          * NAPI code and this dummy netdev is supposed to be
6157          * only ever used for NAPI polls
6158          */
6159         memset(dev, 0, sizeof(struct net_device));
6160
6161         /* make sure we BUG if trying to hit standard
6162          * register/unregister code path
6163          */
6164         dev->reg_state = NETREG_DUMMY;
6165
6166         /* NAPI wants this */
6167         INIT_LIST_HEAD(&dev->napi_list);
6168
6169         /* a dummy interface is started by default */
6170         set_bit(__LINK_STATE_PRESENT, &dev->state);
6171         set_bit(__LINK_STATE_START, &dev->state);
6172
6173         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6174          * because users of this 'device' dont need to change
6175          * its refcount.
6176          */
6177
6178         return 0;
6179 }
6180 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6181
6182
6183 /**
6184  *      register_netdev - register a network device
6185  *      @dev: device to register
6186  *
6187  *      Take a completed network device structure and add it to the kernel
6188  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6189  *      chain. 0 is returned on success. A negative errno code is returned
6190  *      on a failure to set up the device, or if the name is a duplicate.
6191  *
6192  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6193  *      and expands the device name if you passed a format string to
6194  *      alloc_netdev.
6195  */
6196 int register_netdev(struct net_device *dev)
6197 {
6198         int err;
6199
6200         rtnl_lock();
6201         err = register_netdevice(dev);
6202         rtnl_unlock();
6203         return err;
6204 }
6205 EXPORT_SYMBOL(register_netdev);
6206
6207 int netdev_refcnt_read(const struct net_device *dev)
6208 {
6209         int i, refcnt = 0;
6210
6211         for_each_possible_cpu(i)
6212                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6213         return refcnt;
6214 }
6215 EXPORT_SYMBOL(netdev_refcnt_read);
6216
6217 /**
6218  * netdev_wait_allrefs - wait until all references are gone.
6219  * @dev: target net_device
6220  *
6221  * This is called when unregistering network devices.
6222  *
6223  * Any protocol or device that holds a reference should register
6224  * for netdevice notification, and cleanup and put back the
6225  * reference if they receive an UNREGISTER event.
6226  * We can get stuck here if buggy protocols don't correctly
6227  * call dev_put.
6228  */
6229 static void netdev_wait_allrefs(struct net_device *dev)
6230 {
6231         unsigned long rebroadcast_time, warning_time;
6232         int refcnt;
6233
6234         linkwatch_forget_dev(dev);
6235
6236         rebroadcast_time = warning_time = jiffies;
6237         refcnt = netdev_refcnt_read(dev);
6238
6239         while (refcnt != 0) {
6240                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6241                         rtnl_lock();
6242
6243                         /* Rebroadcast unregister notification */
6244                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6245
6246                         __rtnl_unlock();
6247                         rcu_barrier();
6248                         rtnl_lock();
6249
6250                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6251                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6252                                      &dev->state)) {
6253                                 /* We must not have linkwatch events
6254                                  * pending on unregister. If this
6255                                  * happens, we simply run the queue
6256                                  * unscheduled, resulting in a noop
6257                                  * for this device.
6258                                  */
6259                                 linkwatch_run_queue();
6260                         }
6261
6262                         __rtnl_unlock();
6263
6264                         rebroadcast_time = jiffies;
6265                 }
6266
6267                 msleep(250);
6268
6269                 refcnt = netdev_refcnt_read(dev);
6270
6271                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6272                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6273                                  dev->name, refcnt);
6274                         warning_time = jiffies;
6275                 }
6276         }
6277 }
6278
6279 /* The sequence is:
6280  *
6281  *      rtnl_lock();
6282  *      ...
6283  *      register_netdevice(x1);
6284  *      register_netdevice(x2);
6285  *      ...
6286  *      unregister_netdevice(y1);
6287  *      unregister_netdevice(y2);
6288  *      ...
6289  *      rtnl_unlock();
6290  *      free_netdev(y1);
6291  *      free_netdev(y2);
6292  *
6293  * We are invoked by rtnl_unlock().
6294  * This allows us to deal with problems:
6295  * 1) We can delete sysfs objects which invoke hotplug
6296  *    without deadlocking with linkwatch via keventd.
6297  * 2) Since we run with the RTNL semaphore not held, we can sleep
6298  *    safely in order to wait for the netdev refcnt to drop to zero.
6299  *
6300  * We must not return until all unregister events added during
6301  * the interval the lock was held have been completed.
6302  */
6303 void netdev_run_todo(void)
6304 {
6305         struct list_head list;
6306
6307         /* Snapshot list, allow later requests */
6308         list_replace_init(&net_todo_list, &list);
6309
6310         __rtnl_unlock();
6311
6312
6313         /* Wait for rcu callbacks to finish before next phase */
6314         if (!list_empty(&list))
6315                 rcu_barrier();
6316
6317         while (!list_empty(&list)) {
6318                 struct net_device *dev
6319                         = list_first_entry(&list, struct net_device, todo_list);
6320                 list_del(&dev->todo_list);
6321
6322                 rtnl_lock();
6323                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6324                 __rtnl_unlock();
6325
6326                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6327                         pr_err("network todo '%s' but state %d\n",
6328                                dev->name, dev->reg_state);
6329                         dump_stack();
6330                         continue;
6331                 }
6332
6333                 dev->reg_state = NETREG_UNREGISTERED;
6334
6335                 on_each_cpu(flush_backlog, dev, 1);
6336
6337                 netdev_wait_allrefs(dev);
6338
6339                 /* paranoia */
6340                 BUG_ON(netdev_refcnt_read(dev));
6341                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6342                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6343                 WARN_ON(dev->dn_ptr);
6344
6345                 if (dev->destructor)
6346                         dev->destructor(dev);
6347
6348                 /* Report a network device has been unregistered */
6349                 rtnl_lock();
6350                 dev_net(dev)->dev_unreg_count--;
6351                 __rtnl_unlock();
6352                 wake_up(&netdev_unregistering_wq);
6353
6354                 /* Free network device */
6355                 kobject_put(&dev->dev.kobj);
6356         }
6357 }
6358
6359 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6360  * fields in the same order, with only the type differing.
6361  */
6362 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6363                              const struct net_device_stats *netdev_stats)
6364 {
6365 #if BITS_PER_LONG == 64
6366         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6367         memcpy(stats64, netdev_stats, sizeof(*stats64));
6368 #else
6369         size_t i, n = sizeof(*stats64) / sizeof(u64);
6370         const unsigned long *src = (const unsigned long *)netdev_stats;
6371         u64 *dst = (u64 *)stats64;
6372
6373         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6374                      sizeof(*stats64) / sizeof(u64));
6375         for (i = 0; i < n; i++)
6376                 dst[i] = src[i];
6377 #endif
6378 }
6379 EXPORT_SYMBOL(netdev_stats_to_stats64);
6380
6381 /**
6382  *      dev_get_stats   - get network device statistics
6383  *      @dev: device to get statistics from
6384  *      @storage: place to store stats
6385  *
6386  *      Get network statistics from device. Return @storage.
6387  *      The device driver may provide its own method by setting
6388  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6389  *      otherwise the internal statistics structure is used.
6390  */
6391 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6392                                         struct rtnl_link_stats64 *storage)
6393 {
6394         const struct net_device_ops *ops = dev->netdev_ops;
6395
6396         if (ops->ndo_get_stats64) {
6397                 memset(storage, 0, sizeof(*storage));
6398                 ops->ndo_get_stats64(dev, storage);
6399         } else if (ops->ndo_get_stats) {
6400                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6401         } else {
6402                 netdev_stats_to_stats64(storage, &dev->stats);
6403         }
6404         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6405         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6406         return storage;
6407 }
6408 EXPORT_SYMBOL(dev_get_stats);
6409
6410 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6411 {
6412         struct netdev_queue *queue = dev_ingress_queue(dev);
6413
6414 #ifdef CONFIG_NET_CLS_ACT
6415         if (queue)
6416                 return queue;
6417         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6418         if (!queue)
6419                 return NULL;
6420         netdev_init_one_queue(dev, queue, NULL);
6421         queue->qdisc = &noop_qdisc;
6422         queue->qdisc_sleeping = &noop_qdisc;
6423         rcu_assign_pointer(dev->ingress_queue, queue);
6424 #endif
6425         return queue;
6426 }
6427
6428 static const struct ethtool_ops default_ethtool_ops;
6429
6430 void netdev_set_default_ethtool_ops(struct net_device *dev,
6431                                     const struct ethtool_ops *ops)
6432 {
6433         if (dev->ethtool_ops == &default_ethtool_ops)
6434                 dev->ethtool_ops = ops;
6435 }
6436 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6437
6438 void netdev_freemem(struct net_device *dev)
6439 {
6440         char *addr = (char *)dev - dev->padded;
6441
6442         kvfree(addr);
6443 }
6444
6445 /**
6446  *      alloc_netdev_mqs - allocate network device
6447  *      @sizeof_priv:   size of private data to allocate space for
6448  *      @name:          device name format string
6449  *      @setup:         callback to initialize device
6450  *      @txqs:          the number of TX subqueues to allocate
6451  *      @rxqs:          the number of RX subqueues to allocate
6452  *
6453  *      Allocates a struct net_device with private data area for driver use
6454  *      and performs basic initialization.  Also allocates subqueue structs
6455  *      for each queue on the device.
6456  */
6457 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6458                 void (*setup)(struct net_device *),
6459                 unsigned int txqs, unsigned int rxqs)
6460 {
6461         struct net_device *dev;
6462         size_t alloc_size;
6463         struct net_device *p;
6464
6465         BUG_ON(strlen(name) >= sizeof(dev->name));
6466
6467         if (txqs < 1) {
6468                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6469                 return NULL;
6470         }
6471
6472 #ifdef CONFIG_SYSFS
6473         if (rxqs < 1) {
6474                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6475                 return NULL;
6476         }
6477 #endif
6478
6479         alloc_size = sizeof(struct net_device);
6480         if (sizeof_priv) {
6481                 /* ensure 32-byte alignment of private area */
6482                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6483                 alloc_size += sizeof_priv;
6484         }
6485         /* ensure 32-byte alignment of whole construct */
6486         alloc_size += NETDEV_ALIGN - 1;
6487
6488         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6489         if (!p)
6490                 p = vzalloc(alloc_size);
6491         if (!p)
6492                 return NULL;
6493
6494         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6495         dev->padded = (char *)dev - (char *)p;
6496
6497         dev->pcpu_refcnt = alloc_percpu(int);
6498         if (!dev->pcpu_refcnt)
6499                 goto free_dev;
6500
6501         if (dev_addr_init(dev))
6502                 goto free_pcpu;
6503
6504         dev_mc_init(dev);
6505         dev_uc_init(dev);
6506
6507         dev_net_set(dev, &init_net);
6508
6509         dev->gso_max_size = GSO_MAX_SIZE;
6510         dev->gso_max_segs = GSO_MAX_SEGS;
6511
6512         INIT_LIST_HEAD(&dev->napi_list);
6513         INIT_LIST_HEAD(&dev->unreg_list);
6514         INIT_LIST_HEAD(&dev->close_list);
6515         INIT_LIST_HEAD(&dev->link_watch_list);
6516         INIT_LIST_HEAD(&dev->adj_list.upper);
6517         INIT_LIST_HEAD(&dev->adj_list.lower);
6518         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6519         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6520         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6521         setup(dev);
6522
6523         dev->num_tx_queues = txqs;
6524         dev->real_num_tx_queues = txqs;
6525         if (netif_alloc_netdev_queues(dev))
6526                 goto free_all;
6527
6528 #ifdef CONFIG_SYSFS
6529         dev->num_rx_queues = rxqs;
6530         dev->real_num_rx_queues = rxqs;
6531         if (netif_alloc_rx_queues(dev))
6532                 goto free_all;
6533 #endif
6534
6535         strcpy(dev->name, name);
6536         dev->group = INIT_NETDEV_GROUP;
6537         if (!dev->ethtool_ops)
6538                 dev->ethtool_ops = &default_ethtool_ops;
6539         return dev;
6540
6541 free_all:
6542         free_netdev(dev);
6543         return NULL;
6544
6545 free_pcpu:
6546         free_percpu(dev->pcpu_refcnt);
6547 free_dev:
6548         netdev_freemem(dev);
6549         return NULL;
6550 }
6551 EXPORT_SYMBOL(alloc_netdev_mqs);
6552
6553 /**
6554  *      free_netdev - free network device
6555  *      @dev: device
6556  *
6557  *      This function does the last stage of destroying an allocated device
6558  *      interface. The reference to the device object is released.
6559  *      If this is the last reference then it will be freed.
6560  */
6561 void free_netdev(struct net_device *dev)
6562 {
6563         struct napi_struct *p, *n;
6564
6565         release_net(dev_net(dev));
6566
6567         netif_free_tx_queues(dev);
6568 #ifdef CONFIG_SYSFS
6569         kfree(dev->_rx);
6570 #endif
6571
6572         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6573
6574         /* Flush device addresses */
6575         dev_addr_flush(dev);
6576
6577         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6578                 netif_napi_del(p);
6579
6580         free_percpu(dev->pcpu_refcnt);
6581         dev->pcpu_refcnt = NULL;
6582
6583         /*  Compatibility with error handling in drivers */
6584         if (dev->reg_state == NETREG_UNINITIALIZED) {
6585                 netdev_freemem(dev);
6586                 return;
6587         }
6588
6589         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6590         dev->reg_state = NETREG_RELEASED;
6591
6592         /* will free via device release */
6593         put_device(&dev->dev);
6594 }
6595 EXPORT_SYMBOL(free_netdev);
6596
6597 /**
6598  *      synchronize_net -  Synchronize with packet receive processing
6599  *
6600  *      Wait for packets currently being received to be done.
6601  *      Does not block later packets from starting.
6602  */
6603 void synchronize_net(void)
6604 {
6605         might_sleep();
6606         if (rtnl_is_locked())
6607                 synchronize_rcu_expedited();
6608         else
6609                 synchronize_rcu();
6610 }
6611 EXPORT_SYMBOL(synchronize_net);
6612
6613 /**
6614  *      unregister_netdevice_queue - remove device from the kernel
6615  *      @dev: device
6616  *      @head: list
6617  *
6618  *      This function shuts down a device interface and removes it
6619  *      from the kernel tables.
6620  *      If head not NULL, device is queued to be unregistered later.
6621  *
6622  *      Callers must hold the rtnl semaphore.  You may want
6623  *      unregister_netdev() instead of this.
6624  */
6625
6626 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6627 {
6628         ASSERT_RTNL();
6629
6630         if (head) {
6631                 list_move_tail(&dev->unreg_list, head);
6632         } else {
6633                 rollback_registered(dev);
6634                 /* Finish processing unregister after unlock */
6635                 net_set_todo(dev);
6636         }
6637 }
6638 EXPORT_SYMBOL(unregister_netdevice_queue);
6639
6640 /**
6641  *      unregister_netdevice_many - unregister many devices
6642  *      @head: list of devices
6643  *
6644  *  Note: As most callers use a stack allocated list_head,
6645  *  we force a list_del() to make sure stack wont be corrupted later.
6646  */
6647 void unregister_netdevice_many(struct list_head *head)
6648 {
6649         struct net_device *dev;
6650
6651         if (!list_empty(head)) {
6652                 rollback_registered_many(head);
6653                 list_for_each_entry(dev, head, unreg_list)
6654                         net_set_todo(dev);
6655                 list_del(head);
6656         }
6657 }
6658 EXPORT_SYMBOL(unregister_netdevice_many);
6659
6660 /**
6661  *      unregister_netdev - remove device from the kernel
6662  *      @dev: device
6663  *
6664  *      This function shuts down a device interface and removes it
6665  *      from the kernel tables.
6666  *
6667  *      This is just a wrapper for unregister_netdevice that takes
6668  *      the rtnl semaphore.  In general you want to use this and not
6669  *      unregister_netdevice.
6670  */
6671 void unregister_netdev(struct net_device *dev)
6672 {
6673         rtnl_lock();
6674         unregister_netdevice(dev);
6675         rtnl_unlock();
6676 }
6677 EXPORT_SYMBOL(unregister_netdev);
6678
6679 /**
6680  *      dev_change_net_namespace - move device to different nethost namespace
6681  *      @dev: device
6682  *      @net: network namespace
6683  *      @pat: If not NULL name pattern to try if the current device name
6684  *            is already taken in the destination network namespace.
6685  *
6686  *      This function shuts down a device interface and moves it
6687  *      to a new network namespace. On success 0 is returned, on
6688  *      a failure a netagive errno code is returned.
6689  *
6690  *      Callers must hold the rtnl semaphore.
6691  */
6692
6693 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6694 {
6695         int err;
6696
6697         ASSERT_RTNL();
6698
6699         /* Don't allow namespace local devices to be moved. */
6700         err = -EINVAL;
6701         if (dev->features & NETIF_F_NETNS_LOCAL)
6702                 goto out;
6703
6704         /* Ensure the device has been registrered */
6705         if (dev->reg_state != NETREG_REGISTERED)
6706                 goto out;
6707
6708         /* Get out if there is nothing todo */
6709         err = 0;
6710         if (net_eq(dev_net(dev), net))
6711                 goto out;
6712
6713         /* Pick the destination device name, and ensure
6714          * we can use it in the destination network namespace.
6715          */
6716         err = -EEXIST;
6717         if (__dev_get_by_name(net, dev->name)) {
6718                 /* We get here if we can't use the current device name */
6719                 if (!pat)
6720                         goto out;
6721                 if (dev_get_valid_name(net, dev, pat) < 0)
6722                         goto out;
6723         }
6724
6725         /*
6726          * And now a mini version of register_netdevice unregister_netdevice.
6727          */
6728
6729         /* If device is running close it first. */
6730         dev_close(dev);
6731
6732         /* And unlink it from device chain */
6733         err = -ENODEV;
6734         unlist_netdevice(dev);
6735
6736         synchronize_net();
6737
6738         /* Shutdown queueing discipline. */
6739         dev_shutdown(dev);
6740
6741         /* Notify protocols, that we are about to destroy
6742            this device. They should clean all the things.
6743
6744            Note that dev->reg_state stays at NETREG_REGISTERED.
6745            This is wanted because this way 8021q and macvlan know
6746            the device is just moving and can keep their slaves up.
6747         */
6748         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6749         rcu_barrier();
6750         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6751         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6752
6753         /*
6754          *      Flush the unicast and multicast chains
6755          */
6756         dev_uc_flush(dev);
6757         dev_mc_flush(dev);
6758
6759         /* Send a netdev-removed uevent to the old namespace */
6760         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6761
6762         /* Actually switch the network namespace */
6763         dev_net_set(dev, net);
6764
6765         /* If there is an ifindex conflict assign a new one */
6766         if (__dev_get_by_index(net, dev->ifindex)) {
6767                 int iflink = (dev->iflink == dev->ifindex);
6768                 dev->ifindex = dev_new_index(net);
6769                 if (iflink)
6770                         dev->iflink = dev->ifindex;
6771         }
6772
6773         /* Send a netdev-add uevent to the new namespace */
6774         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6775
6776         /* Fixup kobjects */
6777         err = device_rename(&dev->dev, dev->name);
6778         WARN_ON(err);
6779
6780         /* Add the device back in the hashes */
6781         list_netdevice(dev);
6782
6783         /* Notify protocols, that a new device appeared. */
6784         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6785
6786         /*
6787          *      Prevent userspace races by waiting until the network
6788          *      device is fully setup before sending notifications.
6789          */
6790         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6791
6792         synchronize_net();
6793         err = 0;
6794 out:
6795         return err;
6796 }
6797 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6798
6799 static int dev_cpu_callback(struct notifier_block *nfb,
6800                             unsigned long action,
6801                             void *ocpu)
6802 {
6803         struct sk_buff **list_skb;
6804         struct sk_buff *skb;
6805         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6806         struct softnet_data *sd, *oldsd;
6807
6808         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6809                 return NOTIFY_OK;
6810
6811         local_irq_disable();
6812         cpu = smp_processor_id();
6813         sd = &per_cpu(softnet_data, cpu);
6814         oldsd = &per_cpu(softnet_data, oldcpu);
6815
6816         /* Find end of our completion_queue. */
6817         list_skb = &sd->completion_queue;
6818         while (*list_skb)
6819                 list_skb = &(*list_skb)->next;
6820         /* Append completion queue from offline CPU. */
6821         *list_skb = oldsd->completion_queue;
6822         oldsd->completion_queue = NULL;
6823
6824         /* Append output queue from offline CPU. */
6825         if (oldsd->output_queue) {
6826                 *sd->output_queue_tailp = oldsd->output_queue;
6827                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6828                 oldsd->output_queue = NULL;
6829                 oldsd->output_queue_tailp = &oldsd->output_queue;
6830         }
6831         /* Append NAPI poll list from offline CPU. */
6832         if (!list_empty(&oldsd->poll_list)) {
6833                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6834                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6835         }
6836
6837         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6838         local_irq_enable();
6839
6840         /* Process offline CPU's input_pkt_queue */
6841         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6842                 netif_rx_internal(skb);
6843                 input_queue_head_incr(oldsd);
6844         }
6845         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6846                 netif_rx_internal(skb);
6847                 input_queue_head_incr(oldsd);
6848         }
6849
6850         return NOTIFY_OK;
6851 }
6852
6853
6854 /**
6855  *      netdev_increment_features - increment feature set by one
6856  *      @all: current feature set
6857  *      @one: new feature set
6858  *      @mask: mask feature set
6859  *
6860  *      Computes a new feature set after adding a device with feature set
6861  *      @one to the master device with current feature set @all.  Will not
6862  *      enable anything that is off in @mask. Returns the new feature set.
6863  */
6864 netdev_features_t netdev_increment_features(netdev_features_t all,
6865         netdev_features_t one, netdev_features_t mask)
6866 {
6867         if (mask & NETIF_F_GEN_CSUM)
6868                 mask |= NETIF_F_ALL_CSUM;
6869         mask |= NETIF_F_VLAN_CHALLENGED;
6870
6871         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6872         all &= one | ~NETIF_F_ALL_FOR_ALL;
6873
6874         /* If one device supports hw checksumming, set for all. */
6875         if (all & NETIF_F_GEN_CSUM)
6876                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6877
6878         return all;
6879 }
6880 EXPORT_SYMBOL(netdev_increment_features);
6881
6882 static struct hlist_head * __net_init netdev_create_hash(void)
6883 {
6884         int i;
6885         struct hlist_head *hash;
6886
6887         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6888         if (hash != NULL)
6889                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6890                         INIT_HLIST_HEAD(&hash[i]);
6891
6892         return hash;
6893 }
6894
6895 /* Initialize per network namespace state */
6896 static int __net_init netdev_init(struct net *net)
6897 {
6898         if (net != &init_net)
6899                 INIT_LIST_HEAD(&net->dev_base_head);
6900
6901         net->dev_name_head = netdev_create_hash();
6902         if (net->dev_name_head == NULL)
6903                 goto err_name;
6904
6905         net->dev_index_head = netdev_create_hash();
6906         if (net->dev_index_head == NULL)
6907                 goto err_idx;
6908
6909         return 0;
6910
6911 err_idx:
6912         kfree(net->dev_name_head);
6913 err_name:
6914         return -ENOMEM;
6915 }
6916
6917 /**
6918  *      netdev_drivername - network driver for the device
6919  *      @dev: network device
6920  *
6921  *      Determine network driver for device.
6922  */
6923 const char *netdev_drivername(const struct net_device *dev)
6924 {
6925         const struct device_driver *driver;
6926         const struct device *parent;
6927         const char *empty = "";
6928
6929         parent = dev->dev.parent;
6930         if (!parent)
6931                 return empty;
6932
6933         driver = parent->driver;
6934         if (driver && driver->name)
6935                 return driver->name;
6936         return empty;
6937 }
6938
6939 static int __netdev_printk(const char *level, const struct net_device *dev,
6940                            struct va_format *vaf)
6941 {
6942         int r;
6943
6944         if (dev && dev->dev.parent) {
6945                 r = dev_printk_emit(level[1] - '0',
6946                                     dev->dev.parent,
6947                                     "%s %s %s: %pV",
6948                                     dev_driver_string(dev->dev.parent),
6949                                     dev_name(dev->dev.parent),
6950                                     netdev_name(dev), vaf);
6951         } else if (dev) {
6952                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6953         } else {
6954                 r = printk("%s(NULL net_device): %pV", level, vaf);
6955         }
6956
6957         return r;
6958 }
6959
6960 int netdev_printk(const char *level, const struct net_device *dev,
6961                   const char *format, ...)
6962 {
6963         struct va_format vaf;
6964         va_list args;
6965         int r;
6966
6967         va_start(args, format);
6968
6969         vaf.fmt = format;
6970         vaf.va = &args;
6971
6972         r = __netdev_printk(level, dev, &vaf);
6973
6974         va_end(args);
6975
6976         return r;
6977 }
6978 EXPORT_SYMBOL(netdev_printk);
6979
6980 #define define_netdev_printk_level(func, level)                 \
6981 int func(const struct net_device *dev, const char *fmt, ...)    \
6982 {                                                               \
6983         int r;                                                  \
6984         struct va_format vaf;                                   \
6985         va_list args;                                           \
6986                                                                 \
6987         va_start(args, fmt);                                    \
6988                                                                 \
6989         vaf.fmt = fmt;                                          \
6990         vaf.va = &args;                                         \
6991                                                                 \
6992         r = __netdev_printk(level, dev, &vaf);                  \
6993                                                                 \
6994         va_end(args);                                           \
6995                                                                 \
6996         return r;                                               \
6997 }                                                               \
6998 EXPORT_SYMBOL(func);
6999
7000 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7001 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7002 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7003 define_netdev_printk_level(netdev_err, KERN_ERR);
7004 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7005 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7006 define_netdev_printk_level(netdev_info, KERN_INFO);
7007
7008 static void __net_exit netdev_exit(struct net *net)
7009 {
7010         kfree(net->dev_name_head);
7011         kfree(net->dev_index_head);
7012 }
7013
7014 static struct pernet_operations __net_initdata netdev_net_ops = {
7015         .init = netdev_init,
7016         .exit = netdev_exit,
7017 };
7018
7019 static void __net_exit default_device_exit(struct net *net)
7020 {
7021         struct net_device *dev, *aux;
7022         /*
7023          * Push all migratable network devices back to the
7024          * initial network namespace
7025          */
7026         rtnl_lock();
7027         for_each_netdev_safe(net, dev, aux) {
7028                 int err;
7029                 char fb_name[IFNAMSIZ];
7030
7031                 /* Ignore unmoveable devices (i.e. loopback) */
7032                 if (dev->features & NETIF_F_NETNS_LOCAL)
7033                         continue;
7034
7035                 /* Leave virtual devices for the generic cleanup */
7036                 if (dev->rtnl_link_ops)
7037                         continue;
7038
7039                 /* Push remaining network devices to init_net */
7040                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7041                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7042                 if (err) {
7043                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7044                                  __func__, dev->name, err);
7045                         BUG();
7046                 }
7047         }
7048         rtnl_unlock();
7049 }
7050
7051 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7052 {
7053         /* Return with the rtnl_lock held when there are no network
7054          * devices unregistering in any network namespace in net_list.
7055          */
7056         struct net *net;
7057         bool unregistering;
7058         DEFINE_WAIT(wait);
7059
7060         for (;;) {
7061                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7062                                 TASK_UNINTERRUPTIBLE);
7063                 unregistering = false;
7064                 rtnl_lock();
7065                 list_for_each_entry(net, net_list, exit_list) {
7066                         if (net->dev_unreg_count > 0) {
7067                                 unregistering = true;
7068                                 break;
7069                         }
7070                 }
7071                 if (!unregistering)
7072                         break;
7073                 __rtnl_unlock();
7074                 schedule();
7075         }
7076         finish_wait(&netdev_unregistering_wq, &wait);
7077 }
7078
7079 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7080 {
7081         /* At exit all network devices most be removed from a network
7082          * namespace.  Do this in the reverse order of registration.
7083          * Do this across as many network namespaces as possible to
7084          * improve batching efficiency.
7085          */
7086         struct net_device *dev;
7087         struct net *net;
7088         LIST_HEAD(dev_kill_list);
7089
7090         /* To prevent network device cleanup code from dereferencing
7091          * loopback devices or network devices that have been freed
7092          * wait here for all pending unregistrations to complete,
7093          * before unregistring the loopback device and allowing the
7094          * network namespace be freed.
7095          *
7096          * The netdev todo list containing all network devices
7097          * unregistrations that happen in default_device_exit_batch
7098          * will run in the rtnl_unlock() at the end of
7099          * default_device_exit_batch.
7100          */
7101         rtnl_lock_unregistering(net_list);
7102         list_for_each_entry(net, net_list, exit_list) {
7103                 for_each_netdev_reverse(net, dev) {
7104                         if (dev->rtnl_link_ops)
7105                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7106                         else
7107                                 unregister_netdevice_queue(dev, &dev_kill_list);
7108                 }
7109         }
7110         unregister_netdevice_many(&dev_kill_list);
7111         rtnl_unlock();
7112 }
7113
7114 static struct pernet_operations __net_initdata default_device_ops = {
7115         .exit = default_device_exit,
7116         .exit_batch = default_device_exit_batch,
7117 };
7118
7119 /*
7120  *      Initialize the DEV module. At boot time this walks the device list and
7121  *      unhooks any devices that fail to initialise (normally hardware not
7122  *      present) and leaves us with a valid list of present and active devices.
7123  *
7124  */
7125
7126 /*
7127  *       This is called single threaded during boot, so no need
7128  *       to take the rtnl semaphore.
7129  */
7130 static int __init net_dev_init(void)
7131 {
7132         int i, rc = -ENOMEM;
7133
7134         BUG_ON(!dev_boot_phase);
7135
7136         if (dev_proc_init())
7137                 goto out;
7138
7139         if (netdev_kobject_init())
7140                 goto out;
7141
7142         INIT_LIST_HEAD(&ptype_all);
7143         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7144                 INIT_LIST_HEAD(&ptype_base[i]);
7145
7146         INIT_LIST_HEAD(&offload_base);
7147
7148         if (register_pernet_subsys(&netdev_net_ops))
7149                 goto out;
7150
7151         /*
7152          *      Initialise the packet receive queues.
7153          */
7154
7155         for_each_possible_cpu(i) {
7156                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7157
7158                 skb_queue_head_init(&sd->input_pkt_queue);
7159                 skb_queue_head_init(&sd->process_queue);
7160                 INIT_LIST_HEAD(&sd->poll_list);
7161                 sd->output_queue_tailp = &sd->output_queue;
7162 #ifdef CONFIG_RPS
7163                 sd->csd.func = rps_trigger_softirq;
7164                 sd->csd.info = sd;
7165                 sd->cpu = i;
7166 #endif
7167
7168                 sd->backlog.poll = process_backlog;
7169                 sd->backlog.weight = weight_p;
7170         }
7171
7172         dev_boot_phase = 0;
7173
7174         /* The loopback device is special if any other network devices
7175          * is present in a network namespace the loopback device must
7176          * be present. Since we now dynamically allocate and free the
7177          * loopback device ensure this invariant is maintained by
7178          * keeping the loopback device as the first device on the
7179          * list of network devices.  Ensuring the loopback devices
7180          * is the first device that appears and the last network device
7181          * that disappears.
7182          */
7183         if (register_pernet_device(&loopback_net_ops))
7184                 goto out;
7185
7186         if (register_pernet_device(&default_device_ops))
7187                 goto out;
7188
7189         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7190         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7191
7192         hotcpu_notifier(dev_cpu_callback, 0);
7193         dst_init();
7194         rc = 0;
7195 out:
7196         return rc;
7197 }
7198
7199 subsys_initcall(net_dev_init);