net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138 #include <linux/netfilter_ingress.h>
 139
 140 #include "net-sysfs.h"
 141
 142 /* Instead of increasing this, you should create a hash table. */
 143 #define MAX_GRO_SKBS 8
 144
 145 /* This should be increased if a protocol with a bigger head is added. */
 146 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 147
 148 static DEFINE_SPINLOCK(ptype_lock);
 149 static DEFINE_SPINLOCK(offload_lock);
 150 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 151 struct list_head ptype_all __read_mostly;       /* Taps */
 152 static struct list_head offload_base __read_mostly;
 153
 154 static int netif_rx_internal(struct sk_buff *skb);
 155 static int call_netdevice_notifiers_info(unsigned long val,
 156                                          struct net_device *dev,
 157                                          struct netdev_notifier_info *info);
 158
 159 /*
 160  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 161  * semaphore.
 162  *
 163  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 164  *
 165  * Writers must hold the rtnl semaphore while they loop through the
 166  * dev_base_head list, and hold dev_base_lock for writing when they do the
 167  * actual updates.  This allows pure readers to access the list even
 168  * while a writer is preparing to update it.
 169  *
 170  * To put it another way, dev_base_lock is held for writing only to
 171  * protect against pure readers; the rtnl semaphore provides the
 172  * protection against other writers.
 173  *
 174  * See, for example usages, register_netdevice() and
 175  * unregister_netdevice(), which must be called with the rtnl
 176  * semaphore held.
 177  */
 178 DEFINE_RWLOCK(dev_base_lock);
 179 EXPORT_SYMBOL(dev_base_lock);
 180
 181 /* protects napi_hash addition/deletion and napi_gen_id */
 182 static DEFINE_SPINLOCK(napi_hash_lock);
 183
 184 static unsigned int napi_gen_id;
 185 static DEFINE_HASHTABLE(napi_hash, 8);
 186
 187 static seqcount_t devnet_rename_seq;
 188
 189 static inline void dev_base_seq_inc(struct net *net)
 190 {
 191         while (++net->dev_base_seq == 0);
 192 }
 193
 194 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 195 {
 196         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 197
 198         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 199 }
 200
 201 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 202 {
 203         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 204 }
 205
 206 static inline void rps_lock(struct softnet_data *sd)
 207 {
 208 #ifdef CONFIG_RPS
 209         spin_lock(&sd->input_pkt_queue.lock);
 210 #endif
 211 }
 212
 213 static inline void rps_unlock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_unlock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 /* Device list insertion */
 221 static void list_netdevice(struct net_device *dev)
 222 {
 223         struct net *net = dev_net(dev);
 224
 225         ASSERT_RTNL();
 226
 227         write_lock_bh(&dev_base_lock);
 228         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 229         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 230         hlist_add_head_rcu(&dev->index_hlist,
 231                            dev_index_hash(net, dev->ifindex));
 232         write_unlock_bh(&dev_base_lock);
 233
 234         dev_base_seq_inc(net);
 235 }
 236
 237 /* Device list removal
 238  * caller must respect a RCU grace period before freeing/reusing dev
 239  */
 240 static void unlist_netdevice(struct net_device *dev)
 241 {
 242         ASSERT_RTNL();
 243
 244         /* Unlink dev from the device chain */
 245         write_lock_bh(&dev_base_lock);
 246         list_del_rcu(&dev->dev_list);
 247         hlist_del_rcu(&dev->name_hlist);
 248         hlist_del_rcu(&dev->index_hlist);
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(dev_net(dev));
 252 }
 253
 254 /*
 255  *      Our notifier list
 256  */
 257
 258 static RAW_NOTIFIER_HEAD(netdev_chain);
 259
 260 /*
 261  *      Device drivers call our routines to queue packets here. We empty the
 262  *      queue in the local softnet handler.
 263  */
 264
 265 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 266 EXPORT_PER_CPU_SYMBOL(softnet_data);
 267
 268 #ifdef CONFIG_LOCKDEP
 269 /*
 270  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 271  * according to dev->type
 272  */
 273 static const unsigned short netdev_lock_type[] =
 274         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 275          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 276          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 277          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 278          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 279          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 280          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 281          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 282          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 283          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 284          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 285          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 286          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 287          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 288          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 289
 290 static const char *const netdev_lock_name[] =
 291         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 292          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 293          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 294          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 295          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 296          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 297          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 298          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 299          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 300          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 301          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 302          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 303          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 304          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 305          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 306
 307 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 309
 310 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 311 {
 312         int i;
 313
 314         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 315                 if (netdev_lock_type[i] == dev_type)
 316                         return i;
 317         /* the last key is used by default */
 318         return ARRAY_SIZE(netdev_lock_type) - 1;
 319 }
 320
 321 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 322                                                  unsigned short dev_type)
 323 {
 324         int i;
 325
 326         i = netdev_lock_pos(dev_type);
 327         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 328                                    netdev_lock_name[i]);
 329 }
 330
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333         int i;
 334
 335         i = netdev_lock_pos(dev->type);
 336         lockdep_set_class_and_name(&dev->addr_list_lock,
 337                                    &netdev_addr_lock_key[i],
 338                                    netdev_lock_name[i]);
 339 }
 340 #else
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344 }
 345 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 346 {
 347 }
 348 #endif
 349
 350 /*******************************************************************************
 351
 352                 Protocol management and registration routines
 353
 354 *******************************************************************************/
 355
 356 /*
 357  *      Add a protocol ID to the list. Now that the input handler is
 358  *      smarter we can dispense with all the messy stuff that used to be
 359  *      here.
 360  *
 361  *      BEWARE!!! Protocol handlers, mangling input packets,
 362  *      MUST BE last in hash buckets and checking protocol handlers
 363  *      MUST start from promiscuous ptype_all chain in net_bh.
 364  *      It is true now, do not change it.
 365  *      Explanation follows: if protocol handler, mangling packet, will
 366  *      be the first on list, it is not able to sense, that packet
 367  *      is cloned and should be copied-on-write, so that it will
 368  *      change it and subsequent readers will get broken packet.
 369  *                                                      --ANK (980803)
 370  */
 371
 372 static inline struct list_head *ptype_head(const struct packet_type *pt)
 373 {
 374         if (pt->type == htons(ETH_P_ALL))
 375                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 376         else
 377                 return pt->dev ? &pt->dev->ptype_specific :
 378                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 379 }
 380
 381 /**
 382  *      dev_add_pack - add packet handler
 383  *      @pt: packet type declaration
 384  *
 385  *      Add a protocol handler to the networking stack. The passed &packet_type
 386  *      is linked into kernel lists and may not be freed until it has been
 387  *      removed from the kernel lists.
 388  *
 389  *      This call does not sleep therefore it can not
 390  *      guarantee all CPU's that are in middle of receiving packets
 391  *      will see the new packet type (until the next received packet).
 392  */
 393
 394 void dev_add_pack(struct packet_type *pt)
 395 {
 396         struct list_head *head = ptype_head(pt);
 397
 398         spin_lock(&ptype_lock);
 399         list_add_rcu(&pt->list, head);
 400         spin_unlock(&ptype_lock);
 401 }
 402 EXPORT_SYMBOL(dev_add_pack);
 403
 404 /**
 405  *      __dev_remove_pack        - remove packet handler
 406  *      @pt: packet type declaration
 407  *
 408  *      Remove a protocol handler that was previously added to the kernel
 409  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 410  *      from the kernel lists and can be freed or reused once this function
 411  *      returns.
 412  *
 413  *      The packet type might still be in use by receivers
 414  *      and must not be freed until after all the CPU's have gone
 415  *      through a quiescent state.
 416  */
 417 void __dev_remove_pack(struct packet_type *pt)
 418 {
 419         struct list_head *head = ptype_head(pt);
 420         struct packet_type *pt1;
 421
 422         spin_lock(&ptype_lock);
 423
 424         list_for_each_entry(pt1, head, list) {
 425                 if (pt == pt1) {
 426                         list_del_rcu(&pt->list);
 427                         goto out;
 428                 }
 429         }
 430
 431         pr_warn("dev_remove_pack: %p not found\n", pt);
 432 out:
 433         spin_unlock(&ptype_lock);
 434 }
 435 EXPORT_SYMBOL(__dev_remove_pack);
 436
 437 /**
 438  *      dev_remove_pack  - remove packet handler
 439  *      @pt: packet type declaration
 440  *
 441  *      Remove a protocol handler that was previously added to the kernel
 442  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 443  *      from the kernel lists and can be freed or reused once this function
 444  *      returns.
 445  *
 446  *      This call sleeps to guarantee that no CPU is looking at the packet
 447  *      type after return.
 448  */
 449 void dev_remove_pack(struct packet_type *pt)
 450 {
 451         __dev_remove_pack(pt);
 452
 453         synchronize_net();
 454 }
 455 EXPORT_SYMBOL(dev_remove_pack);
 456
 457
 458 /**
 459  *      dev_add_offload - register offload handlers
 460  *      @po: protocol offload declaration
 461  *
 462  *      Add protocol offload handlers to the networking stack. The passed
 463  *      &proto_offload is linked into kernel lists and may not be freed until
 464  *      it has been removed from the kernel lists.
 465  *
 466  *      This call does not sleep therefore it can not
 467  *      guarantee all CPU's that are in middle of receiving packets
 468  *      will see the new offload handlers (until the next received packet).
 469  */
 470 void dev_add_offload(struct packet_offload *po)
 471 {
 472         struct packet_offload *elem;
 473
 474         spin_lock(&offload_lock);
 475         list_for_each_entry(elem, &offload_base, list) {
 476                 if (po->priority < elem->priority)
 477                         break;
 478         }
 479         list_add_rcu(&po->list, elem->list.prev);
 480         spin_unlock(&offload_lock);
 481 }
 482 EXPORT_SYMBOL(dev_add_offload);
 483
 484 /**
 485  *      __dev_remove_offload     - remove offload handler
 486  *      @po: packet offload declaration
 487  *
 488  *      Remove a protocol offload handler that was previously added to the
 489  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 490  *      is removed from the kernel lists and can be freed or reused once this
 491  *      function returns.
 492  *
 493  *      The packet type might still be in use by receivers
 494  *      and must not be freed until after all the CPU's have gone
 495  *      through a quiescent state.
 496  */
 497 static void __dev_remove_offload(struct packet_offload *po)
 498 {
 499         struct list_head *head = &offload_base;
 500         struct packet_offload *po1;
 501
 502         spin_lock(&offload_lock);
 503
 504         list_for_each_entry(po1, head, list) {
 505                 if (po == po1) {
 506                         list_del_rcu(&po->list);
 507                         goto out;
 508                 }
 509         }
 510
 511         pr_warn("dev_remove_offload: %p not found\n", po);
 512 out:
 513         spin_unlock(&offload_lock);
 514 }
 515
 516 /**
 517  *      dev_remove_offload       - remove packet offload handler
 518  *      @po: packet offload declaration
 519  *
 520  *      Remove a packet offload handler that was previously added to the kernel
 521  *      offload handlers by dev_add_offload(). The passed &offload_type is
 522  *      removed from the kernel lists and can be freed or reused once this
 523  *      function returns.
 524  *
 525  *      This call sleeps to guarantee that no CPU is looking at the packet
 526  *      type after return.
 527  */
 528 void dev_remove_offload(struct packet_offload *po)
 529 {
 530         __dev_remove_offload(po);
 531
 532         synchronize_net();
 533 }
 534 EXPORT_SYMBOL(dev_remove_offload);
 535
 536 /******************************************************************************
 537
 538                       Device Boot-time Settings Routines
 539
 540 *******************************************************************************/
 541
 542 /* Boot time configuration table */
 543 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 544
 545 /**
 546  *      netdev_boot_setup_add   - add new setup entry
 547  *      @name: name of the device
 548  *      @map: configured settings for the device
 549  *
 550  *      Adds new setup entry to the dev_boot_setup list.  The function
 551  *      returns 0 on error and 1 on success.  This is a generic routine to
 552  *      all netdevices.
 553  */
 554 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 555 {
 556         struct netdev_boot_setup *s;
 557         int i;
 558
 559         s = dev_boot_setup;
 560         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 561                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 562                         memset(s[i].name, 0, sizeof(s[i].name));
 563                         strlcpy(s[i].name, name, IFNAMSIZ);
 564                         memcpy(&s[i].map, map, sizeof(s[i].map));
 565                         break;
 566                 }
 567         }
 568
 569         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 570 }
 571
 572 /**
 573  *      netdev_boot_setup_check - check boot time settings
 574  *      @dev: the netdevice
 575  *
 576  *      Check boot time settings for the device.
 577  *      The found settings are set for the device to be used
 578  *      later in the device probing.
 579  *      Returns 0 if no settings found, 1 if they are.
 580  */
 581 int netdev_boot_setup_check(struct net_device *dev)
 582 {
 583         struct netdev_boot_setup *s = dev_boot_setup;
 584         int i;
 585
 586         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 587                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 588                     !strcmp(dev->name, s[i].name)) {
 589                         dev->irq        = s[i].map.irq;
 590                         dev->base_addr  = s[i].map.base_addr;
 591                         dev->mem_start  = s[i].map.mem_start;
 592                         dev->mem_end    = s[i].map.mem_end;
 593                         return 1;
 594                 }
 595         }
 596         return 0;
 597 }
 598 EXPORT_SYMBOL(netdev_boot_setup_check);
 599
 600
 601 /**
 602  *      netdev_boot_base        - get address from boot time settings
 603  *      @prefix: prefix for network device
 604  *      @unit: id for network device
 605  *
 606  *      Check boot time settings for the base address of device.
 607  *      The found settings are set for the device to be used
 608  *      later in the device probing.
 609  *      Returns 0 if no settings found.
 610  */
 611 unsigned long netdev_boot_base(const char *prefix, int unit)
 612 {
 613         const struct netdev_boot_setup *s = dev_boot_setup;
 614         char name[IFNAMSIZ];
 615         int i;
 616
 617         sprintf(name, "%s%d", prefix, unit);
 618
 619         /*
 620          * If device already registered then return base of 1
 621          * to indicate not to probe for this interface
 622          */
 623         if (__dev_get_by_name(&init_net, name))
 624                 return 1;
 625
 626         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 627                 if (!strcmp(name, s[i].name))
 628                         return s[i].map.base_addr;
 629         return 0;
 630 }
 631
 632 /*
 633  * Saves at boot time configured settings for any netdevice.
 634  */
 635 int __init netdev_boot_setup(char *str)
 636 {
 637         int ints[5];
 638         struct ifmap map;
 639
 640         str = get_options(str, ARRAY_SIZE(ints), ints);
 641         if (!str || !*str)
 642                 return 0;
 643
 644         /* Save settings */
 645         memset(&map, 0, sizeof(map));
 646         if (ints[0] > 0)
 647                 map.irq = ints[1];
 648         if (ints[0] > 1)
 649                 map.base_addr = ints[2];
 650         if (ints[0] > 2)
 651                 map.mem_start = ints[3];
 652         if (ints[0] > 3)
 653                 map.mem_end = ints[4];
 654
 655         /* Add new entry to the list */
 656         return netdev_boot_setup_add(str, &map);
 657 }
 658
 659 __setup("netdev=", netdev_boot_setup);
 660
 661 /*******************************************************************************
 662
 663                             Device Interface Subroutines
 664
 665 *******************************************************************************/
 666
 667 /**
 668  *      dev_get_iflink  - get 'iflink' value of a interface
 669  *      @dev: targeted interface
 670  *
 671  *      Indicates the ifindex the interface is linked to.
 672  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 673  */
 674
 675 int dev_get_iflink(const struct net_device *dev)
 676 {
 677         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 678                 return dev->netdev_ops->ndo_get_iflink(dev);
 679
 680         /* If dev->rtnl_link_ops is set, it's a virtual interface. */
 681         if (dev->rtnl_link_ops)
 682                 return 0;
 683
 684         return dev->ifindex;
 685 }
 686 EXPORT_SYMBOL(dev_get_iflink);
 687
 688 /**
 689  *      __dev_get_by_name       - find a device by its name
 690  *      @net: the applicable net namespace
 691  *      @name: name to find
 692  *
 693  *      Find an interface by name. Must be called under RTNL semaphore
 694  *      or @dev_base_lock. If the name is found a pointer to the device
 695  *      is returned. If the name is not found then %NULL is returned. The
 696  *      reference counters are not incremented so the caller must be
 697  *      careful with locks.
 698  */
 699
 700 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 701 {
 702         struct net_device *dev;
 703         struct hlist_head *head = dev_name_hash(net, name);
 704
 705         hlist_for_each_entry(dev, head, name_hlist)
 706                 if (!strncmp(dev->name, name, IFNAMSIZ))
 707                         return dev;
 708
 709         return NULL;
 710 }
 711 EXPORT_SYMBOL(__dev_get_by_name);
 712
 713 /**
 714  *      dev_get_by_name_rcu     - find a device by its name
 715  *      @net: the applicable net namespace
 716  *      @name: name to find
 717  *
 718  *      Find an interface by name.
 719  *      If the name is found a pointer to the device is returned.
 720  *      If the name is not found then %NULL is returned.
 721  *      The reference counters are not incremented so the caller must be
 722  *      careful with locks. The caller must hold RCU lock.
 723  */
 724
 725 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 726 {
 727         struct net_device *dev;
 728         struct hlist_head *head = dev_name_hash(net, name);
 729
 730         hlist_for_each_entry_rcu(dev, head, name_hlist)
 731                 if (!strncmp(dev->name, name, IFNAMSIZ))
 732                         return dev;
 733
 734         return NULL;
 735 }
 736 EXPORT_SYMBOL(dev_get_by_name_rcu);
 737
 738 /**
 739  *      dev_get_by_name         - find a device by its name
 740  *      @net: the applicable net namespace
 741  *      @name: name to find
 742  *
 743  *      Find an interface by name. This can be called from any
 744  *      context and does its own locking. The returned handle has
 745  *      the usage count incremented and the caller must use dev_put() to
 746  *      release it when it is no longer needed. %NULL is returned if no
 747  *      matching device is found.
 748  */
 749
 750 struct net_device *dev_get_by_name(struct net *net, const char *name)
 751 {
 752         struct net_device *dev;
 753
 754         rcu_read_lock();
 755         dev = dev_get_by_name_rcu(net, name);
 756         if (dev)
 757                 dev_hold(dev);
 758         rcu_read_unlock();
 759         return dev;
 760 }
 761 EXPORT_SYMBOL(dev_get_by_name);
 762
 763 /**
 764  *      __dev_get_by_index - find a device by its ifindex
 765  *      @net: the applicable net namespace
 766  *      @ifindex: index of device
 767  *
 768  *      Search for an interface by index. Returns %NULL if the device
 769  *      is not found or a pointer to the device. The device has not
 770  *      had its reference counter increased so the caller must be careful
 771  *      about locking. The caller must hold either the RTNL semaphore
 772  *      or @dev_base_lock.
 773  */
 774
 775 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 776 {
 777         struct net_device *dev;
 778         struct hlist_head *head = dev_index_hash(net, ifindex);
 779
 780         hlist_for_each_entry(dev, head, index_hlist)
 781                 if (dev->ifindex == ifindex)
 782                         return dev;
 783
 784         return NULL;
 785 }
 786 EXPORT_SYMBOL(__dev_get_by_index);
 787
 788 /**
 789  *      dev_get_by_index_rcu - find a device by its ifindex
 790  *      @net: the applicable net namespace
 791  *      @ifindex: index of device
 792  *
 793  *      Search for an interface by index. Returns %NULL if the device
 794  *      is not found or a pointer to the device. The device has not
 795  *      had its reference counter increased so the caller must be careful
 796  *      about locking. The caller must hold RCU lock.
 797  */
 798
 799 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 800 {
 801         struct net_device *dev;
 802         struct hlist_head *head = dev_index_hash(net, ifindex);
 803
 804         hlist_for_each_entry_rcu(dev, head, index_hlist)
 805                 if (dev->ifindex == ifindex)
 806                         return dev;
 807
 808         return NULL;
 809 }
 810 EXPORT_SYMBOL(dev_get_by_index_rcu);
 811
 812
 813 /**
 814  *      dev_get_by_index - find a device by its ifindex
 815  *      @net: the applicable net namespace
 816  *      @ifindex: index of device
 817  *
 818  *      Search for an interface by index. Returns NULL if the device
 819  *      is not found or a pointer to the device. The device returned has
 820  *      had a reference added and the pointer is safe until the user calls
 821  *      dev_put to indicate they have finished with it.
 822  */
 823
 824 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 825 {
 826         struct net_device *dev;
 827
 828         rcu_read_lock();
 829         dev = dev_get_by_index_rcu(net, ifindex);
 830         if (dev)
 831                 dev_hold(dev);
 832         rcu_read_unlock();
 833         return dev;
 834 }
 835 EXPORT_SYMBOL(dev_get_by_index);
 836
 837 /**
 838  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 839  *      @net: network namespace
 840  *      @name: a pointer to the buffer where the name will be stored.
 841  *      @ifindex: the ifindex of the interface to get the name from.
 842  *
 843  *      The use of raw_seqcount_begin() and cond_resched() before
 844  *      retrying is required as we want to give the writers a chance
 845  *      to complete when CONFIG_PREEMPT is not set.
 846  */
 847 int netdev_get_name(struct net *net, char *name, int ifindex)
 848 {
 849         struct net_device *dev;
 850         unsigned int seq;
 851
 852 retry:
 853         seq = raw_seqcount_begin(&devnet_rename_seq);
 854         rcu_read_lock();
 855         dev = dev_get_by_index_rcu(net, ifindex);
 856         if (!dev) {
 857                 rcu_read_unlock();
 858                 return -ENODEV;
 859         }
 860
 861         strcpy(name, dev->name);
 862         rcu_read_unlock();
 863         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 864                 cond_resched();
 865                 goto retry;
 866         }
 867
 868         return 0;
 869 }
 870
 871 /**
 872  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 873  *      @net: the applicable net namespace
 874  *      @type: media type of device
 875  *      @ha: hardware address
 876  *
 877  *      Search for an interface by MAC address. Returns NULL if the device
 878  *      is not found or a pointer to the device.
 879  *      The caller must hold RCU or RTNL.
 880  *      The returned device has not had its ref count increased
 881  *      and the caller must therefore be careful about locking
 882  *
 883  */
 884
 885 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 886                                        const char *ha)
 887 {
 888         struct net_device *dev;
 889
 890         for_each_netdev_rcu(net, dev)
 891                 if (dev->type == type &&
 892                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 893                         return dev;
 894
 895         return NULL;
 896 }
 897 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 898
 899 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 900 {
 901         struct net_device *dev;
 902
 903         ASSERT_RTNL();
 904         for_each_netdev(net, dev)
 905                 if (dev->type == type)
 906                         return dev;
 907
 908         return NULL;
 909 }
 910 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 911
 912 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 913 {
 914         struct net_device *dev, *ret = NULL;
 915
 916         rcu_read_lock();
 917         for_each_netdev_rcu(net, dev)
 918                 if (dev->type == type) {
 919                         dev_hold(dev);
 920                         ret = dev;
 921                         break;
 922                 }
 923         rcu_read_unlock();
 924         return ret;
 925 }
 926 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 927
 928 /**
 929  *      __dev_get_by_flags - find any device with given flags
 930  *      @net: the applicable net namespace
 931  *      @if_flags: IFF_* values
 932  *      @mask: bitmask of bits in if_flags to check
 933  *
 934  *      Search for any interface with the given flags. Returns NULL if a device
 935  *      is not found or a pointer to the device. Must be called inside
 936  *      rtnl_lock(), and result refcount is unchanged.
 937  */
 938
 939 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 940                                       unsigned short mask)
 941 {
 942         struct net_device *dev, *ret;
 943
 944         ASSERT_RTNL();
 945
 946         ret = NULL;
 947         for_each_netdev(net, dev) {
 948                 if (((dev->flags ^ if_flags) & mask) == 0) {
 949                         ret = dev;
 950                         break;
 951                 }
 952         }
 953         return ret;
 954 }
 955 EXPORT_SYMBOL(__dev_get_by_flags);
 956
 957 /**
 958  *      dev_valid_name - check if name is okay for network device
 959  *      @name: name string
 960  *
 961  *      Network device names need to be valid file names to
 962  *      to allow sysfs to work.  We also disallow any kind of
 963  *      whitespace.
 964  */
 965 bool dev_valid_name(const char *name)
 966 {
 967         if (*name == '\0')
 968                 return false;
 969         if (strlen(name) >= IFNAMSIZ)
 970                 return false;
 971         if (!strcmp(name, ".") || !strcmp(name, ".."))
 972                 return false;
 973
 974         while (*name) {
 975                 if (*name == '/' || *name == ':' || isspace(*name))
 976                         return false;
 977                 name++;
 978         }
 979         return true;
 980 }
 981 EXPORT_SYMBOL(dev_valid_name);
 982
 983 /**
 984  *      __dev_alloc_name - allocate a name for a device
 985  *      @net: network namespace to allocate the device name in
 986  *      @name: name format string
 987  *      @buf:  scratch buffer and result name string
 988  *
 989  *      Passed a format string - eg "lt%d" it will try and find a suitable
 990  *      id. It scans list of devices to build up a free map, then chooses
 991  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 992  *      while allocating the name and adding the device in order to avoid
 993  *      duplicates.
 994  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 995  *      Returns the number of the unit assigned or a negative errno code.
 996  */
 997
 998 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 999 {
1000         int i = 0;
1001         const char *p;
1002         const int max_netdevices = 8*PAGE_SIZE;
1003         unsigned long *inuse;
1004         struct net_device *d;
1005
1006         p = strnchr(name, IFNAMSIZ-1, '%');
1007         if (p) {
1008                 /*
1009                  * Verify the string as this thing may have come from
1010                  * the user.  There must be either one "%d" and no other "%"
1011                  * characters.
1012                  */
1013                 if (p[1] != 'd' || strchr(p + 2, '%'))
1014                         return -EINVAL;
1015
1016                 /* Use one page as a bit array of possible slots */
1017                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1018                 if (!inuse)
1019                         return -ENOMEM;
1020
1021                 for_each_netdev(net, d) {
1022                         if (!sscanf(d->name, name, &i))
1023                                 continue;
1024                         if (i < 0 || i >= max_netdevices)
1025                                 continue;
1026
1027                         /*  avoid cases where sscanf is not exact inverse of printf */
1028                         snprintf(buf, IFNAMSIZ, name, i);
1029                         if (!strncmp(buf, d->name, IFNAMSIZ))
1030                                 set_bit(i, inuse);
1031                 }
1032
1033                 i = find_first_zero_bit(inuse, max_netdevices);
1034                 free_page((unsigned long) inuse);
1035         }
1036
1037         if (buf != name)
1038                 snprintf(buf, IFNAMSIZ, name, i);
1039         if (!__dev_get_by_name(net, buf))
1040                 return i;
1041
1042         /* It is possible to run out of possible slots
1043          * when the name is long and there isn't enough space left
1044          * for the digits, or if all bits are used.
1045          */
1046         return -ENFILE;
1047 }
1048
1049 /**
1050  *      dev_alloc_name - allocate a name for a device
1051  *      @dev: device
1052  *      @name: name format string
1053  *
1054  *      Passed a format string - eg "lt%d" it will try and find a suitable
1055  *      id. It scans list of devices to build up a free map, then chooses
1056  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1057  *      while allocating the name and adding the device in order to avoid
1058  *      duplicates.
1059  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1060  *      Returns the number of the unit assigned or a negative errno code.
1061  */
1062
1063 int dev_alloc_name(struct net_device *dev, const char *name)
1064 {
1065         char buf[IFNAMSIZ];
1066         struct net *net;
1067         int ret;
1068
1069         BUG_ON(!dev_net(dev));
1070         net = dev_net(dev);
1071         ret = __dev_alloc_name(net, name, buf);
1072         if (ret >= 0)
1073                 strlcpy(dev->name, buf, IFNAMSIZ);
1074         return ret;
1075 }
1076 EXPORT_SYMBOL(dev_alloc_name);
1077
1078 static int dev_alloc_name_ns(struct net *net,
1079                              struct net_device *dev,
1080                              const char *name)
1081 {
1082         char buf[IFNAMSIZ];
1083         int ret;
1084
1085         ret = __dev_alloc_name(net, name, buf);
1086         if (ret >= 0)
1087                 strlcpy(dev->name, buf, IFNAMSIZ);
1088         return ret;
1089 }
1090
1091 static int dev_get_valid_name(struct net *net,
1092                               struct net_device *dev,
1093                               const char *name)
1094 {
1095         BUG_ON(!net);
1096
1097         if (!dev_valid_name(name))
1098                 return -EINVAL;
1099
1100         if (strchr(name, '%'))
1101                 return dev_alloc_name_ns(net, dev, name);
1102         else if (__dev_get_by_name(net, name))
1103                 return -EEXIST;
1104         else if (dev->name != name)
1105                 strlcpy(dev->name, name, IFNAMSIZ);
1106
1107         return 0;
1108 }
1109
1110 /**
1111  *      dev_change_name - change name of a device
1112  *      @dev: device
1113  *      @newname: name (or format string) must be at least IFNAMSIZ
1114  *
1115  *      Change name of a device, can pass format strings "eth%d".
1116  *      for wildcarding.
1117  */
1118 int dev_change_name(struct net_device *dev, const char *newname)
1119 {
1120         unsigned char old_assign_type;
1121         char oldname[IFNAMSIZ];
1122         int err = 0;
1123         int ret;
1124         struct net *net;
1125
1126         ASSERT_RTNL();
1127         BUG_ON(!dev_net(dev));
1128
1129         net = dev_net(dev);
1130         if (dev->flags & IFF_UP)
1131                 return -EBUSY;
1132
1133         write_seqcount_begin(&devnet_rename_seq);
1134
1135         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1136                 write_seqcount_end(&devnet_rename_seq);
1137                 return 0;
1138         }
1139
1140         memcpy(oldname, dev->name, IFNAMSIZ);
1141
1142         err = dev_get_valid_name(net, dev, newname);
1143         if (err < 0) {
1144                 write_seqcount_end(&devnet_rename_seq);
1145                 return err;
1146         }
1147
1148         if (oldname[0] && !strchr(oldname, '%'))
1149                 netdev_info(dev, "renamed from %s\n", oldname);
1150
1151         old_assign_type = dev->name_assign_type;
1152         dev->name_assign_type = NET_NAME_RENAMED;
1153
1154 rollback:
1155         ret = device_rename(&dev->dev, dev->name);
1156         if (ret) {
1157                 memcpy(dev->name, oldname, IFNAMSIZ);
1158                 dev->name_assign_type = old_assign_type;
1159                 write_seqcount_end(&devnet_rename_seq);
1160                 return ret;
1161         }
1162
1163         write_seqcount_end(&devnet_rename_seq);
1164
1165         netdev_adjacent_rename_links(dev, oldname);
1166
1167         write_lock_bh(&dev_base_lock);
1168         hlist_del_rcu(&dev->name_hlist);
1169         write_unlock_bh(&dev_base_lock);
1170
1171         synchronize_rcu();
1172
1173         write_lock_bh(&dev_base_lock);
1174         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1175         write_unlock_bh(&dev_base_lock);
1176
1177         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1178         ret = notifier_to_errno(ret);
1179
1180         if (ret) {
1181                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1182                 if (err >= 0) {
1183                         err = ret;
1184                         write_seqcount_begin(&devnet_rename_seq);
1185                         memcpy(dev->name, oldname, IFNAMSIZ);
1186                         memcpy(oldname, newname, IFNAMSIZ);
1187                         dev->name_assign_type = old_assign_type;
1188                         old_assign_type = NET_NAME_RENAMED;
1189                         goto rollback;
1190                 } else {
1191                         pr_err("%s: name change rollback failed: %d\n",
1192                                dev->name, ret);
1193                 }
1194         }
1195
1196         return err;
1197 }
1198
1199 /**
1200  *      dev_set_alias - change ifalias of a device
1201  *      @dev: device
1202  *      @alias: name up to IFALIASZ
1203  *      @len: limit of bytes to copy from info
1204  *
1205  *      Set ifalias for a device,
1206  */
1207 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1208 {
1209         char *new_ifalias;
1210
1211         ASSERT_RTNL();
1212
1213         if (len >= IFALIASZ)
1214                 return -EINVAL;
1215
1216         if (!len) {
1217                 kfree(dev->ifalias);
1218                 dev->ifalias = NULL;
1219                 return 0;
1220         }
1221
1222         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1223         if (!new_ifalias)
1224                 return -ENOMEM;
1225         dev->ifalias = new_ifalias;
1226
1227         strlcpy(dev->ifalias, alias, len+1);
1228         return len;
1229 }
1230
1231
1232 /**
1233  *      netdev_features_change - device changes features
1234  *      @dev: device to cause notification
1235  *
1236  *      Called to indicate a device has changed features.
1237  */
1238 void netdev_features_change(struct net_device *dev)
1239 {
1240         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1241 }
1242 EXPORT_SYMBOL(netdev_features_change);
1243
1244 /**
1245  *      netdev_state_change - device changes state
1246  *      @dev: device to cause notification
1247  *
1248  *      Called to indicate a device has changed state. This function calls
1249  *      the notifier chains for netdev_chain and sends a NEWLINK message
1250  *      to the routing socket.
1251  */
1252 void netdev_state_change(struct net_device *dev)
1253 {
1254         if (dev->flags & IFF_UP) {
1255                 struct netdev_notifier_change_info change_info;
1256
1257                 change_info.flags_changed = 0;
1258                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1259                                               &change_info.info);
1260                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1261         }
1262 }
1263 EXPORT_SYMBOL(netdev_state_change);
1264
1265 /**
1266  *      netdev_notify_peers - notify network peers about existence of @dev
1267  *      @dev: network device
1268  *
1269  * Generate traffic such that interested network peers are aware of
1270  * @dev, such as by generating a gratuitous ARP. This may be used when
1271  * a device wants to inform the rest of the network about some sort of
1272  * reconfiguration such as a failover event or virtual machine
1273  * migration.
1274  */
1275 void netdev_notify_peers(struct net_device *dev)
1276 {
1277         rtnl_lock();
1278         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1279         rtnl_unlock();
1280 }
1281 EXPORT_SYMBOL(netdev_notify_peers);
1282
1283 static int __dev_open(struct net_device *dev)
1284 {
1285         const struct net_device_ops *ops = dev->netdev_ops;
1286         int ret;
1287
1288         ASSERT_RTNL();
1289
1290         if (!netif_device_present(dev))
1291                 return -ENODEV;
1292
1293         /* Block netpoll from trying to do any rx path servicing.
1294          * If we don't do this there is a chance ndo_poll_controller
1295          * or ndo_poll may be running while we open the device
1296          */
1297         netpoll_poll_disable(dev);
1298
1299         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1300         ret = notifier_to_errno(ret);
1301         if (ret)
1302                 return ret;
1303
1304         set_bit(__LINK_STATE_START, &dev->state);
1305
1306         if (ops->ndo_validate_addr)
1307                 ret = ops->ndo_validate_addr(dev);
1308
1309         if (!ret && ops->ndo_open)
1310                 ret = ops->ndo_open(dev);
1311
1312         netpoll_poll_enable(dev);
1313
1314         if (ret)
1315                 clear_bit(__LINK_STATE_START, &dev->state);
1316         else {
1317                 dev->flags |= IFF_UP;
1318                 dev_set_rx_mode(dev);
1319                 dev_activate(dev);
1320                 add_device_randomness(dev->dev_addr, dev->addr_len);
1321         }
1322
1323         return ret;
1324 }
1325
1326 /**
1327  *      dev_open        - prepare an interface for use.
1328  *      @dev:   device to open
1329  *
1330  *      Takes a device from down to up state. The device's private open
1331  *      function is invoked and then the multicast lists are loaded. Finally
1332  *      the device is moved into the up state and a %NETDEV_UP message is
1333  *      sent to the netdev notifier chain.
1334  *
1335  *      Calling this function on an active interface is a nop. On a failure
1336  *      a negative errno code is returned.
1337  */
1338 int dev_open(struct net_device *dev)
1339 {
1340         int ret;
1341
1342         if (dev->flags & IFF_UP)
1343                 return 0;
1344
1345         ret = __dev_open(dev);
1346         if (ret < 0)
1347                 return ret;
1348
1349         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1350         call_netdevice_notifiers(NETDEV_UP, dev);
1351
1352         return ret;
1353 }
1354 EXPORT_SYMBOL(dev_open);
1355
1356 static int __dev_close_many(struct list_head *head)
1357 {
1358         struct net_device *dev;
1359
1360         ASSERT_RTNL();
1361         might_sleep();
1362
1363         list_for_each_entry(dev, head, close_list) {
1364                 /* Temporarily disable netpoll until the interface is down */
1365                 netpoll_poll_disable(dev);
1366
1367                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1368
1369                 clear_bit(__LINK_STATE_START, &dev->state);
1370
1371                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1372                  * can be even on different cpu. So just clear netif_running().
1373                  *
1374                  * dev->stop() will invoke napi_disable() on all of it's
1375                  * napi_struct instances on this device.
1376                  */
1377                 smp_mb__after_atomic(); /* Commit netif_running(). */
1378         }
1379
1380         dev_deactivate_many(head);
1381
1382         list_for_each_entry(dev, head, close_list) {
1383                 const struct net_device_ops *ops = dev->netdev_ops;
1384
1385                 /*
1386                  *      Call the device specific close. This cannot fail.
1387                  *      Only if device is UP
1388                  *
1389                  *      We allow it to be called even after a DETACH hot-plug
1390                  *      event.
1391                  */
1392                 if (ops->ndo_stop)
1393                         ops->ndo_stop(dev);
1394
1395                 dev->flags &= ~IFF_UP;
1396                 netpoll_poll_enable(dev);
1397         }
1398
1399         return 0;
1400 }
1401
1402 static int __dev_close(struct net_device *dev)
1403 {
1404         int retval;
1405         LIST_HEAD(single);
1406
1407         list_add(&dev->close_list, &single);
1408         retval = __dev_close_many(&single);
1409         list_del(&single);
1410
1411         return retval;
1412 }
1413
1414 int dev_close_many(struct list_head *head, bool unlink)
1415 {
1416         struct net_device *dev, *tmp;
1417
1418         /* Remove the devices that don't need to be closed */
1419         list_for_each_entry_safe(dev, tmp, head, close_list)
1420                 if (!(dev->flags & IFF_UP))
1421                         list_del_init(&dev->close_list);
1422
1423         __dev_close_many(head);
1424
1425         list_for_each_entry_safe(dev, tmp, head, close_list) {
1426                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1427                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1428                 if (unlink)
1429                         list_del_init(&dev->close_list);
1430         }
1431
1432         return 0;
1433 }
1434 EXPORT_SYMBOL(dev_close_many);
1435
1436 /**
1437  *      dev_close - shutdown an interface.
1438  *      @dev: device to shutdown
1439  *
1440  *      This function moves an active device into down state. A
1441  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1442  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1443  *      chain.
1444  */
1445 int dev_close(struct net_device *dev)
1446 {
1447         if (dev->flags & IFF_UP) {
1448                 LIST_HEAD(single);
1449
1450                 list_add(&dev->close_list, &single);
1451                 dev_close_many(&single, true);
1452                 list_del(&single);
1453         }
1454         return 0;
1455 }
1456 EXPORT_SYMBOL(dev_close);
1457
1458
1459 /**
1460  *      dev_disable_lro - disable Large Receive Offload on a device
1461  *      @dev: device
1462  *
1463  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1464  *      called under RTNL.  This is needed if received packets may be
1465  *      forwarded to another interface.
1466  */
1467 void dev_disable_lro(struct net_device *dev)
1468 {
1469         struct net_device *lower_dev;
1470         struct list_head *iter;
1471
1472         dev->wanted_features &= ~NETIF_F_LRO;
1473         netdev_update_features(dev);
1474
1475         if (unlikely(dev->features & NETIF_F_LRO))
1476                 netdev_WARN(dev, "failed to disable LRO!\n");
1477
1478         netdev_for_each_lower_dev(dev, lower_dev, iter)
1479                 dev_disable_lro(lower_dev);
1480 }
1481 EXPORT_SYMBOL(dev_disable_lro);
1482
1483 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1484                                    struct net_device *dev)
1485 {
1486         struct netdev_notifier_info info;
1487
1488         netdev_notifier_info_init(&info, dev);
1489         return nb->notifier_call(nb, val, &info);
1490 }
1491
1492 static int dev_boot_phase = 1;
1493
1494 /**
1495  *      register_netdevice_notifier - register a network notifier block
1496  *      @nb: notifier
1497  *
1498  *      Register a notifier to be called when network device events occur.
1499  *      The notifier passed is linked into the kernel structures and must
1500  *      not be reused until it has been unregistered. A negative errno code
1501  *      is returned on a failure.
1502  *
1503  *      When registered all registration and up events are replayed
1504  *      to the new notifier to allow device to have a race free
1505  *      view of the network device list.
1506  */
1507
1508 int register_netdevice_notifier(struct notifier_block *nb)
1509 {
1510         struct net_device *dev;
1511         struct net_device *last;
1512         struct net *net;
1513         int err;
1514
1515         rtnl_lock();
1516         err = raw_notifier_chain_register(&netdev_chain, nb);
1517         if (err)
1518                 goto unlock;
1519         if (dev_boot_phase)
1520                 goto unlock;
1521         for_each_net(net) {
1522                 for_each_netdev(net, dev) {
1523                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1524                         err = notifier_to_errno(err);
1525                         if (err)
1526                                 goto rollback;
1527
1528                         if (!(dev->flags & IFF_UP))
1529                                 continue;
1530
1531                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1532                 }
1533         }
1534
1535 unlock:
1536         rtnl_unlock();
1537         return err;
1538
1539 rollback:
1540         last = dev;
1541         for_each_net(net) {
1542                 for_each_netdev(net, dev) {
1543                         if (dev == last)
1544                                 goto outroll;
1545
1546                         if (dev->flags & IFF_UP) {
1547                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1548                                                         dev);
1549                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1550                         }
1551                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1552                 }
1553         }
1554
1555 outroll:
1556         raw_notifier_chain_unregister(&netdev_chain, nb);
1557         goto unlock;
1558 }
1559 EXPORT_SYMBOL(register_netdevice_notifier);
1560
1561 /**
1562  *      unregister_netdevice_notifier - unregister a network notifier block
1563  *      @nb: notifier
1564  *
1565  *      Unregister a notifier previously registered by
1566  *      register_netdevice_notifier(). The notifier is unlinked into the
1567  *      kernel structures and may then be reused. A negative errno code
1568  *      is returned on a failure.
1569  *
1570  *      After unregistering unregister and down device events are synthesized
1571  *      for all devices on the device list to the removed notifier to remove
1572  *      the need for special case cleanup code.
1573  */
1574
1575 int unregister_netdevice_notifier(struct notifier_block *nb)
1576 {
1577         struct net_device *dev;
1578         struct net *net;
1579         int err;
1580
1581         rtnl_lock();
1582         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1583         if (err)
1584                 goto unlock;
1585
1586         for_each_net(net) {
1587                 for_each_netdev(net, dev) {
1588                         if (dev->flags & IFF_UP) {
1589                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1590                                                         dev);
1591                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1592                         }
1593                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1594                 }
1595         }
1596 unlock:
1597         rtnl_unlock();
1598         return err;
1599 }
1600 EXPORT_SYMBOL(unregister_netdevice_notifier);
1601
1602 /**
1603  *      call_netdevice_notifiers_info - call all network notifier blocks
1604  *      @val: value passed unmodified to notifier function
1605  *      @dev: net_device pointer passed unmodified to notifier function
1606  *      @info: notifier information data
1607  *
1608  *      Call all network notifier blocks.  Parameters and return value
1609  *      are as for raw_notifier_call_chain().
1610  */
1611
1612 static int call_netdevice_notifiers_info(unsigned long val,
1613                                          struct net_device *dev,
1614                                          struct netdev_notifier_info *info)
1615 {
1616         ASSERT_RTNL();
1617         netdev_notifier_info_init(info, dev);
1618         return raw_notifier_call_chain(&netdev_chain, val, info);
1619 }
1620
1621 /**
1622  *      call_netdevice_notifiers - call all network notifier blocks
1623  *      @val: value passed unmodified to notifier function
1624  *      @dev: net_device pointer passed unmodified to notifier function
1625  *
1626  *      Call all network notifier blocks.  Parameters and return value
1627  *      are as for raw_notifier_call_chain().
1628  */
1629
1630 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1631 {
1632         struct netdev_notifier_info info;
1633
1634         return call_netdevice_notifiers_info(val, dev, &info);
1635 }
1636 EXPORT_SYMBOL(call_netdevice_notifiers);
1637
1638 #ifdef CONFIG_NET_INGRESS
1639 static struct static_key ingress_needed __read_mostly;
1640
1641 void net_inc_ingress_queue(void)
1642 {
1643         static_key_slow_inc(&ingress_needed);
1644 }
1645 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1646
1647 void net_dec_ingress_queue(void)
1648 {
1649         static_key_slow_dec(&ingress_needed);
1650 }
1651 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1652 #endif
1653
1654 static struct static_key netstamp_needed __read_mostly;
1655 #ifdef HAVE_JUMP_LABEL
1656 /* We are not allowed to call static_key_slow_dec() from irq context
1657  * If net_disable_timestamp() is called from irq context, defer the
1658  * static_key_slow_dec() calls.
1659  */
1660 static atomic_t netstamp_needed_deferred;
1661 #endif
1662
1663 void net_enable_timestamp(void)
1664 {
1665 #ifdef HAVE_JUMP_LABEL
1666         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1667
1668         if (deferred) {
1669                 while (--deferred)
1670                         static_key_slow_dec(&netstamp_needed);
1671                 return;
1672         }
1673 #endif
1674         static_key_slow_inc(&netstamp_needed);
1675 }
1676 EXPORT_SYMBOL(net_enable_timestamp);
1677
1678 void net_disable_timestamp(void)
1679 {
1680 #ifdef HAVE_JUMP_LABEL
1681         if (in_interrupt()) {
1682                 atomic_inc(&netstamp_needed_deferred);
1683                 return;
1684         }
1685 #endif
1686         static_key_slow_dec(&netstamp_needed);
1687 }
1688 EXPORT_SYMBOL(net_disable_timestamp);
1689
1690 static inline void net_timestamp_set(struct sk_buff *skb)
1691 {
1692         skb->tstamp.tv64 = 0;
1693         if (static_key_false(&netstamp_needed))
1694                 __net_timestamp(skb);
1695 }
1696
1697 #define net_timestamp_check(COND, SKB)                  \
1698         if (static_key_false(&netstamp_needed)) {               \
1699                 if ((COND) && !(SKB)->tstamp.tv64)      \
1700                         __net_timestamp(SKB);           \
1701         }                                               \
1702
1703 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1704 {
1705         unsigned int len;
1706
1707         if (!(dev->flags & IFF_UP))
1708                 return false;
1709
1710         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1711         if (skb->len <= len)
1712                 return true;
1713
1714         /* if TSO is enabled, we don't care about the length as the packet
1715          * could be forwarded without being segmented before
1716          */
1717         if (skb_is_gso(skb))
1718                 return true;
1719
1720         return false;
1721 }
1722 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1723
1724 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1725 {
1726         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1727             unlikely(!is_skb_forwardable(dev, skb))) {
1728                 atomic_long_inc(&dev->rx_dropped);
1729                 kfree_skb(skb);
1730                 return NET_RX_DROP;
1731         }
1732
1733         skb_scrub_packet(skb, true);
1734         skb->priority = 0;
1735         skb->protocol = eth_type_trans(skb, dev);
1736         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1737
1738         return 0;
1739 }
1740 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1741
1742 /**
1743  * dev_forward_skb - loopback an skb to another netif
1744  *
1745  * @dev: destination network device
1746  * @skb: buffer to forward
1747  *
1748  * return values:
1749  *      NET_RX_SUCCESS  (no congestion)
1750  *      NET_RX_DROP     (packet was dropped, but freed)
1751  *
1752  * dev_forward_skb can be used for injecting an skb from the
1753  * start_xmit function of one device into the receive queue
1754  * of another device.
1755  *
1756  * The receiving device may be in another namespace, so
1757  * we have to clear all information in the skb that could
1758  * impact namespace isolation.
1759  */
1760 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1761 {
1762         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1763 }
1764 EXPORT_SYMBOL_GPL(dev_forward_skb);
1765
1766 static inline int deliver_skb(struct sk_buff *skb,
1767                               struct packet_type *pt_prev,
1768                               struct net_device *orig_dev)
1769 {
1770         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1771                 return -ENOMEM;
1772         atomic_inc(&skb->users);
1773         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1774 }
1775
1776 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1777                                           struct packet_type **pt,
1778                                           struct net_device *orig_dev,
1779                                           __be16 type,
1780                                           struct list_head *ptype_list)
1781 {
1782         struct packet_type *ptype, *pt_prev = *pt;
1783
1784         list_for_each_entry_rcu(ptype, ptype_list, list) {
1785                 if (ptype->type != type)
1786                         continue;
1787                 if (pt_prev)
1788                         deliver_skb(skb, pt_prev, orig_dev);
1789                 pt_prev = ptype;
1790         }
1791         *pt = pt_prev;
1792 }
1793
1794 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1795 {
1796         if (!ptype->af_packet_priv || !skb->sk)
1797                 return false;
1798
1799         if (ptype->id_match)
1800                 return ptype->id_match(ptype, skb->sk);
1801         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1802                 return true;
1803
1804         return false;
1805 }
1806
1807 /*
1808  *      Support routine. Sends outgoing frames to any network
1809  *      taps currently in use.
1810  */
1811
1812 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1813 {
1814         struct packet_type *ptype;
1815         struct sk_buff *skb2 = NULL;
1816         struct packet_type *pt_prev = NULL;
1817         struct list_head *ptype_list = &ptype_all;
1818
1819         rcu_read_lock();
1820 again:
1821         list_for_each_entry_rcu(ptype, ptype_list, list) {
1822                 /* Never send packets back to the socket
1823                  * they originated from - MvS (miquels@drinkel.ow.org)
1824                  */
1825                 if (skb_loop_sk(ptype, skb))
1826                         continue;
1827
1828                 if (pt_prev) {
1829                         deliver_skb(skb2, pt_prev, skb->dev);
1830                         pt_prev = ptype;
1831                         continue;
1832                 }
1833
1834                 /* need to clone skb, done only once */
1835                 skb2 = skb_clone(skb, GFP_ATOMIC);
1836                 if (!skb2)
1837                         goto out_unlock;
1838
1839                 net_timestamp_set(skb2);
1840
1841                 /* skb->nh should be correctly
1842                  * set by sender, so that the second statement is
1843                  * just protection against buggy protocols.
1844                  */
1845                 skb_reset_mac_header(skb2);
1846
1847                 if (skb_network_header(skb2) < skb2->data ||
1848                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1849                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1850                                              ntohs(skb2->protocol),
1851                                              dev->name);
1852                         skb_reset_network_header(skb2);
1853                 }
1854
1855                 skb2->transport_header = skb2->network_header;
1856                 skb2->pkt_type = PACKET_OUTGOING;
1857                 pt_prev = ptype;
1858         }
1859
1860         if (ptype_list == &ptype_all) {
1861                 ptype_list = &dev->ptype_all;
1862                 goto again;
1863         }
1864 out_unlock:
1865         if (pt_prev)
1866                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1867         rcu_read_unlock();
1868 }
1869
1870 /**
1871  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1872  * @dev: Network device
1873  * @txq: number of queues available
1874  *
1875  * If real_num_tx_queues is changed the tc mappings may no longer be
1876  * valid. To resolve this verify the tc mapping remains valid and if
1877  * not NULL the mapping. With no priorities mapping to this
1878  * offset/count pair it will no longer be used. In the worst case TC0
1879  * is invalid nothing can be done so disable priority mappings. If is
1880  * expected that drivers will fix this mapping if they can before
1881  * calling netif_set_real_num_tx_queues.
1882  */
1883 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1884 {
1885         int i;
1886         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1887
1888         /* If TC0 is invalidated disable TC mapping */
1889         if (tc->offset + tc->count > txq) {
1890                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1891                 dev->num_tc = 0;
1892                 return;
1893         }
1894
1895         /* Invalidated prio to tc mappings set to TC0 */
1896         for (i = 1; i < TC_BITMASK + 1; i++) {
1897                 int q = netdev_get_prio_tc_map(dev, i);
1898
1899                 tc = &dev->tc_to_txq[q];
1900                 if (tc->offset + tc->count > txq) {
1901                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1902                                 i, q);
1903                         netdev_set_prio_tc_map(dev, i, 0);
1904                 }
1905         }
1906 }
1907
1908 #ifdef CONFIG_XPS
1909 static DEFINE_MUTEX(xps_map_mutex);
1910 #define xmap_dereference(P)             \
1911         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1912
1913 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1914                                         int cpu, u16 index)
1915 {
1916         struct xps_map *map = NULL;
1917         int pos;
1918
1919         if (dev_maps)
1920                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1921
1922         for (pos = 0; map && pos < map->len; pos++) {
1923                 if (map->queues[pos] == index) {
1924                         if (map->len > 1) {
1925                                 map->queues[pos] = map->queues[--map->len];
1926                         } else {
1927                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1928                                 kfree_rcu(map, rcu);
1929                                 map = NULL;
1930                         }
1931                         break;
1932                 }
1933         }
1934
1935         return map;
1936 }
1937
1938 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1939 {
1940         struct xps_dev_maps *dev_maps;
1941         int cpu, i;
1942         bool active = false;
1943
1944         mutex_lock(&xps_map_mutex);
1945         dev_maps = xmap_dereference(dev->xps_maps);
1946
1947         if (!dev_maps)
1948                 goto out_no_maps;
1949
1950         for_each_possible_cpu(cpu) {
1951                 for (i = index; i < dev->num_tx_queues; i++) {
1952                         if (!remove_xps_queue(dev_maps, cpu, i))
1953                                 break;
1954                 }
1955                 if (i == dev->num_tx_queues)
1956                         active = true;
1957         }
1958
1959         if (!active) {
1960                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1961                 kfree_rcu(dev_maps, rcu);
1962         }
1963
1964         for (i = index; i < dev->num_tx_queues; i++)
1965                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1966                                              NUMA_NO_NODE);
1967
1968 out_no_maps:
1969         mutex_unlock(&xps_map_mutex);
1970 }
1971
1972 static struct xps_map *expand_xps_map(struct xps_map *map,
1973                                       int cpu, u16 index)
1974 {
1975         struct xps_map *new_map;
1976         int alloc_len = XPS_MIN_MAP_ALLOC;
1977         int i, pos;
1978
1979         for (pos = 0; map && pos < map->len; pos++) {
1980                 if (map->queues[pos] != index)
1981                         continue;
1982                 return map;
1983         }
1984
1985         /* Need to add queue to this CPU's existing map */
1986         if (map) {
1987                 if (pos < map->alloc_len)
1988                         return map;
1989
1990                 alloc_len = map->alloc_len * 2;
1991         }
1992
1993         /* Need to allocate new map to store queue on this CPU's map */
1994         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1995                                cpu_to_node(cpu));
1996         if (!new_map)
1997                 return NULL;
1998
1999         for (i = 0; i < pos; i++)
2000                 new_map->queues[i] = map->queues[i];
2001         new_map->alloc_len = alloc_len;
2002         new_map->len = pos;
2003
2004         return new_map;
2005 }
2006
2007 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2008                         u16 index)
2009 {
2010         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2011         struct xps_map *map, *new_map;
2012         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2013         int cpu, numa_node_id = -2;
2014         bool active = false;
2015
2016         mutex_lock(&xps_map_mutex);
2017
2018         dev_maps = xmap_dereference(dev->xps_maps);
2019
2020         /* allocate memory for queue storage */
2021         for_each_online_cpu(cpu) {
2022                 if (!cpumask_test_cpu(cpu, mask))
2023                         continue;
2024
2025                 if (!new_dev_maps)
2026                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2027                 if (!new_dev_maps) {
2028                         mutex_unlock(&xps_map_mutex);
2029                         return -ENOMEM;
2030                 }
2031
2032                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2033                                  NULL;
2034
2035                 map = expand_xps_map(map, cpu, index);
2036                 if (!map)
2037                         goto error;
2038
2039                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2040         }
2041
2042         if (!new_dev_maps)
2043                 goto out_no_new_maps;
2044
2045         for_each_possible_cpu(cpu) {
2046                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2047                         /* add queue to CPU maps */
2048                         int pos = 0;
2049
2050                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2051                         while ((pos < map->len) && (map->queues[pos] != index))
2052                                 pos++;
2053
2054                         if (pos == map->len)
2055                                 map->queues[map->len++] = index;
2056 #ifdef CONFIG_NUMA
2057                         if (numa_node_id == -2)
2058                                 numa_node_id = cpu_to_node(cpu);
2059                         else if (numa_node_id != cpu_to_node(cpu))
2060                                 numa_node_id = -1;
2061 #endif
2062                 } else if (dev_maps) {
2063                         /* fill in the new device map from the old device map */
2064                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2065                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2066                 }
2067
2068         }
2069
2070         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2071
2072         /* Cleanup old maps */
2073         if (dev_maps) {
2074                 for_each_possible_cpu(cpu) {
2075                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2076                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2077                         if (map && map != new_map)
2078                                 kfree_rcu(map, rcu);
2079                 }
2080
2081                 kfree_rcu(dev_maps, rcu);
2082         }
2083
2084         dev_maps = new_dev_maps;
2085         active = true;
2086
2087 out_no_new_maps:
2088         /* update Tx queue numa node */
2089         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2090                                      (numa_node_id >= 0) ? numa_node_id :
2091                                      NUMA_NO_NODE);
2092
2093         if (!dev_maps)
2094                 goto out_no_maps;
2095
2096         /* removes queue from unused CPUs */
2097         for_each_possible_cpu(cpu) {
2098                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2099                         continue;
2100
2101                 if (remove_xps_queue(dev_maps, cpu, index))
2102                         active = true;
2103         }
2104
2105         /* free map if not active */
2106         if (!active) {
2107                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2108                 kfree_rcu(dev_maps, rcu);
2109         }
2110
2111 out_no_maps:
2112         mutex_unlock(&xps_map_mutex);
2113
2114         return 0;
2115 error:
2116         /* remove any maps that we added */
2117         for_each_possible_cpu(cpu) {
2118                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2119                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2120                                  NULL;
2121                 if (new_map && new_map != map)
2122                         kfree(new_map);
2123         }
2124
2125         mutex_unlock(&xps_map_mutex);
2126
2127         kfree(new_dev_maps);
2128         return -ENOMEM;
2129 }
2130 EXPORT_SYMBOL(netif_set_xps_queue);
2131
2132 #endif
2133 /*
2134  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2135  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2136  */
2137 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2138 {
2139         int rc;
2140
2141         if (txq < 1 || txq > dev->num_tx_queues)
2142                 return -EINVAL;
2143
2144         if (dev->reg_state == NETREG_REGISTERED ||
2145             dev->reg_state == NETREG_UNREGISTERING) {
2146                 ASSERT_RTNL();
2147
2148                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2149                                                   txq);
2150                 if (rc)
2151                         return rc;
2152
2153                 if (dev->num_tc)
2154                         netif_setup_tc(dev, txq);
2155
2156                 if (txq < dev->real_num_tx_queues) {
2157                         qdisc_reset_all_tx_gt(dev, txq);
2158 #ifdef CONFIG_XPS
2159                         netif_reset_xps_queues_gt(dev, txq);
2160 #endif
2161                 }
2162         }
2163
2164         dev->real_num_tx_queues = txq;
2165         return 0;
2166 }
2167 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2168
2169 #ifdef CONFIG_SYSFS
2170 /**
2171  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2172  *      @dev: Network device
2173  *      @rxq: Actual number of RX queues
2174  *
2175  *      This must be called either with the rtnl_lock held or before
2176  *      registration of the net device.  Returns 0 on success, or a
2177  *      negative error code.  If called before registration, it always
2178  *      succeeds.
2179  */
2180 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2181 {
2182         int rc;
2183
2184         if (rxq < 1 || rxq > dev->num_rx_queues)
2185                 return -EINVAL;
2186
2187         if (dev->reg_state == NETREG_REGISTERED) {
2188                 ASSERT_RTNL();
2189
2190                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2191                                                   rxq);
2192                 if (rc)
2193                         return rc;
2194         }
2195
2196         dev->real_num_rx_queues = rxq;
2197         return 0;
2198 }
2199 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2200 #endif
2201
2202 /**
2203  * netif_get_num_default_rss_queues - default number of RSS queues
2204  *
2205  * This routine should set an upper limit on the number of RSS queues
2206  * used by default by multiqueue devices.
2207  */
2208 int netif_get_num_default_rss_queues(void)
2209 {
2210         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2211 }
2212 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2213
2214 static inline void __netif_reschedule(struct Qdisc *q)
2215 {
2216         struct softnet_data *sd;
2217         unsigned long flags;
2218
2219         local_irq_save(flags);
2220         sd = this_cpu_ptr(&softnet_data);
2221         q->next_sched = NULL;
2222         *sd->output_queue_tailp = q;
2223         sd->output_queue_tailp = &q->next_sched;
2224         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2225         local_irq_restore(flags);
2226 }
2227
2228 void __netif_schedule(struct Qdisc *q)
2229 {
2230         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2231                 __netif_reschedule(q);
2232 }
2233 EXPORT_SYMBOL(__netif_schedule);
2234
2235 struct dev_kfree_skb_cb {
2236         enum skb_free_reason reason;
2237 };
2238
2239 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2240 {
2241         return (struct dev_kfree_skb_cb *)skb->cb;
2242 }
2243
2244 void netif_schedule_queue(struct netdev_queue *txq)
2245 {
2246         rcu_read_lock();
2247         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2248                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2249
2250                 __netif_schedule(q);
2251         }
2252         rcu_read_unlock();
2253 }
2254 EXPORT_SYMBOL(netif_schedule_queue);
2255
2256 /**
2257  *      netif_wake_subqueue - allow sending packets on subqueue
2258  *      @dev: network device
2259  *      @queue_index: sub queue index
2260  *
2261  * Resume individual transmit queue of a device with multiple transmit queues.
2262  */
2263 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2264 {
2265         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2266
2267         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2268                 struct Qdisc *q;
2269
2270                 rcu_read_lock();
2271                 q = rcu_dereference(txq->qdisc);
2272                 __netif_schedule(q);
2273                 rcu_read_unlock();
2274         }
2275 }
2276 EXPORT_SYMBOL(netif_wake_subqueue);
2277
2278 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2279 {
2280         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2281                 struct Qdisc *q;
2282
2283                 rcu_read_lock();
2284                 q = rcu_dereference(dev_queue->qdisc);
2285                 __netif_schedule(q);
2286                 rcu_read_unlock();
2287         }
2288 }
2289 EXPORT_SYMBOL(netif_tx_wake_queue);
2290
2291 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2292 {
2293         unsigned long flags;
2294
2295         if (likely(atomic_read(&skb->users) == 1)) {
2296                 smp_rmb();
2297                 atomic_set(&skb->users, 0);
2298         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2299                 return;
2300         }
2301         get_kfree_skb_cb(skb)->reason = reason;
2302         local_irq_save(flags);
2303         skb->next = __this_cpu_read(softnet_data.completion_queue);
2304         __this_cpu_write(softnet_data.completion_queue, skb);
2305         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2306         local_irq_restore(flags);
2307 }
2308 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2309
2310 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2311 {
2312         if (in_irq() || irqs_disabled())
2313                 __dev_kfree_skb_irq(skb, reason);
2314         else
2315                 dev_kfree_skb(skb);
2316 }
2317 EXPORT_SYMBOL(__dev_kfree_skb_any);
2318
2319
2320 /**
2321  * netif_device_detach - mark device as removed
2322  * @dev: network device
2323  *
2324  * Mark device as removed from system and therefore no longer available.
2325  */
2326 void netif_device_detach(struct net_device *dev)
2327 {
2328         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2329             netif_running(dev)) {
2330                 netif_tx_stop_all_queues(dev);
2331         }
2332 }
2333 EXPORT_SYMBOL(netif_device_detach);
2334
2335 /**
2336  * netif_device_attach - mark device as attached
2337  * @dev: network device
2338  *
2339  * Mark device as attached from system and restart if needed.
2340  */
2341 void netif_device_attach(struct net_device *dev)
2342 {
2343         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2344             netif_running(dev)) {
2345                 netif_tx_wake_all_queues(dev);
2346                 __netdev_watchdog_up(dev);
2347         }
2348 }
2349 EXPORT_SYMBOL(netif_device_attach);
2350
2351 /*
2352  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2353  * to be used as a distribution range.
2354  */
2355 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2356                   unsigned int num_tx_queues)
2357 {
2358         u32 hash;
2359         u16 qoffset = 0;
2360         u16 qcount = num_tx_queues;
2361
2362         if (skb_rx_queue_recorded(skb)) {
2363                 hash = skb_get_rx_queue(skb);
2364                 while (unlikely(hash >= num_tx_queues))
2365                         hash -= num_tx_queues;
2366                 return hash;
2367         }
2368
2369         if (dev->num_tc) {
2370                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2371                 qoffset = dev->tc_to_txq[tc].offset;
2372                 qcount = dev->tc_to_txq[tc].count;
2373         }
2374
2375         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2376 }
2377 EXPORT_SYMBOL(__skb_tx_hash);
2378
2379 static void skb_warn_bad_offload(const struct sk_buff *skb)
2380 {
2381         static const netdev_features_t null_features = 0;
2382         struct net_device *dev = skb->dev;
2383         const char *driver = "";
2384
2385         if (!net_ratelimit())
2386                 return;
2387
2388         if (dev && dev->dev.parent)
2389                 driver = dev_driver_string(dev->dev.parent);
2390
2391         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2392              "gso_type=%d ip_summed=%d\n",
2393              driver, dev ? &dev->features : &null_features,
2394              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2395              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2396              skb_shinfo(skb)->gso_type, skb->ip_summed);
2397 }
2398
2399 /*
2400  * Invalidate hardware checksum when packet is to be mangled, and
2401  * complete checksum manually on outgoing path.
2402  */
2403 int skb_checksum_help(struct sk_buff *skb)
2404 {
2405         __wsum csum;
2406         int ret = 0, offset;
2407
2408         if (skb->ip_summed == CHECKSUM_COMPLETE)
2409                 goto out_set_summed;
2410
2411         if (unlikely(skb_shinfo(skb)->gso_size)) {
2412                 skb_warn_bad_offload(skb);
2413                 return -EINVAL;
2414         }
2415
2416         /* Before computing a checksum, we should make sure no frag could
2417          * be modified by an external entity : checksum could be wrong.
2418          */
2419         if (skb_has_shared_frag(skb)) {
2420                 ret = __skb_linearize(skb);
2421                 if (ret)
2422                         goto out;
2423         }
2424
2425         offset = skb_checksum_start_offset(skb);
2426         BUG_ON(offset >= skb_headlen(skb));
2427         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2428
2429         offset += skb->csum_offset;
2430         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2431
2432         if (skb_cloned(skb) &&
2433             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2434                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2435                 if (ret)
2436                         goto out;
2437         }
2438
2439         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2440 out_set_summed:
2441         skb->ip_summed = CHECKSUM_NONE;
2442 out:
2443         return ret;
2444 }
2445 EXPORT_SYMBOL(skb_checksum_help);
2446
2447 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2448 {
2449         __be16 type = skb->protocol;
2450
2451         /* Tunnel gso handlers can set protocol to ethernet. */
2452         if (type == htons(ETH_P_TEB)) {
2453                 struct ethhdr *eth;
2454
2455                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2456                         return 0;
2457
2458                 eth = (struct ethhdr *)skb_mac_header(skb);
2459                 type = eth->h_proto;
2460         }
2461
2462         return __vlan_get_protocol(skb, type, depth);
2463 }
2464
2465 /**
2466  *      skb_mac_gso_segment - mac layer segmentation handler.
2467  *      @skb: buffer to segment
2468  *      @features: features for the output path (see dev->features)
2469  */
2470 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2471                                     netdev_features_t features)
2472 {
2473         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2474         struct packet_offload *ptype;
2475         int vlan_depth = skb->mac_len;
2476         __be16 type = skb_network_protocol(skb, &vlan_depth);
2477
2478         if (unlikely(!type))
2479                 return ERR_PTR(-EINVAL);
2480
2481         __skb_pull(skb, vlan_depth);
2482
2483         rcu_read_lock();
2484         list_for_each_entry_rcu(ptype, &offload_base, list) {
2485                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2486                         segs = ptype->callbacks.gso_segment(skb, features);
2487                         break;
2488                 }
2489         }
2490         rcu_read_unlock();
2491
2492         __skb_push(skb, skb->data - skb_mac_header(skb));
2493
2494         return segs;
2495 }
2496 EXPORT_SYMBOL(skb_mac_gso_segment);
2497
2498
2499 /* openvswitch calls this on rx path, so we need a different check.
2500  */
2501 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2502 {
2503         if (tx_path)
2504                 return skb->ip_summed != CHECKSUM_PARTIAL;
2505         else
2506                 return skb->ip_summed == CHECKSUM_NONE;
2507 }
2508
2509 /**
2510  *      __skb_gso_segment - Perform segmentation on skb.
2511  *      @skb: buffer to segment
2512  *      @features: features for the output path (see dev->features)
2513  *      @tx_path: whether it is called in TX path
2514  *
2515  *      This function segments the given skb and returns a list of segments.
2516  *
2517  *      It may return NULL if the skb requires no segmentation.  This is
2518  *      only possible when GSO is used for verifying header integrity.
2519  */
2520 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2521                                   netdev_features_t features, bool tx_path)
2522 {
2523         if (unlikely(skb_needs_check(skb, tx_path))) {
2524                 int err;
2525
2526                 skb_warn_bad_offload(skb);
2527
2528                 err = skb_cow_head(skb, 0);
2529                 if (err < 0)
2530                         return ERR_PTR(err);
2531         }
2532
2533         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2534         SKB_GSO_CB(skb)->encap_level = 0;
2535
2536         skb_reset_mac_header(skb);
2537         skb_reset_mac_len(skb);
2538
2539         return skb_mac_gso_segment(skb, features);
2540 }
2541 EXPORT_SYMBOL(__skb_gso_segment);
2542
2543 /* Take action when hardware reception checksum errors are detected. */
2544 #ifdef CONFIG_BUG
2545 void netdev_rx_csum_fault(struct net_device *dev)
2546 {
2547         if (net_ratelimit()) {
2548                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2549                 dump_stack();
2550         }
2551 }
2552 EXPORT_SYMBOL(netdev_rx_csum_fault);
2553 #endif
2554
2555 /* Actually, we should eliminate this check as soon as we know, that:
2556  * 1. IOMMU is present and allows to map all the memory.
2557  * 2. No high memory really exists on this machine.
2558  */
2559
2560 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2561 {
2562 #ifdef CONFIG_HIGHMEM
2563         int i;
2564         if (!(dev->features & NETIF_F_HIGHDMA)) {
2565                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2566                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2567                         if (PageHighMem(skb_frag_page(frag)))
2568                                 return 1;
2569                 }
2570         }
2571
2572         if (PCI_DMA_BUS_IS_PHYS) {
2573                 struct device *pdev = dev->dev.parent;
2574
2575                 if (!pdev)
2576                         return 0;
2577                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2578                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2579                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2580                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2581                                 return 1;
2582                 }
2583         }
2584 #endif
2585         return 0;
2586 }
2587
2588 /* If MPLS offload request, verify we are testing hardware MPLS features
2589  * instead of standard features for the netdev.
2590  */
2591 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2592 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2593                                            netdev_features_t features,
2594                                            __be16 type)
2595 {
2596         if (eth_p_mpls(type))
2597                 features &= skb->dev->mpls_features;
2598
2599         return features;
2600 }
2601 #else
2602 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2603                                            netdev_features_t features,
2604                                            __be16 type)
2605 {
2606         return features;
2607 }
2608 #endif
2609
2610 static netdev_features_t harmonize_features(struct sk_buff *skb,
2611         netdev_features_t features)
2612 {
2613         int tmp;
2614         __be16 type;
2615
2616         type = skb_network_protocol(skb, &tmp);
2617         features = net_mpls_features(skb, features, type);
2618
2619         if (skb->ip_summed != CHECKSUM_NONE &&
2620             !can_checksum_protocol(features, type)) {
2621                 features &= ~NETIF_F_ALL_CSUM;
2622         } else if (illegal_highdma(skb->dev, skb)) {
2623                 features &= ~NETIF_F_SG;
2624         }
2625
2626         return features;
2627 }
2628
2629 netdev_features_t passthru_features_check(struct sk_buff *skb,
2630                                           struct net_device *dev,
2631                                           netdev_features_t features)
2632 {
2633         return features;
2634 }
2635 EXPORT_SYMBOL(passthru_features_check);
2636
2637 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2638                                              struct net_device *dev,
2639                                              netdev_features_t features)
2640 {
2641         return vlan_features_check(skb, features);
2642 }
2643
2644 netdev_features_t netif_skb_features(struct sk_buff *skb)
2645 {
2646         struct net_device *dev = skb->dev;
2647         netdev_features_t features = dev->features;
2648         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2649
2650         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2651                 features &= ~NETIF_F_GSO_MASK;
2652
2653         /* If encapsulation offload request, verify we are testing
2654          * hardware encapsulation features instead of standard
2655          * features for the netdev
2656          */
2657         if (skb->encapsulation)
2658                 features &= dev->hw_enc_features;
2659
2660         if (skb_vlan_tagged(skb))
2661                 features = netdev_intersect_features(features,
2662                                                      dev->vlan_features |
2663                                                      NETIF_F_HW_VLAN_CTAG_TX |
2664                                                      NETIF_F_HW_VLAN_STAG_TX);
2665
2666         if (dev->netdev_ops->ndo_features_check)
2667                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2668                                                                 features);
2669         else
2670                 features &= dflt_features_check(skb, dev, features);
2671
2672         return harmonize_features(skb, features);
2673 }
2674 EXPORT_SYMBOL(netif_skb_features);
2675
2676 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2677                     struct netdev_queue *txq, bool more)
2678 {
2679         unsigned int len;
2680         int rc;
2681
2682         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2683                 dev_queue_xmit_nit(skb, dev);
2684
2685         len = skb->len;
2686         trace_net_dev_start_xmit(skb, dev);
2687         rc = netdev_start_xmit(skb, dev, txq, more);
2688         trace_net_dev_xmit(skb, rc, dev, len);
2689
2690         return rc;
2691 }
2692
2693 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2694                                     struct netdev_queue *txq, int *ret)
2695 {
2696         struct sk_buff *skb = first;
2697         int rc = NETDEV_TX_OK;
2698
2699         while (skb) {
2700                 struct sk_buff *next = skb->next;
2701
2702                 skb->next = NULL;
2703                 rc = xmit_one(skb, dev, txq, next != NULL);
2704                 if (unlikely(!dev_xmit_complete(rc))) {
2705                         skb->next = next;
2706                         goto out;
2707                 }
2708
2709                 skb = next;
2710                 if (netif_xmit_stopped(txq) && skb) {
2711                         rc = NETDEV_TX_BUSY;
2712                         break;
2713                 }
2714         }
2715
2716 out:
2717         *ret = rc;
2718         return skb;
2719 }
2720
2721 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2722                                           netdev_features_t features)
2723 {
2724         if (skb_vlan_tag_present(skb) &&
2725             !vlan_hw_offload_capable(features, skb->vlan_proto))
2726                 skb = __vlan_hwaccel_push_inside(skb);
2727         return skb;
2728 }
2729
2730 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2731 {
2732         netdev_features_t features;
2733
2734         if (skb->next)
2735                 return skb;
2736
2737         features = netif_skb_features(skb);
2738         skb = validate_xmit_vlan(skb, features);
2739         if (unlikely(!skb))
2740                 goto out_null;
2741
2742         if (netif_needs_gso(skb, features)) {
2743                 struct sk_buff *segs;
2744
2745                 segs = skb_gso_segment(skb, features);
2746                 if (IS_ERR(segs)) {
2747                         goto out_kfree_skb;
2748                 } else if (segs) {
2749                         consume_skb(skb);
2750                         skb = segs;
2751                 }
2752         } else {
2753                 if (skb_needs_linearize(skb, features) &&
2754                     __skb_linearize(skb))
2755                         goto out_kfree_skb;
2756
2757                 /* If packet is not checksummed and device does not
2758                  * support checksumming for this protocol, complete
2759                  * checksumming here.
2760                  */
2761                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2762                         if (skb->encapsulation)
2763                                 skb_set_inner_transport_header(skb,
2764                                                                skb_checksum_start_offset(skb));
2765                         else
2766                                 skb_set_transport_header(skb,
2767                                                          skb_checksum_start_offset(skb));
2768                         if (!(features & NETIF_F_ALL_CSUM) &&
2769                             skb_checksum_help(skb))
2770                                 goto out_kfree_skb;
2771                 }
2772         }
2773
2774         return skb;
2775
2776 out_kfree_skb:
2777         kfree_skb(skb);
2778 out_null:
2779         return NULL;
2780 }
2781
2782 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2783 {
2784         struct sk_buff *next, *head = NULL, *tail;
2785
2786         for (; skb != NULL; skb = next) {
2787                 next = skb->next;
2788                 skb->next = NULL;
2789
2790                 /* in case skb wont be segmented, point to itself */
2791                 skb->prev = skb;
2792
2793                 skb = validate_xmit_skb(skb, dev);
2794                 if (!skb)
2795                         continue;
2796
2797                 if (!head)
2798                         head = skb;
2799                 else
2800                         tail->next = skb;
2801                 /* If skb was segmented, skb->prev points to
2802                  * the last segment. If not, it still contains skb.
2803                  */
2804                 tail = skb->prev;
2805         }
2806         return head;
2807 }
2808
2809 static void qdisc_pkt_len_init(struct sk_buff *skb)
2810 {
2811         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2812
2813         qdisc_skb_cb(skb)->pkt_len = skb->len;
2814
2815         /* To get more precise estimation of bytes sent on wire,
2816          * we add to pkt_len the headers size of all segments
2817          */
2818         if (shinfo->gso_size)  {
2819                 unsigned int hdr_len;
2820                 u16 gso_segs = shinfo->gso_segs;
2821
2822                 /* mac layer + network layer */
2823                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2824
2825                 /* + transport layer */
2826                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2827                         hdr_len += tcp_hdrlen(skb);
2828                 else
2829                         hdr_len += sizeof(struct udphdr);
2830
2831                 if (shinfo->gso_type & SKB_GSO_DODGY)
2832                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2833                                                 shinfo->gso_size);
2834
2835                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2836         }
2837 }
2838
2839 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2840                                  struct net_device *dev,
2841                                  struct netdev_queue *txq)
2842 {
2843         spinlock_t *root_lock = qdisc_lock(q);
2844         bool contended;
2845         int rc;
2846
2847         qdisc_pkt_len_init(skb);
2848         qdisc_calculate_pkt_len(skb, q);
2849         /*
2850          * Heuristic to force contended enqueues to serialize on a
2851          * separate lock before trying to get qdisc main lock.
2852          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2853          * often and dequeue packets faster.
2854          */
2855         contended = qdisc_is_running(q);
2856         if (unlikely(contended))
2857                 spin_lock(&q->busylock);
2858
2859         spin_lock(root_lock);
2860         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2861                 kfree_skb(skb);
2862                 rc = NET_XMIT_DROP;
2863         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2864                    qdisc_run_begin(q)) {
2865                 /*
2866                  * This is a work-conserving queue; there are no old skbs
2867                  * waiting to be sent out; and the qdisc is not running -
2868                  * xmit the skb directly.
2869                  */
2870
2871                 qdisc_bstats_update(q, skb);
2872
2873                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2874                         if (unlikely(contended)) {
2875                                 spin_unlock(&q->busylock);
2876                                 contended = false;
2877                         }
2878                         __qdisc_run(q);
2879                 } else
2880                         qdisc_run_end(q);
2881
2882                 rc = NET_XMIT_SUCCESS;
2883         } else {
2884                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2885                 if (qdisc_run_begin(q)) {
2886                         if (unlikely(contended)) {
2887                                 spin_unlock(&q->busylock);
2888                                 contended = false;
2889                         }
2890                         __qdisc_run(q);
2891                 }
2892         }
2893         spin_unlock(root_lock);
2894         if (unlikely(contended))
2895                 spin_unlock(&q->busylock);
2896         return rc;
2897 }
2898
2899 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2900 static void skb_update_prio(struct sk_buff *skb)
2901 {
2902         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2903
2904         if (!skb->priority && skb->sk && map) {
2905                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2906
2907                 if (prioidx < map->priomap_len)
2908                         skb->priority = map->priomap[prioidx];
2909         }
2910 }
2911 #else
2912 #define skb_update_prio(skb)
2913 #endif
2914
2915 DEFINE_PER_CPU(int, xmit_recursion);
2916 EXPORT_SYMBOL(xmit_recursion);
2917
2918 #define RECURSION_LIMIT 10
2919
2920 /**
2921  *      dev_loopback_xmit - loop back @skb
2922  *      @skb: buffer to transmit
2923  */
2924 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2925 {
2926         skb_reset_mac_header(skb);
2927         __skb_pull(skb, skb_network_offset(skb));
2928         skb->pkt_type = PACKET_LOOPBACK;
2929         skb->ip_summed = CHECKSUM_UNNECESSARY;
2930         WARN_ON(!skb_dst(skb));
2931         skb_dst_force(skb);
2932         netif_rx_ni(skb);
2933         return 0;
2934 }
2935 EXPORT_SYMBOL(dev_loopback_xmit);
2936
2937 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2938 {
2939 #ifdef CONFIG_XPS
2940         struct xps_dev_maps *dev_maps;
2941         struct xps_map *map;
2942         int queue_index = -1;
2943
2944         rcu_read_lock();
2945         dev_maps = rcu_dereference(dev->xps_maps);
2946         if (dev_maps) {
2947                 map = rcu_dereference(
2948                     dev_maps->cpu_map[skb->sender_cpu - 1]);
2949                 if (map) {
2950                         if (map->len == 1)
2951                                 queue_index = map->queues[0];
2952                         else
2953                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2954                                                                            map->len)];
2955                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2956                                 queue_index = -1;
2957                 }
2958         }
2959         rcu_read_unlock();
2960
2961         return queue_index;
2962 #else
2963         return -1;
2964 #endif
2965 }
2966
2967 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2968 {
2969         struct sock *sk = skb->sk;
2970         int queue_index = sk_tx_queue_get(sk);
2971
2972         if (queue_index < 0 || skb->ooo_okay ||
2973             queue_index >= dev->real_num_tx_queues) {
2974                 int new_index = get_xps_queue(dev, skb);
2975                 if (new_index < 0)
2976                         new_index = skb_tx_hash(dev, skb);
2977
2978                 if (queue_index != new_index && sk &&
2979                     rcu_access_pointer(sk->sk_dst_cache))
2980                         sk_tx_queue_set(sk, new_index);
2981
2982                 queue_index = new_index;
2983         }
2984
2985         return queue_index;
2986 }
2987
2988 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2989                                     struct sk_buff *skb,
2990                                     void *accel_priv)
2991 {
2992         int queue_index = 0;
2993
2994 #ifdef CONFIG_XPS
2995         if (skb->sender_cpu == 0)
2996                 skb->sender_cpu = raw_smp_processor_id() + 1;
2997 #endif
2998
2999         if (dev->real_num_tx_queues != 1) {
3000                 const struct net_device_ops *ops = dev->netdev_ops;
3001                 if (ops->ndo_select_queue)
3002                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3003                                                             __netdev_pick_tx);
3004                 else
3005                         queue_index = __netdev_pick_tx(dev, skb);
3006
3007                 if (!accel_priv)
3008                         queue_index = netdev_cap_txqueue(dev, queue_index);
3009         }
3010
3011         skb_set_queue_mapping(skb, queue_index);
3012         return netdev_get_tx_queue(dev, queue_index);
3013 }
3014
3015 /**
3016  *      __dev_queue_xmit - transmit a buffer
3017  *      @skb: buffer to transmit
3018  *      @accel_priv: private data used for L2 forwarding offload
3019  *
3020  *      Queue a buffer for transmission to a network device. The caller must
3021  *      have set the device and priority and built the buffer before calling
3022  *      this function. The function can be called from an interrupt.
3023  *
3024  *      A negative errno code is returned on a failure. A success does not
3025  *      guarantee the frame will be transmitted as it may be dropped due
3026  *      to congestion or traffic shaping.
3027  *
3028  * -----------------------------------------------------------------------------------
3029  *      I notice this method can also return errors from the queue disciplines,
3030  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3031  *      be positive.
3032  *
3033  *      Regardless of the return value, the skb is consumed, so it is currently
3034  *      difficult to retry a send to this method.  (You can bump the ref count
3035  *      before sending to hold a reference for retry if you are careful.)
3036  *
3037  *      When calling this method, interrupts MUST be enabled.  This is because
3038  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3039  *          --BLG
3040  */
3041 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3042 {
3043         struct net_device *dev = skb->dev;
3044         struct netdev_queue *txq;
3045         struct Qdisc *q;
3046         int rc = -ENOMEM;
3047
3048         skb_reset_mac_header(skb);
3049
3050         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3051                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3052
3053         /* Disable soft irqs for various locks below. Also
3054          * stops preemption for RCU.
3055          */
3056         rcu_read_lock_bh();
3057
3058         skb_update_prio(skb);
3059
3060         /* If device/qdisc don't need skb->dst, release it right now while
3061          * its hot in this cpu cache.
3062          */
3063         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3064                 skb_dst_drop(skb);
3065         else
3066                 skb_dst_force(skb);
3067
3068         txq = netdev_pick_tx(dev, skb, accel_priv);
3069         q = rcu_dereference_bh(txq->qdisc);
3070
3071 #ifdef CONFIG_NET_CLS_ACT
3072         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3073 #endif
3074         trace_net_dev_queue(skb);
3075         if (q->enqueue) {
3076                 rc = __dev_xmit_skb(skb, q, dev, txq);
3077                 goto out;
3078         }
3079
3080         /* The device has no queue. Common case for software devices:
3081            loopback, all the sorts of tunnels...
3082
3083            Really, it is unlikely that netif_tx_lock protection is necessary
3084            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3085            counters.)
3086            However, it is possible, that they rely on protection
3087            made by us here.
3088
3089            Check this and shot the lock. It is not prone from deadlocks.
3090            Either shot noqueue qdisc, it is even simpler 8)
3091          */
3092         if (dev->flags & IFF_UP) {
3093                 int cpu = smp_processor_id(); /* ok because BHs are off */
3094
3095                 if (txq->xmit_lock_owner != cpu) {
3096
3097                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3098                                 goto recursion_alert;
3099
3100                         skb = validate_xmit_skb(skb, dev);
3101                         if (!skb)
3102                                 goto drop;
3103
3104                         HARD_TX_LOCK(dev, txq, cpu);
3105
3106                         if (!netif_xmit_stopped(txq)) {
3107                                 __this_cpu_inc(xmit_recursion);
3108                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3109                                 __this_cpu_dec(xmit_recursion);
3110                                 if (dev_xmit_complete(rc)) {
3111                                         HARD_TX_UNLOCK(dev, txq);
3112                                         goto out;
3113                                 }
3114                         }
3115                         HARD_TX_UNLOCK(dev, txq);
3116                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3117                                              dev->name);
3118                 } else {
3119                         /* Recursion is detected! It is possible,
3120                          * unfortunately
3121                          */
3122 recursion_alert:
3123                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3124                                              dev->name);
3125                 }
3126         }
3127
3128         rc = -ENETDOWN;
3129 drop:
3130         rcu_read_unlock_bh();
3131
3132         atomic_long_inc(&dev->tx_dropped);
3133         kfree_skb_list(skb);
3134         return rc;
3135 out:
3136         rcu_read_unlock_bh();
3137         return rc;
3138 }
3139
3140 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3141 {
3142         return __dev_queue_xmit(skb, NULL);
3143 }
3144 EXPORT_SYMBOL(dev_queue_xmit_sk);
3145
3146 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3147 {
3148         return __dev_queue_xmit(skb, accel_priv);
3149 }
3150 EXPORT_SYMBOL(dev_queue_xmit_accel);
3151
3152
3153 /*=======================================================================
3154                         Receiver routines
3155   =======================================================================*/
3156
3157 int netdev_max_backlog __read_mostly = 1000;
3158 EXPORT_SYMBOL(netdev_max_backlog);
3159
3160 int netdev_tstamp_prequeue __read_mostly = 1;
3161 int netdev_budget __read_mostly = 300;
3162 int weight_p __read_mostly = 64;            /* old backlog weight */
3163
3164 /* Called with irq disabled */
3165 static inline void ____napi_schedule(struct softnet_data *sd,
3166                                      struct napi_struct *napi)
3167 {
3168         list_add_tail(&napi->poll_list, &sd->poll_list);
3169         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3170 }
3171
3172 #ifdef CONFIG_RPS
3173
3174 /* One global table that all flow-based protocols share. */
3175 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3176 EXPORT_SYMBOL(rps_sock_flow_table);
3177 u32 rps_cpu_mask __read_mostly;
3178 EXPORT_SYMBOL(rps_cpu_mask);
3179
3180 struct static_key rps_needed __read_mostly;
3181
3182 static struct rps_dev_flow *
3183 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3184             struct rps_dev_flow *rflow, u16 next_cpu)
3185 {
3186         if (next_cpu < nr_cpu_ids) {
3187 #ifdef CONFIG_RFS_ACCEL
3188                 struct netdev_rx_queue *rxqueue;
3189                 struct rps_dev_flow_table *flow_table;
3190                 struct rps_dev_flow *old_rflow;
3191                 u32 flow_id;
3192                 u16 rxq_index;
3193                 int rc;
3194
3195                 /* Should we steer this flow to a different hardware queue? */
3196                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3197                     !(dev->features & NETIF_F_NTUPLE))
3198                         goto out;
3199                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3200                 if (rxq_index == skb_get_rx_queue(skb))
3201                         goto out;
3202
3203                 rxqueue = dev->_rx + rxq_index;
3204                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3205                 if (!flow_table)
3206                         goto out;
3207                 flow_id = skb_get_hash(skb) & flow_table->mask;
3208                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3209                                                         rxq_index, flow_id);
3210                 if (rc < 0)
3211                         goto out;
3212                 old_rflow = rflow;
3213                 rflow = &flow_table->flows[flow_id];
3214                 rflow->filter = rc;
3215                 if (old_rflow->filter == rflow->filter)
3216                         old_rflow->filter = RPS_NO_FILTER;
3217         out:
3218 #endif
3219                 rflow->last_qtail =
3220                         per_cpu(softnet_data, next_cpu).input_queue_head;
3221         }
3222
3223         rflow->cpu = next_cpu;
3224         return rflow;
3225 }
3226
3227 /*
3228  * get_rps_cpu is called from netif_receive_skb and returns the target
3229  * CPU from the RPS map of the receiving queue for a given skb.
3230  * rcu_read_lock must be held on entry.
3231  */
3232 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3233                        struct rps_dev_flow **rflowp)
3234 {
3235         const struct rps_sock_flow_table *sock_flow_table;
3236         struct netdev_rx_queue *rxqueue = dev->_rx;
3237         struct rps_dev_flow_table *flow_table;
3238         struct rps_map *map;
3239         int cpu = -1;
3240         u32 tcpu;
3241         u32 hash;
3242
3243         if (skb_rx_queue_recorded(skb)) {
3244                 u16 index = skb_get_rx_queue(skb);
3245
3246                 if (unlikely(index >= dev->real_num_rx_queues)) {
3247                         WARN_ONCE(dev->real_num_rx_queues > 1,
3248                                   "%s received packet on queue %u, but number "
3249                                   "of RX queues is %u\n",
3250                                   dev->name, index, dev->real_num_rx_queues);
3251                         goto done;
3252                 }
3253                 rxqueue += index;
3254         }
3255
3256         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3257
3258         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3259         map = rcu_dereference(rxqueue->rps_map);
3260         if (!flow_table && !map)
3261                 goto done;
3262
3263         skb_reset_network_header(skb);
3264         hash = skb_get_hash(skb);
3265         if (!hash)
3266                 goto done;
3267
3268         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3269         if (flow_table && sock_flow_table) {
3270                 struct rps_dev_flow *rflow;
3271                 u32 next_cpu;
3272                 u32 ident;
3273
3274                 /* First check into global flow table if there is a match */
3275                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3276                 if ((ident ^ hash) & ~rps_cpu_mask)
3277                         goto try_rps;
3278
3279                 next_cpu = ident & rps_cpu_mask;
3280
3281                 /* OK, now we know there is a match,
3282                  * we can look at the local (per receive queue) flow table
3283                  */
3284                 rflow = &flow_table->flows[hash & flow_table->mask];
3285                 tcpu = rflow->cpu;
3286
3287                 /*
3288                  * If the desired CPU (where last recvmsg was done) is
3289                  * different from current CPU (one in the rx-queue flow
3290                  * table entry), switch if one of the following holds:
3291                  *   - Current CPU is unset (>= nr_cpu_ids).
3292                  *   - Current CPU is offline.
3293                  *   - The current CPU's queue tail has advanced beyond the
3294                  *     last packet that was enqueued using this table entry.
3295                  *     This guarantees that all previous packets for the flow
3296                  *     have been dequeued, thus preserving in order delivery.
3297                  */
3298                 if (unlikely(tcpu != next_cpu) &&
3299                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3300                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3301                       rflow->last_qtail)) >= 0)) {
3302                         tcpu = next_cpu;
3303                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3304                 }
3305
3306                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3307                         *rflowp = rflow;
3308                         cpu = tcpu;
3309                         goto done;
3310                 }
3311         }
3312
3313 try_rps:
3314
3315         if (map) {
3316                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3317                 if (cpu_online(tcpu)) {
3318                         cpu = tcpu;
3319                         goto done;
3320                 }
3321         }
3322
3323 done:
3324         return cpu;
3325 }
3326
3327 #ifdef CONFIG_RFS_ACCEL
3328
3329 /**
3330  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3331  * @dev: Device on which the filter was set
3332  * @rxq_index: RX queue index
3333  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3334  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3335  *
3336  * Drivers that implement ndo_rx_flow_steer() should periodically call
3337  * this function for each installed filter and remove the filters for
3338  * which it returns %true.
3339  */
3340 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3341                          u32 flow_id, u16 filter_id)
3342 {
3343         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3344         struct rps_dev_flow_table *flow_table;
3345         struct rps_dev_flow *rflow;
3346         bool expire = true;
3347         unsigned int cpu;
3348
3349         rcu_read_lock();
3350         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3351         if (flow_table && flow_id <= flow_table->mask) {
3352                 rflow = &flow_table->flows[flow_id];
3353                 cpu = ACCESS_ONCE(rflow->cpu);
3354                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3355                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3356                            rflow->last_qtail) <
3357                      (int)(10 * flow_table->mask)))
3358                         expire = false;
3359         }
3360         rcu_read_unlock();
3361         return expire;
3362 }
3363 EXPORT_SYMBOL(rps_may_expire_flow);
3364
3365 #endif /* CONFIG_RFS_ACCEL */
3366
3367 /* Called from hardirq (IPI) context */
3368 static void rps_trigger_softirq(void *data)
3369 {
3370         struct softnet_data *sd = data;
3371
3372         ____napi_schedule(sd, &sd->backlog);
3373         sd->received_rps++;
3374 }
3375
3376 #endif /* CONFIG_RPS */
3377
3378 /*
3379  * Check if this softnet_data structure is another cpu one
3380  * If yes, queue it to our IPI list and return 1
3381  * If no, return 0
3382  */
3383 static int rps_ipi_queued(struct softnet_data *sd)
3384 {
3385 #ifdef CONFIG_RPS
3386         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3387
3388         if (sd != mysd) {
3389                 sd->rps_ipi_next = mysd->rps_ipi_list;
3390                 mysd->rps_ipi_list = sd;
3391
3392                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3393                 return 1;
3394         }
3395 #endif /* CONFIG_RPS */
3396         return 0;
3397 }
3398
3399 #ifdef CONFIG_NET_FLOW_LIMIT
3400 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3401 #endif
3402
3403 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3404 {
3405 #ifdef CONFIG_NET_FLOW_LIMIT
3406         struct sd_flow_limit *fl;
3407         struct softnet_data *sd;
3408         unsigned int old_flow, new_flow;
3409
3410         if (qlen < (netdev_max_backlog >> 1))
3411                 return false;
3412
3413         sd = this_cpu_ptr(&softnet_data);
3414
3415         rcu_read_lock();
3416         fl = rcu_dereference(sd->flow_limit);
3417         if (fl) {
3418                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3419                 old_flow = fl->history[fl->history_head];
3420                 fl->history[fl->history_head] = new_flow;
3421
3422                 fl->history_head++;
3423                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3424
3425                 if (likely(fl->buckets[old_flow]))
3426                         fl->buckets[old_flow]--;
3427
3428                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3429                         fl->count++;
3430                         rcu_read_unlock();
3431                         return true;
3432                 }
3433         }
3434         rcu_read_unlock();
3435 #endif
3436         return false;
3437 }
3438
3439 /*
3440  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3441  * queue (may be a remote CPU queue).
3442  */
3443 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3444                               unsigned int *qtail)
3445 {
3446         struct softnet_data *sd;
3447         unsigned long flags;
3448         unsigned int qlen;
3449
3450         sd = &per_cpu(softnet_data, cpu);
3451
3452         local_irq_save(flags);
3453
3454         rps_lock(sd);
3455         qlen = skb_queue_len(&sd->input_pkt_queue);
3456         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3457                 if (qlen) {
3458 enqueue:
3459                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3460                         input_queue_tail_incr_save(sd, qtail);
3461                         rps_unlock(sd);
3462                         local_irq_restore(flags);
3463                         return NET_RX_SUCCESS;
3464                 }
3465
3466                 /* Schedule NAPI for backlog device
3467                  * We can use non atomic operation since we own the queue lock
3468                  */
3469                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3470                         if (!rps_ipi_queued(sd))
3471                                 ____napi_schedule(sd, &sd->backlog);
3472                 }
3473                 goto enqueue;
3474         }
3475
3476         sd->dropped++;
3477         rps_unlock(sd);
3478
3479         local_irq_restore(flags);
3480
3481         atomic_long_inc(&skb->dev->rx_dropped);
3482         kfree_skb(skb);
3483         return NET_RX_DROP;
3484 }
3485
3486 static int netif_rx_internal(struct sk_buff *skb)
3487 {
3488         int ret;
3489
3490         net_timestamp_check(netdev_tstamp_prequeue, skb);
3491
3492         trace_netif_rx(skb);
3493 #ifdef CONFIG_RPS
3494         if (static_key_false(&rps_needed)) {
3495                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3496                 int cpu;
3497
3498                 preempt_disable();
3499                 rcu_read_lock();
3500
3501                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3502                 if (cpu < 0)
3503                         cpu = smp_processor_id();
3504
3505                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3506
3507                 rcu_read_unlock();
3508                 preempt_enable();
3509         } else
3510 #endif
3511         {
3512                 unsigned int qtail;
3513                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3514                 put_cpu();
3515         }
3516         return ret;
3517 }
3518
3519 /**
3520  *      netif_rx        -       post buffer to the network code
3521  *      @skb: buffer to post
3522  *
3523  *      This function receives a packet from a device driver and queues it for
3524  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3525  *      may be dropped during processing for congestion control or by the
3526  *      protocol layers.
3527  *
3528  *      return values:
3529  *      NET_RX_SUCCESS  (no congestion)
3530  *      NET_RX_DROP     (packet was dropped)
3531  *
3532  */
3533
3534 int netif_rx(struct sk_buff *skb)
3535 {
3536         trace_netif_rx_entry(skb);
3537
3538         return netif_rx_internal(skb);
3539 }
3540 EXPORT_SYMBOL(netif_rx);
3541
3542 int netif_rx_ni(struct sk_buff *skb)
3543 {
3544         int err;
3545
3546         trace_netif_rx_ni_entry(skb);
3547
3548         preempt_disable();
3549         err = netif_rx_internal(skb);
3550         if (local_softirq_pending())
3551                 do_softirq();
3552         preempt_enable();
3553
3554         return err;
3555 }
3556 EXPORT_SYMBOL(netif_rx_ni);
3557
3558 static void net_tx_action(struct softirq_action *h)
3559 {
3560         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3561
3562         if (sd->completion_queue) {
3563                 struct sk_buff *clist;
3564
3565                 local_irq_disable();
3566                 clist = sd->completion_queue;
3567                 sd->completion_queue = NULL;
3568                 local_irq_enable();
3569
3570                 while (clist) {
3571                         struct sk_buff *skb = clist;
3572                         clist = clist->next;
3573
3574                         WARN_ON(atomic_read(&skb->users));
3575                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3576                                 trace_consume_skb(skb);
3577                         else
3578                                 trace_kfree_skb(skb, net_tx_action);
3579                         __kfree_skb(skb);
3580                 }
3581         }
3582
3583         if (sd->output_queue) {
3584                 struct Qdisc *head;
3585
3586                 local_irq_disable();
3587                 head = sd->output_queue;
3588                 sd->output_queue = NULL;
3589                 sd->output_queue_tailp = &sd->output_queue;
3590                 local_irq_enable();
3591
3592                 while (head) {
3593                         struct Qdisc *q = head;
3594                         spinlock_t *root_lock;
3595
3596                         head = head->next_sched;
3597
3598                         root_lock = qdisc_lock(q);
3599                         if (spin_trylock(root_lock)) {
3600                                 smp_mb__before_atomic();
3601                                 clear_bit(__QDISC_STATE_SCHED,
3602                                           &q->state);
3603                                 qdisc_run(q);
3604                                 spin_unlock(root_lock);
3605                         } else {
3606                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3607                                               &q->state)) {
3608                                         __netif_reschedule(q);
3609                                 } else {
3610                                         smp_mb__before_atomic();
3611                                         clear_bit(__QDISC_STATE_SCHED,
3612                                                   &q->state);
3613                                 }
3614                         }
3615                 }
3616         }
3617 }
3618
3619 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3620     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3621 /* This hook is defined here for ATM LANE */
3622 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3623                              unsigned char *addr) __read_mostly;
3624 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3625 #endif
3626
3627 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3628                                          struct packet_type **pt_prev,
3629                                          int *ret, struct net_device *orig_dev)
3630 {
3631 #ifdef CONFIG_NET_CLS_ACT
3632         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3633         struct tcf_result cl_res;
3634
3635         /* If there's at least one ingress present somewhere (so
3636          * we get here via enabled static key), remaining devices
3637          * that are not configured with an ingress qdisc will bail
3638          * out here.
3639          */
3640         if (!cl)
3641                 return skb;
3642         if (*pt_prev) {
3643                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3644                 *pt_prev = NULL;
3645         }
3646
3647         qdisc_skb_cb(skb)->pkt_len = skb->len;
3648         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3649         qdisc_bstats_update_cpu(cl->q, skb);
3650
3651         switch (tc_classify(skb, cl, &cl_res)) {
3652         case TC_ACT_OK:
3653         case TC_ACT_RECLASSIFY:
3654                 skb->tc_index = TC_H_MIN(cl_res.classid);
3655                 break;
3656         case TC_ACT_SHOT:
3657                 qdisc_qstats_drop_cpu(cl->q);
3658         case TC_ACT_STOLEN:
3659         case TC_ACT_QUEUED:
3660                 kfree_skb(skb);
3661                 return NULL;
3662         default:
3663                 break;
3664         }
3665 #endif /* CONFIG_NET_CLS_ACT */
3666         return skb;
3667 }
3668
3669 /**
3670  *      netdev_rx_handler_register - register receive handler
3671  *      @dev: device to register a handler for
3672  *      @rx_handler: receive handler to register
3673  *      @rx_handler_data: data pointer that is used by rx handler
3674  *
3675  *      Register a receive handler for a device. This handler will then be
3676  *      called from __netif_receive_skb. A negative errno code is returned
3677  *      on a failure.
3678  *
3679  *      The caller must hold the rtnl_mutex.
3680  *
3681  *      For a general description of rx_handler, see enum rx_handler_result.
3682  */
3683 int netdev_rx_handler_register(struct net_device *dev,
3684                                rx_handler_func_t *rx_handler,
3685                                void *rx_handler_data)
3686 {
3687         ASSERT_RTNL();
3688
3689         if (dev->rx_handler)
3690                 return -EBUSY;
3691
3692         /* Note: rx_handler_data must be set before rx_handler */
3693         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3694         rcu_assign_pointer(dev->rx_handler, rx_handler);
3695
3696         return 0;
3697 }
3698 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3699
3700 /**
3701  *      netdev_rx_handler_unregister - unregister receive handler
3702  *      @dev: device to unregister a handler from
3703  *
3704  *      Unregister a receive handler from a device.
3705  *
3706  *      The caller must hold the rtnl_mutex.
3707  */
3708 void netdev_rx_handler_unregister(struct net_device *dev)
3709 {
3710
3711         ASSERT_RTNL();
3712         RCU_INIT_POINTER(dev->rx_handler, NULL);
3713         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3714          * section has a guarantee to see a non NULL rx_handler_data
3715          * as well.
3716          */
3717         synchronize_net();
3718         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3719 }
3720 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3721
3722 /*
3723  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3724  * the special handling of PFMEMALLOC skbs.
3725  */
3726 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3727 {
3728         switch (skb->protocol) {
3729         case htons(ETH_P_ARP):
3730         case htons(ETH_P_IP):
3731         case htons(ETH_P_IPV6):
3732         case htons(ETH_P_8021Q):
3733         case htons(ETH_P_8021AD):
3734                 return true;
3735         default:
3736                 return false;
3737         }
3738 }
3739
3740 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3741                              int *ret, struct net_device *orig_dev)
3742 {
3743 #ifdef CONFIG_NETFILTER_INGRESS
3744         if (nf_hook_ingress_active(skb)) {
3745                 if (*pt_prev) {
3746                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3747                         *pt_prev = NULL;
3748                 }
3749
3750                 return nf_hook_ingress(skb);
3751         }
3752 #endif /* CONFIG_NETFILTER_INGRESS */
3753         return 0;
3754 }
3755
3756 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3757 {
3758         struct packet_type *ptype, *pt_prev;
3759         rx_handler_func_t *rx_handler;
3760         struct net_device *orig_dev;
3761         bool deliver_exact = false;
3762         int ret = NET_RX_DROP;
3763         __be16 type;
3764
3765         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3766
3767         trace_netif_receive_skb(skb);
3768
3769         orig_dev = skb->dev;
3770
3771         skb_reset_network_header(skb);
3772         if (!skb_transport_header_was_set(skb))
3773                 skb_reset_transport_header(skb);
3774         skb_reset_mac_len(skb);
3775
3776         pt_prev = NULL;
3777
3778         rcu_read_lock();
3779
3780 another_round:
3781         skb->skb_iif = skb->dev->ifindex;
3782
3783         __this_cpu_inc(softnet_data.processed);
3784
3785         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3786             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3787                 skb = skb_vlan_untag(skb);
3788                 if (unlikely(!skb))
3789                         goto unlock;
3790         }
3791
3792 #ifdef CONFIG_NET_CLS_ACT
3793         if (skb->tc_verd & TC_NCLS) {
3794                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3795                 goto ncls;
3796         }
3797 #endif
3798
3799         if (pfmemalloc)
3800                 goto skip_taps;
3801
3802         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3803                 if (pt_prev)
3804                         ret = deliver_skb(skb, pt_prev, orig_dev);
3805                 pt_prev = ptype;
3806         }
3807
3808         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3809                 if (pt_prev)
3810                         ret = deliver_skb(skb, pt_prev, orig_dev);
3811                 pt_prev = ptype;
3812         }
3813
3814 skip_taps:
3815 #ifdef CONFIG_NET_INGRESS
3816         if (static_key_false(&ingress_needed)) {
3817                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3818                 if (!skb)
3819                         goto unlock;
3820
3821                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3822                         goto unlock;
3823         }
3824 #endif
3825 #ifdef CONFIG_NET_CLS_ACT
3826         skb->tc_verd = 0;
3827 ncls:
3828 #endif
3829         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3830                 goto drop;
3831
3832         if (skb_vlan_tag_present(skb)) {
3833                 if (pt_prev) {
3834                         ret = deliver_skb(skb, pt_prev, orig_dev);
3835                         pt_prev = NULL;
3836                 }
3837                 if (vlan_do_receive(&skb))
3838                         goto another_round;
3839                 else if (unlikely(!skb))
3840                         goto unlock;
3841         }
3842
3843         rx_handler = rcu_dereference(skb->dev->rx_handler);
3844         if (rx_handler) {
3845                 if (pt_prev) {
3846                         ret = deliver_skb(skb, pt_prev, orig_dev);
3847                         pt_prev = NULL;
3848                 }
3849                 switch (rx_handler(&skb)) {
3850                 case RX_HANDLER_CONSUMED:
3851                         ret = NET_RX_SUCCESS;
3852                         goto unlock;
3853                 case RX_HANDLER_ANOTHER:
3854                         goto another_round;
3855                 case RX_HANDLER_EXACT:
3856                         deliver_exact = true;
3857                 case RX_HANDLER_PASS:
3858                         break;
3859                 default:
3860                         BUG();
3861                 }
3862         }
3863
3864         if (unlikely(skb_vlan_tag_present(skb))) {
3865                 if (skb_vlan_tag_get_id(skb))
3866                         skb->pkt_type = PACKET_OTHERHOST;
3867                 /* Note: we might in the future use prio bits
3868                  * and set skb->priority like in vlan_do_receive()
3869                  * For the time being, just ignore Priority Code Point
3870                  */
3871                 skb->vlan_tci = 0;
3872         }
3873
3874         type = skb->protocol;
3875
3876         /* deliver only exact match when indicated */
3877         if (likely(!deliver_exact)) {
3878                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3879                                        &ptype_base[ntohs(type) &
3880                                                    PTYPE_HASH_MASK]);
3881         }
3882
3883         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3884                                &orig_dev->ptype_specific);
3885
3886         if (unlikely(skb->dev != orig_dev)) {
3887                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3888                                        &skb->dev->ptype_specific);
3889         }
3890
3891         if (pt_prev) {
3892                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3893                         goto drop;
3894                 else
3895                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3896         } else {
3897 drop:
3898                 atomic_long_inc(&skb->dev->rx_dropped);
3899                 kfree_skb(skb);
3900                 /* Jamal, now you will not able to escape explaining
3901                  * me how you were going to use this. :-)
3902                  */
3903                 ret = NET_RX_DROP;
3904         }
3905
3906 unlock:
3907         rcu_read_unlock();
3908         return ret;
3909 }
3910
3911 static int __netif_receive_skb(struct sk_buff *skb)
3912 {
3913         int ret;
3914
3915         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3916                 unsigned long pflags = current->flags;
3917
3918                 /*
3919                  * PFMEMALLOC skbs are special, they should
3920                  * - be delivered to SOCK_MEMALLOC sockets only
3921                  * - stay away from userspace
3922                  * - have bounded memory usage
3923                  *
3924                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3925                  * context down to all allocation sites.
3926                  */
3927                 current->flags |= PF_MEMALLOC;
3928                 ret = __netif_receive_skb_core(skb, true);
3929                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3930         } else
3931                 ret = __netif_receive_skb_core(skb, false);
3932
3933         return ret;
3934 }
3935
3936 static int netif_receive_skb_internal(struct sk_buff *skb)
3937 {
3938         net_timestamp_check(netdev_tstamp_prequeue, skb);
3939
3940         if (skb_defer_rx_timestamp(skb))
3941                 return NET_RX_SUCCESS;
3942
3943 #ifdef CONFIG_RPS
3944         if (static_key_false(&rps_needed)) {
3945                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3946                 int cpu, ret;
3947
3948                 rcu_read_lock();
3949
3950                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3951
3952                 if (cpu >= 0) {
3953                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3954                         rcu_read_unlock();
3955                         return ret;
3956                 }
3957                 rcu_read_unlock();
3958         }
3959 #endif
3960         return __netif_receive_skb(skb);
3961 }
3962
3963 /**
3964  *      netif_receive_skb - process receive buffer from network
3965  *      @skb: buffer to process
3966  *
3967  *      netif_receive_skb() is the main receive data processing function.
3968  *      It always succeeds. The buffer may be dropped during processing
3969  *      for congestion control or by the protocol layers.
3970  *
3971  *      This function may only be called from softirq context and interrupts
3972  *      should be enabled.
3973  *
3974  *      Return values (usually ignored):
3975  *      NET_RX_SUCCESS: no congestion
3976  *      NET_RX_DROP: packet was dropped
3977  */
3978 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
3979 {
3980         trace_netif_receive_skb_entry(skb);
3981
3982         return netif_receive_skb_internal(skb);
3983 }
3984 EXPORT_SYMBOL(netif_receive_skb_sk);
3985
3986 /* Network device is going away, flush any packets still pending
3987  * Called with irqs disabled.
3988  */
3989 static void flush_backlog(void *arg)
3990 {
3991         struct net_device *dev = arg;
3992         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3993         struct sk_buff *skb, *tmp;
3994
3995         rps_lock(sd);
3996         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3997                 if (skb->dev == dev) {
3998                         __skb_unlink(skb, &sd->input_pkt_queue);
3999                         kfree_skb(skb);
4000                         input_queue_head_incr(sd);
4001                 }
4002         }
4003         rps_unlock(sd);
4004
4005         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4006                 if (skb->dev == dev) {
4007                         __skb_unlink(skb, &sd->process_queue);
4008                         kfree_skb(skb);
4009                         input_queue_head_incr(sd);
4010                 }
4011         }
4012 }
4013
4014 static int napi_gro_complete(struct sk_buff *skb)
4015 {
4016         struct packet_offload *ptype;
4017         __be16 type = skb->protocol;
4018         struct list_head *head = &offload_base;
4019         int err = -ENOENT;
4020
4021         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4022
4023         if (NAPI_GRO_CB(skb)->count == 1) {
4024                 skb_shinfo(skb)->gso_size = 0;
4025                 goto out;
4026         }
4027
4028         rcu_read_lock();
4029         list_for_each_entry_rcu(ptype, head, list) {
4030                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4031                         continue;
4032
4033                 err = ptype->callbacks.gro_complete(skb, 0);
4034                 break;
4035         }
4036         rcu_read_unlock();
4037
4038         if (err) {
4039                 WARN_ON(&ptype->list == head);
4040                 kfree_skb(skb);
4041                 return NET_RX_SUCCESS;
4042         }
4043
4044 out:
4045         return netif_receive_skb_internal(skb);
4046 }
4047
4048 /* napi->gro_list contains packets ordered by age.
4049  * youngest packets at the head of it.
4050  * Complete skbs in reverse order to reduce latencies.
4051  */
4052 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4053 {
4054         struct sk_buff *skb, *prev = NULL;
4055
4056         /* scan list and build reverse chain */
4057         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4058                 skb->prev = prev;
4059                 prev = skb;
4060         }
4061
4062         for (skb = prev; skb; skb = prev) {
4063                 skb->next = NULL;
4064
4065                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4066                         return;
4067
4068                 prev = skb->prev;
4069                 napi_gro_complete(skb);
4070                 napi->gro_count--;
4071         }
4072
4073         napi->gro_list = NULL;
4074 }
4075 EXPORT_SYMBOL(napi_gro_flush);
4076
4077 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4078 {
4079         struct sk_buff *p;
4080         unsigned int maclen = skb->dev->hard_header_len;
4081         u32 hash = skb_get_hash_raw(skb);
4082
4083         for (p = napi->gro_list; p; p = p->next) {
4084                 unsigned long diffs;
4085
4086                 NAPI_GRO_CB(p)->flush = 0;
4087
4088                 if (hash != skb_get_hash_raw(p)) {
4089                         NAPI_GRO_CB(p)->same_flow = 0;
4090                         continue;
4091                 }
4092
4093                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4094                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4095                 if (maclen == ETH_HLEN)
4096                         diffs |= compare_ether_header(skb_mac_header(p),
4097                                                       skb_mac_header(skb));
4098                 else if (!diffs)
4099                         diffs = memcmp(skb_mac_header(p),
4100                                        skb_mac_header(skb),
4101                                        maclen);
4102                 NAPI_GRO_CB(p)->same_flow = !diffs;
4103         }
4104 }
4105
4106 static void skb_gro_reset_offset(struct sk_buff *skb)
4107 {
4108         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4109         const skb_frag_t *frag0 = &pinfo->frags[0];
4110
4111         NAPI_GRO_CB(skb)->data_offset = 0;
4112         NAPI_GRO_CB(skb)->frag0 = NULL;
4113         NAPI_GRO_CB(skb)->frag0_len = 0;
4114
4115         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4116             pinfo->nr_frags &&
4117             !PageHighMem(skb_frag_page(frag0))) {
4118                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4119                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4120         }
4121 }
4122
4123 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4124 {
4125         struct skb_shared_info *pinfo = skb_shinfo(skb);
4126
4127         BUG_ON(skb->end - skb->tail < grow);
4128
4129         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4130
4131         skb->data_len -= grow;
4132         skb->tail += grow;
4133
4134         pinfo->frags[0].page_offset += grow;
4135         skb_frag_size_sub(&pinfo->frags[0], grow);
4136
4137         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4138                 skb_frag_unref(skb, 0);
4139                 memmove(pinfo->frags, pinfo->frags + 1,
4140                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4141         }
4142 }
4143
4144 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4145 {
4146         struct sk_buff **pp = NULL;
4147         struct packet_offload *ptype;
4148         __be16 type = skb->protocol;
4149         struct list_head *head = &offload_base;
4150         int same_flow;
4151         enum gro_result ret;
4152         int grow;
4153
4154         if (!(skb->dev->features & NETIF_F_GRO))
4155                 goto normal;
4156
4157         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4158                 goto normal;
4159
4160         gro_list_prepare(napi, skb);
4161
4162         rcu_read_lock();
4163         list_for_each_entry_rcu(ptype, head, list) {
4164                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4165                         continue;
4166
4167                 skb_set_network_header(skb, skb_gro_offset(skb));
4168                 skb_reset_mac_len(skb);
4169                 NAPI_GRO_CB(skb)->same_flow = 0;
4170                 NAPI_GRO_CB(skb)->flush = 0;
4171                 NAPI_GRO_CB(skb)->free = 0;
4172                 NAPI_GRO_CB(skb)->udp_mark = 0;
4173                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4174
4175                 /* Setup for GRO checksum validation */
4176                 switch (skb->ip_summed) {
4177                 case CHECKSUM_COMPLETE:
4178                         NAPI_GRO_CB(skb)->csum = skb->csum;
4179                         NAPI_GRO_CB(skb)->csum_valid = 1;
4180                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4181                         break;
4182                 case CHECKSUM_UNNECESSARY:
4183                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4184                         NAPI_GRO_CB(skb)->csum_valid = 0;
4185                         break;
4186                 default:
4187                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4188                         NAPI_GRO_CB(skb)->csum_valid = 0;
4189                 }
4190
4191                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4192                 break;
4193         }
4194         rcu_read_unlock();
4195
4196         if (&ptype->list == head)
4197                 goto normal;
4198
4199         same_flow = NAPI_GRO_CB(skb)->same_flow;
4200         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4201
4202         if (pp) {
4203                 struct sk_buff *nskb = *pp;
4204
4205                 *pp = nskb->next;
4206                 nskb->next = NULL;
4207                 napi_gro_complete(nskb);
4208                 napi->gro_count--;
4209         }
4210
4211         if (same_flow)
4212                 goto ok;
4213
4214         if (NAPI_GRO_CB(skb)->flush)
4215                 goto normal;
4216
4217         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4218                 struct sk_buff *nskb = napi->gro_list;
4219
4220                 /* locate the end of the list to select the 'oldest' flow */
4221                 while (nskb->next) {
4222                         pp = &nskb->next;
4223                         nskb = *pp;
4224                 }
4225                 *pp = NULL;
4226                 nskb->next = NULL;
4227                 napi_gro_complete(nskb);
4228         } else {
4229                 napi->gro_count++;
4230         }
4231         NAPI_GRO_CB(skb)->count = 1;
4232         NAPI_GRO_CB(skb)->age = jiffies;
4233         NAPI_GRO_CB(skb)->last = skb;
4234         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4235         skb->next = napi->gro_list;
4236         napi->gro_list = skb;
4237         ret = GRO_HELD;
4238
4239 pull:
4240         grow = skb_gro_offset(skb) - skb_headlen(skb);
4241         if (grow > 0)
4242                 gro_pull_from_frag0(skb, grow);
4243 ok:
4244         return ret;
4245
4246 normal:
4247         ret = GRO_NORMAL;
4248         goto pull;
4249 }
4250
4251 struct packet_offload *gro_find_receive_by_type(__be16 type)
4252 {
4253         struct list_head *offload_head = &offload_base;
4254         struct packet_offload *ptype;
4255
4256         list_for_each_entry_rcu(ptype, offload_head, list) {
4257                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4258                         continue;
4259                 return ptype;
4260         }
4261         return NULL;
4262 }
4263 EXPORT_SYMBOL(gro_find_receive_by_type);
4264
4265 struct packet_offload *gro_find_complete_by_type(__be16 type)
4266 {
4267         struct list_head *offload_head = &offload_base;
4268         struct packet_offload *ptype;
4269
4270         list_for_each_entry_rcu(ptype, offload_head, list) {
4271                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4272                         continue;
4273                 return ptype;
4274         }
4275         return NULL;
4276 }
4277 EXPORT_SYMBOL(gro_find_complete_by_type);
4278
4279 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4280 {
4281         switch (ret) {
4282         case GRO_NORMAL:
4283                 if (netif_receive_skb_internal(skb))
4284                         ret = GRO_DROP;
4285                 break;
4286
4287         case GRO_DROP:
4288                 kfree_skb(skb);
4289                 break;
4290
4291         case GRO_MERGED_FREE:
4292                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4293                         kmem_cache_free(skbuff_head_cache, skb);
4294                 else
4295                         __kfree_skb(skb);
4296                 break;
4297
4298         case GRO_HELD:
4299         case GRO_MERGED:
4300                 break;
4301         }
4302
4303         return ret;
4304 }
4305
4306 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4307 {
4308         trace_napi_gro_receive_entry(skb);
4309
4310         skb_gro_reset_offset(skb);
4311
4312         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4313 }
4314 EXPORT_SYMBOL(napi_gro_receive);
4315
4316 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4317 {
4318         if (unlikely(skb->pfmemalloc)) {
4319                 consume_skb(skb);
4320                 return;
4321         }
4322         __skb_pull(skb, skb_headlen(skb));
4323         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4324         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4325         skb->vlan_tci = 0;
4326         skb->dev = napi->dev;
4327         skb->skb_iif = 0;
4328         skb->encapsulation = 0;
4329         skb_shinfo(skb)->gso_type = 0;
4330         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4331
4332         napi->skb = skb;
4333 }
4334
4335 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4336 {
4337         struct sk_buff *skb = napi->skb;
4338
4339         if (!skb) {
4340                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4341                 napi->skb = skb;
4342         }
4343         return skb;
4344 }
4345 EXPORT_SYMBOL(napi_get_frags);
4346
4347 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4348                                       struct sk_buff *skb,
4349                                       gro_result_t ret)
4350 {
4351         switch (ret) {
4352         case GRO_NORMAL:
4353         case GRO_HELD:
4354                 __skb_push(skb, ETH_HLEN);
4355                 skb->protocol = eth_type_trans(skb, skb->dev);
4356                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4357                         ret = GRO_DROP;
4358                 break;
4359
4360         case GRO_DROP:
4361         case GRO_MERGED_FREE:
4362                 napi_reuse_skb(napi, skb);
4363                 break;
4364
4365         case GRO_MERGED:
4366                 break;
4367         }
4368
4369         return ret;
4370 }
4371
4372 /* Upper GRO stack assumes network header starts at gro_offset=0
4373  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4374  * We copy ethernet header into skb->data to have a common layout.
4375  */
4376 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4377 {
4378         struct sk_buff *skb = napi->skb;
4379         const struct ethhdr *eth;
4380         unsigned int hlen = sizeof(*eth);
4381
4382         napi->skb = NULL;
4383
4384         skb_reset_mac_header(skb);
4385         skb_gro_reset_offset(skb);
4386
4387         eth = skb_gro_header_fast(skb, 0);
4388         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4389                 eth = skb_gro_header_slow(skb, hlen, 0);
4390                 if (unlikely(!eth)) {
4391                         napi_reuse_skb(napi, skb);
4392                         return NULL;
4393                 }
4394         } else {
4395                 gro_pull_from_frag0(skb, hlen);
4396                 NAPI_GRO_CB(skb)->frag0 += hlen;
4397                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4398         }
4399         __skb_pull(skb, hlen);
4400
4401         /*
4402          * This works because the only protocols we care about don't require
4403          * special handling.
4404          * We'll fix it up properly in napi_frags_finish()
4405          */
4406         skb->protocol = eth->h_proto;
4407
4408         return skb;
4409 }
4410
4411 gro_result_t napi_gro_frags(struct napi_struct *napi)
4412 {
4413         struct sk_buff *skb = napi_frags_skb(napi);
4414
4415         if (!skb)
4416                 return GRO_DROP;
4417
4418         trace_napi_gro_frags_entry(skb);
4419
4420         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4421 }
4422 EXPORT_SYMBOL(napi_gro_frags);
4423
4424 /* Compute the checksum from gro_offset and return the folded value
4425  * after adding in any pseudo checksum.
4426  */
4427 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4428 {
4429         __wsum wsum;
4430         __sum16 sum;
4431
4432         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4433
4434         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4435         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4436         if (likely(!sum)) {
4437                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4438                     !skb->csum_complete_sw)
4439                         netdev_rx_csum_fault(skb->dev);
4440         }
4441
4442         NAPI_GRO_CB(skb)->csum = wsum;
4443         NAPI_GRO_CB(skb)->csum_valid = 1;
4444
4445         return sum;
4446 }
4447 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4448
4449 /*
4450  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4451  * Note: called with local irq disabled, but exits with local irq enabled.
4452  */
4453 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4454 {
4455 #ifdef CONFIG_RPS
4456         struct softnet_data *remsd = sd->rps_ipi_list;
4457
4458         if (remsd) {
4459                 sd->rps_ipi_list = NULL;
4460
4461                 local_irq_enable();
4462
4463                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4464                 while (remsd) {
4465                         struct softnet_data *next = remsd->rps_ipi_next;
4466
4467                         if (cpu_online(remsd->cpu))
4468                                 smp_call_function_single_async(remsd->cpu,
4469                                                            &remsd->csd);
4470                         remsd = next;
4471                 }
4472         } else
4473 #endif
4474                 local_irq_enable();
4475 }
4476
4477 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4478 {
4479 #ifdef CONFIG_RPS
4480         return sd->rps_ipi_list != NULL;
4481 #else
4482         return false;
4483 #endif
4484 }
4485
4486 static int process_backlog(struct napi_struct *napi, int quota)
4487 {
4488         int work = 0;
4489         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4490
4491         /* Check if we have pending ipi, its better to send them now,
4492          * not waiting net_rx_action() end.
4493          */
4494         if (sd_has_rps_ipi_waiting(sd)) {
4495                 local_irq_disable();
4496                 net_rps_action_and_irq_enable(sd);
4497         }
4498
4499         napi->weight = weight_p;
4500         local_irq_disable();
4501         while (1) {
4502                 struct sk_buff *skb;
4503
4504                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4505                         local_irq_enable();
4506                         __netif_receive_skb(skb);
4507                         local_irq_disable();
4508                         input_queue_head_incr(sd);
4509                         if (++work >= quota) {
4510                                 local_irq_enable();
4511                                 return work;
4512                         }
4513                 }
4514
4515                 rps_lock(sd);
4516                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4517                         /*
4518                          * Inline a custom version of __napi_complete().
4519                          * only current cpu owns and manipulates this napi,
4520                          * and NAPI_STATE_SCHED is the only possible flag set
4521                          * on backlog.
4522                          * We can use a plain write instead of clear_bit(),
4523                          * and we dont need an smp_mb() memory barrier.
4524                          */
4525                         napi->state = 0;
4526                         rps_unlock(sd);
4527
4528                         break;
4529                 }
4530
4531                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4532                                            &sd->process_queue);
4533                 rps_unlock(sd);
4534         }
4535         local_irq_enable();
4536
4537         return work;
4538 }
4539
4540 /**
4541  * __napi_schedule - schedule for receive
4542  * @n: entry to schedule
4543  *
4544  * The entry's receive function will be scheduled to run.
4545  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4546  */
4547 void __napi_schedule(struct napi_struct *n)
4548 {
4549         unsigned long flags;
4550
4551         local_irq_save(flags);
4552         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4553         local_irq_restore(flags);
4554 }
4555 EXPORT_SYMBOL(__napi_schedule);
4556
4557 /**
4558  * __napi_schedule_irqoff - schedule for receive
4559  * @n: entry to schedule
4560  *
4561  * Variant of __napi_schedule() assuming hard irqs are masked
4562  */
4563 void __napi_schedule_irqoff(struct napi_struct *n)
4564 {
4565         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4566 }
4567 EXPORT_SYMBOL(__napi_schedule_irqoff);
4568
4569 void __napi_complete(struct napi_struct *n)
4570 {
4571         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4572
4573         list_del_init(&n->poll_list);
4574         smp_mb__before_atomic();
4575         clear_bit(NAPI_STATE_SCHED, &n->state);
4576 }
4577 EXPORT_SYMBOL(__napi_complete);
4578
4579 void napi_complete_done(struct napi_struct *n, int work_done)
4580 {
4581         unsigned long flags;
4582
4583         /*
4584          * don't let napi dequeue from the cpu poll list
4585          * just in case its running on a different cpu
4586          */
4587         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4588                 return;
4589
4590         if (n->gro_list) {
4591                 unsigned long timeout = 0;
4592
4593                 if (work_done)
4594                         timeout = n->dev->gro_flush_timeout;
4595
4596                 if (timeout)
4597                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4598                                       HRTIMER_MODE_REL_PINNED);
4599                 else
4600                         napi_gro_flush(n, false);
4601         }
4602         if (likely(list_empty(&n->poll_list))) {
4603                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4604         } else {
4605                 /* If n->poll_list is not empty, we need to mask irqs */
4606                 local_irq_save(flags);
4607                 __napi_complete(n);
4608                 local_irq_restore(flags);
4609         }
4610 }
4611 EXPORT_SYMBOL(napi_complete_done);
4612
4613 /* must be called under rcu_read_lock(), as we dont take a reference */
4614 struct napi_struct *napi_by_id(unsigned int napi_id)
4615 {
4616         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4617         struct napi_struct *napi;
4618
4619         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4620                 if (napi->napi_id == napi_id)
4621                         return napi;
4622
4623         return NULL;
4624 }
4625 EXPORT_SYMBOL_GPL(napi_by_id);
4626
4627 void napi_hash_add(struct napi_struct *napi)
4628 {
4629         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4630
4631                 spin_lock(&napi_hash_lock);
4632
4633                 /* 0 is not a valid id, we also skip an id that is taken
4634                  * we expect both events to be extremely rare
4635                  */
4636                 napi->napi_id = 0;
4637                 while (!napi->napi_id) {
4638                         napi->napi_id = ++napi_gen_id;
4639                         if (napi_by_id(napi->napi_id))
4640                                 napi->napi_id = 0;
4641                 }
4642
4643                 hlist_add_head_rcu(&napi->napi_hash_node,
4644                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4645
4646                 spin_unlock(&napi_hash_lock);
4647         }
4648 }
4649 EXPORT_SYMBOL_GPL(napi_hash_add);
4650
4651 /* Warning : caller is responsible to make sure rcu grace period
4652  * is respected before freeing memory containing @napi
4653  */
4654 void napi_hash_del(struct napi_struct *napi)
4655 {
4656         spin_lock(&napi_hash_lock);
4657
4658         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4659                 hlist_del_rcu(&napi->napi_hash_node);
4660
4661         spin_unlock(&napi_hash_lock);
4662 }
4663 EXPORT_SYMBOL_GPL(napi_hash_del);
4664
4665 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4666 {
4667         struct napi_struct *napi;
4668
4669         napi = container_of(timer, struct napi_struct, timer);
4670         if (napi->gro_list)
4671                 napi_schedule(napi);
4672
4673         return HRTIMER_NORESTART;
4674 }
4675
4676 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4677                     int (*poll)(struct napi_struct *, int), int weight)
4678 {
4679         INIT_LIST_HEAD(&napi->poll_list);
4680         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4681         napi->timer.function = napi_watchdog;
4682         napi->gro_count = 0;
4683         napi->gro_list = NULL;
4684         napi->skb = NULL;
4685         napi->poll = poll;
4686         if (weight > NAPI_POLL_WEIGHT)
4687                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4688                             weight, dev->name);
4689         napi->weight = weight;
4690         list_add(&napi->dev_list, &dev->napi_list);
4691         napi->dev = dev;
4692 #ifdef CONFIG_NETPOLL
4693         spin_lock_init(&napi->poll_lock);
4694         napi->poll_owner = -1;
4695 #endif
4696         set_bit(NAPI_STATE_SCHED, &napi->state);
4697 }
4698 EXPORT_SYMBOL(netif_napi_add);
4699
4700 void napi_disable(struct napi_struct *n)
4701 {
4702         might_sleep();
4703         set_bit(NAPI_STATE_DISABLE, &n->state);
4704
4705         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4706                 msleep(1);
4707
4708         hrtimer_cancel(&n->timer);
4709
4710         clear_bit(NAPI_STATE_DISABLE, &n->state);
4711 }
4712 EXPORT_SYMBOL(napi_disable);
4713
4714 void netif_napi_del(struct napi_struct *napi)
4715 {
4716         list_del_init(&napi->dev_list);
4717         napi_free_frags(napi);
4718
4719         kfree_skb_list(napi->gro_list);
4720         napi->gro_list = NULL;
4721         napi->gro_count = 0;
4722 }
4723 EXPORT_SYMBOL(netif_napi_del);
4724
4725 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4726 {
4727         void *have;
4728         int work, weight;
4729
4730         list_del_init(&n->poll_list);
4731
4732         have = netpoll_poll_lock(n);
4733
4734         weight = n->weight;
4735
4736         /* This NAPI_STATE_SCHED test is for avoiding a race
4737          * with netpoll's poll_napi().  Only the entity which
4738          * obtains the lock and sees NAPI_STATE_SCHED set will
4739          * actually make the ->poll() call.  Therefore we avoid
4740          * accidentally calling ->poll() when NAPI is not scheduled.
4741          */
4742         work = 0;
4743         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4744                 work = n->poll(n, weight);
4745                 trace_napi_poll(n);
4746         }
4747
4748         WARN_ON_ONCE(work > weight);
4749
4750         if (likely(work < weight))
4751                 goto out_unlock;
4752
4753         /* Drivers must not modify the NAPI state if they
4754          * consume the entire weight.  In such cases this code
4755          * still "owns" the NAPI instance and therefore can
4756          * move the instance around on the list at-will.
4757          */
4758         if (unlikely(napi_disable_pending(n))) {
4759                 napi_complete(n);
4760                 goto out_unlock;
4761         }
4762
4763         if (n->gro_list) {
4764                 /* flush too old packets
4765                  * If HZ < 1000, flush all packets.
4766                  */
4767                 napi_gro_flush(n, HZ >= 1000);
4768         }
4769
4770         /* Some drivers may have called napi_schedule
4771          * prior to exhausting their budget.
4772          */
4773         if (unlikely(!list_empty(&n->poll_list))) {
4774                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4775                              n->dev ? n->dev->name : "backlog");
4776                 goto out_unlock;
4777         }
4778
4779         list_add_tail(&n->poll_list, repoll);
4780
4781 out_unlock:
4782         netpoll_poll_unlock(have);
4783
4784         return work;
4785 }
4786
4787 static void net_rx_action(struct softirq_action *h)
4788 {
4789         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4790         unsigned long time_limit = jiffies + 2;
4791         int budget = netdev_budget;
4792         LIST_HEAD(list);
4793         LIST_HEAD(repoll);
4794
4795         local_irq_disable();
4796         list_splice_init(&sd->poll_list, &list);
4797         local_irq_enable();
4798
4799         for (;;) {
4800                 struct napi_struct *n;
4801
4802                 if (list_empty(&list)) {
4803                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4804                                 return;
4805                         break;
4806                 }
4807
4808                 n = list_first_entry(&list, struct napi_struct, poll_list);
4809                 budget -= napi_poll(n, &repoll);
4810
4811                 /* If softirq window is exhausted then punt.
4812                  * Allow this to run for 2 jiffies since which will allow
4813                  * an average latency of 1.5/HZ.
4814                  */
4815                 if (unlikely(budget <= 0 ||
4816                              time_after_eq(jiffies, time_limit))) {
4817                         sd->time_squeeze++;
4818                         break;
4819                 }
4820         }
4821
4822         local_irq_disable();
4823
4824         list_splice_tail_init(&sd->poll_list, &list);
4825         list_splice_tail(&repoll, &list);
4826         list_splice(&list, &sd->poll_list);
4827         if (!list_empty(&sd->poll_list))
4828                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4829
4830         net_rps_action_and_irq_enable(sd);
4831 }
4832
4833 struct netdev_adjacent {
4834         struct net_device *dev;
4835
4836         /* upper master flag, there can only be one master device per list */
4837         bool master;
4838
4839         /* counter for the number of times this device was added to us */
4840         u16 ref_nr;
4841
4842         /* private field for the users */
4843         void *private;
4844
4845         struct list_head list;
4846         struct rcu_head rcu;
4847 };
4848
4849 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4850                                                  struct net_device *adj_dev,
4851                                                  struct list_head *adj_list)
4852 {
4853         struct netdev_adjacent *adj;
4854
4855         list_for_each_entry(adj, adj_list, list) {
4856                 if (adj->dev == adj_dev)
4857                         return adj;
4858         }
4859         return NULL;
4860 }
4861
4862 /**
4863  * netdev_has_upper_dev - Check if device is linked to an upper device
4864  * @dev: device
4865  * @upper_dev: upper device to check
4866  *
4867  * Find out if a device is linked to specified upper device and return true
4868  * in case it is. Note that this checks only immediate upper device,
4869  * not through a complete stack of devices. The caller must hold the RTNL lock.
4870  */
4871 bool netdev_has_upper_dev(struct net_device *dev,
4872                           struct net_device *upper_dev)
4873 {
4874         ASSERT_RTNL();
4875
4876         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4877 }
4878 EXPORT_SYMBOL(netdev_has_upper_dev);
4879
4880 /**
4881  * netdev_has_any_upper_dev - Check if device is linked to some device
4882  * @dev: device
4883  *
4884  * Find out if a device is linked to an upper device and return true in case
4885  * it is. The caller must hold the RTNL lock.
4886  */
4887 static bool netdev_has_any_upper_dev(struct net_device *dev)
4888 {
4889         ASSERT_RTNL();
4890
4891         return !list_empty(&dev->all_adj_list.upper);
4892 }
4893
4894 /**
4895  * netdev_master_upper_dev_get - Get master upper device
4896  * @dev: device
4897  *
4898  * Find a master upper device and return pointer to it or NULL in case
4899  * it's not there. The caller must hold the RTNL lock.
4900  */
4901 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4902 {
4903         struct netdev_adjacent *upper;
4904
4905         ASSERT_RTNL();
4906
4907         if (list_empty(&dev->adj_list.upper))
4908                 return NULL;
4909
4910         upper = list_first_entry(&dev->adj_list.upper,
4911                                  struct netdev_adjacent, list);
4912         if (likely(upper->master))
4913                 return upper->dev;
4914         return NULL;
4915 }
4916 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4917
4918 void *netdev_adjacent_get_private(struct list_head *adj_list)
4919 {
4920         struct netdev_adjacent *adj;
4921
4922         adj = list_entry(adj_list, struct netdev_adjacent, list);
4923
4924         return adj->private;
4925 }
4926 EXPORT_SYMBOL(netdev_adjacent_get_private);
4927
4928 /**
4929  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4930  * @dev: device
4931  * @iter: list_head ** of the current position
4932  *
4933  * Gets the next device from the dev's upper list, starting from iter
4934  * position. The caller must hold RCU read lock.
4935  */
4936 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4937                                                  struct list_head **iter)
4938 {
4939         struct netdev_adjacent *upper;
4940
4941         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4942
4943         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4944
4945         if (&upper->list == &dev->adj_list.upper)
4946                 return NULL;
4947
4948         *iter = &upper->list;
4949
4950         return upper->dev;
4951 }
4952 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4953
4954 /**
4955  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4956  * @dev: device
4957  * @iter: list_head ** of the current position
4958  *
4959  * Gets the next device from the dev's upper list, starting from iter
4960  * position. The caller must hold RCU read lock.
4961  */
4962 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4963                                                      struct list_head **iter)
4964 {
4965         struct netdev_adjacent *upper;
4966
4967         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4968
4969         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4970
4971         if (&upper->list == &dev->all_adj_list.upper)
4972                 return NULL;
4973
4974         *iter = &upper->list;
4975
4976         return upper->dev;
4977 }
4978 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4979
4980 /**
4981  * netdev_lower_get_next_private - Get the next ->private from the
4982  *                                 lower neighbour list
4983  * @dev: device
4984  * @iter: list_head ** of the current position
4985  *
4986  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4987  * list, starting from iter position. The caller must hold either hold the
4988  * RTNL lock or its own locking that guarantees that the neighbour lower
4989  * list will remain unchainged.
4990  */
4991 void *netdev_lower_get_next_private(struct net_device *dev,
4992                                     struct list_head **iter)
4993 {
4994         struct netdev_adjacent *lower;
4995
4996         lower = list_entry(*iter, struct netdev_adjacent, list);
4997
4998         if (&lower->list == &dev->adj_list.lower)
4999                 return NULL;
5000
5001         *iter = lower->list.next;
5002
5003         return lower->private;
5004 }
5005 EXPORT_SYMBOL(netdev_lower_get_next_private);
5006
5007 /**
5008  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5009  *                                     lower neighbour list, RCU
5010  *                                     variant
5011  * @dev: device
5012  * @iter: list_head ** of the current position
5013  *
5014  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5015  * list, starting from iter position. The caller must hold RCU read lock.
5016  */
5017 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5018                                         struct list_head **iter)
5019 {
5020         struct netdev_adjacent *lower;
5021
5022         WARN_ON_ONCE(!rcu_read_lock_held());
5023
5024         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5025
5026         if (&lower->list == &dev->adj_list.lower)
5027                 return NULL;
5028
5029         *iter = &lower->list;
5030
5031         return lower->private;
5032 }
5033 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5034
5035 /**
5036  * netdev_lower_get_next - Get the next device from the lower neighbour
5037  *                         list
5038  * @dev: device
5039  * @iter: list_head ** of the current position
5040  *
5041  * Gets the next netdev_adjacent from the dev's lower neighbour
5042  * list, starting from iter position. The caller must hold RTNL lock or
5043  * its own locking that guarantees that the neighbour lower
5044  * list will remain unchainged.
5045  */
5046 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5047 {
5048         struct netdev_adjacent *lower;
5049
5050         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5051
5052         if (&lower->list == &dev->adj_list.lower)
5053                 return NULL;
5054
5055         *iter = &lower->list;
5056
5057         return lower->dev;
5058 }
5059 EXPORT_SYMBOL(netdev_lower_get_next);
5060
5061 /**
5062  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5063  *                                     lower neighbour list, RCU
5064  *                                     variant
5065  * @dev: device
5066  *
5067  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5068  * list. The caller must hold RCU read lock.
5069  */
5070 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5071 {
5072         struct netdev_adjacent *lower;
5073
5074         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5075                         struct netdev_adjacent, list);
5076         if (lower)
5077                 return lower->private;
5078         return NULL;
5079 }
5080 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5081
5082 /**
5083  * netdev_master_upper_dev_get_rcu - Get master upper device
5084  * @dev: device
5085  *
5086  * Find a master upper device and return pointer to it or NULL in case
5087  * it's not there. The caller must hold the RCU read lock.
5088  */
5089 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5090 {
5091         struct netdev_adjacent *upper;
5092
5093         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5094                                        struct netdev_adjacent, list);
5095         if (upper && likely(upper->master))
5096                 return upper->dev;
5097         return NULL;
5098 }
5099 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5100
5101 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5102                               struct net_device *adj_dev,
5103                               struct list_head *dev_list)
5104 {
5105         char linkname[IFNAMSIZ+7];
5106         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5107                 "upper_%s" : "lower_%s", adj_dev->name);
5108         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5109                                  linkname);
5110 }
5111 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5112                                char *name,
5113                                struct list_head *dev_list)
5114 {
5115         char linkname[IFNAMSIZ+7];
5116         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5117                 "upper_%s" : "lower_%s", name);
5118         sysfs_remove_link(&(dev->dev.kobj), linkname);
5119 }
5120
5121 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5122                                                  struct net_device *adj_dev,
5123                                                  struct list_head *dev_list)
5124 {
5125         return (dev_list == &dev->adj_list.upper ||
5126                 dev_list == &dev->adj_list.lower) &&
5127                 net_eq(dev_net(dev), dev_net(adj_dev));
5128 }
5129
5130 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5131                                         struct net_device *adj_dev,
5132                                         struct list_head *dev_list,
5133                                         void *private, bool master)
5134 {
5135         struct netdev_adjacent *adj;
5136         int ret;
5137
5138         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5139
5140         if (adj) {
5141                 adj->ref_nr++;
5142                 return 0;
5143         }
5144
5145         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5146         if (!adj)
5147                 return -ENOMEM;
5148
5149         adj->dev = adj_dev;
5150         adj->master = master;
5151         adj->ref_nr = 1;
5152         adj->private = private;
5153         dev_hold(adj_dev);
5154
5155         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5156                  adj_dev->name, dev->name, adj_dev->name);
5157
5158         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5159                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5160                 if (ret)
5161                         goto free_adj;
5162         }
5163
5164         /* Ensure that master link is always the first item in list. */
5165         if (master) {
5166                 ret = sysfs_create_link(&(dev->dev.kobj),
5167                                         &(adj_dev->dev.kobj), "master");
5168                 if (ret)
5169                         goto remove_symlinks;
5170
5171                 list_add_rcu(&adj->list, dev_list);
5172         } else {
5173                 list_add_tail_rcu(&adj->list, dev_list);
5174         }
5175
5176         return 0;
5177
5178 remove_symlinks:
5179         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5180                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5181 free_adj:
5182         kfree(adj);
5183         dev_put(adj_dev);
5184
5185         return ret;
5186 }
5187
5188 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5189                                          struct net_device *adj_dev,
5190                                          struct list_head *dev_list)
5191 {
5192         struct netdev_adjacent *adj;
5193
5194         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5195
5196         if (!adj) {
5197                 pr_err("tried to remove device %s from %s\n",
5198                        dev->name, adj_dev->name);
5199                 BUG();
5200         }
5201
5202         if (adj->ref_nr > 1) {
5203                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5204                          adj->ref_nr-1);
5205                 adj->ref_nr--;
5206                 return;
5207         }
5208
5209         if (adj->master)
5210                 sysfs_remove_link(&(dev->dev.kobj), "master");
5211
5212         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5213                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5214
5215         list_del_rcu(&adj->list);
5216         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5217                  adj_dev->name, dev->name, adj_dev->name);
5218         dev_put(adj_dev);
5219         kfree_rcu(adj, rcu);
5220 }
5221
5222 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5223                                             struct net_device *upper_dev,
5224                                             struct list_head *up_list,
5225                                             struct list_head *down_list,
5226                                             void *private, bool master)
5227 {
5228         int ret;
5229
5230         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5231                                            master);
5232         if (ret)
5233                 return ret;
5234
5235         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5236                                            false);
5237         if (ret) {
5238                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5239                 return ret;
5240         }
5241
5242         return 0;
5243 }
5244
5245 static int __netdev_adjacent_dev_link(struct net_device *dev,
5246                                       struct net_device *upper_dev)
5247 {
5248         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5249                                                 &dev->all_adj_list.upper,
5250                                                 &upper_dev->all_adj_list.lower,
5251                                                 NULL, false);
5252 }
5253
5254 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5255                                                struct net_device *upper_dev,
5256                                                struct list_head *up_list,
5257                                                struct list_head *down_list)
5258 {
5259         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5260         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5261 }
5262
5263 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5264                                          struct net_device *upper_dev)
5265 {
5266         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5267                                            &dev->all_adj_list.upper,
5268                                            &upper_dev->all_adj_list.lower);
5269 }
5270
5271 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5272                                                 struct net_device *upper_dev,
5273                                                 void *private, bool master)
5274 {
5275         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5276
5277         if (ret)
5278                 return ret;
5279
5280         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5281                                                &dev->adj_list.upper,
5282                                                &upper_dev->adj_list.lower,
5283                                                private, master);
5284         if (ret) {
5285                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5286                 return ret;
5287         }
5288
5289         return 0;
5290 }
5291
5292 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5293                                                    struct net_device *upper_dev)
5294 {
5295         __netdev_adjacent_dev_unlink(dev, upper_dev);
5296         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5297                                            &dev->adj_list.upper,
5298                                            &upper_dev->adj_list.lower);
5299 }
5300
5301 static int __netdev_upper_dev_link(struct net_device *dev,
5302                                    struct net_device *upper_dev, bool master,
5303                                    void *private)
5304 {
5305         struct netdev_adjacent *i, *j, *to_i, *to_j;
5306         int ret = 0;
5307
5308         ASSERT_RTNL();
5309
5310         if (dev == upper_dev)
5311                 return -EBUSY;
5312
5313         /* To prevent loops, check if dev is not upper device to upper_dev. */
5314         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5315                 return -EBUSY;
5316
5317         if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5318                 return -EEXIST;
5319
5320         if (master && netdev_master_upper_dev_get(dev))
5321                 return -EBUSY;
5322
5323         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5324                                                    master);
5325         if (ret)
5326                 return ret;
5327
5328         /* Now that we linked these devs, make all the upper_dev's
5329          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5330          * versa, and don't forget the devices itself. All of these
5331          * links are non-neighbours.
5332          */
5333         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5334                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5335                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5336                                  i->dev->name, j->dev->name);
5337                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5338                         if (ret)
5339                                 goto rollback_mesh;
5340                 }
5341         }
5342
5343         /* add dev to every upper_dev's upper device */
5344         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5345                 pr_debug("linking %s's upper device %s with %s\n",
5346                          upper_dev->name, i->dev->name, dev->name);
5347                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5348                 if (ret)
5349                         goto rollback_upper_mesh;
5350         }
5351
5352         /* add upper_dev to every dev's lower device */
5353         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5354                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5355                          i->dev->name, upper_dev->name);
5356                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5357                 if (ret)
5358                         goto rollback_lower_mesh;
5359         }
5360
5361         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5362         return 0;
5363
5364 rollback_lower_mesh:
5365         to_i = i;
5366         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5367                 if (i == to_i)
5368                         break;
5369                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5370         }
5371
5372         i = NULL;
5373
5374 rollback_upper_mesh:
5375         to_i = i;
5376         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5377                 if (i == to_i)
5378                         break;
5379                 __netdev_adjacent_dev_unlink(dev, i->dev);
5380         }
5381
5382         i = j = NULL;
5383
5384 rollback_mesh:
5385         to_i = i;
5386         to_j = j;
5387         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5388                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5389                         if (i == to_i && j == to_j)
5390                                 break;
5391                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5392                 }
5393                 if (i == to_i)
5394                         break;
5395         }
5396
5397         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5398
5399         return ret;
5400 }
5401
5402 /**
5403  * netdev_upper_dev_link - Add a link to the upper device
5404  * @dev: device
5405  * @upper_dev: new upper device
5406  *
5407  * Adds a link to device which is upper to this one. The caller must hold
5408  * the RTNL lock. On a failure a negative errno code is returned.
5409  * On success the reference counts are adjusted and the function
5410  * returns zero.
5411  */
5412 int netdev_upper_dev_link(struct net_device *dev,
5413                           struct net_device *upper_dev)
5414 {
5415         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5416 }
5417 EXPORT_SYMBOL(netdev_upper_dev_link);
5418
5419 /**
5420  * netdev_master_upper_dev_link - Add a master link to the upper device
5421  * @dev: device
5422  * @upper_dev: new upper device
5423  *
5424  * Adds a link to device which is upper to this one. In this case, only
5425  * one master upper device can be linked, although other non-master devices
5426  * might be linked as well. The caller must hold the RTNL lock.
5427  * On a failure a negative errno code is returned. On success the reference
5428  * counts are adjusted and the function returns zero.
5429  */
5430 int netdev_master_upper_dev_link(struct net_device *dev,
5431                                  struct net_device *upper_dev)
5432 {
5433         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5434 }
5435 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5436
5437 int netdev_master_upper_dev_link_private(struct net_device *dev,
5438                                          struct net_device *upper_dev,
5439                                          void *private)
5440 {
5441         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5442 }
5443 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5444
5445 /**
5446  * netdev_upper_dev_unlink - Removes a link to upper device
5447  * @dev: device
5448  * @upper_dev: new upper device
5449  *
5450  * Removes a link to device which is upper to this one. The caller must hold
5451  * the RTNL lock.
5452  */
5453 void netdev_upper_dev_unlink(struct net_device *dev,
5454                              struct net_device *upper_dev)
5455 {
5456         struct netdev_adjacent *i, *j;
5457         ASSERT_RTNL();
5458
5459         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5460
5461         /* Here is the tricky part. We must remove all dev's lower
5462          * devices from all upper_dev's upper devices and vice
5463          * versa, to maintain the graph relationship.
5464          */
5465         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5466                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5467                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5468
5469         /* remove also the devices itself from lower/upper device
5470          * list
5471          */
5472         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5473                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5474
5475         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5476                 __netdev_adjacent_dev_unlink(dev, i->dev);
5477
5478         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5479 }
5480 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5481
5482 /**
5483  * netdev_bonding_info_change - Dispatch event about slave change
5484  * @dev: device
5485  * @bonding_info: info to dispatch
5486  *
5487  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5488  * The caller must hold the RTNL lock.
5489  */
5490 void netdev_bonding_info_change(struct net_device *dev,
5491                                 struct netdev_bonding_info *bonding_info)
5492 {
5493         struct netdev_notifier_bonding_info     info;
5494
5495         memcpy(&info.bonding_info, bonding_info,
5496                sizeof(struct netdev_bonding_info));
5497         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5498                                       &info.info);
5499 }
5500 EXPORT_SYMBOL(netdev_bonding_info_change);
5501
5502 static void netdev_adjacent_add_links(struct net_device *dev)
5503 {
5504         struct netdev_adjacent *iter;
5505
5506         struct net *net = dev_net(dev);
5507
5508         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5509                 if (!net_eq(net,dev_net(iter->dev)))
5510                         continue;
5511                 netdev_adjacent_sysfs_add(iter->dev, dev,
5512                                           &iter->dev->adj_list.lower);
5513                 netdev_adjacent_sysfs_add(dev, iter->dev,
5514                                           &dev->adj_list.upper);
5515         }
5516
5517         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5518                 if (!net_eq(net,dev_net(iter->dev)))
5519                         continue;
5520                 netdev_adjacent_sysfs_add(iter->dev, dev,
5521                                           &iter->dev->adj_list.upper);
5522                 netdev_adjacent_sysfs_add(dev, iter->dev,
5523                                           &dev->adj_list.lower);
5524         }
5525 }
5526
5527 static void netdev_adjacent_del_links(struct net_device *dev)
5528 {
5529         struct netdev_adjacent *iter;
5530
5531         struct net *net = dev_net(dev);
5532
5533         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5534                 if (!net_eq(net,dev_net(iter->dev)))
5535                         continue;
5536                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5537                                           &iter->dev->adj_list.lower);
5538                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5539                                           &dev->adj_list.upper);
5540         }
5541
5542         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5543                 if (!net_eq(net,dev_net(iter->dev)))
5544                         continue;
5545                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5546                                           &iter->dev->adj_list.upper);
5547                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5548                                           &dev->adj_list.lower);
5549         }
5550 }
5551
5552 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5553 {
5554         struct netdev_adjacent *iter;
5555
5556         struct net *net = dev_net(dev);
5557
5558         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5559                 if (!net_eq(net,dev_net(iter->dev)))
5560                         continue;
5561                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5562                                           &iter->dev->adj_list.lower);
5563                 netdev_adjacent_sysfs_add(iter->dev, dev,
5564                                           &iter->dev->adj_list.lower);
5565         }
5566
5567         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5568                 if (!net_eq(net,dev_net(iter->dev)))
5569                         continue;
5570                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5571                                           &iter->dev->adj_list.upper);
5572                 netdev_adjacent_sysfs_add(iter->dev, dev,
5573                                           &iter->dev->adj_list.upper);
5574         }
5575 }
5576
5577 void *netdev_lower_dev_get_private(struct net_device *dev,
5578                                    struct net_device *lower_dev)
5579 {
5580         struct netdev_adjacent *lower;
5581
5582         if (!lower_dev)
5583                 return NULL;
5584         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5585         if (!lower)
5586                 return NULL;
5587
5588         return lower->private;
5589 }
5590 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5591
5592
5593 int dev_get_nest_level(struct net_device *dev,
5594                        bool (*type_check)(struct net_device *dev))
5595 {
5596         struct net_device *lower = NULL;
5597         struct list_head *iter;
5598         int max_nest = -1;
5599         int nest;
5600
5601         ASSERT_RTNL();
5602
5603         netdev_for_each_lower_dev(dev, lower, iter) {
5604                 nest = dev_get_nest_level(lower, type_check);
5605                 if (max_nest < nest)
5606                         max_nest = nest;
5607         }
5608
5609         if (type_check(dev))
5610                 max_nest++;
5611
5612         return max_nest;
5613 }
5614 EXPORT_SYMBOL(dev_get_nest_level);
5615
5616 static void dev_change_rx_flags(struct net_device *dev, int flags)
5617 {
5618         const struct net_device_ops *ops = dev->netdev_ops;
5619
5620         if (ops->ndo_change_rx_flags)
5621                 ops->ndo_change_rx_flags(dev, flags);
5622 }
5623
5624 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5625 {
5626         unsigned int old_flags = dev->flags;
5627         kuid_t uid;
5628         kgid_t gid;
5629
5630         ASSERT_RTNL();
5631
5632         dev->flags |= IFF_PROMISC;
5633         dev->promiscuity += inc;
5634         if (dev->promiscuity == 0) {
5635                 /*
5636                  * Avoid overflow.
5637                  * If inc causes overflow, untouch promisc and return error.
5638                  */
5639                 if (inc < 0)
5640                         dev->flags &= ~IFF_PROMISC;
5641                 else {
5642                         dev->promiscuity -= inc;
5643                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5644                                 dev->name);
5645                         return -EOVERFLOW;
5646                 }
5647         }
5648         if (dev->flags != old_flags) {
5649                 pr_info("device %s %s promiscuous mode\n",
5650                         dev->name,
5651                         dev->flags & IFF_PROMISC ? "entered" : "left");
5652                 if (audit_enabled) {
5653                         current_uid_gid(&uid, &gid);
5654                         audit_log(current->audit_context, GFP_ATOMIC,
5655                                 AUDIT_ANOM_PROMISCUOUS,
5656                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5657                                 dev->name, (dev->flags & IFF_PROMISC),
5658                                 (old_flags & IFF_PROMISC),
5659                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5660                                 from_kuid(&init_user_ns, uid),
5661                                 from_kgid(&init_user_ns, gid),
5662                                 audit_get_sessionid(current));
5663                 }
5664
5665                 dev_change_rx_flags(dev, IFF_PROMISC);
5666         }
5667         if (notify)
5668                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5669         return 0;
5670 }
5671
5672 /**
5673  *      dev_set_promiscuity     - update promiscuity count on a device
5674  *      @dev: device
5675  *      @inc: modifier
5676  *
5677  *      Add or remove promiscuity from a device. While the count in the device
5678  *      remains above zero the interface remains promiscuous. Once it hits zero
5679  *      the device reverts back to normal filtering operation. A negative inc
5680  *      value is used to drop promiscuity on the device.
5681  *      Return 0 if successful or a negative errno code on error.
5682  */
5683 int dev_set_promiscuity(struct net_device *dev, int inc)
5684 {
5685         unsigned int old_flags = dev->flags;
5686         int err;
5687
5688         err = __dev_set_promiscuity(dev, inc, true);
5689         if (err < 0)
5690                 return err;
5691         if (dev->flags != old_flags)
5692                 dev_set_rx_mode(dev);
5693         return err;
5694 }
5695 EXPORT_SYMBOL(dev_set_promiscuity);
5696
5697 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5698 {
5699         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5700
5701         ASSERT_RTNL();
5702
5703         dev->flags |= IFF_ALLMULTI;
5704         dev->allmulti += inc;
5705         if (dev->allmulti == 0) {
5706                 /*
5707                  * Avoid overflow.
5708                  * If inc causes overflow, untouch allmulti and return error.
5709                  */
5710                 if (inc < 0)
5711                         dev->flags &= ~IFF_ALLMULTI;
5712                 else {
5713                         dev->allmulti -= inc;
5714                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5715                                 dev->name);
5716                         return -EOVERFLOW;
5717                 }
5718         }
5719         if (dev->flags ^ old_flags) {
5720                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5721                 dev_set_rx_mode(dev);
5722                 if (notify)
5723                         __dev_notify_flags(dev, old_flags,
5724                                            dev->gflags ^ old_gflags);
5725         }
5726         return 0;
5727 }
5728
5729 /**
5730  *      dev_set_allmulti        - update allmulti count on a device
5731  *      @dev: device
5732  *      @inc: modifier
5733  *
5734  *      Add or remove reception of all multicast frames to a device. While the
5735  *      count in the device remains above zero the interface remains listening
5736  *      to all interfaces. Once it hits zero the device reverts back to normal
5737  *      filtering operation. A negative @inc value is used to drop the counter
5738  *      when releasing a resource needing all multicasts.
5739  *      Return 0 if successful or a negative errno code on error.
5740  */
5741
5742 int dev_set_allmulti(struct net_device *dev, int inc)
5743 {
5744         return __dev_set_allmulti(dev, inc, true);
5745 }
5746 EXPORT_SYMBOL(dev_set_allmulti);
5747
5748 /*
5749  *      Upload unicast and multicast address lists to device and
5750  *      configure RX filtering. When the device doesn't support unicast
5751  *      filtering it is put in promiscuous mode while unicast addresses
5752  *      are present.
5753  */
5754 void __dev_set_rx_mode(struct net_device *dev)
5755 {
5756         const struct net_device_ops *ops = dev->netdev_ops;
5757
5758         /* dev_open will call this function so the list will stay sane. */
5759         if (!(dev->flags&IFF_UP))
5760                 return;
5761
5762         if (!netif_device_present(dev))
5763                 return;
5764
5765         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5766                 /* Unicast addresses changes may only happen under the rtnl,
5767                  * therefore calling __dev_set_promiscuity here is safe.
5768                  */
5769                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5770                         __dev_set_promiscuity(dev, 1, false);
5771                         dev->uc_promisc = true;
5772                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5773                         __dev_set_promiscuity(dev, -1, false);
5774                         dev->uc_promisc = false;
5775                 }
5776         }
5777
5778         if (ops->ndo_set_rx_mode)
5779                 ops->ndo_set_rx_mode(dev);
5780 }
5781
5782 void dev_set_rx_mode(struct net_device *dev)
5783 {
5784         netif_addr_lock_bh(dev);
5785         __dev_set_rx_mode(dev);
5786         netif_addr_unlock_bh(dev);
5787 }
5788
5789 /**
5790  *      dev_get_flags - get flags reported to userspace
5791  *      @dev: device
5792  *
5793  *      Get the combination of flag bits exported through APIs to userspace.
5794  */
5795 unsigned int dev_get_flags(const struct net_device *dev)
5796 {
5797         unsigned int flags;
5798
5799         flags = (dev->flags & ~(IFF_PROMISC |
5800                                 IFF_ALLMULTI |
5801                                 IFF_RUNNING |
5802                                 IFF_LOWER_UP |
5803                                 IFF_DORMANT)) |
5804                 (dev->gflags & (IFF_PROMISC |
5805                                 IFF_ALLMULTI));
5806
5807         if (netif_running(dev)) {
5808                 if (netif_oper_up(dev))
5809                         flags |= IFF_RUNNING;
5810                 if (netif_carrier_ok(dev))
5811                         flags |= IFF_LOWER_UP;
5812                 if (netif_dormant(dev))
5813                         flags |= IFF_DORMANT;
5814         }
5815
5816         return flags;
5817 }
5818 EXPORT_SYMBOL(dev_get_flags);
5819
5820 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5821 {
5822         unsigned int old_flags = dev->flags;
5823         int ret;
5824
5825         ASSERT_RTNL();
5826
5827         /*
5828          *      Set the flags on our device.
5829          */
5830
5831         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5832                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5833                                IFF_AUTOMEDIA)) |
5834                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5835                                     IFF_ALLMULTI));
5836
5837         /*
5838          *      Load in the correct multicast list now the flags have changed.
5839          */
5840
5841         if ((old_flags ^ flags) & IFF_MULTICAST)
5842                 dev_change_rx_flags(dev, IFF_MULTICAST);
5843
5844         dev_set_rx_mode(dev);
5845
5846         /*
5847          *      Have we downed the interface. We handle IFF_UP ourselves
5848          *      according to user attempts to set it, rather than blindly
5849          *      setting it.
5850          */
5851
5852         ret = 0;
5853         if ((old_flags ^ flags) & IFF_UP)
5854                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5855
5856         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5857                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5858                 unsigned int old_flags = dev->flags;
5859
5860                 dev->gflags ^= IFF_PROMISC;
5861
5862                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5863                         if (dev->flags != old_flags)
5864                                 dev_set_rx_mode(dev);
5865         }
5866
5867         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5868            is important. Some (broken) drivers set IFF_PROMISC, when
5869            IFF_ALLMULTI is requested not asking us and not reporting.
5870          */
5871         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5872                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5873
5874                 dev->gflags ^= IFF_ALLMULTI;
5875                 __dev_set_allmulti(dev, inc, false);
5876         }
5877
5878         return ret;
5879 }
5880
5881 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5882                         unsigned int gchanges)
5883 {
5884         unsigned int changes = dev->flags ^ old_flags;
5885
5886         if (gchanges)
5887                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5888
5889         if (changes & IFF_UP) {
5890                 if (dev->flags & IFF_UP)
5891                         call_netdevice_notifiers(NETDEV_UP, dev);
5892                 else
5893                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5894         }
5895
5896         if (dev->flags & IFF_UP &&
5897             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5898                 struct netdev_notifier_change_info change_info;
5899
5900                 change_info.flags_changed = changes;
5901                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5902                                               &change_info.info);
5903         }
5904 }
5905
5906 /**
5907  *      dev_change_flags - change device settings
5908  *      @dev: device
5909  *      @flags: device state flags
5910  *
5911  *      Change settings on device based state flags. The flags are
5912  *      in the userspace exported format.
5913  */
5914 int dev_change_flags(struct net_device *dev, unsigned int flags)
5915 {
5916         int ret;
5917         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5918
5919         ret = __dev_change_flags(dev, flags);
5920         if (ret < 0)
5921                 return ret;
5922
5923         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5924         __dev_notify_flags(dev, old_flags, changes);
5925         return ret;
5926 }
5927 EXPORT_SYMBOL(dev_change_flags);
5928
5929 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5930 {
5931         const struct net_device_ops *ops = dev->netdev_ops;
5932
5933         if (ops->ndo_change_mtu)
5934                 return ops->ndo_change_mtu(dev, new_mtu);
5935
5936         dev->mtu = new_mtu;
5937         return 0;
5938 }
5939
5940 /**
5941  *      dev_set_mtu - Change maximum transfer unit
5942  *      @dev: device
5943  *      @new_mtu: new transfer unit
5944  *
5945  *      Change the maximum transfer size of the network device.
5946  */
5947 int dev_set_mtu(struct net_device *dev, int new_mtu)
5948 {
5949         int err, orig_mtu;
5950
5951         if (new_mtu == dev->mtu)
5952                 return 0;
5953
5954         /*      MTU must be positive.    */
5955         if (new_mtu < 0)
5956                 return -EINVAL;
5957
5958         if (!netif_device_present(dev))
5959                 return -ENODEV;
5960
5961         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5962         err = notifier_to_errno(err);
5963         if (err)
5964                 return err;
5965
5966         orig_mtu = dev->mtu;
5967         err = __dev_set_mtu(dev, new_mtu);
5968
5969         if (!err) {
5970                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5971                 err = notifier_to_errno(err);
5972                 if (err) {
5973                         /* setting mtu back and notifying everyone again,
5974                          * so that they have a chance to revert changes.
5975                          */
5976                         __dev_set_mtu(dev, orig_mtu);
5977                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5978                 }
5979         }
5980         return err;
5981 }
5982 EXPORT_SYMBOL(dev_set_mtu);
5983
5984 /**
5985  *      dev_set_group - Change group this device belongs to
5986  *      @dev: device
5987  *      @new_group: group this device should belong to
5988  */
5989 void dev_set_group(struct net_device *dev, int new_group)
5990 {
5991         dev->group = new_group;
5992 }
5993 EXPORT_SYMBOL(dev_set_group);
5994
5995 /**
5996  *      dev_set_mac_address - Change Media Access Control Address
5997  *      @dev: device
5998  *      @sa: new address
5999  *
6000  *      Change the hardware (MAC) address of the device
6001  */
6002 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6003 {
6004         const struct net_device_ops *ops = dev->netdev_ops;
6005         int err;
6006
6007         if (!ops->ndo_set_mac_address)
6008                 return -EOPNOTSUPP;
6009         if (sa->sa_family != dev->type)
6010                 return -EINVAL;
6011         if (!netif_device_present(dev))
6012                 return -ENODEV;
6013         err = ops->ndo_set_mac_address(dev, sa);
6014         if (err)
6015                 return err;
6016         dev->addr_assign_type = NET_ADDR_SET;
6017         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6018         add_device_randomness(dev->dev_addr, dev->addr_len);
6019         return 0;
6020 }
6021 EXPORT_SYMBOL(dev_set_mac_address);
6022
6023 /**
6024  *      dev_change_carrier - Change device carrier
6025  *      @dev: device
6026  *      @new_carrier: new value
6027  *
6028  *      Change device carrier
6029  */
6030 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6031 {
6032         const struct net_device_ops *ops = dev->netdev_ops;
6033
6034         if (!ops->ndo_change_carrier)
6035                 return -EOPNOTSUPP;
6036         if (!netif_device_present(dev))
6037                 return -ENODEV;
6038         return ops->ndo_change_carrier(dev, new_carrier);
6039 }
6040 EXPORT_SYMBOL(dev_change_carrier);
6041
6042 /**
6043  *      dev_get_phys_port_id - Get device physical port ID
6044  *      @dev: device
6045  *      @ppid: port ID
6046  *
6047  *      Get device physical port ID
6048  */
6049 int dev_get_phys_port_id(struct net_device *dev,
6050                          struct netdev_phys_item_id *ppid)
6051 {
6052         const struct net_device_ops *ops = dev->netdev_ops;
6053
6054         if (!ops->ndo_get_phys_port_id)
6055                 return -EOPNOTSUPP;
6056         return ops->ndo_get_phys_port_id(dev, ppid);
6057 }
6058 EXPORT_SYMBOL(dev_get_phys_port_id);
6059
6060 /**
6061  *      dev_get_phys_port_name - Get device physical port name
6062  *      @dev: device
6063  *      @name: port name
6064  *
6065  *      Get device physical port name
6066  */
6067 int dev_get_phys_port_name(struct net_device *dev,
6068                            char *name, size_t len)
6069 {
6070         const struct net_device_ops *ops = dev->netdev_ops;
6071
6072         if (!ops->ndo_get_phys_port_name)
6073                 return -EOPNOTSUPP;
6074         return ops->ndo_get_phys_port_name(dev, name, len);
6075 }
6076 EXPORT_SYMBOL(dev_get_phys_port_name);
6077
6078 /**
6079  *      dev_new_index   -       allocate an ifindex
6080  *      @net: the applicable net namespace
6081  *
6082  *      Returns a suitable unique value for a new device interface
6083  *      number.  The caller must hold the rtnl semaphore or the
6084  *      dev_base_lock to be sure it remains unique.
6085  */
6086 static int dev_new_index(struct net *net)
6087 {
6088         int ifindex = net->ifindex;
6089         for (;;) {
6090                 if (++ifindex <= 0)
6091                         ifindex = 1;
6092                 if (!__dev_get_by_index(net, ifindex))
6093                         return net->ifindex = ifindex;
6094         }
6095 }
6096
6097 /* Delayed registration/unregisteration */
6098 static LIST_HEAD(net_todo_list);
6099 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6100
6101 static void net_set_todo(struct net_device *dev)
6102 {
6103         list_add_tail(&dev->todo_list, &net_todo_list);
6104         dev_net(dev)->dev_unreg_count++;
6105 }
6106
6107 static void rollback_registered_many(struct list_head *head)
6108 {
6109         struct net_device *dev, *tmp;
6110         LIST_HEAD(close_head);
6111
6112         BUG_ON(dev_boot_phase);
6113         ASSERT_RTNL();
6114
6115         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6116                 /* Some devices call without registering
6117                  * for initialization unwind. Remove those
6118                  * devices and proceed with the remaining.
6119                  */
6120                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6121                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6122                                  dev->name, dev);
6123
6124                         WARN_ON(1);
6125                         list_del(&dev->unreg_list);
6126                         continue;
6127                 }
6128                 dev->dismantle = true;
6129                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6130         }
6131
6132         /* If device is running, close it first. */
6133         list_for_each_entry(dev, head, unreg_list)
6134                 list_add_tail(&dev->close_list, &close_head);
6135         dev_close_many(&close_head, true);
6136
6137         list_for_each_entry(dev, head, unreg_list) {
6138                 /* And unlink it from device chain. */
6139                 unlist_netdevice(dev);
6140
6141                 dev->reg_state = NETREG_UNREGISTERING;
6142         }
6143
6144         synchronize_net();
6145
6146         list_for_each_entry(dev, head, unreg_list) {
6147                 struct sk_buff *skb = NULL;
6148
6149                 /* Shutdown queueing discipline. */
6150                 dev_shutdown(dev);
6151
6152
6153                 /* Notify protocols, that we are about to destroy
6154                    this device. They should clean all the things.
6155                 */
6156                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6157
6158                 if (!dev->rtnl_link_ops ||
6159                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6160                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6161                                                      GFP_KERNEL);
6162
6163                 /*
6164                  *      Flush the unicast and multicast chains
6165                  */
6166                 dev_uc_flush(dev);
6167                 dev_mc_flush(dev);
6168
6169                 if (dev->netdev_ops->ndo_uninit)
6170                         dev->netdev_ops->ndo_uninit(dev);
6171
6172                 if (skb)
6173                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6174
6175                 /* Notifier chain MUST detach us all upper devices. */
6176                 WARN_ON(netdev_has_any_upper_dev(dev));
6177
6178                 /* Remove entries from kobject tree */
6179                 netdev_unregister_kobject(dev);
6180 #ifdef CONFIG_XPS
6181                 /* Remove XPS queueing entries */
6182                 netif_reset_xps_queues_gt(dev, 0);
6183 #endif
6184         }
6185
6186         synchronize_net();
6187
6188         list_for_each_entry(dev, head, unreg_list)
6189                 dev_put(dev);
6190 }
6191
6192 static void rollback_registered(struct net_device *dev)
6193 {
6194         LIST_HEAD(single);
6195
6196         list_add(&dev->unreg_list, &single);
6197         rollback_registered_many(&single);
6198         list_del(&single);
6199 }
6200
6201 static netdev_features_t netdev_fix_features(struct net_device *dev,
6202         netdev_features_t features)
6203 {
6204         /* Fix illegal checksum combinations */
6205         if ((features & NETIF_F_HW_CSUM) &&
6206             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6207                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6208                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6209         }
6210
6211         /* TSO requires that SG is present as well. */
6212         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6213                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6214                 features &= ~NETIF_F_ALL_TSO;
6215         }
6216
6217         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6218                                         !(features & NETIF_F_IP_CSUM)) {
6219                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6220                 features &= ~NETIF_F_TSO;
6221                 features &= ~NETIF_F_TSO_ECN;
6222         }
6223
6224         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6225                                          !(features & NETIF_F_IPV6_CSUM)) {
6226                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6227                 features &= ~NETIF_F_TSO6;
6228         }
6229
6230         /* TSO ECN requires that TSO is present as well. */
6231         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6232                 features &= ~NETIF_F_TSO_ECN;
6233
6234         /* Software GSO depends on SG. */
6235         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6236                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6237                 features &= ~NETIF_F_GSO;
6238         }
6239
6240         /* UFO needs SG and checksumming */
6241         if (features & NETIF_F_UFO) {
6242                 /* maybe split UFO into V4 and V6? */
6243                 if (!((features & NETIF_F_GEN_CSUM) ||
6244                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6245                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6246                         netdev_dbg(dev,
6247                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6248                         features &= ~NETIF_F_UFO;
6249                 }
6250
6251                 if (!(features & NETIF_F_SG)) {
6252                         netdev_dbg(dev,
6253                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6254                         features &= ~NETIF_F_UFO;
6255                 }
6256         }
6257
6258 #ifdef CONFIG_NET_RX_BUSY_POLL
6259         if (dev->netdev_ops->ndo_busy_poll)
6260                 features |= NETIF_F_BUSY_POLL;
6261         else
6262 #endif
6263                 features &= ~NETIF_F_BUSY_POLL;
6264
6265         return features;
6266 }
6267
6268 int __netdev_update_features(struct net_device *dev)
6269 {
6270         netdev_features_t features;
6271         int err = 0;
6272
6273         ASSERT_RTNL();
6274
6275         features = netdev_get_wanted_features(dev);
6276
6277         if (dev->netdev_ops->ndo_fix_features)
6278                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6279
6280         /* driver might be less strict about feature dependencies */
6281         features = netdev_fix_features(dev, features);
6282
6283         if (dev->features == features)
6284                 return 0;
6285
6286         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6287                 &dev->features, &features);
6288
6289         if (dev->netdev_ops->ndo_set_features)
6290                 err = dev->netdev_ops->ndo_set_features(dev, features);
6291
6292         if (unlikely(err < 0)) {
6293                 netdev_err(dev,
6294                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6295                         err, &features, &dev->features);
6296                 return -1;
6297         }
6298
6299         if (!err)
6300                 dev->features = features;
6301
6302         return 1;
6303 }
6304
6305 /**
6306  *      netdev_update_features - recalculate device features
6307  *      @dev: the device to check
6308  *
6309  *      Recalculate dev->features set and send notifications if it
6310  *      has changed. Should be called after driver or hardware dependent
6311  *      conditions might have changed that influence the features.
6312  */
6313 void netdev_update_features(struct net_device *dev)
6314 {
6315         if (__netdev_update_features(dev))
6316                 netdev_features_change(dev);
6317 }
6318 EXPORT_SYMBOL(netdev_update_features);
6319
6320 /**
6321  *      netdev_change_features - recalculate device features
6322  *      @dev: the device to check
6323  *
6324  *      Recalculate dev->features set and send notifications even
6325  *      if they have not changed. Should be called instead of
6326  *      netdev_update_features() if also dev->vlan_features might
6327  *      have changed to allow the changes to be propagated to stacked
6328  *      VLAN devices.
6329  */
6330 void netdev_change_features(struct net_device *dev)
6331 {
6332         __netdev_update_features(dev);
6333         netdev_features_change(dev);
6334 }
6335 EXPORT_SYMBOL(netdev_change_features);
6336
6337 /**
6338  *      netif_stacked_transfer_operstate -      transfer operstate
6339  *      @rootdev: the root or lower level device to transfer state from
6340  *      @dev: the device to transfer operstate to
6341  *
6342  *      Transfer operational state from root to device. This is normally
6343  *      called when a stacking relationship exists between the root
6344  *      device and the device(a leaf device).
6345  */
6346 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6347                                         struct net_device *dev)
6348 {
6349         if (rootdev->operstate == IF_OPER_DORMANT)
6350                 netif_dormant_on(dev);
6351         else
6352                 netif_dormant_off(dev);
6353
6354         if (netif_carrier_ok(rootdev)) {
6355                 if (!netif_carrier_ok(dev))
6356                         netif_carrier_on(dev);
6357         } else {
6358                 if (netif_carrier_ok(dev))
6359                         netif_carrier_off(dev);
6360         }
6361 }
6362 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6363
6364 #ifdef CONFIG_SYSFS
6365 static int netif_alloc_rx_queues(struct net_device *dev)
6366 {
6367         unsigned int i, count = dev->num_rx_queues;
6368         struct netdev_rx_queue *rx;
6369         size_t sz = count * sizeof(*rx);
6370
6371         BUG_ON(count < 1);
6372
6373         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6374         if (!rx) {
6375                 rx = vzalloc(sz);
6376                 if (!rx)
6377                         return -ENOMEM;
6378         }
6379         dev->_rx = rx;
6380
6381         for (i = 0; i < count; i++)
6382                 rx[i].dev = dev;
6383         return 0;
6384 }
6385 #endif
6386
6387 static void netdev_init_one_queue(struct net_device *dev,
6388                                   struct netdev_queue *queue, void *_unused)
6389 {
6390         /* Initialize queue lock */
6391         spin_lock_init(&queue->_xmit_lock);
6392         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6393         queue->xmit_lock_owner = -1;
6394         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6395         queue->dev = dev;
6396 #ifdef CONFIG_BQL
6397         dql_init(&queue->dql, HZ);
6398 #endif
6399 }
6400
6401 static void netif_free_tx_queues(struct net_device *dev)
6402 {
6403         kvfree(dev->_tx);
6404 }
6405
6406 static int netif_alloc_netdev_queues(struct net_device *dev)
6407 {
6408         unsigned int count = dev->num_tx_queues;
6409         struct netdev_queue *tx;
6410         size_t sz = count * sizeof(*tx);
6411
6412         BUG_ON(count < 1 || count > 0xffff);
6413
6414         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6415         if (!tx) {
6416                 tx = vzalloc(sz);
6417                 if (!tx)
6418                         return -ENOMEM;
6419         }
6420         dev->_tx = tx;
6421
6422         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6423         spin_lock_init(&dev->tx_global_lock);
6424
6425         return 0;
6426 }
6427
6428 void netif_tx_stop_all_queues(struct net_device *dev)
6429 {
6430         unsigned int i;
6431
6432         for (i = 0; i < dev->num_tx_queues; i++) {
6433                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6434                 netif_tx_stop_queue(txq);
6435         }
6436 }
6437 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6438
6439 /**
6440  *      register_netdevice      - register a network device
6441  *      @dev: device to register
6442  *
6443  *      Take a completed network device structure and add it to the kernel
6444  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6445  *      chain. 0 is returned on success. A negative errno code is returned
6446  *      on a failure to set up the device, or if the name is a duplicate.
6447  *
6448  *      Callers must hold the rtnl semaphore. You may want
6449  *      register_netdev() instead of this.
6450  *
6451  *      BUGS:
6452  *      The locking appears insufficient to guarantee two parallel registers
6453  *      will not get the same name.
6454  */
6455
6456 int register_netdevice(struct net_device *dev)
6457 {
6458         int ret;
6459         struct net *net = dev_net(dev);
6460
6461         BUG_ON(dev_boot_phase);
6462         ASSERT_RTNL();
6463
6464         might_sleep();
6465
6466         /* When net_device's are persistent, this will be fatal. */
6467         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6468         BUG_ON(!net);
6469
6470         spin_lock_init(&dev->addr_list_lock);
6471         netdev_set_addr_lockdep_class(dev);
6472
6473         ret = dev_get_valid_name(net, dev, dev->name);
6474         if (ret < 0)
6475                 goto out;
6476
6477         /* Init, if this function is available */
6478         if (dev->netdev_ops->ndo_init) {
6479                 ret = dev->netdev_ops->ndo_init(dev);
6480                 if (ret) {
6481                         if (ret > 0)
6482                                 ret = -EIO;
6483                         goto out;
6484                 }
6485         }
6486
6487         if (((dev->hw_features | dev->features) &
6488              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6489             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6490              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6491                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6492                 ret = -EINVAL;
6493                 goto err_uninit;
6494         }
6495
6496         ret = -EBUSY;
6497         if (!dev->ifindex)
6498                 dev->ifindex = dev_new_index(net);
6499         else if (__dev_get_by_index(net, dev->ifindex))
6500                 goto err_uninit;
6501
6502         /* Transfer changeable features to wanted_features and enable
6503          * software offloads (GSO and GRO).
6504          */
6505         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6506         dev->features |= NETIF_F_SOFT_FEATURES;
6507         dev->wanted_features = dev->features & dev->hw_features;
6508
6509         if (!(dev->flags & IFF_LOOPBACK)) {
6510                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6511         }
6512
6513         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6514          */
6515         dev->vlan_features |= NETIF_F_HIGHDMA;
6516
6517         /* Make NETIF_F_SG inheritable to tunnel devices.
6518          */
6519         dev->hw_enc_features |= NETIF_F_SG;
6520
6521         /* Make NETIF_F_SG inheritable to MPLS.
6522          */
6523         dev->mpls_features |= NETIF_F_SG;
6524
6525         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6526         ret = notifier_to_errno(ret);
6527         if (ret)
6528                 goto err_uninit;
6529
6530         ret = netdev_register_kobject(dev);
6531         if (ret)
6532                 goto err_uninit;
6533         dev->reg_state = NETREG_REGISTERED;
6534
6535         __netdev_update_features(dev);
6536
6537         /*
6538          *      Default initial state at registry is that the
6539          *      device is present.
6540          */
6541
6542         set_bit(__LINK_STATE_PRESENT, &dev->state);
6543
6544         linkwatch_init_dev(dev);
6545
6546         dev_init_scheduler(dev);
6547         dev_hold(dev);
6548         list_netdevice(dev);
6549         add_device_randomness(dev->dev_addr, dev->addr_len);
6550
6551         /* If the device has permanent device address, driver should
6552          * set dev_addr and also addr_assign_type should be set to
6553          * NET_ADDR_PERM (default value).
6554          */
6555         if (dev->addr_assign_type == NET_ADDR_PERM)
6556                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6557
6558         /* Notify protocols, that a new device appeared. */
6559         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6560         ret = notifier_to_errno(ret);
6561         if (ret) {
6562                 rollback_registered(dev);
6563                 dev->reg_state = NETREG_UNREGISTERED;
6564         }
6565         /*
6566          *      Prevent userspace races by waiting until the network
6567          *      device is fully setup before sending notifications.
6568          */
6569         if (!dev->rtnl_link_ops ||
6570             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6571                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6572
6573 out:
6574         return ret;
6575
6576 err_uninit:
6577         if (dev->netdev_ops->ndo_uninit)
6578                 dev->netdev_ops->ndo_uninit(dev);
6579         goto out;
6580 }
6581 EXPORT_SYMBOL(register_netdevice);
6582
6583 /**
6584  *      init_dummy_netdev       - init a dummy network device for NAPI
6585  *      @dev: device to init
6586  *
6587  *      This takes a network device structure and initialize the minimum
6588  *      amount of fields so it can be used to schedule NAPI polls without
6589  *      registering a full blown interface. This is to be used by drivers
6590  *      that need to tie several hardware interfaces to a single NAPI
6591  *      poll scheduler due to HW limitations.
6592  */
6593 int init_dummy_netdev(struct net_device *dev)
6594 {
6595         /* Clear everything. Note we don't initialize spinlocks
6596          * are they aren't supposed to be taken by any of the
6597          * NAPI code and this dummy netdev is supposed to be
6598          * only ever used for NAPI polls
6599          */
6600         memset(dev, 0, sizeof(struct net_device));
6601
6602         /* make sure we BUG if trying to hit standard
6603          * register/unregister code path
6604          */
6605         dev->reg_state = NETREG_DUMMY;
6606
6607         /* NAPI wants this */
6608         INIT_LIST_HEAD(&dev->napi_list);
6609
6610         /* a dummy interface is started by default */
6611         set_bit(__LINK_STATE_PRESENT, &dev->state);
6612         set_bit(__LINK_STATE_START, &dev->state);
6613
6614         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6615          * because users of this 'device' dont need to change
6616          * its refcount.
6617          */
6618
6619         return 0;
6620 }
6621 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6622
6623
6624 /**
6625  *      register_netdev - register a network device
6626  *      @dev: device to register
6627  *
6628  *      Take a completed network device structure and add it to the kernel
6629  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6630  *      chain. 0 is returned on success. A negative errno code is returned
6631  *      on a failure to set up the device, or if the name is a duplicate.
6632  *
6633  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6634  *      and expands the device name if you passed a format string to
6635  *      alloc_netdev.
6636  */
6637 int register_netdev(struct net_device *dev)
6638 {
6639         int err;
6640
6641         rtnl_lock();
6642         err = register_netdevice(dev);
6643         rtnl_unlock();
6644         return err;
6645 }
6646 EXPORT_SYMBOL(register_netdev);
6647
6648 int netdev_refcnt_read(const struct net_device *dev)
6649 {
6650         int i, refcnt = 0;
6651
6652         for_each_possible_cpu(i)
6653                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6654         return refcnt;
6655 }
6656 EXPORT_SYMBOL(netdev_refcnt_read);
6657
6658 /**
6659  * netdev_wait_allrefs - wait until all references are gone.
6660  * @dev: target net_device
6661  *
6662  * This is called when unregistering network devices.
6663  *
6664  * Any protocol or device that holds a reference should register
6665  * for netdevice notification, and cleanup and put back the
6666  * reference if they receive an UNREGISTER event.
6667  * We can get stuck here if buggy protocols don't correctly
6668  * call dev_put.
6669  */
6670 static void netdev_wait_allrefs(struct net_device *dev)
6671 {
6672         unsigned long rebroadcast_time, warning_time;
6673         int refcnt;
6674
6675         linkwatch_forget_dev(dev);
6676
6677         rebroadcast_time = warning_time = jiffies;
6678         refcnt = netdev_refcnt_read(dev);
6679
6680         while (refcnt != 0) {
6681                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6682                         rtnl_lock();
6683
6684                         /* Rebroadcast unregister notification */
6685                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6686
6687                         __rtnl_unlock();
6688                         rcu_barrier();
6689                         rtnl_lock();
6690
6691                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6692                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6693                                      &dev->state)) {
6694                                 /* We must not have linkwatch events
6695                                  * pending on unregister. If this
6696                                  * happens, we simply run the queue
6697                                  * unscheduled, resulting in a noop
6698                                  * for this device.
6699                                  */
6700                                 linkwatch_run_queue();
6701                         }
6702
6703                         __rtnl_unlock();
6704
6705                         rebroadcast_time = jiffies;
6706                 }
6707
6708                 msleep(250);
6709
6710                 refcnt = netdev_refcnt_read(dev);
6711
6712                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6713                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6714                                  dev->name, refcnt);
6715                         warning_time = jiffies;
6716                 }
6717         }
6718 }
6719
6720 /* The sequence is:
6721  *
6722  *      rtnl_lock();
6723  *      ...
6724  *      register_netdevice(x1);
6725  *      register_netdevice(x2);
6726  *      ...
6727  *      unregister_netdevice(y1);
6728  *      unregister_netdevice(y2);
6729  *      ...
6730  *      rtnl_unlock();
6731  *      free_netdev(y1);
6732  *      free_netdev(y2);
6733  *
6734  * We are invoked by rtnl_unlock().
6735  * This allows us to deal with problems:
6736  * 1) We can delete sysfs objects which invoke hotplug
6737  *    without deadlocking with linkwatch via keventd.
6738  * 2) Since we run with the RTNL semaphore not held, we can sleep
6739  *    safely in order to wait for the netdev refcnt to drop to zero.
6740  *
6741  * We must not return until all unregister events added during
6742  * the interval the lock was held have been completed.
6743  */
6744 void netdev_run_todo(void)
6745 {
6746         struct list_head list;
6747
6748         /* Snapshot list, allow later requests */
6749         list_replace_init(&net_todo_list, &list);
6750
6751         __rtnl_unlock();
6752
6753
6754         /* Wait for rcu callbacks to finish before next phase */
6755         if (!list_empty(&list))
6756                 rcu_barrier();
6757
6758         while (!list_empty(&list)) {
6759                 struct net_device *dev
6760                         = list_first_entry(&list, struct net_device, todo_list);
6761                 list_del(&dev->todo_list);
6762
6763                 rtnl_lock();
6764                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6765                 __rtnl_unlock();
6766
6767                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6768                         pr_err("network todo '%s' but state %d\n",
6769                                dev->name, dev->reg_state);
6770                         dump_stack();
6771                         continue;
6772                 }
6773
6774                 dev->reg_state = NETREG_UNREGISTERED;
6775
6776                 on_each_cpu(flush_backlog, dev, 1);
6777
6778                 netdev_wait_allrefs(dev);
6779
6780                 /* paranoia */
6781                 BUG_ON(netdev_refcnt_read(dev));
6782                 BUG_ON(!list_empty(&dev->ptype_all));
6783                 BUG_ON(!list_empty(&dev->ptype_specific));
6784                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6785                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6786                 WARN_ON(dev->dn_ptr);
6787
6788                 if (dev->destructor)
6789                         dev->destructor(dev);
6790
6791                 /* Report a network device has been unregistered */
6792                 rtnl_lock();
6793                 dev_net(dev)->dev_unreg_count--;
6794                 __rtnl_unlock();
6795                 wake_up(&netdev_unregistering_wq);
6796
6797                 /* Free network device */
6798                 kobject_put(&dev->dev.kobj);
6799         }
6800 }
6801
6802 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6803  * fields in the same order, with only the type differing.
6804  */
6805 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6806                              const struct net_device_stats *netdev_stats)
6807 {
6808 #if BITS_PER_LONG == 64
6809         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6810         memcpy(stats64, netdev_stats, sizeof(*stats64));
6811 #else
6812         size_t i, n = sizeof(*stats64) / sizeof(u64);
6813         const unsigned long *src = (const unsigned long *)netdev_stats;
6814         u64 *dst = (u64 *)stats64;
6815
6816         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6817                      sizeof(*stats64) / sizeof(u64));
6818         for (i = 0; i < n; i++)
6819                 dst[i] = src[i];
6820 #endif
6821 }
6822 EXPORT_SYMBOL(netdev_stats_to_stats64);
6823
6824 /**
6825  *      dev_get_stats   - get network device statistics
6826  *      @dev: device to get statistics from
6827  *      @storage: place to store stats
6828  *
6829  *      Get network statistics from device. Return @storage.
6830  *      The device driver may provide its own method by setting
6831  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6832  *      otherwise the internal statistics structure is used.
6833  */
6834 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6835                                         struct rtnl_link_stats64 *storage)
6836 {
6837         const struct net_device_ops *ops = dev->netdev_ops;
6838
6839         if (ops->ndo_get_stats64) {
6840                 memset(storage, 0, sizeof(*storage));
6841                 ops->ndo_get_stats64(dev, storage);
6842         } else if (ops->ndo_get_stats) {
6843                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6844         } else {
6845                 netdev_stats_to_stats64(storage, &dev->stats);
6846         }
6847         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6848         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6849         return storage;
6850 }
6851 EXPORT_SYMBOL(dev_get_stats);
6852
6853 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6854 {
6855         struct netdev_queue *queue = dev_ingress_queue(dev);
6856
6857 #ifdef CONFIG_NET_CLS_ACT
6858         if (queue)
6859                 return queue;
6860         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6861         if (!queue)
6862                 return NULL;
6863         netdev_init_one_queue(dev, queue, NULL);
6864         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6865         queue->qdisc_sleeping = &noop_qdisc;
6866         rcu_assign_pointer(dev->ingress_queue, queue);
6867 #endif
6868         return queue;
6869 }
6870
6871 static const struct ethtool_ops default_ethtool_ops;
6872
6873 void netdev_set_default_ethtool_ops(struct net_device *dev,
6874                                     const struct ethtool_ops *ops)
6875 {
6876         if (dev->ethtool_ops == &default_ethtool_ops)
6877                 dev->ethtool_ops = ops;
6878 }
6879 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6880
6881 void netdev_freemem(struct net_device *dev)
6882 {
6883         char *addr = (char *)dev - dev->padded;
6884
6885         kvfree(addr);
6886 }
6887
6888 /**
6889  *      alloc_netdev_mqs - allocate network device
6890  *      @sizeof_priv:           size of private data to allocate space for
6891  *      @name:                  device name format string
6892  *      @name_assign_type:      origin of device name
6893  *      @setup:                 callback to initialize device
6894  *      @txqs:                  the number of TX subqueues to allocate
6895  *      @rxqs:                  the number of RX subqueues to allocate
6896  *
6897  *      Allocates a struct net_device with private data area for driver use
6898  *      and performs basic initialization.  Also allocates subqueue structs
6899  *      for each queue on the device.
6900  */
6901 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6902                 unsigned char name_assign_type,
6903                 void (*setup)(struct net_device *),
6904                 unsigned int txqs, unsigned int rxqs)
6905 {
6906         struct net_device *dev;
6907         size_t alloc_size;
6908         struct net_device *p;
6909
6910         BUG_ON(strlen(name) >= sizeof(dev->name));
6911
6912         if (txqs < 1) {
6913                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6914                 return NULL;
6915         }
6916
6917 #ifdef CONFIG_SYSFS
6918         if (rxqs < 1) {
6919                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6920                 return NULL;
6921         }
6922 #endif
6923
6924         alloc_size = sizeof(struct net_device);
6925         if (sizeof_priv) {
6926                 /* ensure 32-byte alignment of private area */
6927                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6928                 alloc_size += sizeof_priv;
6929         }
6930         /* ensure 32-byte alignment of whole construct */
6931         alloc_size += NETDEV_ALIGN - 1;
6932
6933         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6934         if (!p)
6935                 p = vzalloc(alloc_size);
6936         if (!p)
6937                 return NULL;
6938
6939         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6940         dev->padded = (char *)dev - (char *)p;
6941
6942         dev->pcpu_refcnt = alloc_percpu(int);
6943         if (!dev->pcpu_refcnt)
6944                 goto free_dev;
6945
6946         if (dev_addr_init(dev))
6947                 goto free_pcpu;
6948
6949         dev_mc_init(dev);
6950         dev_uc_init(dev);
6951
6952         dev_net_set(dev, &init_net);
6953
6954         dev->gso_max_size = GSO_MAX_SIZE;
6955         dev->gso_max_segs = GSO_MAX_SEGS;
6956         dev->gso_min_segs = 0;
6957
6958         INIT_LIST_HEAD(&dev->napi_list);
6959         INIT_LIST_HEAD(&dev->unreg_list);
6960         INIT_LIST_HEAD(&dev->close_list);
6961         INIT_LIST_HEAD(&dev->link_watch_list);
6962         INIT_LIST_HEAD(&dev->adj_list.upper);
6963         INIT_LIST_HEAD(&dev->adj_list.lower);
6964         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6965         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6966         INIT_LIST_HEAD(&dev->ptype_all);
6967         INIT_LIST_HEAD(&dev->ptype_specific);
6968         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6969         setup(dev);
6970
6971         dev->num_tx_queues = txqs;
6972         dev->real_num_tx_queues = txqs;
6973         if (netif_alloc_netdev_queues(dev))
6974                 goto free_all;
6975
6976 #ifdef CONFIG_SYSFS
6977         dev->num_rx_queues = rxqs;
6978         dev->real_num_rx_queues = rxqs;
6979         if (netif_alloc_rx_queues(dev))
6980                 goto free_all;
6981 #endif
6982
6983         strcpy(dev->name, name);
6984         dev->name_assign_type = name_assign_type;
6985         dev->group = INIT_NETDEV_GROUP;
6986         if (!dev->ethtool_ops)
6987                 dev->ethtool_ops = &default_ethtool_ops;
6988
6989         nf_hook_ingress_init(dev);
6990
6991         return dev;
6992
6993 free_all:
6994         free_netdev(dev);
6995         return NULL;
6996
6997 free_pcpu:
6998         free_percpu(dev->pcpu_refcnt);
6999 free_dev:
7000         netdev_freemem(dev);
7001         return NULL;
7002 }
7003 EXPORT_SYMBOL(alloc_netdev_mqs);
7004
7005 /**
7006  *      free_netdev - free network device
7007  *      @dev: device
7008  *
7009  *      This function does the last stage of destroying an allocated device
7010  *      interface. The reference to the device object is released.
7011  *      If this is the last reference then it will be freed.
7012  */
7013 void free_netdev(struct net_device *dev)
7014 {
7015         struct napi_struct *p, *n;
7016
7017         netif_free_tx_queues(dev);
7018 #ifdef CONFIG_SYSFS
7019         kvfree(dev->_rx);
7020 #endif
7021
7022         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7023
7024         /* Flush device addresses */
7025         dev_addr_flush(dev);
7026
7027         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7028                 netif_napi_del(p);
7029
7030         free_percpu(dev->pcpu_refcnt);
7031         dev->pcpu_refcnt = NULL;
7032
7033         /*  Compatibility with error handling in drivers */
7034         if (dev->reg_state == NETREG_UNINITIALIZED) {
7035                 netdev_freemem(dev);
7036                 return;
7037         }
7038
7039         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7040         dev->reg_state = NETREG_RELEASED;
7041
7042         /* will free via device release */
7043         put_device(&dev->dev);
7044 }
7045 EXPORT_SYMBOL(free_netdev);
7046
7047 /**
7048  *      synchronize_net -  Synchronize with packet receive processing
7049  *
7050  *      Wait for packets currently being received to be done.
7051  *      Does not block later packets from starting.
7052  */
7053 void synchronize_net(void)
7054 {
7055         might_sleep();
7056         if (rtnl_is_locked())
7057                 synchronize_rcu_expedited();
7058         else
7059                 synchronize_rcu();
7060 }
7061 EXPORT_SYMBOL(synchronize_net);
7062
7063 /**
7064  *      unregister_netdevice_queue - remove device from the kernel
7065  *      @dev: device
7066  *      @head: list
7067  *
7068  *      This function shuts down a device interface and removes it
7069  *      from the kernel tables.
7070  *      If head not NULL, device is queued to be unregistered later.
7071  *
7072  *      Callers must hold the rtnl semaphore.  You may want
7073  *      unregister_netdev() instead of this.
7074  */
7075
7076 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7077 {
7078         ASSERT_RTNL();
7079
7080         if (head) {
7081                 list_move_tail(&dev->unreg_list, head);
7082         } else {
7083                 rollback_registered(dev);
7084                 /* Finish processing unregister after unlock */
7085                 net_set_todo(dev);
7086         }
7087 }
7088 EXPORT_SYMBOL(unregister_netdevice_queue);
7089
7090 /**
7091  *      unregister_netdevice_many - unregister many devices
7092  *      @head: list of devices
7093  *
7094  *  Note: As most callers use a stack allocated list_head,
7095  *  we force a list_del() to make sure stack wont be corrupted later.
7096  */
7097 void unregister_netdevice_many(struct list_head *head)
7098 {
7099         struct net_device *dev;
7100
7101         if (!list_empty(head)) {
7102                 rollback_registered_many(head);
7103                 list_for_each_entry(dev, head, unreg_list)
7104                         net_set_todo(dev);
7105                 list_del(head);
7106         }
7107 }
7108 EXPORT_SYMBOL(unregister_netdevice_many);
7109
7110 /**
7111  *      unregister_netdev - remove device from the kernel
7112  *      @dev: device
7113  *
7114  *      This function shuts down a device interface and removes it
7115  *      from the kernel tables.
7116  *
7117  *      This is just a wrapper for unregister_netdevice that takes
7118  *      the rtnl semaphore.  In general you want to use this and not
7119  *      unregister_netdevice.
7120  */
7121 void unregister_netdev(struct net_device *dev)
7122 {
7123         rtnl_lock();
7124         unregister_netdevice(dev);
7125         rtnl_unlock();
7126 }
7127 EXPORT_SYMBOL(unregister_netdev);
7128
7129 /**
7130  *      dev_change_net_namespace - move device to different nethost namespace
7131  *      @dev: device
7132  *      @net: network namespace
7133  *      @pat: If not NULL name pattern to try if the current device name
7134  *            is already taken in the destination network namespace.
7135  *
7136  *      This function shuts down a device interface and moves it
7137  *      to a new network namespace. On success 0 is returned, on
7138  *      a failure a netagive errno code is returned.
7139  *
7140  *      Callers must hold the rtnl semaphore.
7141  */
7142
7143 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7144 {
7145         int err;
7146
7147         ASSERT_RTNL();
7148
7149         /* Don't allow namespace local devices to be moved. */
7150         err = -EINVAL;
7151         if (dev->features & NETIF_F_NETNS_LOCAL)
7152                 goto out;
7153
7154         /* Ensure the device has been registrered */
7155         if (dev->reg_state != NETREG_REGISTERED)
7156                 goto out;
7157
7158         /* Get out if there is nothing todo */
7159         err = 0;
7160         if (net_eq(dev_net(dev), net))
7161                 goto out;
7162
7163         /* Pick the destination device name, and ensure
7164          * we can use it in the destination network namespace.
7165          */
7166         err = -EEXIST;
7167         if (__dev_get_by_name(net, dev->name)) {
7168                 /* We get here if we can't use the current device name */
7169                 if (!pat)
7170                         goto out;
7171                 if (dev_get_valid_name(net, dev, pat) < 0)
7172                         goto out;
7173         }
7174
7175         /*
7176          * And now a mini version of register_netdevice unregister_netdevice.
7177          */
7178
7179         /* If device is running close it first. */
7180         dev_close(dev);
7181
7182         /* And unlink it from device chain */
7183         err = -ENODEV;
7184         unlist_netdevice(dev);
7185
7186         synchronize_net();
7187
7188         /* Shutdown queueing discipline. */
7189         dev_shutdown(dev);
7190
7191         /* Notify protocols, that we are about to destroy
7192            this device. They should clean all the things.
7193
7194            Note that dev->reg_state stays at NETREG_REGISTERED.
7195            This is wanted because this way 8021q and macvlan know
7196            the device is just moving and can keep their slaves up.
7197         */
7198         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7199         rcu_barrier();
7200         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7201         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7202
7203         /*
7204          *      Flush the unicast and multicast chains
7205          */
7206         dev_uc_flush(dev);
7207         dev_mc_flush(dev);
7208
7209         /* Send a netdev-removed uevent to the old namespace */
7210         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7211         netdev_adjacent_del_links(dev);
7212
7213         /* Actually switch the network namespace */
7214         dev_net_set(dev, net);
7215
7216         /* If there is an ifindex conflict assign a new one */
7217         if (__dev_get_by_index(net, dev->ifindex))
7218                 dev->ifindex = dev_new_index(net);
7219
7220         /* Send a netdev-add uevent to the new namespace */
7221         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7222         netdev_adjacent_add_links(dev);
7223
7224         /* Fixup kobjects */
7225         err = device_rename(&dev->dev, dev->name);
7226         WARN_ON(err);
7227
7228         /* Add the device back in the hashes */
7229         list_netdevice(dev);
7230
7231         /* Notify protocols, that a new device appeared. */
7232         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7233
7234         /*
7235          *      Prevent userspace races by waiting until the network
7236          *      device is fully setup before sending notifications.
7237          */
7238         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7239
7240         synchronize_net();
7241         err = 0;
7242 out:
7243         return err;
7244 }
7245 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7246
7247 static int dev_cpu_callback(struct notifier_block *nfb,
7248                             unsigned long action,
7249                             void *ocpu)
7250 {
7251         struct sk_buff **list_skb;
7252         struct sk_buff *skb;
7253         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7254         struct softnet_data *sd, *oldsd;
7255
7256         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7257                 return NOTIFY_OK;
7258
7259         local_irq_disable();
7260         cpu = smp_processor_id();
7261         sd = &per_cpu(softnet_data, cpu);
7262         oldsd = &per_cpu(softnet_data, oldcpu);
7263
7264         /* Find end of our completion_queue. */
7265         list_skb = &sd->completion_queue;
7266         while (*list_skb)
7267                 list_skb = &(*list_skb)->next;
7268         /* Append completion queue from offline CPU. */
7269         *list_skb = oldsd->completion_queue;
7270         oldsd->completion_queue = NULL;
7271
7272         /* Append output queue from offline CPU. */
7273         if (oldsd->output_queue) {
7274                 *sd->output_queue_tailp = oldsd->output_queue;
7275                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7276                 oldsd->output_queue = NULL;
7277                 oldsd->output_queue_tailp = &oldsd->output_queue;
7278         }
7279         /* Append NAPI poll list from offline CPU, with one exception :
7280          * process_backlog() must be called by cpu owning percpu backlog.
7281          * We properly handle process_queue & input_pkt_queue later.
7282          */
7283         while (!list_empty(&oldsd->poll_list)) {
7284                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7285                                                             struct napi_struct,
7286                                                             poll_list);
7287
7288                 list_del_init(&napi->poll_list);
7289                 if (napi->poll == process_backlog)
7290                         napi->state = 0;
7291                 else
7292                         ____napi_schedule(sd, napi);
7293         }
7294
7295         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7296         local_irq_enable();
7297
7298         /* Process offline CPU's input_pkt_queue */
7299         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7300                 netif_rx_ni(skb);
7301                 input_queue_head_incr(oldsd);
7302         }
7303         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7304                 netif_rx_ni(skb);
7305                 input_queue_head_incr(oldsd);
7306         }
7307
7308         return NOTIFY_OK;
7309 }
7310
7311
7312 /**
7313  *      netdev_increment_features - increment feature set by one
7314  *      @all: current feature set
7315  *      @one: new feature set
7316  *      @mask: mask feature set
7317  *
7318  *      Computes a new feature set after adding a device with feature set
7319  *      @one to the master device with current feature set @all.  Will not
7320  *      enable anything that is off in @mask. Returns the new feature set.
7321  */
7322 netdev_features_t netdev_increment_features(netdev_features_t all,
7323         netdev_features_t one, netdev_features_t mask)
7324 {
7325         if (mask & NETIF_F_GEN_CSUM)
7326                 mask |= NETIF_F_ALL_CSUM;
7327         mask |= NETIF_F_VLAN_CHALLENGED;
7328
7329         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7330         all &= one | ~NETIF_F_ALL_FOR_ALL;
7331
7332         /* If one device supports hw checksumming, set for all. */
7333         if (all & NETIF_F_GEN_CSUM)
7334                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7335
7336         return all;
7337 }
7338 EXPORT_SYMBOL(netdev_increment_features);
7339
7340 static struct hlist_head * __net_init netdev_create_hash(void)
7341 {
7342         int i;
7343         struct hlist_head *hash;
7344
7345         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7346         if (hash != NULL)
7347                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7348                         INIT_HLIST_HEAD(&hash[i]);
7349
7350         return hash;
7351 }
7352
7353 /* Initialize per network namespace state */
7354 static int __net_init netdev_init(struct net *net)
7355 {
7356         if (net != &init_net)
7357                 INIT_LIST_HEAD(&net->dev_base_head);
7358
7359         net->dev_name_head = netdev_create_hash();
7360         if (net->dev_name_head == NULL)
7361                 goto err_name;
7362
7363         net->dev_index_head = netdev_create_hash();
7364         if (net->dev_index_head == NULL)
7365                 goto err_idx;
7366
7367         return 0;
7368
7369 err_idx:
7370         kfree(net->dev_name_head);
7371 err_name:
7372         return -ENOMEM;
7373 }
7374
7375 /**
7376  *      netdev_drivername - network driver for the device
7377  *      @dev: network device
7378  *
7379  *      Determine network driver for device.
7380  */
7381 const char *netdev_drivername(const struct net_device *dev)
7382 {
7383         const struct device_driver *driver;
7384         const struct device *parent;
7385         const char *empty = "";
7386
7387         parent = dev->dev.parent;
7388         if (!parent)
7389                 return empty;
7390
7391         driver = parent->driver;
7392         if (driver && driver->name)
7393                 return driver->name;
7394         return empty;
7395 }
7396
7397 static void __netdev_printk(const char *level, const struct net_device *dev,
7398                             struct va_format *vaf)
7399 {
7400         if (dev && dev->dev.parent) {
7401                 dev_printk_emit(level[1] - '0',
7402                                 dev->dev.parent,
7403                                 "%s %s %s%s: %pV",
7404                                 dev_driver_string(dev->dev.parent),
7405                                 dev_name(dev->dev.parent),
7406                                 netdev_name(dev), netdev_reg_state(dev),
7407                                 vaf);
7408         } else if (dev) {
7409                 printk("%s%s%s: %pV",
7410                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7411         } else {
7412                 printk("%s(NULL net_device): %pV", level, vaf);
7413         }
7414 }
7415
7416 void netdev_printk(const char *level, const struct net_device *dev,
7417                    const char *format, ...)
7418 {
7419         struct va_format vaf;
7420         va_list args;
7421
7422         va_start(args, format);
7423
7424         vaf.fmt = format;
7425         vaf.va = &args;
7426
7427         __netdev_printk(level, dev, &vaf);
7428
7429         va_end(args);
7430 }
7431 EXPORT_SYMBOL(netdev_printk);
7432
7433 #define define_netdev_printk_level(func, level)                 \
7434 void func(const struct net_device *dev, const char *fmt, ...)   \
7435 {                                                               \
7436         struct va_format vaf;                                   \
7437         va_list args;                                           \
7438                                                                 \
7439         va_start(args, fmt);                                    \
7440                                                                 \
7441         vaf.fmt = fmt;                                          \
7442         vaf.va = &args;                                         \
7443                                                                 \
7444         __netdev_printk(level, dev, &vaf);                      \
7445                                                                 \
7446         va_end(args);                                           \
7447 }                                                               \
7448 EXPORT_SYMBOL(func);
7449
7450 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7451 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7452 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7453 define_netdev_printk_level(netdev_err, KERN_ERR);
7454 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7455 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7456 define_netdev_printk_level(netdev_info, KERN_INFO);
7457
7458 static void __net_exit netdev_exit(struct net *net)
7459 {
7460         kfree(net->dev_name_head);
7461         kfree(net->dev_index_head);
7462 }
7463
7464 static struct pernet_operations __net_initdata netdev_net_ops = {
7465         .init = netdev_init,
7466         .exit = netdev_exit,
7467 };
7468
7469 static void __net_exit default_device_exit(struct net *net)
7470 {
7471         struct net_device *dev, *aux;
7472         /*
7473          * Push all migratable network devices back to the
7474          * initial network namespace
7475          */
7476         rtnl_lock();
7477         for_each_netdev_safe(net, dev, aux) {
7478                 int err;
7479                 char fb_name[IFNAMSIZ];
7480
7481                 /* Ignore unmoveable devices (i.e. loopback) */
7482                 if (dev->features & NETIF_F_NETNS_LOCAL)
7483                         continue;
7484
7485                 /* Leave virtual devices for the generic cleanup */
7486                 if (dev->rtnl_link_ops)
7487                         continue;
7488
7489                 /* Push remaining network devices to init_net */
7490                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7491                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7492                 if (err) {
7493                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7494                                  __func__, dev->name, err);
7495                         BUG();
7496                 }
7497         }
7498         rtnl_unlock();
7499 }
7500
7501 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7502 {
7503         /* Return with the rtnl_lock held when there are no network
7504          * devices unregistering in any network namespace in net_list.
7505          */
7506         struct net *net;
7507         bool unregistering;
7508         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7509
7510         add_wait_queue(&netdev_unregistering_wq, &wait);
7511         for (;;) {
7512                 unregistering = false;
7513                 rtnl_lock();
7514                 list_for_each_entry(net, net_list, exit_list) {
7515                         if (net->dev_unreg_count > 0) {
7516                                 unregistering = true;
7517                                 break;
7518                         }
7519                 }
7520                 if (!unregistering)
7521                         break;
7522                 __rtnl_unlock();
7523
7524                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7525         }
7526         remove_wait_queue(&netdev_unregistering_wq, &wait);
7527 }
7528
7529 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7530 {
7531         /* At exit all network devices most be removed from a network
7532          * namespace.  Do this in the reverse order of registration.
7533          * Do this across as many network namespaces as possible to
7534          * improve batching efficiency.
7535          */
7536         struct net_device *dev;
7537         struct net *net;
7538         LIST_HEAD(dev_kill_list);
7539
7540         /* To prevent network device cleanup code from dereferencing
7541          * loopback devices or network devices that have been freed
7542          * wait here for all pending unregistrations to complete,
7543          * before unregistring the loopback device and allowing the
7544          * network namespace be freed.
7545          *
7546          * The netdev todo list containing all network devices
7547          * unregistrations that happen in default_device_exit_batch
7548          * will run in the rtnl_unlock() at the end of
7549          * default_device_exit_batch.
7550          */
7551         rtnl_lock_unregistering(net_list);
7552         list_for_each_entry(net, net_list, exit_list) {
7553                 for_each_netdev_reverse(net, dev) {
7554                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7555                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7556                         else
7557                                 unregister_netdevice_queue(dev, &dev_kill_list);
7558                 }
7559         }
7560         unregister_netdevice_many(&dev_kill_list);
7561         rtnl_unlock();
7562 }
7563
7564 static struct pernet_operations __net_initdata default_device_ops = {
7565         .exit = default_device_exit,
7566         .exit_batch = default_device_exit_batch,
7567 };
7568
7569 /*
7570  *      Initialize the DEV module. At boot time this walks the device list and
7571  *      unhooks any devices that fail to initialise (normally hardware not
7572  *      present) and leaves us with a valid list of present and active devices.
7573  *
7574  */
7575
7576 /*
7577  *       This is called single threaded during boot, so no need
7578  *       to take the rtnl semaphore.
7579  */
7580 static int __init net_dev_init(void)
7581 {
7582         int i, rc = -ENOMEM;
7583
7584         BUG_ON(!dev_boot_phase);
7585
7586         if (dev_proc_init())
7587                 goto out;
7588
7589         if (netdev_kobject_init())
7590                 goto out;
7591
7592         INIT_LIST_HEAD(&ptype_all);
7593         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7594                 INIT_LIST_HEAD(&ptype_base[i]);
7595
7596         INIT_LIST_HEAD(&offload_base);
7597
7598         if (register_pernet_subsys(&netdev_net_ops))
7599                 goto out;
7600
7601         /*
7602          *      Initialise the packet receive queues.
7603          */
7604
7605         for_each_possible_cpu(i) {
7606                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7607
7608                 skb_queue_head_init(&sd->input_pkt_queue);
7609                 skb_queue_head_init(&sd->process_queue);
7610                 INIT_LIST_HEAD(&sd->poll_list);
7611                 sd->output_queue_tailp = &sd->output_queue;
7612 #ifdef CONFIG_RPS
7613                 sd->csd.func = rps_trigger_softirq;
7614                 sd->csd.info = sd;
7615                 sd->cpu = i;
7616 #endif
7617
7618                 sd->backlog.poll = process_backlog;
7619                 sd->backlog.weight = weight_p;
7620         }
7621
7622         dev_boot_phase = 0;
7623
7624         /* The loopback device is special if any other network devices
7625          * is present in a network namespace the loopback device must
7626          * be present. Since we now dynamically allocate and free the
7627          * loopback device ensure this invariant is maintained by
7628          * keeping the loopback device as the first device on the
7629          * list of network devices.  Ensuring the loopback devices
7630          * is the first device that appears and the last network device
7631          * that disappears.
7632          */
7633         if (register_pernet_device(&loopback_net_ops))
7634                 goto out;
7635
7636         if (register_pernet_device(&default_device_ops))
7637                 goto out;
7638
7639         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7640         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7641
7642         hotcpu_notifier(dev_cpu_callback, 0);
7643         dst_init();
7644         rc = 0;
7645 out:
7646         return rc;
7647 }
7648
7649 subsys_initcall(net_dev_init);