net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <linux/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <linux/bpf.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <net/busy_poll.h>
 101 #include <linux/rtnetlink.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/dst_metadata.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/iw_handler.h>
 115 #include <asm/current.h>
 116 #include <linux/audit.h>
 117 #include <linux/dmaengine.h>
 118 #include <linux/err.h>
 119 #include <linux/ctype.h>
 120 #include <linux/if_arp.h>
 121 #include <linux/if_vlan.h>
 122 #include <linux/ip.h>
 123 #include <net/ip.h>
 124 #include <net/mpls.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/static_key.h>
 136 #include <linux/hashtable.h>
 137 #include <linux/vmalloc.h>
 138 #include <linux/if_macvlan.h>
 139 #include <linux/errqueue.h>
 140 #include <linux/hrtimer.h>
 141 #include <linux/netfilter_ingress.h>
 142 #include <linux/crash_dump.h>
 143
 144 #include "net-sysfs.h"
 145
 146 /* Instead of increasing this, you should create a hash table. */
 147 #define MAX_GRO_SKBS 8
 148
 149 /* This should be increased if a protocol with a bigger head is added. */
 150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152 static DEFINE_SPINLOCK(ptype_lock);
 153 static DEFINE_SPINLOCK(offload_lock);
 154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155 struct list_head ptype_all __read_mostly;       /* Taps */
 156 static struct list_head offload_base __read_mostly;
 157
 158 static int netif_rx_internal(struct sk_buff *skb);
 159 static int call_netdevice_notifiers_info(unsigned long val,
 160                                          struct net_device *dev,
 161                                          struct netdev_notifier_info *info);
 162
 163 /*
 164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165  * semaphore.
 166  *
 167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168  *
 169  * Writers must hold the rtnl semaphore while they loop through the
 170  * dev_base_head list, and hold dev_base_lock for writing when they do the
 171  * actual updates.  This allows pure readers to access the list even
 172  * while a writer is preparing to update it.
 173  *
 174  * To put it another way, dev_base_lock is held for writing only to
 175  * protect against pure readers; the rtnl semaphore provides the
 176  * protection against other writers.
 177  *
 178  * See, for example usages, register_netdevice() and
 179  * unregister_netdevice(), which must be called with the rtnl
 180  * semaphore held.
 181  */
 182 DEFINE_RWLOCK(dev_base_lock);
 183 EXPORT_SYMBOL(dev_base_lock);
 184
 185 /* protects napi_hash addition/deletion and napi_gen_id */
 186 static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188 static unsigned int napi_gen_id = NR_CPUS;
 189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191 static seqcount_t devnet_rename_seq;
 192
 193 static inline void dev_base_seq_inc(struct net *net)
 194 {
 195         while (++net->dev_base_seq == 0);
 196 }
 197
 198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199 {
 200         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208 }
 209
 210 static inline void rps_lock(struct softnet_data *sd)
 211 {
 212 #ifdef CONFIG_RPS
 213         spin_lock(&sd->input_pkt_queue.lock);
 214 #endif
 215 }
 216
 217 static inline void rps_unlock(struct softnet_data *sd)
 218 {
 219 #ifdef CONFIG_RPS
 220         spin_unlock(&sd->input_pkt_queue.lock);
 221 #endif
 222 }
 223
 224 /* Device list insertion */
 225 static void list_netdevice(struct net_device *dev)
 226 {
 227         struct net *net = dev_net(dev);
 228
 229         ASSERT_RTNL();
 230
 231         write_lock_bh(&dev_base_lock);
 232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234         hlist_add_head_rcu(&dev->index_hlist,
 235                            dev_index_hash(net, dev->ifindex));
 236         write_unlock_bh(&dev_base_lock);
 237
 238         dev_base_seq_inc(net);
 239 }
 240
 241 /* Device list removal
 242  * caller must respect a RCU grace period before freeing/reusing dev
 243  */
 244 static void unlist_netdevice(struct net_device *dev)
 245 {
 246         ASSERT_RTNL();
 247
 248         /* Unlink dev from the device chain */
 249         write_lock_bh(&dev_base_lock);
 250         list_del_rcu(&dev->dev_list);
 251         hlist_del_rcu(&dev->name_hlist);
 252         hlist_del_rcu(&dev->index_hlist);
 253         write_unlock_bh(&dev_base_lock);
 254
 255         dev_base_seq_inc(dev_net(dev));
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315 {
 316         int i;
 317
 318         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319                 if (netdev_lock_type[i] == dev_type)
 320                         return i;
 321         /* the last key is used by default */
 322         return ARRAY_SIZE(netdev_lock_type) - 1;
 323 }
 324
 325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326                                                  unsigned short dev_type)
 327 {
 328         int i;
 329
 330         i = netdev_lock_pos(dev_type);
 331         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332                                    netdev_lock_name[i]);
 333 }
 334
 335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336 {
 337         int i;
 338
 339         i = netdev_lock_pos(dev->type);
 340         lockdep_set_class_and_name(&dev->addr_list_lock,
 341                                    &netdev_addr_lock_key[i],
 342                                    netdev_lock_name[i]);
 343 }
 344 #else
 345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346                                                  unsigned short dev_type)
 347 {
 348 }
 349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350 {
 351 }
 352 #endif
 353
 354 /*******************************************************************************
 355
 356                 Protocol management and registration routines
 357
 358 *******************************************************************************/
 359
 360 /*
 361  *      Add a protocol ID to the list. Now that the input handler is
 362  *      smarter we can dispense with all the messy stuff that used to be
 363  *      here.
 364  *
 365  *      BEWARE!!! Protocol handlers, mangling input packets,
 366  *      MUST BE last in hash buckets and checking protocol handlers
 367  *      MUST start from promiscuous ptype_all chain in net_bh.
 368  *      It is true now, do not change it.
 369  *      Explanation follows: if protocol handler, mangling packet, will
 370  *      be the first on list, it is not able to sense, that packet
 371  *      is cloned and should be copied-on-write, so that it will
 372  *      change it and subsequent readers will get broken packet.
 373  *                                                      --ANK (980803)
 374  */
 375
 376 static inline struct list_head *ptype_head(const struct packet_type *pt)
 377 {
 378         if (pt->type == htons(ETH_P_ALL))
 379                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380         else
 381                 return pt->dev ? &pt->dev->ptype_specific :
 382                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         pr_warn("dev_remove_pack: %p not found\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462 /**
 463  *      dev_add_offload - register offload handlers
 464  *      @po: protocol offload declaration
 465  *
 466  *      Add protocol offload handlers to the networking stack. The passed
 467  *      &proto_offload is linked into kernel lists and may not be freed until
 468  *      it has been removed from the kernel lists.
 469  *
 470  *      This call does not sleep therefore it can not
 471  *      guarantee all CPU's that are in middle of receiving packets
 472  *      will see the new offload handlers (until the next received packet).
 473  */
 474 void dev_add_offload(struct packet_offload *po)
 475 {
 476         struct packet_offload *elem;
 477
 478         spin_lock(&offload_lock);
 479         list_for_each_entry(elem, &offload_base, list) {
 480                 if (po->priority < elem->priority)
 481                         break;
 482         }
 483         list_add_rcu(&po->list, elem->list.prev);
 484         spin_unlock(&offload_lock);
 485 }
 486 EXPORT_SYMBOL(dev_add_offload);
 487
 488 /**
 489  *      __dev_remove_offload     - remove offload handler
 490  *      @po: packet offload declaration
 491  *
 492  *      Remove a protocol offload handler that was previously added to the
 493  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 494  *      is removed from the kernel lists and can be freed or reused once this
 495  *      function returns.
 496  *
 497  *      The packet type might still be in use by receivers
 498  *      and must not be freed until after all the CPU's have gone
 499  *      through a quiescent state.
 500  */
 501 static void __dev_remove_offload(struct packet_offload *po)
 502 {
 503         struct list_head *head = &offload_base;
 504         struct packet_offload *po1;
 505
 506         spin_lock(&offload_lock);
 507
 508         list_for_each_entry(po1, head, list) {
 509                 if (po == po1) {
 510                         list_del_rcu(&po->list);
 511                         goto out;
 512                 }
 513         }
 514
 515         pr_warn("dev_remove_offload: %p not found\n", po);
 516 out:
 517         spin_unlock(&offload_lock);
 518 }
 519
 520 /**
 521  *      dev_remove_offload       - remove packet offload handler
 522  *      @po: packet offload declaration
 523  *
 524  *      Remove a packet offload handler that was previously added to the kernel
 525  *      offload handlers by dev_add_offload(). The passed &offload_type is
 526  *      removed from the kernel lists and can be freed or reused once this
 527  *      function returns.
 528  *
 529  *      This call sleeps to guarantee that no CPU is looking at the packet
 530  *      type after return.
 531  */
 532 void dev_remove_offload(struct packet_offload *po)
 533 {
 534         __dev_remove_offload(po);
 535
 536         synchronize_net();
 537 }
 538 EXPORT_SYMBOL(dev_remove_offload);
 539
 540 /******************************************************************************
 541
 542                       Device Boot-time Settings Routines
 543
 544 *******************************************************************************/
 545
 546 /* Boot time configuration table */
 547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549 /**
 550  *      netdev_boot_setup_add   - add new setup entry
 551  *      @name: name of the device
 552  *      @map: configured settings for the device
 553  *
 554  *      Adds new setup entry to the dev_boot_setup list.  The function
 555  *      returns 0 on error and 1 on success.  This is a generic routine to
 556  *      all netdevices.
 557  */
 558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559 {
 560         struct netdev_boot_setup *s;
 561         int i;
 562
 563         s = dev_boot_setup;
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566                         memset(s[i].name, 0, sizeof(s[i].name));
 567                         strlcpy(s[i].name, name, IFNAMSIZ);
 568                         memcpy(&s[i].map, map, sizeof(s[i].map));
 569                         break;
 570                 }
 571         }
 572
 573         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574 }
 575
 576 /**
 577  *      netdev_boot_setup_check - check boot time settings
 578  *      @dev: the netdevice
 579  *
 580  *      Check boot time settings for the device.
 581  *      The found settings are set for the device to be used
 582  *      later in the device probing.
 583  *      Returns 0 if no settings found, 1 if they are.
 584  */
 585 int netdev_boot_setup_check(struct net_device *dev)
 586 {
 587         struct netdev_boot_setup *s = dev_boot_setup;
 588         int i;
 589
 590         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592                     !strcmp(dev->name, s[i].name)) {
 593                         dev->irq        = s[i].map.irq;
 594                         dev->base_addr  = s[i].map.base_addr;
 595                         dev->mem_start  = s[i].map.mem_start;
 596                         dev->mem_end    = s[i].map.mem_end;
 597                         return 1;
 598                 }
 599         }
 600         return 0;
 601 }
 602 EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605 /**
 606  *      netdev_boot_base        - get address from boot time settings
 607  *      @prefix: prefix for network device
 608  *      @unit: id for network device
 609  *
 610  *      Check boot time settings for the base address of device.
 611  *      The found settings are set for the device to be used
 612  *      later in the device probing.
 613  *      Returns 0 if no settings found.
 614  */
 615 unsigned long netdev_boot_base(const char *prefix, int unit)
 616 {
 617         const struct netdev_boot_setup *s = dev_boot_setup;
 618         char name[IFNAMSIZ];
 619         int i;
 620
 621         sprintf(name, "%s%d", prefix, unit);
 622
 623         /*
 624          * If device already registered then return base of 1
 625          * to indicate not to probe for this interface
 626          */
 627         if (__dev_get_by_name(&init_net, name))
 628                 return 1;
 629
 630         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631                 if (!strcmp(name, s[i].name))
 632                         return s[i].map.base_addr;
 633         return 0;
 634 }
 635
 636 /*
 637  * Saves at boot time configured settings for any netdevice.
 638  */
 639 int __init netdev_boot_setup(char *str)
 640 {
 641         int ints[5];
 642         struct ifmap map;
 643
 644         str = get_options(str, ARRAY_SIZE(ints), ints);
 645         if (!str || !*str)
 646                 return 0;
 647
 648         /* Save settings */
 649         memset(&map, 0, sizeof(map));
 650         if (ints[0] > 0)
 651                 map.irq = ints[1];
 652         if (ints[0] > 1)
 653                 map.base_addr = ints[2];
 654         if (ints[0] > 2)
 655                 map.mem_start = ints[3];
 656         if (ints[0] > 3)
 657                 map.mem_end = ints[4];
 658
 659         /* Add new entry to the list */
 660         return netdev_boot_setup_add(str, &map);
 661 }
 662
 663 __setup("netdev=", netdev_boot_setup);
 664
 665 /*******************************************************************************
 666
 667                             Device Interface Subroutines
 668
 669 *******************************************************************************/
 670
 671 /**
 672  *      dev_get_iflink  - get 'iflink' value of a interface
 673  *      @dev: targeted interface
 674  *
 675  *      Indicates the ifindex the interface is linked to.
 676  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 677  */
 678
 679 int dev_get_iflink(const struct net_device *dev)
 680 {
 681         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682                 return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684         return dev->ifindex;
 685 }
 686 EXPORT_SYMBOL(dev_get_iflink);
 687
 688 /**
 689  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 690  *      @dev: targeted interface
 691  *      @skb: The packet.
 692  *
 693  *      For better visibility of tunnel traffic OVS needs to retrieve
 694  *      egress tunnel information for a packet. Following API allows
 695  *      user to get this info.
 696  */
 697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698 {
 699         struct ip_tunnel_info *info;
 700
 701         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702                 return -EINVAL;
 703
 704         info = skb_tunnel_info_unclone(skb);
 705         if (!info)
 706                 return -ENOMEM;
 707         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708                 return -EINVAL;
 709
 710         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711 }
 712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 714 /**
 715  *      __dev_get_by_name       - find a device by its name
 716  *      @net: the applicable net namespace
 717  *      @name: name to find
 718  *
 719  *      Find an interface by name. Must be called under RTNL semaphore
 720  *      or @dev_base_lock. If the name is found a pointer to the device
 721  *      is returned. If the name is not found then %NULL is returned. The
 722  *      reference counters are not incremented so the caller must be
 723  *      careful with locks.
 724  */
 725
 726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727 {
 728         struct net_device *dev;
 729         struct hlist_head *head = dev_name_hash(net, name);
 730
 731         hlist_for_each_entry(dev, head, name_hlist)
 732                 if (!strncmp(dev->name, name, IFNAMSIZ))
 733                         return dev;
 734
 735         return NULL;
 736 }
 737 EXPORT_SYMBOL(__dev_get_by_name);
 738
 739 /**
 740  *      dev_get_by_name_rcu     - find a device by its name
 741  *      @net: the applicable net namespace
 742  *      @name: name to find
 743  *
 744  *      Find an interface by name.
 745  *      If the name is found a pointer to the device is returned.
 746  *      If the name is not found then %NULL is returned.
 747  *      The reference counters are not incremented so the caller must be
 748  *      careful with locks. The caller must hold RCU lock.
 749  */
 750
 751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752 {
 753         struct net_device *dev;
 754         struct hlist_head *head = dev_name_hash(net, name);
 755
 756         hlist_for_each_entry_rcu(dev, head, name_hlist)
 757                 if (!strncmp(dev->name, name, IFNAMSIZ))
 758                         return dev;
 759
 760         return NULL;
 761 }
 762 EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764 /**
 765  *      dev_get_by_name         - find a device by its name
 766  *      @net: the applicable net namespace
 767  *      @name: name to find
 768  *
 769  *      Find an interface by name. This can be called from any
 770  *      context and does its own locking. The returned handle has
 771  *      the usage count incremented and the caller must use dev_put() to
 772  *      release it when it is no longer needed. %NULL is returned if no
 773  *      matching device is found.
 774  */
 775
 776 struct net_device *dev_get_by_name(struct net *net, const char *name)
 777 {
 778         struct net_device *dev;
 779
 780         rcu_read_lock();
 781         dev = dev_get_by_name_rcu(net, name);
 782         if (dev)
 783                 dev_hold(dev);
 784         rcu_read_unlock();
 785         return dev;
 786 }
 787 EXPORT_SYMBOL(dev_get_by_name);
 788
 789 /**
 790  *      __dev_get_by_index - find a device by its ifindex
 791  *      @net: the applicable net namespace
 792  *      @ifindex: index of device
 793  *
 794  *      Search for an interface by index. Returns %NULL if the device
 795  *      is not found or a pointer to the device. The device has not
 796  *      had its reference counter increased so the caller must be careful
 797  *      about locking. The caller must hold either the RTNL semaphore
 798  *      or @dev_base_lock.
 799  */
 800
 801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802 {
 803         struct net_device *dev;
 804         struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806         hlist_for_each_entry(dev, head, index_hlist)
 807                 if (dev->ifindex == ifindex)
 808                         return dev;
 809
 810         return NULL;
 811 }
 812 EXPORT_SYMBOL(__dev_get_by_index);
 813
 814 /**
 815  *      dev_get_by_index_rcu - find a device by its ifindex
 816  *      @net: the applicable net namespace
 817  *      @ifindex: index of device
 818  *
 819  *      Search for an interface by index. Returns %NULL if the device
 820  *      is not found or a pointer to the device. The device has not
 821  *      had its reference counter increased so the caller must be careful
 822  *      about locking. The caller must hold RCU lock.
 823  */
 824
 825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826 {
 827         struct net_device *dev;
 828         struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830         hlist_for_each_entry_rcu(dev, head, index_hlist)
 831                 if (dev->ifindex == ifindex)
 832                         return dev;
 833
 834         return NULL;
 835 }
 836 EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 838
 839 /**
 840  *      dev_get_by_index - find a device by its ifindex
 841  *      @net: the applicable net namespace
 842  *      @ifindex: index of device
 843  *
 844  *      Search for an interface by index. Returns NULL if the device
 845  *      is not found or a pointer to the device. The device returned has
 846  *      had a reference added and the pointer is safe until the user calls
 847  *      dev_put to indicate they have finished with it.
 848  */
 849
 850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851 {
 852         struct net_device *dev;
 853
 854         rcu_read_lock();
 855         dev = dev_get_by_index_rcu(net, ifindex);
 856         if (dev)
 857                 dev_hold(dev);
 858         rcu_read_unlock();
 859         return dev;
 860 }
 861 EXPORT_SYMBOL(dev_get_by_index);
 862
 863 /**
 864  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 865  *      @net: network namespace
 866  *      @name: a pointer to the buffer where the name will be stored.
 867  *      @ifindex: the ifindex of the interface to get the name from.
 868  *
 869  *      The use of raw_seqcount_begin() and cond_resched() before
 870  *      retrying is required as we want to give the writers a chance
 871  *      to complete when CONFIG_PREEMPT is not set.
 872  */
 873 int netdev_get_name(struct net *net, char *name, int ifindex)
 874 {
 875         struct net_device *dev;
 876         unsigned int seq;
 877
 878 retry:
 879         seq = raw_seqcount_begin(&devnet_rename_seq);
 880         rcu_read_lock();
 881         dev = dev_get_by_index_rcu(net, ifindex);
 882         if (!dev) {
 883                 rcu_read_unlock();
 884                 return -ENODEV;
 885         }
 886
 887         strcpy(name, dev->name);
 888         rcu_read_unlock();
 889         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890                 cond_resched();
 891                 goto retry;
 892         }
 893
 894         return 0;
 895 }
 896
 897 /**
 898  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 899  *      @net: the applicable net namespace
 900  *      @type: media type of device
 901  *      @ha: hardware address
 902  *
 903  *      Search for an interface by MAC address. Returns NULL if the device
 904  *      is not found or a pointer to the device.
 905  *      The caller must hold RCU or RTNL.
 906  *      The returned device has not had its ref count increased
 907  *      and the caller must therefore be careful about locking
 908  *
 909  */
 910
 911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912                                        const char *ha)
 913 {
 914         struct net_device *dev;
 915
 916         for_each_netdev_rcu(net, dev)
 917                 if (dev->type == type &&
 918                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 919                         return dev;
 920
 921         return NULL;
 922 }
 923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926 {
 927         struct net_device *dev;
 928
 929         ASSERT_RTNL();
 930         for_each_netdev(net, dev)
 931                 if (dev->type == type)
 932                         return dev;
 933
 934         return NULL;
 935 }
 936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939 {
 940         struct net_device *dev, *ret = NULL;
 941
 942         rcu_read_lock();
 943         for_each_netdev_rcu(net, dev)
 944                 if (dev->type == type) {
 945                         dev_hold(dev);
 946                         ret = dev;
 947                         break;
 948                 }
 949         rcu_read_unlock();
 950         return ret;
 951 }
 952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954 /**
 955  *      __dev_get_by_flags - find any device with given flags
 956  *      @net: the applicable net namespace
 957  *      @if_flags: IFF_* values
 958  *      @mask: bitmask of bits in if_flags to check
 959  *
 960  *      Search for any interface with the given flags. Returns NULL if a device
 961  *      is not found or a pointer to the device. Must be called inside
 962  *      rtnl_lock(), and result refcount is unchanged.
 963  */
 964
 965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966                                       unsigned short mask)
 967 {
 968         struct net_device *dev, *ret;
 969
 970         ASSERT_RTNL();
 971
 972         ret = NULL;
 973         for_each_netdev(net, dev) {
 974                 if (((dev->flags ^ if_flags) & mask) == 0) {
 975                         ret = dev;
 976                         break;
 977                 }
 978         }
 979         return ret;
 980 }
 981 EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983 /**
 984  *      dev_valid_name - check if name is okay for network device
 985  *      @name: name string
 986  *
 987  *      Network device names need to be valid file names to
 988  *      to allow sysfs to work.  We also disallow any kind of
 989  *      whitespace.
 990  */
 991 bool dev_valid_name(const char *name)
 992 {
 993         if (*name == '\0')
 994                 return false;
 995         if (strlen(name) >= IFNAMSIZ)
 996                 return false;
 997         if (!strcmp(name, ".") || !strcmp(name, ".."))
 998                 return false;
 999
1000         while (*name) {
1001                 if (*name == '/' || *name == ':' || isspace(*name))
1002                         return false;
1003                 name++;
1004         }
1005         return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008
1009 /**
1010  *      __dev_alloc_name - allocate a name for a device
1011  *      @net: network namespace to allocate the device name in
1012  *      @name: name format string
1013  *      @buf:  scratch buffer and result name string
1014  *
1015  *      Passed a format string - eg "lt%d" it will try and find a suitable
1016  *      id. It scans list of devices to build up a free map, then chooses
1017  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *      while allocating the name and adding the device in order to avoid
1019  *      duplicates.
1020  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *      Returns the number of the unit assigned or a negative errno code.
1022  */
1023
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026         int i = 0;
1027         const char *p;
1028         const int max_netdevices = 8*PAGE_SIZE;
1029         unsigned long *inuse;
1030         struct net_device *d;
1031
1032         p = strnchr(name, IFNAMSIZ-1, '%');
1033         if (p) {
1034                 /*
1035                  * Verify the string as this thing may have come from
1036                  * the user.  There must be either one "%d" and no other "%"
1037                  * characters.
1038                  */
1039                 if (p[1] != 'd' || strchr(p + 2, '%'))
1040                         return -EINVAL;
1041
1042                 /* Use one page as a bit array of possible slots */
1043                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044                 if (!inuse)
1045                         return -ENOMEM;
1046
1047                 for_each_netdev(net, d) {
1048                         if (!sscanf(d->name, name, &i))
1049                                 continue;
1050                         if (i < 0 || i >= max_netdevices)
1051                                 continue;
1052
1053                         /*  avoid cases where sscanf is not exact inverse of printf */
1054                         snprintf(buf, IFNAMSIZ, name, i);
1055                         if (!strncmp(buf, d->name, IFNAMSIZ))
1056                                 set_bit(i, inuse);
1057                 }
1058
1059                 i = find_first_zero_bit(inuse, max_netdevices);
1060                 free_page((unsigned long) inuse);
1061         }
1062
1063         if (buf != name)
1064                 snprintf(buf, IFNAMSIZ, name, i);
1065         if (!__dev_get_by_name(net, buf))
1066                 return i;
1067
1068         /* It is possible to run out of possible slots
1069          * when the name is long and there isn't enough space left
1070          * for the digits, or if all bits are used.
1071          */
1072         return -ENFILE;
1073 }
1074
1075 /**
1076  *      dev_alloc_name - allocate a name for a device
1077  *      @dev: device
1078  *      @name: name format string
1079  *
1080  *      Passed a format string - eg "lt%d" it will try and find a suitable
1081  *      id. It scans list of devices to build up a free map, then chooses
1082  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *      while allocating the name and adding the device in order to avoid
1084  *      duplicates.
1085  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *      Returns the number of the unit assigned or a negative errno code.
1087  */
1088
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091         char buf[IFNAMSIZ];
1092         struct net *net;
1093         int ret;
1094
1095         BUG_ON(!dev_net(dev));
1096         net = dev_net(dev);
1097         ret = __dev_alloc_name(net, name, buf);
1098         if (ret >= 0)
1099                 strlcpy(dev->name, buf, IFNAMSIZ);
1100         return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103
1104 static int dev_alloc_name_ns(struct net *net,
1105                              struct net_device *dev,
1106                              const char *name)
1107 {
1108         char buf[IFNAMSIZ];
1109         int ret;
1110
1111         ret = __dev_alloc_name(net, name, buf);
1112         if (ret >= 0)
1113                 strlcpy(dev->name, buf, IFNAMSIZ);
1114         return ret;
1115 }
1116
1117 static int dev_get_valid_name(struct net *net,
1118                               struct net_device *dev,
1119                               const char *name)
1120 {
1121         BUG_ON(!net);
1122
1123         if (!dev_valid_name(name))
1124                 return -EINVAL;
1125
1126         if (strchr(name, '%'))
1127                 return dev_alloc_name_ns(net, dev, name);
1128         else if (__dev_get_by_name(net, name))
1129                 return -EEXIST;
1130         else if (dev->name != name)
1131                 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133         return 0;
1134 }
1135
1136 /**
1137  *      dev_change_name - change name of a device
1138  *      @dev: device
1139  *      @newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *      Change name of a device, can pass format strings "eth%d".
1142  *      for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146         unsigned char old_assign_type;
1147         char oldname[IFNAMSIZ];
1148         int err = 0;
1149         int ret;
1150         struct net *net;
1151
1152         ASSERT_RTNL();
1153         BUG_ON(!dev_net(dev));
1154
1155         net = dev_net(dev);
1156         if (dev->flags & IFF_UP)
1157                 return -EBUSY;
1158
1159         write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162                 write_seqcount_end(&devnet_rename_seq);
1163                 return 0;
1164         }
1165
1166         memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168         err = dev_get_valid_name(net, dev, newname);
1169         if (err < 0) {
1170                 write_seqcount_end(&devnet_rename_seq);
1171                 return err;
1172         }
1173
1174         if (oldname[0] && !strchr(oldname, '%'))
1175                 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177         old_assign_type = dev->name_assign_type;
1178         dev->name_assign_type = NET_NAME_RENAMED;
1179
1180 rollback:
1181         ret = device_rename(&dev->dev, dev->name);
1182         if (ret) {
1183                 memcpy(dev->name, oldname, IFNAMSIZ);
1184                 dev->name_assign_type = old_assign_type;
1185                 write_seqcount_end(&devnet_rename_seq);
1186                 return ret;
1187         }
1188
1189         write_seqcount_end(&devnet_rename_seq);
1190
1191         netdev_adjacent_rename_links(dev, oldname);
1192
1193         write_lock_bh(&dev_base_lock);
1194         hlist_del_rcu(&dev->name_hlist);
1195         write_unlock_bh(&dev_base_lock);
1196
1197         synchronize_rcu();
1198
1199         write_lock_bh(&dev_base_lock);
1200         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201         write_unlock_bh(&dev_base_lock);
1202
1203         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204         ret = notifier_to_errno(ret);
1205
1206         if (ret) {
1207                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208                 if (err >= 0) {
1209                         err = ret;
1210                         write_seqcount_begin(&devnet_rename_seq);
1211                         memcpy(dev->name, oldname, IFNAMSIZ);
1212                         memcpy(oldname, newname, IFNAMSIZ);
1213                         dev->name_assign_type = old_assign_type;
1214                         old_assign_type = NET_NAME_RENAMED;
1215                         goto rollback;
1216                 } else {
1217                         pr_err("%s: name change rollback failed: %d\n",
1218                                dev->name, ret);
1219                 }
1220         }
1221
1222         return err;
1223 }
1224
1225 /**
1226  *      dev_set_alias - change ifalias of a device
1227  *      @dev: device
1228  *      @alias: name up to IFALIASZ
1229  *      @len: limit of bytes to copy from info
1230  *
1231  *      Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235         char *new_ifalias;
1236
1237         ASSERT_RTNL();
1238
1239         if (len >= IFALIASZ)
1240                 return -EINVAL;
1241
1242         if (!len) {
1243                 kfree(dev->ifalias);
1244                 dev->ifalias = NULL;
1245                 return 0;
1246         }
1247
1248         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249         if (!new_ifalias)
1250                 return -ENOMEM;
1251         dev->ifalias = new_ifalias;
1252
1253         strlcpy(dev->ifalias, alias, len+1);
1254         return len;
1255 }
1256
1257
1258 /**
1259  *      netdev_features_change - device changes features
1260  *      @dev: device to cause notification
1261  *
1262  *      Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269
1270 /**
1271  *      netdev_state_change - device changes state
1272  *      @dev: device to cause notification
1273  *
1274  *      Called to indicate a device has changed state. This function calls
1275  *      the notifier chains for netdev_chain and sends a NEWLINK message
1276  *      to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280         if (dev->flags & IFF_UP) {
1281                 struct netdev_notifier_change_info change_info;
1282
1283                 change_info.flags_changed = 0;
1284                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285                                               &change_info.info);
1286                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287         }
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290
1291 /**
1292  *      netdev_notify_peers - notify network peers about existence of @dev
1293  *      @dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303         rtnl_lock();
1304         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305         rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308
1309 static int __dev_open(struct net_device *dev)
1310 {
1311         const struct net_device_ops *ops = dev->netdev_ops;
1312         int ret;
1313
1314         ASSERT_RTNL();
1315
1316         if (!netif_device_present(dev))
1317                 return -ENODEV;
1318
1319         /* Block netpoll from trying to do any rx path servicing.
1320          * If we don't do this there is a chance ndo_poll_controller
1321          * or ndo_poll may be running while we open the device
1322          */
1323         netpoll_poll_disable(dev);
1324
1325         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326         ret = notifier_to_errno(ret);
1327         if (ret)
1328                 return ret;
1329
1330         set_bit(__LINK_STATE_START, &dev->state);
1331
1332         if (ops->ndo_validate_addr)
1333                 ret = ops->ndo_validate_addr(dev);
1334
1335         if (!ret && ops->ndo_open)
1336                 ret = ops->ndo_open(dev);
1337
1338         netpoll_poll_enable(dev);
1339
1340         if (ret)
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342         else {
1343                 dev->flags |= IFF_UP;
1344                 dev_set_rx_mode(dev);
1345                 dev_activate(dev);
1346                 add_device_randomness(dev->dev_addr, dev->addr_len);
1347         }
1348
1349         return ret;
1350 }
1351
1352 /**
1353  *      dev_open        - prepare an interface for use.
1354  *      @dev:   device to open
1355  *
1356  *      Takes a device from down to up state. The device's private open
1357  *      function is invoked and then the multicast lists are loaded. Finally
1358  *      the device is moved into the up state and a %NETDEV_UP message is
1359  *      sent to the netdev notifier chain.
1360  *
1361  *      Calling this function on an active interface is a nop. On a failure
1362  *      a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366         int ret;
1367
1368         if (dev->flags & IFF_UP)
1369                 return 0;
1370
1371         ret = __dev_open(dev);
1372         if (ret < 0)
1373                 return ret;
1374
1375         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376         call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378         return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384         struct net_device *dev;
1385
1386         ASSERT_RTNL();
1387         might_sleep();
1388
1389         list_for_each_entry(dev, head, close_list) {
1390                 /* Temporarily disable netpoll until the interface is down */
1391                 netpoll_poll_disable(dev);
1392
1393                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395                 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398                  * can be even on different cpu. So just clear netif_running().
1399                  *
1400                  * dev->stop() will invoke napi_disable() on all of it's
1401                  * napi_struct instances on this device.
1402                  */
1403                 smp_mb__after_atomic(); /* Commit netif_running(). */
1404         }
1405
1406         dev_deactivate_many(head);
1407
1408         list_for_each_entry(dev, head, close_list) {
1409                 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411                 /*
1412                  *      Call the device specific close. This cannot fail.
1413                  *      Only if device is UP
1414                  *
1415                  *      We allow it to be called even after a DETACH hot-plug
1416                  *      event.
1417                  */
1418                 if (ops->ndo_stop)
1419                         ops->ndo_stop(dev);
1420
1421                 dev->flags &= ~IFF_UP;
1422                 netpoll_poll_enable(dev);
1423         }
1424
1425         return 0;
1426 }
1427
1428 static int __dev_close(struct net_device *dev)
1429 {
1430         int retval;
1431         LIST_HEAD(single);
1432
1433         list_add(&dev->close_list, &single);
1434         retval = __dev_close_many(&single);
1435         list_del(&single);
1436
1437         return retval;
1438 }
1439
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442         struct net_device *dev, *tmp;
1443
1444         /* Remove the devices that don't need to be closed */
1445         list_for_each_entry_safe(dev, tmp, head, close_list)
1446                 if (!(dev->flags & IFF_UP))
1447                         list_del_init(&dev->close_list);
1448
1449         __dev_close_many(head);
1450
1451         list_for_each_entry_safe(dev, tmp, head, close_list) {
1452                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454                 if (unlink)
1455                         list_del_init(&dev->close_list);
1456         }
1457
1458         return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461
1462 /**
1463  *      dev_close - shutdown an interface.
1464  *      @dev: device to shutdown
1465  *
1466  *      This function moves an active device into down state. A
1467  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *      chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473         if (dev->flags & IFF_UP) {
1474                 LIST_HEAD(single);
1475
1476                 list_add(&dev->close_list, &single);
1477                 dev_close_many(&single, true);
1478                 list_del(&single);
1479         }
1480         return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483
1484
1485 /**
1486  *      dev_disable_lro - disable Large Receive Offload on a device
1487  *      @dev: device
1488  *
1489  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *      called under RTNL.  This is needed if received packets may be
1491  *      forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495         struct net_device *lower_dev;
1496         struct list_head *iter;
1497
1498         dev->wanted_features &= ~NETIF_F_LRO;
1499         netdev_update_features(dev);
1500
1501         if (unlikely(dev->features & NETIF_F_LRO))
1502                 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504         netdev_for_each_lower_dev(dev, lower_dev, iter)
1505                 dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510                                    struct net_device *dev)
1511 {
1512         struct netdev_notifier_info info;
1513
1514         netdev_notifier_info_init(&info, dev);
1515         return nb->notifier_call(nb, val, &info);
1516 }
1517
1518 static int dev_boot_phase = 1;
1519
1520 /**
1521  *      register_netdevice_notifier - register a network notifier block
1522  *      @nb: notifier
1523  *
1524  *      Register a notifier to be called when network device events occur.
1525  *      The notifier passed is linked into the kernel structures and must
1526  *      not be reused until it has been unregistered. A negative errno code
1527  *      is returned on a failure.
1528  *
1529  *      When registered all registration and up events are replayed
1530  *      to the new notifier to allow device to have a race free
1531  *      view of the network device list.
1532  */
1533
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536         struct net_device *dev;
1537         struct net_device *last;
1538         struct net *net;
1539         int err;
1540
1541         rtnl_lock();
1542         err = raw_notifier_chain_register(&netdev_chain, nb);
1543         if (err)
1544                 goto unlock;
1545         if (dev_boot_phase)
1546                 goto unlock;
1547         for_each_net(net) {
1548                 for_each_netdev(net, dev) {
1549                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550                         err = notifier_to_errno(err);
1551                         if (err)
1552                                 goto rollback;
1553
1554                         if (!(dev->flags & IFF_UP))
1555                                 continue;
1556
1557                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1558                 }
1559         }
1560
1561 unlock:
1562         rtnl_unlock();
1563         return err;
1564
1565 rollback:
1566         last = dev;
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev == last)
1570                                 goto outroll;
1571
1572                         if (dev->flags & IFF_UP) {
1573                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574                                                         dev);
1575                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576                         }
1577                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578                 }
1579         }
1580
1581 outroll:
1582         raw_notifier_chain_unregister(&netdev_chain, nb);
1583         goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587 /**
1588  *      unregister_netdevice_notifier - unregister a network notifier block
1589  *      @nb: notifier
1590  *
1591  *      Unregister a notifier previously registered by
1592  *      register_netdevice_notifier(). The notifier is unlinked into the
1593  *      kernel structures and may then be reused. A negative errno code
1594  *      is returned on a failure.
1595  *
1596  *      After unregistering unregister and down device events are synthesized
1597  *      for all devices on the device list to the removed notifier to remove
1598  *      the need for special case cleanup code.
1599  */
1600
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603         struct net_device *dev;
1604         struct net *net;
1605         int err;
1606
1607         rtnl_lock();
1608         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609         if (err)
1610                 goto unlock;
1611
1612         for_each_net(net) {
1613                 for_each_netdev(net, dev) {
1614                         if (dev->flags & IFF_UP) {
1615                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616                                                         dev);
1617                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618                         }
1619                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620                 }
1621         }
1622 unlock:
1623         rtnl_unlock();
1624         return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628 /**
1629  *      call_netdevice_notifiers_info - call all network notifier blocks
1630  *      @val: value passed unmodified to notifier function
1631  *      @dev: net_device pointer passed unmodified to notifier function
1632  *      @info: notifier information data
1633  *
1634  *      Call all network notifier blocks.  Parameters and return value
1635  *      are as for raw_notifier_call_chain().
1636  */
1637
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639                                          struct net_device *dev,
1640                                          struct netdev_notifier_info *info)
1641 {
1642         ASSERT_RTNL();
1643         netdev_notifier_info_init(info, dev);
1644         return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646
1647 /**
1648  *      call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *      Call all network notifier blocks.  Parameters and return value
1653  *      are as for raw_notifier_call_chain().
1654  */
1655
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658         struct netdev_notifier_info info;
1659
1660         return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666
1667 void net_inc_ingress_queue(void)
1668 {
1669         static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673 void net_dec_ingress_queue(void)
1674 {
1675         static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682
1683 void net_inc_egress_queue(void)
1684 {
1685         static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689 void net_dec_egress_queue(void)
1690 {
1691         static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 static atomic_t netstamp_needed_deferred;
1699 static atomic_t netstamp_wanted;
1700 static void netstamp_clear(struct work_struct *work)
1701 {
1702         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703         int wanted;
1704
1705         wanted = atomic_add_return(deferred, &netstamp_wanted);
1706         if (wanted > 0)
1707                 static_key_enable(&netstamp_needed);
1708         else
1709                 static_key_disable(&netstamp_needed);
1710 }
1711 static DECLARE_WORK(netstamp_work, netstamp_clear);
1712 #endif
1713
1714 void net_enable_timestamp(void)
1715 {
1716 #ifdef HAVE_JUMP_LABEL
1717         int wanted;
1718
1719         while (1) {
1720                 wanted = atomic_read(&netstamp_wanted);
1721                 if (wanted <= 0)
1722                         break;
1723                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724                         return;
1725         }
1726         atomic_inc(&netstamp_needed_deferred);
1727         schedule_work(&netstamp_work);
1728 #else
1729         static_key_slow_inc(&netstamp_needed);
1730 #endif
1731 }
1732 EXPORT_SYMBOL(net_enable_timestamp);
1733
1734 void net_disable_timestamp(void)
1735 {
1736 #ifdef HAVE_JUMP_LABEL
1737         int wanted;
1738
1739         while (1) {
1740                 wanted = atomic_read(&netstamp_wanted);
1741                 if (wanted <= 1)
1742                         break;
1743                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744                         return;
1745         }
1746         atomic_dec(&netstamp_needed_deferred);
1747         schedule_work(&netstamp_work);
1748 #else
1749         static_key_slow_dec(&netstamp_needed);
1750 #endif
1751 }
1752 EXPORT_SYMBOL(net_disable_timestamp);
1753
1754 static inline void net_timestamp_set(struct sk_buff *skb)
1755 {
1756         skb->tstamp = 0;
1757         if (static_key_false(&netstamp_needed))
1758                 __net_timestamp(skb);
1759 }
1760
1761 #define net_timestamp_check(COND, SKB)                  \
1762         if (static_key_false(&netstamp_needed)) {               \
1763                 if ((COND) && !(SKB)->tstamp)   \
1764                         __net_timestamp(SKB);           \
1765         }                                               \
1766
1767 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768 {
1769         unsigned int len;
1770
1771         if (!(dev->flags & IFF_UP))
1772                 return false;
1773
1774         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775         if (skb->len <= len)
1776                 return true;
1777
1778         /* if TSO is enabled, we don't care about the length as the packet
1779          * could be forwarded without being segmented before
1780          */
1781         if (skb_is_gso(skb))
1782                 return true;
1783
1784         return false;
1785 }
1786 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1789 {
1790         int ret = ____dev_forward_skb(dev, skb);
1791
1792         if (likely(!ret)) {
1793                 skb->protocol = eth_type_trans(skb, dev);
1794                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795         }
1796
1797         return ret;
1798 }
1799 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801 /**
1802  * dev_forward_skb - loopback an skb to another netif
1803  *
1804  * @dev: destination network device
1805  * @skb: buffer to forward
1806  *
1807  * return values:
1808  *      NET_RX_SUCCESS  (no congestion)
1809  *      NET_RX_DROP     (packet was dropped, but freed)
1810  *
1811  * dev_forward_skb can be used for injecting an skb from the
1812  * start_xmit function of one device into the receive queue
1813  * of another device.
1814  *
1815  * The receiving device may be in another namespace, so
1816  * we have to clear all information in the skb that could
1817  * impact namespace isolation.
1818  */
1819 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820 {
1821         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822 }
1823 EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
1825 static inline int deliver_skb(struct sk_buff *skb,
1826                               struct packet_type *pt_prev,
1827                               struct net_device *orig_dev)
1828 {
1829         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830                 return -ENOMEM;
1831         atomic_inc(&skb->users);
1832         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833 }
1834
1835 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836                                           struct packet_type **pt,
1837                                           struct net_device *orig_dev,
1838                                           __be16 type,
1839                                           struct list_head *ptype_list)
1840 {
1841         struct packet_type *ptype, *pt_prev = *pt;
1842
1843         list_for_each_entry_rcu(ptype, ptype_list, list) {
1844                 if (ptype->type != type)
1845                         continue;
1846                 if (pt_prev)
1847                         deliver_skb(skb, pt_prev, orig_dev);
1848                 pt_prev = ptype;
1849         }
1850         *pt = pt_prev;
1851 }
1852
1853 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854 {
1855         if (!ptype->af_packet_priv || !skb->sk)
1856                 return false;
1857
1858         if (ptype->id_match)
1859                 return ptype->id_match(ptype, skb->sk);
1860         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861                 return true;
1862
1863         return false;
1864 }
1865
1866 /*
1867  *      Support routine. Sends outgoing frames to any network
1868  *      taps currently in use.
1869  */
1870
1871 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872 {
1873         struct packet_type *ptype;
1874         struct sk_buff *skb2 = NULL;
1875         struct packet_type *pt_prev = NULL;
1876         struct list_head *ptype_list = &ptype_all;
1877
1878         rcu_read_lock();
1879 again:
1880         list_for_each_entry_rcu(ptype, ptype_list, list) {
1881                 /* Never send packets back to the socket
1882                  * they originated from - MvS (miquels@drinkel.ow.org)
1883                  */
1884                 if (skb_loop_sk(ptype, skb))
1885                         continue;
1886
1887                 if (pt_prev) {
1888                         deliver_skb(skb2, pt_prev, skb->dev);
1889                         pt_prev = ptype;
1890                         continue;
1891                 }
1892
1893                 /* need to clone skb, done only once */
1894                 skb2 = skb_clone(skb, GFP_ATOMIC);
1895                 if (!skb2)
1896                         goto out_unlock;
1897
1898                 net_timestamp_set(skb2);
1899
1900                 /* skb->nh should be correctly
1901                  * set by sender, so that the second statement is
1902                  * just protection against buggy protocols.
1903                  */
1904                 skb_reset_mac_header(skb2);
1905
1906                 if (skb_network_header(skb2) < skb2->data ||
1907                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909                                              ntohs(skb2->protocol),
1910                                              dev->name);
1911                         skb_reset_network_header(skb2);
1912                 }
1913
1914                 skb2->transport_header = skb2->network_header;
1915                 skb2->pkt_type = PACKET_OUTGOING;
1916                 pt_prev = ptype;
1917         }
1918
1919         if (ptype_list == &ptype_all) {
1920                 ptype_list = &dev->ptype_all;
1921                 goto again;
1922         }
1923 out_unlock:
1924         if (pt_prev)
1925                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1926         rcu_read_unlock();
1927 }
1928 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930 /**
1931  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932  * @dev: Network device
1933  * @txq: number of queues available
1934  *
1935  * If real_num_tx_queues is changed the tc mappings may no longer be
1936  * valid. To resolve this verify the tc mapping remains valid and if
1937  * not NULL the mapping. With no priorities mapping to this
1938  * offset/count pair it will no longer be used. In the worst case TC0
1939  * is invalid nothing can be done so disable priority mappings. If is
1940  * expected that drivers will fix this mapping if they can before
1941  * calling netif_set_real_num_tx_queues.
1942  */
1943 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944 {
1945         int i;
1946         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948         /* If TC0 is invalidated disable TC mapping */
1949         if (tc->offset + tc->count > txq) {
1950                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951                 dev->num_tc = 0;
1952                 return;
1953         }
1954
1955         /* Invalidated prio to tc mappings set to TC0 */
1956         for (i = 1; i < TC_BITMASK + 1; i++) {
1957                 int q = netdev_get_prio_tc_map(dev, i);
1958
1959                 tc = &dev->tc_to_txq[q];
1960                 if (tc->offset + tc->count > txq) {
1961                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962                                 i, q);
1963                         netdev_set_prio_tc_map(dev, i, 0);
1964                 }
1965         }
1966 }
1967
1968 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969 {
1970         if (dev->num_tc) {
1971                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972                 int i;
1973
1974                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975                         if ((txq - tc->offset) < tc->count)
1976                                 return i;
1977                 }
1978
1979                 return -1;
1980         }
1981
1982         return 0;
1983 }
1984
1985 #ifdef CONFIG_XPS
1986 static DEFINE_MUTEX(xps_map_mutex);
1987 #define xmap_dereference(P)             \
1988         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991                              int tci, u16 index)
1992 {
1993         struct xps_map *map = NULL;
1994         int pos;
1995
1996         if (dev_maps)
1997                 map = xmap_dereference(dev_maps->cpu_map[tci]);
1998         if (!map)
1999                 return false;
2000
2001         for (pos = map->len; pos--;) {
2002                 if (map->queues[pos] != index)
2003                         continue;
2004
2005                 if (map->len > 1) {
2006                         map->queues[pos] = map->queues[--map->len];
2007                         break;
2008                 }
2009
2010                 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2011                 kfree_rcu(map, rcu);
2012                 return false;
2013         }
2014
2015         return true;
2016 }
2017
2018 static bool remove_xps_queue_cpu(struct net_device *dev,
2019                                  struct xps_dev_maps *dev_maps,
2020                                  int cpu, u16 offset, u16 count)
2021 {
2022         int num_tc = dev->num_tc ? : 1;
2023         bool active = false;
2024         int tci;
2025
2026         for (tci = cpu * num_tc; num_tc--; tci++) {
2027                 int i, j;
2028
2029                 for (i = count, j = offset; i--; j++) {
2030                         if (!remove_xps_queue(dev_maps, cpu, j))
2031                                 break;
2032                 }
2033
2034                 active |= i < 0;
2035         }
2036
2037         return active;
2038 }
2039
2040 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041                                    u16 count)
2042 {
2043         struct xps_dev_maps *dev_maps;
2044         int cpu, i;
2045         bool active = false;
2046
2047         mutex_lock(&xps_map_mutex);
2048         dev_maps = xmap_dereference(dev->xps_maps);
2049
2050         if (!dev_maps)
2051                 goto out_no_maps;
2052
2053         for_each_possible_cpu(cpu)
2054                 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055                                                offset, count);
2056
2057         if (!active) {
2058                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2059                 kfree_rcu(dev_maps, rcu);
2060         }
2061
2062         for (i = offset + (count - 1); count--; i--)
2063                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064                                              NUMA_NO_NODE);
2065
2066 out_no_maps:
2067         mutex_unlock(&xps_map_mutex);
2068 }
2069
2070 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071 {
2072         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073 }
2074
2075 static struct xps_map *expand_xps_map(struct xps_map *map,
2076                                       int cpu, u16 index)
2077 {
2078         struct xps_map *new_map;
2079         int alloc_len = XPS_MIN_MAP_ALLOC;
2080         int i, pos;
2081
2082         for (pos = 0; map && pos < map->len; pos++) {
2083                 if (map->queues[pos] != index)
2084                         continue;
2085                 return map;
2086         }
2087
2088         /* Need to add queue to this CPU's existing map */
2089         if (map) {
2090                 if (pos < map->alloc_len)
2091                         return map;
2092
2093                 alloc_len = map->alloc_len * 2;
2094         }
2095
2096         /* Need to allocate new map to store queue on this CPU's map */
2097         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098                                cpu_to_node(cpu));
2099         if (!new_map)
2100                 return NULL;
2101
2102         for (i = 0; i < pos; i++)
2103                 new_map->queues[i] = map->queues[i];
2104         new_map->alloc_len = alloc_len;
2105         new_map->len = pos;
2106
2107         return new_map;
2108 }
2109
2110 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111                         u16 index)
2112 {
2113         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114         int i, cpu, tci, numa_node_id = -2;
2115         int maps_sz, num_tc = 1, tc = 0;
2116         struct xps_map *map, *new_map;
2117         bool active = false;
2118
2119         if (dev->num_tc) {
2120                 num_tc = dev->num_tc;
2121                 tc = netdev_txq_to_tc(dev, index);
2122                 if (tc < 0)
2123                         return -EINVAL;
2124         }
2125
2126         maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2127         if (maps_sz < L1_CACHE_BYTES)
2128                 maps_sz = L1_CACHE_BYTES;
2129
2130         mutex_lock(&xps_map_mutex);
2131
2132         dev_maps = xmap_dereference(dev->xps_maps);
2133
2134         /* allocate memory for queue storage */
2135         for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136                 if (!new_dev_maps)
2137                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138                 if (!new_dev_maps) {
2139                         mutex_unlock(&xps_map_mutex);
2140                         return -ENOMEM;
2141                 }
2142
2143                 tci = cpu * num_tc + tc;
2144                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145                                  NULL;
2146
2147                 map = expand_xps_map(map, cpu, index);
2148                 if (!map)
2149                         goto error;
2150
2151                 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152         }
2153
2154         if (!new_dev_maps)
2155                 goto out_no_new_maps;
2156
2157         for_each_possible_cpu(cpu) {
2158                 /* copy maps belonging to foreign traffic classes */
2159                 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160                         /* fill in the new device map from the old device map */
2161                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2162                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163                 }
2164
2165                 /* We need to explicitly update tci as prevous loop
2166                  * could break out early if dev_maps is NULL.
2167                  */
2168                 tci = cpu * num_tc + tc;
2169
2170                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171                         /* add queue to CPU maps */
2172                         int pos = 0;
2173
2174                         map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175                         while ((pos < map->len) && (map->queues[pos] != index))
2176                                 pos++;
2177
2178                         if (pos == map->len)
2179                                 map->queues[map->len++] = index;
2180 #ifdef CONFIG_NUMA
2181                         if (numa_node_id == -2)
2182                                 numa_node_id = cpu_to_node(cpu);
2183                         else if (numa_node_id != cpu_to_node(cpu))
2184                                 numa_node_id = -1;
2185 #endif
2186                 } else if (dev_maps) {
2187                         /* fill in the new device map from the old device map */
2188                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2189                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190                 }
2191
2192                 /* copy maps belonging to foreign traffic classes */
2193                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194                         /* fill in the new device map from the old device map */
2195                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2196                         RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197                 }
2198         }
2199
2200         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202         /* Cleanup old maps */
2203         if (!dev_maps)
2204                 goto out_no_old_maps;
2205
2206         for_each_possible_cpu(cpu) {
2207                 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208                         new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209                         map = xmap_dereference(dev_maps->cpu_map[tci]);
2210                         if (map && map != new_map)
2211                                 kfree_rcu(map, rcu);
2212                 }
2213         }
2214
2215         kfree_rcu(dev_maps, rcu);
2216
2217 out_no_old_maps:
2218         dev_maps = new_dev_maps;
2219         active = true;
2220
2221 out_no_new_maps:
2222         /* update Tx queue numa node */
2223         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224                                      (numa_node_id >= 0) ? numa_node_id :
2225                                      NUMA_NO_NODE);
2226
2227         if (!dev_maps)
2228                 goto out_no_maps;
2229
2230         /* removes queue from unused CPUs */
2231         for_each_possible_cpu(cpu) {
2232                 for (i = tc, tci = cpu * num_tc; i--; tci++)
2233                         active |= remove_xps_queue(dev_maps, tci, index);
2234                 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235                         active |= remove_xps_queue(dev_maps, tci, index);
2236                 for (i = num_tc - tc, tci++; --i; tci++)
2237                         active |= remove_xps_queue(dev_maps, tci, index);
2238         }
2239
2240         /* free map if not active */
2241         if (!active) {
2242                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2243                 kfree_rcu(dev_maps, rcu);
2244         }
2245
2246 out_no_maps:
2247         mutex_unlock(&xps_map_mutex);
2248
2249         return 0;
2250 error:
2251         /* remove any maps that we added */
2252         for_each_possible_cpu(cpu) {
2253                 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254                         new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255                         map = dev_maps ?
2256                               xmap_dereference(dev_maps->cpu_map[tci]) :
2257                               NULL;
2258                         if (new_map && new_map != map)
2259                                 kfree(new_map);
2260                 }
2261         }
2262
2263         mutex_unlock(&xps_map_mutex);
2264
2265         kfree(new_dev_maps);
2266         return -ENOMEM;
2267 }
2268 EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270 #endif
2271 void netdev_reset_tc(struct net_device *dev)
2272 {
2273 #ifdef CONFIG_XPS
2274         netif_reset_xps_queues_gt(dev, 0);
2275 #endif
2276         dev->num_tc = 0;
2277         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279 }
2280 EXPORT_SYMBOL(netdev_reset_tc);
2281
2282 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283 {
2284         if (tc >= dev->num_tc)
2285                 return -EINVAL;
2286
2287 #ifdef CONFIG_XPS
2288         netif_reset_xps_queues(dev, offset, count);
2289 #endif
2290         dev->tc_to_txq[tc].count = count;
2291         dev->tc_to_txq[tc].offset = offset;
2292         return 0;
2293 }
2294 EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297 {
2298         if (num_tc > TC_MAX_QUEUE)
2299                 return -EINVAL;
2300
2301 #ifdef CONFIG_XPS
2302         netif_reset_xps_queues_gt(dev, 0);
2303 #endif
2304         dev->num_tc = num_tc;
2305         return 0;
2306 }
2307 EXPORT_SYMBOL(netdev_set_num_tc);
2308
2309 /*
2310  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312  */
2313 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314 {
2315         int rc;
2316
2317         if (txq < 1 || txq > dev->num_tx_queues)
2318                 return -EINVAL;
2319
2320         if (dev->reg_state == NETREG_REGISTERED ||
2321             dev->reg_state == NETREG_UNREGISTERING) {
2322                 ASSERT_RTNL();
2323
2324                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325                                                   txq);
2326                 if (rc)
2327                         return rc;
2328
2329                 if (dev->num_tc)
2330                         netif_setup_tc(dev, txq);
2331
2332                 if (txq < dev->real_num_tx_queues) {
2333                         qdisc_reset_all_tx_gt(dev, txq);
2334 #ifdef CONFIG_XPS
2335                         netif_reset_xps_queues_gt(dev, txq);
2336 #endif
2337                 }
2338         }
2339
2340         dev->real_num_tx_queues = txq;
2341         return 0;
2342 }
2343 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345 #ifdef CONFIG_SYSFS
2346 /**
2347  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2348  *      @dev: Network device
2349  *      @rxq: Actual number of RX queues
2350  *
2351  *      This must be called either with the rtnl_lock held or before
2352  *      registration of the net device.  Returns 0 on success, or a
2353  *      negative error code.  If called before registration, it always
2354  *      succeeds.
2355  */
2356 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357 {
2358         int rc;
2359
2360         if (rxq < 1 || rxq > dev->num_rx_queues)
2361                 return -EINVAL;
2362
2363         if (dev->reg_state == NETREG_REGISTERED) {
2364                 ASSERT_RTNL();
2365
2366                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367                                                   rxq);
2368                 if (rc)
2369                         return rc;
2370         }
2371
2372         dev->real_num_rx_queues = rxq;
2373         return 0;
2374 }
2375 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376 #endif
2377
2378 /**
2379  * netif_get_num_default_rss_queues - default number of RSS queues
2380  *
2381  * This routine should set an upper limit on the number of RSS queues
2382  * used by default by multiqueue devices.
2383  */
2384 int netif_get_num_default_rss_queues(void)
2385 {
2386         return is_kdump_kernel() ?
2387                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2388 }
2389 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391 static void __netif_reschedule(struct Qdisc *q)
2392 {
2393         struct softnet_data *sd;
2394         unsigned long flags;
2395
2396         local_irq_save(flags);
2397         sd = this_cpu_ptr(&softnet_data);
2398         q->next_sched = NULL;
2399         *sd->output_queue_tailp = q;
2400         sd->output_queue_tailp = &q->next_sched;
2401         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402         local_irq_restore(flags);
2403 }
2404
2405 void __netif_schedule(struct Qdisc *q)
2406 {
2407         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408                 __netif_reschedule(q);
2409 }
2410 EXPORT_SYMBOL(__netif_schedule);
2411
2412 struct dev_kfree_skb_cb {
2413         enum skb_free_reason reason;
2414 };
2415
2416 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417 {
2418         return (struct dev_kfree_skb_cb *)skb->cb;
2419 }
2420
2421 void netif_schedule_queue(struct netdev_queue *txq)
2422 {
2423         rcu_read_lock();
2424         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427                 __netif_schedule(q);
2428         }
2429         rcu_read_unlock();
2430 }
2431 EXPORT_SYMBOL(netif_schedule_queue);
2432
2433 /**
2434  *      netif_wake_subqueue - allow sending packets on subqueue
2435  *      @dev: network device
2436  *      @queue_index: sub queue index
2437  *
2438  * Resume individual transmit queue of a device with multiple transmit queues.
2439  */
2440 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441 {
2442         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445                 struct Qdisc *q;
2446
2447                 rcu_read_lock();
2448                 q = rcu_dereference(txq->qdisc);
2449                 __netif_schedule(q);
2450                 rcu_read_unlock();
2451         }
2452 }
2453 EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456 {
2457         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458                 struct Qdisc *q;
2459
2460                 rcu_read_lock();
2461                 q = rcu_dereference(dev_queue->qdisc);
2462                 __netif_schedule(q);
2463                 rcu_read_unlock();
2464         }
2465 }
2466 EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469 {
2470         unsigned long flags;
2471
2472         if (likely(atomic_read(&skb->users) == 1)) {
2473                 smp_rmb();
2474                 atomic_set(&skb->users, 0);
2475         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2476                 return;
2477         }
2478         get_kfree_skb_cb(skb)->reason = reason;
2479         local_irq_save(flags);
2480         skb->next = __this_cpu_read(softnet_data.completion_queue);
2481         __this_cpu_write(softnet_data.completion_queue, skb);
2482         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483         local_irq_restore(flags);
2484 }
2485 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488 {
2489         if (in_irq() || irqs_disabled())
2490                 __dev_kfree_skb_irq(skb, reason);
2491         else
2492                 dev_kfree_skb(skb);
2493 }
2494 EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497 /**
2498  * netif_device_detach - mark device as removed
2499  * @dev: network device
2500  *
2501  * Mark device as removed from system and therefore no longer available.
2502  */
2503 void netif_device_detach(struct net_device *dev)
2504 {
2505         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506             netif_running(dev)) {
2507                 netif_tx_stop_all_queues(dev);
2508         }
2509 }
2510 EXPORT_SYMBOL(netif_device_detach);
2511
2512 /**
2513  * netif_device_attach - mark device as attached
2514  * @dev: network device
2515  *
2516  * Mark device as attached from system and restart if needed.
2517  */
2518 void netif_device_attach(struct net_device *dev)
2519 {
2520         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521             netif_running(dev)) {
2522                 netif_tx_wake_all_queues(dev);
2523                 __netdev_watchdog_up(dev);
2524         }
2525 }
2526 EXPORT_SYMBOL(netif_device_attach);
2527
2528 /*
2529  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530  * to be used as a distribution range.
2531  */
2532 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533                   unsigned int num_tx_queues)
2534 {
2535         u32 hash;
2536         u16 qoffset = 0;
2537         u16 qcount = num_tx_queues;
2538
2539         if (skb_rx_queue_recorded(skb)) {
2540                 hash = skb_get_rx_queue(skb);
2541                 while (unlikely(hash >= num_tx_queues))
2542                         hash -= num_tx_queues;
2543                 return hash;
2544         }
2545
2546         if (dev->num_tc) {
2547                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2548                 qoffset = dev->tc_to_txq[tc].offset;
2549                 qcount = dev->tc_to_txq[tc].count;
2550         }
2551
2552         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553 }
2554 EXPORT_SYMBOL(__skb_tx_hash);
2555
2556 static void skb_warn_bad_offload(const struct sk_buff *skb)
2557 {
2558         static const netdev_features_t null_features;
2559         struct net_device *dev = skb->dev;
2560         const char *name = "";
2561
2562         if (!net_ratelimit())
2563                 return;
2564
2565         if (dev) {
2566                 if (dev->dev.parent)
2567                         name = dev_driver_string(dev->dev.parent);
2568                 else
2569                         name = netdev_name(dev);
2570         }
2571         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572              "gso_type=%d ip_summed=%d\n",
2573              name, dev ? &dev->features : &null_features,
2574              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576              skb_shinfo(skb)->gso_type, skb->ip_summed);
2577 }
2578
2579 /*
2580  * Invalidate hardware checksum when packet is to be mangled, and
2581  * complete checksum manually on outgoing path.
2582  */
2583 int skb_checksum_help(struct sk_buff *skb)
2584 {
2585         __wsum csum;
2586         int ret = 0, offset;
2587
2588         if (skb->ip_summed == CHECKSUM_COMPLETE)
2589                 goto out_set_summed;
2590
2591         if (unlikely(skb_shinfo(skb)->gso_size)) {
2592                 skb_warn_bad_offload(skb);
2593                 return -EINVAL;
2594         }
2595
2596         /* Before computing a checksum, we should make sure no frag could
2597          * be modified by an external entity : checksum could be wrong.
2598          */
2599         if (skb_has_shared_frag(skb)) {
2600                 ret = __skb_linearize(skb);
2601                 if (ret)
2602                         goto out;
2603         }
2604
2605         offset = skb_checksum_start_offset(skb);
2606         BUG_ON(offset >= skb_headlen(skb));
2607         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609         offset += skb->csum_offset;
2610         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612         if (skb_cloned(skb) &&
2613             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615                 if (ret)
2616                         goto out;
2617         }
2618
2619         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620 out_set_summed:
2621         skb->ip_summed = CHECKSUM_NONE;
2622 out:
2623         return ret;
2624 }
2625 EXPORT_SYMBOL(skb_checksum_help);
2626
2627 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628 {
2629         __be16 type = skb->protocol;
2630
2631         /* Tunnel gso handlers can set protocol to ethernet. */
2632         if (type == htons(ETH_P_TEB)) {
2633                 struct ethhdr *eth;
2634
2635                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636                         return 0;
2637
2638                 eth = (struct ethhdr *)skb_mac_header(skb);
2639                 type = eth->h_proto;
2640         }
2641
2642         return __vlan_get_protocol(skb, type, depth);
2643 }
2644
2645 /**
2646  *      skb_mac_gso_segment - mac layer segmentation handler.
2647  *      @skb: buffer to segment
2648  *      @features: features for the output path (see dev->features)
2649  */
2650 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651                                     netdev_features_t features)
2652 {
2653         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654         struct packet_offload *ptype;
2655         int vlan_depth = skb->mac_len;
2656         __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658         if (unlikely(!type))
2659                 return ERR_PTR(-EINVAL);
2660
2661         __skb_pull(skb, vlan_depth);
2662
2663         rcu_read_lock();
2664         list_for_each_entry_rcu(ptype, &offload_base, list) {
2665                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2666                         segs = ptype->callbacks.gso_segment(skb, features);
2667                         break;
2668                 }
2669         }
2670         rcu_read_unlock();
2671
2672         __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674         return segs;
2675 }
2676 EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679 /* openvswitch calls this on rx path, so we need a different check.
2680  */
2681 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682 {
2683         if (tx_path)
2684                 return skb->ip_summed != CHECKSUM_PARTIAL;
2685         else
2686                 return skb->ip_summed == CHECKSUM_NONE;
2687 }
2688
2689 /**
2690  *      __skb_gso_segment - Perform segmentation on skb.
2691  *      @skb: buffer to segment
2692  *      @features: features for the output path (see dev->features)
2693  *      @tx_path: whether it is called in TX path
2694  *
2695  *      This function segments the given skb and returns a list of segments.
2696  *
2697  *      It may return NULL if the skb requires no segmentation.  This is
2698  *      only possible when GSO is used for verifying header integrity.
2699  *
2700  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701  */
2702 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703                                   netdev_features_t features, bool tx_path)
2704 {
2705         struct sk_buff *segs;
2706
2707         if (unlikely(skb_needs_check(skb, tx_path))) {
2708                 int err;
2709
2710                 /* We're going to init ->check field in TCP or UDP header */
2711                 err = skb_cow_head(skb, 0);
2712                 if (err < 0)
2713                         return ERR_PTR(err);
2714         }
2715
2716         /* Only report GSO partial support if it will enable us to
2717          * support segmentation on this frame without needing additional
2718          * work.
2719          */
2720         if (features & NETIF_F_GSO_PARTIAL) {
2721                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2722                 struct net_device *dev = skb->dev;
2723
2724                 partial_features |= dev->features & dev->gso_partial_features;
2725                 if (!skb_gso_ok(skb, features | partial_features))
2726                         features &= ~NETIF_F_GSO_PARTIAL;
2727         }
2728
2729         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2730                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2731
2732         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2733         SKB_GSO_CB(skb)->encap_level = 0;
2734
2735         skb_reset_mac_header(skb);
2736         skb_reset_mac_len(skb);
2737
2738         segs = skb_mac_gso_segment(skb, features);
2739
2740         if (unlikely(skb_needs_check(skb, tx_path)))
2741                 skb_warn_bad_offload(skb);
2742
2743         return segs;
2744 }
2745 EXPORT_SYMBOL(__skb_gso_segment);
2746
2747 /* Take action when hardware reception checksum errors are detected. */
2748 #ifdef CONFIG_BUG
2749 void netdev_rx_csum_fault(struct net_device *dev)
2750 {
2751         if (net_ratelimit()) {
2752                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2753                 dump_stack();
2754         }
2755 }
2756 EXPORT_SYMBOL(netdev_rx_csum_fault);
2757 #endif
2758
2759 /* Actually, we should eliminate this check as soon as we know, that:
2760  * 1. IOMMU is present and allows to map all the memory.
2761  * 2. No high memory really exists on this machine.
2762  */
2763
2764 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2765 {
2766 #ifdef CONFIG_HIGHMEM
2767         int i;
2768         if (!(dev->features & NETIF_F_HIGHDMA)) {
2769                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2770                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2771                         if (PageHighMem(skb_frag_page(frag)))
2772                                 return 1;
2773                 }
2774         }
2775
2776         if (PCI_DMA_BUS_IS_PHYS) {
2777                 struct device *pdev = dev->dev.parent;
2778
2779                 if (!pdev)
2780                         return 0;
2781                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2782                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2783                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2784                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2785                                 return 1;
2786                 }
2787         }
2788 #endif
2789         return 0;
2790 }
2791
2792 /* If MPLS offload request, verify we are testing hardware MPLS features
2793  * instead of standard features for the netdev.
2794  */
2795 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2796 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2797                                            netdev_features_t features,
2798                                            __be16 type)
2799 {
2800         if (eth_p_mpls(type))
2801                 features &= skb->dev->mpls_features;
2802
2803         return features;
2804 }
2805 #else
2806 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2807                                            netdev_features_t features,
2808                                            __be16 type)
2809 {
2810         return features;
2811 }
2812 #endif
2813
2814 static netdev_features_t harmonize_features(struct sk_buff *skb,
2815         netdev_features_t features)
2816 {
2817         int tmp;
2818         __be16 type;
2819
2820         type = skb_network_protocol(skb, &tmp);
2821         features = net_mpls_features(skb, features, type);
2822
2823         if (skb->ip_summed != CHECKSUM_NONE &&
2824             !can_checksum_protocol(features, type)) {
2825                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2826         }
2827         if (illegal_highdma(skb->dev, skb))
2828                 features &= ~NETIF_F_SG;
2829
2830         return features;
2831 }
2832
2833 netdev_features_t passthru_features_check(struct sk_buff *skb,
2834                                           struct net_device *dev,
2835                                           netdev_features_t features)
2836 {
2837         return features;
2838 }
2839 EXPORT_SYMBOL(passthru_features_check);
2840
2841 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2842                                              struct net_device *dev,
2843                                              netdev_features_t features)
2844 {
2845         return vlan_features_check(skb, features);
2846 }
2847
2848 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2849                                             struct net_device *dev,
2850                                             netdev_features_t features)
2851 {
2852         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2853
2854         if (gso_segs > dev->gso_max_segs)
2855                 return features & ~NETIF_F_GSO_MASK;
2856
2857         /* Support for GSO partial features requires software
2858          * intervention before we can actually process the packets
2859          * so we need to strip support for any partial features now
2860          * and we can pull them back in after we have partially
2861          * segmented the frame.
2862          */
2863         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2864                 features &= ~dev->gso_partial_features;
2865
2866         /* Make sure to clear the IPv4 ID mangling feature if the
2867          * IPv4 header has the potential to be fragmented.
2868          */
2869         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2870                 struct iphdr *iph = skb->encapsulation ?
2871                                     inner_ip_hdr(skb) : ip_hdr(skb);
2872
2873                 if (!(iph->frag_off & htons(IP_DF)))
2874                         features &= ~NETIF_F_TSO_MANGLEID;
2875         }
2876
2877         return features;
2878 }
2879
2880 netdev_features_t netif_skb_features(struct sk_buff *skb)
2881 {
2882         struct net_device *dev = skb->dev;
2883         netdev_features_t features = dev->features;
2884
2885         if (skb_is_gso(skb))
2886                 features = gso_features_check(skb, dev, features);
2887
2888         /* If encapsulation offload request, verify we are testing
2889          * hardware encapsulation features instead of standard
2890          * features for the netdev
2891          */
2892         if (skb->encapsulation)
2893                 features &= dev->hw_enc_features;
2894
2895         if (skb_vlan_tagged(skb))
2896                 features = netdev_intersect_features(features,
2897                                                      dev->vlan_features |
2898                                                      NETIF_F_HW_VLAN_CTAG_TX |
2899                                                      NETIF_F_HW_VLAN_STAG_TX);
2900
2901         if (dev->netdev_ops->ndo_features_check)
2902                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2903                                                                 features);
2904         else
2905                 features &= dflt_features_check(skb, dev, features);
2906
2907         return harmonize_features(skb, features);
2908 }
2909 EXPORT_SYMBOL(netif_skb_features);
2910
2911 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2912                     struct netdev_queue *txq, bool more)
2913 {
2914         unsigned int len;
2915         int rc;
2916
2917         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2918                 dev_queue_xmit_nit(skb, dev);
2919
2920         len = skb->len;
2921         trace_net_dev_start_xmit(skb, dev);
2922         rc = netdev_start_xmit(skb, dev, txq, more);
2923         trace_net_dev_xmit(skb, rc, dev, len);
2924
2925         return rc;
2926 }
2927
2928 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2929                                     struct netdev_queue *txq, int *ret)
2930 {
2931         struct sk_buff *skb = first;
2932         int rc = NETDEV_TX_OK;
2933
2934         while (skb) {
2935                 struct sk_buff *next = skb->next;
2936
2937                 skb->next = NULL;
2938                 rc = xmit_one(skb, dev, txq, next != NULL);
2939                 if (unlikely(!dev_xmit_complete(rc))) {
2940                         skb->next = next;
2941                         goto out;
2942                 }
2943
2944                 skb = next;
2945                 if (netif_xmit_stopped(txq) && skb) {
2946                         rc = NETDEV_TX_BUSY;
2947                         break;
2948                 }
2949         }
2950
2951 out:
2952         *ret = rc;
2953         return skb;
2954 }
2955
2956 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2957                                           netdev_features_t features)
2958 {
2959         if (skb_vlan_tag_present(skb) &&
2960             !vlan_hw_offload_capable(features, skb->vlan_proto))
2961                 skb = __vlan_hwaccel_push_inside(skb);
2962         return skb;
2963 }
2964
2965 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2966 {
2967         netdev_features_t features;
2968
2969         features = netif_skb_features(skb);
2970         skb = validate_xmit_vlan(skb, features);
2971         if (unlikely(!skb))
2972                 goto out_null;
2973
2974         if (netif_needs_gso(skb, features)) {
2975                 struct sk_buff *segs;
2976
2977                 segs = skb_gso_segment(skb, features);
2978                 if (IS_ERR(segs)) {
2979                         goto out_kfree_skb;
2980                 } else if (segs) {
2981                         consume_skb(skb);
2982                         skb = segs;
2983                 }
2984         } else {
2985                 if (skb_needs_linearize(skb, features) &&
2986                     __skb_linearize(skb))
2987                         goto out_kfree_skb;
2988
2989                 /* If packet is not checksummed and device does not
2990                  * support checksumming for this protocol, complete
2991                  * checksumming here.
2992                  */
2993                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2994                         if (skb->encapsulation)
2995                                 skb_set_inner_transport_header(skb,
2996                                                                skb_checksum_start_offset(skb));
2997                         else
2998                                 skb_set_transport_header(skb,
2999                                                          skb_checksum_start_offset(skb));
3000                         if (!(features & NETIF_F_CSUM_MASK) &&
3001                             skb_checksum_help(skb))
3002                                 goto out_kfree_skb;
3003                 }
3004         }
3005
3006         return skb;
3007
3008 out_kfree_skb:
3009         kfree_skb(skb);
3010 out_null:
3011         atomic_long_inc(&dev->tx_dropped);
3012         return NULL;
3013 }
3014
3015 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3016 {
3017         struct sk_buff *next, *head = NULL, *tail;
3018
3019         for (; skb != NULL; skb = next) {
3020                 next = skb->next;
3021                 skb->next = NULL;
3022
3023                 /* in case skb wont be segmented, point to itself */
3024                 skb->prev = skb;
3025
3026                 skb = validate_xmit_skb(skb, dev);
3027                 if (!skb)
3028                         continue;
3029
3030                 if (!head)
3031                         head = skb;
3032                 else
3033                         tail->next = skb;
3034                 /* If skb was segmented, skb->prev points to
3035                  * the last segment. If not, it still contains skb.
3036                  */
3037                 tail = skb->prev;
3038         }
3039         return head;
3040 }
3041 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3042
3043 static void qdisc_pkt_len_init(struct sk_buff *skb)
3044 {
3045         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3046
3047         qdisc_skb_cb(skb)->pkt_len = skb->len;
3048
3049         /* To get more precise estimation of bytes sent on wire,
3050          * we add to pkt_len the headers size of all segments
3051          */
3052         if (shinfo->gso_size)  {
3053                 unsigned int hdr_len;
3054                 u16 gso_segs = shinfo->gso_segs;
3055
3056                 /* mac layer + network layer */
3057                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3058
3059                 /* + transport layer */
3060                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3061                         hdr_len += tcp_hdrlen(skb);
3062                 else
3063                         hdr_len += sizeof(struct udphdr);
3064
3065                 if (shinfo->gso_type & SKB_GSO_DODGY)
3066                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3067                                                 shinfo->gso_size);
3068
3069                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3070         }
3071 }
3072
3073 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3074                                  struct net_device *dev,
3075                                  struct netdev_queue *txq)
3076 {
3077         spinlock_t *root_lock = qdisc_lock(q);
3078         struct sk_buff *to_free = NULL;
3079         bool contended;
3080         int rc;
3081
3082         qdisc_calculate_pkt_len(skb, q);
3083         /*
3084          * Heuristic to force contended enqueues to serialize on a
3085          * separate lock before trying to get qdisc main lock.
3086          * This permits qdisc->running owner to get the lock more
3087          * often and dequeue packets faster.
3088          */
3089         contended = qdisc_is_running(q);
3090         if (unlikely(contended))
3091                 spin_lock(&q->busylock);
3092
3093         spin_lock(root_lock);
3094         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3095                 __qdisc_drop(skb, &to_free);
3096                 rc = NET_XMIT_DROP;
3097         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3098                    qdisc_run_begin(q)) {
3099                 /*
3100                  * This is a work-conserving queue; there are no old skbs
3101                  * waiting to be sent out; and the qdisc is not running -
3102                  * xmit the skb directly.
3103                  */
3104
3105                 qdisc_bstats_update(q, skb);
3106
3107                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3108                         if (unlikely(contended)) {
3109                                 spin_unlock(&q->busylock);
3110                                 contended = false;
3111                         }
3112                         __qdisc_run(q);
3113                 } else
3114                         qdisc_run_end(q);
3115
3116                 rc = NET_XMIT_SUCCESS;
3117         } else {
3118                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3119                 if (qdisc_run_begin(q)) {
3120                         if (unlikely(contended)) {
3121                                 spin_unlock(&q->busylock);
3122                                 contended = false;
3123                         }
3124                         __qdisc_run(q);
3125                 }
3126         }
3127         spin_unlock(root_lock);
3128         if (unlikely(to_free))
3129                 kfree_skb_list(to_free);
3130         if (unlikely(contended))
3131                 spin_unlock(&q->busylock);
3132         return rc;
3133 }
3134
3135 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3136 static void skb_update_prio(struct sk_buff *skb)
3137 {
3138         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3139
3140         if (!skb->priority && skb->sk && map) {
3141                 unsigned int prioidx =
3142                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3143
3144                 if (prioidx < map->priomap_len)
3145                         skb->priority = map->priomap[prioidx];
3146         }
3147 }
3148 #else
3149 #define skb_update_prio(skb)
3150 #endif
3151
3152 DEFINE_PER_CPU(int, xmit_recursion);
3153 EXPORT_SYMBOL(xmit_recursion);
3154
3155 /**
3156  *      dev_loopback_xmit - loop back @skb
3157  *      @net: network namespace this loopback is happening in
3158  *      @sk:  sk needed to be a netfilter okfn
3159  *      @skb: buffer to transmit
3160  */
3161 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3162 {
3163         skb_reset_mac_header(skb);
3164         __skb_pull(skb, skb_network_offset(skb));
3165         skb->pkt_type = PACKET_LOOPBACK;
3166         skb->ip_summed = CHECKSUM_UNNECESSARY;
3167         WARN_ON(!skb_dst(skb));
3168         skb_dst_force(skb);
3169         netif_rx_ni(skb);
3170         return 0;
3171 }
3172 EXPORT_SYMBOL(dev_loopback_xmit);
3173
3174 #ifdef CONFIG_NET_EGRESS
3175 static struct sk_buff *
3176 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3177 {
3178         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3179         struct tcf_result cl_res;
3180
3181         if (!cl)
3182                 return skb;
3183
3184         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3185          * earlier by the caller.
3186          */
3187         qdisc_bstats_cpu_update(cl->q, skb);
3188
3189         switch (tc_classify(skb, cl, &cl_res, false)) {
3190         case TC_ACT_OK:
3191         case TC_ACT_RECLASSIFY:
3192                 skb->tc_index = TC_H_MIN(cl_res.classid);
3193                 break;
3194         case TC_ACT_SHOT:
3195                 qdisc_qstats_cpu_drop(cl->q);
3196                 *ret = NET_XMIT_DROP;
3197                 kfree_skb(skb);
3198                 return NULL;
3199         case TC_ACT_STOLEN:
3200         case TC_ACT_QUEUED:
3201                 *ret = NET_XMIT_SUCCESS;
3202                 consume_skb(skb);
3203                 return NULL;
3204         case TC_ACT_REDIRECT:
3205                 /* No need to push/pop skb's mac_header here on egress! */
3206                 skb_do_redirect(skb);
3207                 *ret = NET_XMIT_SUCCESS;
3208                 return NULL;
3209         default:
3210                 break;
3211         }
3212
3213         return skb;
3214 }
3215 #endif /* CONFIG_NET_EGRESS */
3216
3217 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3218 {
3219 #ifdef CONFIG_XPS
3220         struct xps_dev_maps *dev_maps;
3221         struct xps_map *map;
3222         int queue_index = -1;
3223
3224         rcu_read_lock();
3225         dev_maps = rcu_dereference(dev->xps_maps);
3226         if (dev_maps) {
3227                 unsigned int tci = skb->sender_cpu - 1;
3228
3229                 if (dev->num_tc) {
3230                         tci *= dev->num_tc;
3231                         tci += netdev_get_prio_tc_map(dev, skb->priority);
3232                 }
3233
3234                 map = rcu_dereference(dev_maps->cpu_map[tci]);
3235                 if (map) {
3236                         if (map->len == 1)
3237                                 queue_index = map->queues[0];
3238                         else
3239                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3240                                                                            map->len)];
3241                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3242                                 queue_index = -1;
3243                 }
3244         }
3245         rcu_read_unlock();
3246
3247         return queue_index;
3248 #else
3249         return -1;
3250 #endif
3251 }
3252
3253 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3254 {
3255         struct sock *sk = skb->sk;
3256         int queue_index = sk_tx_queue_get(sk);
3257
3258         if (queue_index < 0 || skb->ooo_okay ||
3259             queue_index >= dev->real_num_tx_queues) {
3260                 int new_index = get_xps_queue(dev, skb);
3261                 if (new_index < 0)
3262                         new_index = skb_tx_hash(dev, skb);
3263
3264                 if (queue_index != new_index && sk &&
3265                     sk_fullsock(sk) &&
3266                     rcu_access_pointer(sk->sk_dst_cache))
3267                         sk_tx_queue_set(sk, new_index);
3268
3269                 queue_index = new_index;
3270         }
3271
3272         return queue_index;
3273 }
3274
3275 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3276                                     struct sk_buff *skb,
3277                                     void *accel_priv)
3278 {
3279         int queue_index = 0;
3280
3281 #ifdef CONFIG_XPS
3282         u32 sender_cpu = skb->sender_cpu - 1;
3283
3284         if (sender_cpu >= (u32)NR_CPUS)
3285                 skb->sender_cpu = raw_smp_processor_id() + 1;
3286 #endif
3287
3288         if (dev->real_num_tx_queues != 1) {
3289                 const struct net_device_ops *ops = dev->netdev_ops;
3290                 if (ops->ndo_select_queue)
3291                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3292                                                             __netdev_pick_tx);
3293                 else
3294                         queue_index = __netdev_pick_tx(dev, skb);
3295
3296                 if (!accel_priv)
3297                         queue_index = netdev_cap_txqueue(dev, queue_index);
3298         }
3299
3300         skb_set_queue_mapping(skb, queue_index);
3301         return netdev_get_tx_queue(dev, queue_index);
3302 }
3303
3304 /**
3305  *      __dev_queue_xmit - transmit a buffer
3306  *      @skb: buffer to transmit
3307  *      @accel_priv: private data used for L2 forwarding offload
3308  *
3309  *      Queue a buffer for transmission to a network device. The caller must
3310  *      have set the device and priority and built the buffer before calling
3311  *      this function. The function can be called from an interrupt.
3312  *
3313  *      A negative errno code is returned on a failure. A success does not
3314  *      guarantee the frame will be transmitted as it may be dropped due
3315  *      to congestion or traffic shaping.
3316  *
3317  * -----------------------------------------------------------------------------------
3318  *      I notice this method can also return errors from the queue disciplines,
3319  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3320  *      be positive.
3321  *
3322  *      Regardless of the return value, the skb is consumed, so it is currently
3323  *      difficult to retry a send to this method.  (You can bump the ref count
3324  *      before sending to hold a reference for retry if you are careful.)
3325  *
3326  *      When calling this method, interrupts MUST be enabled.  This is because
3327  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3328  *          --BLG
3329  */
3330 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3331 {
3332         struct net_device *dev = skb->dev;
3333         struct netdev_queue *txq;
3334         struct Qdisc *q;
3335         int rc = -ENOMEM;
3336
3337         skb_reset_mac_header(skb);
3338
3339         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3340                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3341
3342         /* Disable soft irqs for various locks below. Also
3343          * stops preemption for RCU.
3344          */
3345         rcu_read_lock_bh();
3346
3347         skb_update_prio(skb);
3348
3349         qdisc_pkt_len_init(skb);
3350 #ifdef CONFIG_NET_CLS_ACT
3351         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3352 # ifdef CONFIG_NET_EGRESS
3353         if (static_key_false(&egress_needed)) {
3354                 skb = sch_handle_egress(skb, &rc, dev);
3355                 if (!skb)
3356                         goto out;
3357         }
3358 # endif
3359 #endif
3360         /* If device/qdisc don't need skb->dst, release it right now while
3361          * its hot in this cpu cache.
3362          */
3363         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3364                 skb_dst_drop(skb);
3365         else
3366                 skb_dst_force(skb);
3367
3368         txq = netdev_pick_tx(dev, skb, accel_priv);
3369         q = rcu_dereference_bh(txq->qdisc);
3370
3371         trace_net_dev_queue(skb);
3372         if (q->enqueue) {
3373                 rc = __dev_xmit_skb(skb, q, dev, txq);
3374                 goto out;
3375         }
3376
3377         /* The device has no queue. Common case for software devices:
3378            loopback, all the sorts of tunnels...
3379
3380            Really, it is unlikely that netif_tx_lock protection is necessary
3381            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3382            counters.)
3383            However, it is possible, that they rely on protection
3384            made by us here.
3385
3386            Check this and shot the lock. It is not prone from deadlocks.
3387            Either shot noqueue qdisc, it is even simpler 8)
3388          */
3389         if (dev->flags & IFF_UP) {
3390                 int cpu = smp_processor_id(); /* ok because BHs are off */
3391
3392                 if (txq->xmit_lock_owner != cpu) {
3393                         if (unlikely(__this_cpu_read(xmit_recursion) >
3394                                      XMIT_RECURSION_LIMIT))
3395                                 goto recursion_alert;
3396
3397                         skb = validate_xmit_skb(skb, dev);
3398                         if (!skb)
3399                                 goto out;
3400
3401                         HARD_TX_LOCK(dev, txq, cpu);
3402
3403                         if (!netif_xmit_stopped(txq)) {
3404                                 __this_cpu_inc(xmit_recursion);
3405                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3406                                 __this_cpu_dec(xmit_recursion);
3407                                 if (dev_xmit_complete(rc)) {
3408                                         HARD_TX_UNLOCK(dev, txq);
3409                                         goto out;
3410                                 }
3411                         }
3412                         HARD_TX_UNLOCK(dev, txq);
3413                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3414                                              dev->name);
3415                 } else {
3416                         /* Recursion is detected! It is possible,
3417                          * unfortunately
3418                          */
3419 recursion_alert:
3420                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3421                                              dev->name);
3422                 }
3423         }
3424
3425         rc = -ENETDOWN;
3426         rcu_read_unlock_bh();
3427
3428         atomic_long_inc(&dev->tx_dropped);
3429         kfree_skb_list(skb);
3430         return rc;
3431 out:
3432         rcu_read_unlock_bh();
3433         return rc;
3434 }
3435
3436 int dev_queue_xmit(struct sk_buff *skb)
3437 {
3438         return __dev_queue_xmit(skb, NULL);
3439 }
3440 EXPORT_SYMBOL(dev_queue_xmit);
3441
3442 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3443 {
3444         return __dev_queue_xmit(skb, accel_priv);
3445 }
3446 EXPORT_SYMBOL(dev_queue_xmit_accel);
3447
3448
3449 /*=======================================================================
3450                         Receiver routines
3451   =======================================================================*/
3452
3453 int netdev_max_backlog __read_mostly = 1000;
3454 EXPORT_SYMBOL(netdev_max_backlog);
3455
3456 int netdev_tstamp_prequeue __read_mostly = 1;
3457 int netdev_budget __read_mostly = 300;
3458 int weight_p __read_mostly = 64;            /* old backlog weight */
3459
3460 /* Called with irq disabled */
3461 static inline void ____napi_schedule(struct softnet_data *sd,
3462                                      struct napi_struct *napi)
3463 {
3464         list_add_tail(&napi->poll_list, &sd->poll_list);
3465         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3466 }
3467
3468 #ifdef CONFIG_RPS
3469
3470 /* One global table that all flow-based protocols share. */
3471 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3472 EXPORT_SYMBOL(rps_sock_flow_table);
3473 u32 rps_cpu_mask __read_mostly;
3474 EXPORT_SYMBOL(rps_cpu_mask);
3475
3476 struct static_key rps_needed __read_mostly;
3477 EXPORT_SYMBOL(rps_needed);
3478 struct static_key rfs_needed __read_mostly;
3479 EXPORT_SYMBOL(rfs_needed);
3480
3481 static struct rps_dev_flow *
3482 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3483             struct rps_dev_flow *rflow, u16 next_cpu)
3484 {
3485         if (next_cpu < nr_cpu_ids) {
3486 #ifdef CONFIG_RFS_ACCEL
3487                 struct netdev_rx_queue *rxqueue;
3488                 struct rps_dev_flow_table *flow_table;
3489                 struct rps_dev_flow *old_rflow;
3490                 u32 flow_id;
3491                 u16 rxq_index;
3492                 int rc;
3493
3494                 /* Should we steer this flow to a different hardware queue? */
3495                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3496                     !(dev->features & NETIF_F_NTUPLE))
3497                         goto out;
3498                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3499                 if (rxq_index == skb_get_rx_queue(skb))
3500                         goto out;
3501
3502                 rxqueue = dev->_rx + rxq_index;
3503                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3504                 if (!flow_table)
3505                         goto out;
3506                 flow_id = skb_get_hash(skb) & flow_table->mask;
3507                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3508                                                         rxq_index, flow_id);
3509                 if (rc < 0)
3510                         goto out;
3511                 old_rflow = rflow;
3512                 rflow = &flow_table->flows[flow_id];
3513                 rflow->filter = rc;
3514                 if (old_rflow->filter == rflow->filter)
3515                         old_rflow->filter = RPS_NO_FILTER;
3516         out:
3517 #endif
3518                 rflow->last_qtail =
3519                         per_cpu(softnet_data, next_cpu).input_queue_head;
3520         }
3521
3522         rflow->cpu = next_cpu;
3523         return rflow;
3524 }
3525
3526 /*
3527  * get_rps_cpu is called from netif_receive_skb and returns the target
3528  * CPU from the RPS map of the receiving queue for a given skb.
3529  * rcu_read_lock must be held on entry.
3530  */
3531 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3532                        struct rps_dev_flow **rflowp)
3533 {
3534         const struct rps_sock_flow_table *sock_flow_table;
3535         struct netdev_rx_queue *rxqueue = dev->_rx;
3536         struct rps_dev_flow_table *flow_table;
3537         struct rps_map *map;
3538         int cpu = -1;
3539         u32 tcpu;
3540         u32 hash;
3541
3542         if (skb_rx_queue_recorded(skb)) {
3543                 u16 index = skb_get_rx_queue(skb);
3544
3545                 if (unlikely(index >= dev->real_num_rx_queues)) {
3546                         WARN_ONCE(dev->real_num_rx_queues > 1,
3547                                   "%s received packet on queue %u, but number "
3548                                   "of RX queues is %u\n",
3549                                   dev->name, index, dev->real_num_rx_queues);
3550                         goto done;
3551                 }
3552                 rxqueue += index;
3553         }
3554
3555         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3556
3557         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3558         map = rcu_dereference(rxqueue->rps_map);
3559         if (!flow_table && !map)
3560                 goto done;
3561
3562         skb_reset_network_header(skb);
3563         hash = skb_get_hash(skb);
3564         if (!hash)
3565                 goto done;
3566
3567         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3568         if (flow_table && sock_flow_table) {
3569                 struct rps_dev_flow *rflow;
3570                 u32 next_cpu;
3571                 u32 ident;
3572
3573                 /* First check into global flow table if there is a match */
3574                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3575                 if ((ident ^ hash) & ~rps_cpu_mask)
3576                         goto try_rps;
3577
3578                 next_cpu = ident & rps_cpu_mask;
3579
3580                 /* OK, now we know there is a match,
3581                  * we can look at the local (per receive queue) flow table
3582                  */
3583                 rflow = &flow_table->flows[hash & flow_table->mask];
3584                 tcpu = rflow->cpu;
3585
3586                 /*
3587                  * If the desired CPU (where last recvmsg was done) is
3588                  * different from current CPU (one in the rx-queue flow
3589                  * table entry), switch if one of the following holds:
3590                  *   - Current CPU is unset (>= nr_cpu_ids).
3591                  *   - Current CPU is offline.
3592                  *   - The current CPU's queue tail has advanced beyond the
3593                  *     last packet that was enqueued using this table entry.
3594                  *     This guarantees that all previous packets for the flow
3595                  *     have been dequeued, thus preserving in order delivery.
3596                  */
3597                 if (unlikely(tcpu != next_cpu) &&
3598                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3599                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3600                       rflow->last_qtail)) >= 0)) {
3601                         tcpu = next_cpu;
3602                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3603                 }
3604
3605                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3606                         *rflowp = rflow;
3607                         cpu = tcpu;
3608                         goto done;
3609                 }
3610         }
3611
3612 try_rps:
3613
3614         if (map) {
3615                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3616                 if (cpu_online(tcpu)) {
3617                         cpu = tcpu;
3618                         goto done;
3619                 }
3620         }
3621
3622 done:
3623         return cpu;
3624 }
3625
3626 #ifdef CONFIG_RFS_ACCEL
3627
3628 /**
3629  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3630  * @dev: Device on which the filter was set
3631  * @rxq_index: RX queue index
3632  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3633  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3634  *
3635  * Drivers that implement ndo_rx_flow_steer() should periodically call
3636  * this function for each installed filter and remove the filters for
3637  * which it returns %true.
3638  */
3639 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3640                          u32 flow_id, u16 filter_id)
3641 {
3642         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3643         struct rps_dev_flow_table *flow_table;
3644         struct rps_dev_flow *rflow;
3645         bool expire = true;
3646         unsigned int cpu;
3647
3648         rcu_read_lock();
3649         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3650         if (flow_table && flow_id <= flow_table->mask) {
3651                 rflow = &flow_table->flows[flow_id];
3652                 cpu = ACCESS_ONCE(rflow->cpu);
3653                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3654                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3655                            rflow->last_qtail) <
3656                      (int)(10 * flow_table->mask)))
3657                         expire = false;
3658         }
3659         rcu_read_unlock();
3660         return expire;
3661 }
3662 EXPORT_SYMBOL(rps_may_expire_flow);
3663
3664 #endif /* CONFIG_RFS_ACCEL */
3665
3666 /* Called from hardirq (IPI) context */
3667 static void rps_trigger_softirq(void *data)
3668 {
3669         struct softnet_data *sd = data;
3670
3671         ____napi_schedule(sd, &sd->backlog);
3672         sd->received_rps++;
3673 }
3674
3675 #endif /* CONFIG_RPS */
3676
3677 /*
3678  * Check if this softnet_data structure is another cpu one
3679  * If yes, queue it to our IPI list and return 1
3680  * If no, return 0
3681  */
3682 static int rps_ipi_queued(struct softnet_data *sd)
3683 {
3684 #ifdef CONFIG_RPS
3685         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3686
3687         if (sd != mysd) {
3688                 sd->rps_ipi_next = mysd->rps_ipi_list;
3689                 mysd->rps_ipi_list = sd;
3690
3691                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3692                 return 1;
3693         }
3694 #endif /* CONFIG_RPS */
3695         return 0;
3696 }
3697
3698 #ifdef CONFIG_NET_FLOW_LIMIT
3699 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3700 #endif
3701
3702 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3703 {
3704 #ifdef CONFIG_NET_FLOW_LIMIT
3705         struct sd_flow_limit *fl;
3706         struct softnet_data *sd;
3707         unsigned int old_flow, new_flow;
3708
3709         if (qlen < (netdev_max_backlog >> 1))
3710                 return false;
3711
3712         sd = this_cpu_ptr(&softnet_data);
3713
3714         rcu_read_lock();
3715         fl = rcu_dereference(sd->flow_limit);
3716         if (fl) {
3717                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3718                 old_flow = fl->history[fl->history_head];
3719                 fl->history[fl->history_head] = new_flow;
3720
3721                 fl->history_head++;
3722                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3723
3724                 if (likely(fl->buckets[old_flow]))
3725                         fl->buckets[old_flow]--;
3726
3727                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3728                         fl->count++;
3729                         rcu_read_unlock();
3730                         return true;
3731                 }
3732         }
3733         rcu_read_unlock();
3734 #endif
3735         return false;
3736 }
3737
3738 /*
3739  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3740  * queue (may be a remote CPU queue).
3741  */
3742 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3743                               unsigned int *qtail)
3744 {
3745         struct softnet_data *sd;
3746         unsigned long flags;
3747         unsigned int qlen;
3748
3749         sd = &per_cpu(softnet_data, cpu);
3750
3751         local_irq_save(flags);
3752
3753         rps_lock(sd);
3754         if (!netif_running(skb->dev))
3755                 goto drop;
3756         qlen = skb_queue_len(&sd->input_pkt_queue);
3757         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3758                 if (qlen) {
3759 enqueue:
3760                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3761                         input_queue_tail_incr_save(sd, qtail);
3762                         rps_unlock(sd);
3763                         local_irq_restore(flags);
3764                         return NET_RX_SUCCESS;
3765                 }
3766
3767                 /* Schedule NAPI for backlog device
3768                  * We can use non atomic operation since we own the queue lock
3769                  */
3770                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3771                         if (!rps_ipi_queued(sd))
3772                                 ____napi_schedule(sd, &sd->backlog);
3773                 }
3774                 goto enqueue;
3775         }
3776
3777 drop:
3778         sd->dropped++;
3779         rps_unlock(sd);
3780
3781         local_irq_restore(flags);
3782
3783         atomic_long_inc(&skb->dev->rx_dropped);
3784         kfree_skb(skb);
3785         return NET_RX_DROP;
3786 }
3787
3788 static int netif_rx_internal(struct sk_buff *skb)
3789 {
3790         int ret;
3791
3792         net_timestamp_check(netdev_tstamp_prequeue, skb);
3793
3794         trace_netif_rx(skb);
3795 #ifdef CONFIG_RPS
3796         if (static_key_false(&rps_needed)) {
3797                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3798                 int cpu;
3799
3800                 preempt_disable();
3801                 rcu_read_lock();
3802
3803                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3804                 if (cpu < 0)
3805                         cpu = smp_processor_id();
3806
3807                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3808
3809                 rcu_read_unlock();
3810                 preempt_enable();
3811         } else
3812 #endif
3813         {
3814                 unsigned int qtail;
3815                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3816                 put_cpu();
3817         }
3818         return ret;
3819 }
3820
3821 /**
3822  *      netif_rx        -       post buffer to the network code
3823  *      @skb: buffer to post
3824  *
3825  *      This function receives a packet from a device driver and queues it for
3826  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3827  *      may be dropped during processing for congestion control or by the
3828  *      protocol layers.
3829  *
3830  *      return values:
3831  *      NET_RX_SUCCESS  (no congestion)
3832  *      NET_RX_DROP     (packet was dropped)
3833  *
3834  */
3835
3836 int netif_rx(struct sk_buff *skb)
3837 {
3838         trace_netif_rx_entry(skb);
3839
3840         return netif_rx_internal(skb);
3841 }
3842 EXPORT_SYMBOL(netif_rx);
3843
3844 int netif_rx_ni(struct sk_buff *skb)
3845 {
3846         int err;
3847
3848         trace_netif_rx_ni_entry(skb);
3849
3850         preempt_disable();
3851         err = netif_rx_internal(skb);
3852         if (local_softirq_pending())
3853                 do_softirq();
3854         preempt_enable();
3855
3856         return err;
3857 }
3858 EXPORT_SYMBOL(netif_rx_ni);
3859
3860 static __latent_entropy void net_tx_action(struct softirq_action *h)
3861 {
3862         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3863
3864         if (sd->completion_queue) {
3865                 struct sk_buff *clist;
3866
3867                 local_irq_disable();
3868                 clist = sd->completion_queue;
3869                 sd->completion_queue = NULL;
3870                 local_irq_enable();
3871
3872                 while (clist) {
3873                         struct sk_buff *skb = clist;
3874                         clist = clist->next;
3875
3876                         WARN_ON(atomic_read(&skb->users));
3877                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3878                                 trace_consume_skb(skb);
3879                         else
3880                                 trace_kfree_skb(skb, net_tx_action);
3881
3882                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3883                                 __kfree_skb(skb);
3884                         else
3885                                 __kfree_skb_defer(skb);
3886                 }
3887
3888                 __kfree_skb_flush();
3889         }
3890
3891         if (sd->output_queue) {
3892                 struct Qdisc *head;
3893
3894                 local_irq_disable();
3895                 head = sd->output_queue;
3896                 sd->output_queue = NULL;
3897                 sd->output_queue_tailp = &sd->output_queue;
3898                 local_irq_enable();
3899
3900                 while (head) {
3901                         struct Qdisc *q = head;
3902                         spinlock_t *root_lock;
3903
3904                         head = head->next_sched;
3905
3906                         root_lock = qdisc_lock(q);
3907                         spin_lock(root_lock);
3908                         /* We need to make sure head->next_sched is read
3909                          * before clearing __QDISC_STATE_SCHED
3910                          */
3911                         smp_mb__before_atomic();
3912                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3913                         qdisc_run(q);
3914                         spin_unlock(root_lock);
3915                 }
3916         }
3917 }
3918
3919 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3920 /* This hook is defined here for ATM LANE */
3921 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3922                              unsigned char *addr) __read_mostly;
3923 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3924 #endif
3925
3926 static inline struct sk_buff *
3927 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3928                    struct net_device *orig_dev)
3929 {
3930 #ifdef CONFIG_NET_CLS_ACT
3931         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3932         struct tcf_result cl_res;
3933
3934         /* If there's at least one ingress present somewhere (so
3935          * we get here via enabled static key), remaining devices
3936          * that are not configured with an ingress qdisc will bail
3937          * out here.
3938          */
3939         if (!cl)
3940                 return skb;
3941         if (*pt_prev) {
3942                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3943                 *pt_prev = NULL;
3944         }
3945
3946         qdisc_skb_cb(skb)->pkt_len = skb->len;
3947         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3948         qdisc_bstats_cpu_update(cl->q, skb);
3949
3950         switch (tc_classify(skb, cl, &cl_res, false)) {
3951         case TC_ACT_OK:
3952         case TC_ACT_RECLASSIFY:
3953                 skb->tc_index = TC_H_MIN(cl_res.classid);
3954                 break;
3955         case TC_ACT_SHOT:
3956                 qdisc_qstats_cpu_drop(cl->q);
3957                 kfree_skb(skb);
3958                 return NULL;
3959         case TC_ACT_STOLEN:
3960         case TC_ACT_QUEUED:
3961                 consume_skb(skb);
3962                 return NULL;
3963         case TC_ACT_REDIRECT:
3964                 /* skb_mac_header check was done by cls/act_bpf, so
3965                  * we can safely push the L2 header back before
3966                  * redirecting to another netdev
3967                  */
3968                 __skb_push(skb, skb->mac_len);
3969                 skb_do_redirect(skb);
3970                 return NULL;
3971         default:
3972                 break;
3973         }
3974 #endif /* CONFIG_NET_CLS_ACT */
3975         return skb;
3976 }
3977
3978 /**
3979  *      netdev_is_rx_handler_busy - check if receive handler is registered
3980  *      @dev: device to check
3981  *
3982  *      Check if a receive handler is already registered for a given device.
3983  *      Return true if there one.
3984  *
3985  *      The caller must hold the rtnl_mutex.
3986  */
3987 bool netdev_is_rx_handler_busy(struct net_device *dev)
3988 {
3989         ASSERT_RTNL();
3990         return dev && rtnl_dereference(dev->rx_handler);
3991 }
3992 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3993
3994 /**
3995  *      netdev_rx_handler_register - register receive handler
3996  *      @dev: device to register a handler for
3997  *      @rx_handler: receive handler to register
3998  *      @rx_handler_data: data pointer that is used by rx handler
3999  *
4000  *      Register a receive handler for a device. This handler will then be
4001  *      called from __netif_receive_skb. A negative errno code is returned
4002  *      on a failure.
4003  *
4004  *      The caller must hold the rtnl_mutex.
4005  *
4006  *      For a general description of rx_handler, see enum rx_handler_result.
4007  */
4008 int netdev_rx_handler_register(struct net_device *dev,
4009                                rx_handler_func_t *rx_handler,
4010                                void *rx_handler_data)
4011 {
4012         ASSERT_RTNL();
4013
4014         if (dev->rx_handler)
4015                 return -EBUSY;
4016
4017         /* Note: rx_handler_data must be set before rx_handler */
4018         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4019         rcu_assign_pointer(dev->rx_handler, rx_handler);
4020
4021         return 0;
4022 }
4023 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4024
4025 /**
4026  *      netdev_rx_handler_unregister - unregister receive handler
4027  *      @dev: device to unregister a handler from
4028  *
4029  *      Unregister a receive handler from a device.
4030  *
4031  *      The caller must hold the rtnl_mutex.
4032  */
4033 void netdev_rx_handler_unregister(struct net_device *dev)
4034 {
4035
4036         ASSERT_RTNL();
4037         RCU_INIT_POINTER(dev->rx_handler, NULL);
4038         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4039          * section has a guarantee to see a non NULL rx_handler_data
4040          * as well.
4041          */
4042         synchronize_net();
4043         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4044 }
4045 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4046
4047 /*
4048  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4049  * the special handling of PFMEMALLOC skbs.
4050  */
4051 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4052 {
4053         switch (skb->protocol) {
4054         case htons(ETH_P_ARP):
4055         case htons(ETH_P_IP):
4056         case htons(ETH_P_IPV6):
4057         case htons(ETH_P_8021Q):
4058         case htons(ETH_P_8021AD):
4059                 return true;
4060         default:
4061                 return false;
4062         }
4063 }
4064
4065 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4066                              int *ret, struct net_device *orig_dev)
4067 {
4068 #ifdef CONFIG_NETFILTER_INGRESS
4069         if (nf_hook_ingress_active(skb)) {
4070                 int ingress_retval;
4071
4072                 if (*pt_prev) {
4073                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4074                         *pt_prev = NULL;
4075                 }
4076
4077                 rcu_read_lock();
4078                 ingress_retval = nf_hook_ingress(skb);
4079                 rcu_read_unlock();
4080                 return ingress_retval;
4081         }
4082 #endif /* CONFIG_NETFILTER_INGRESS */
4083         return 0;
4084 }
4085
4086 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4087 {
4088         struct packet_type *ptype, *pt_prev;
4089         rx_handler_func_t *rx_handler;
4090         struct net_device *orig_dev;
4091         bool deliver_exact = false;
4092         int ret = NET_RX_DROP;
4093         __be16 type;
4094
4095         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4096
4097         trace_netif_receive_skb(skb);
4098
4099         orig_dev = skb->dev;
4100
4101         skb_reset_network_header(skb);
4102         if (!skb_transport_header_was_set(skb))
4103                 skb_reset_transport_header(skb);
4104         skb_reset_mac_len(skb);
4105
4106         pt_prev = NULL;
4107
4108 another_round:
4109         skb->skb_iif = skb->dev->ifindex;
4110
4111         __this_cpu_inc(softnet_data.processed);
4112
4113         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4114             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4115                 skb = skb_vlan_untag(skb);
4116                 if (unlikely(!skb))
4117                         goto out;
4118         }
4119
4120 #ifdef CONFIG_NET_CLS_ACT
4121         if (skb->tc_verd & TC_NCLS) {
4122                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4123                 goto ncls;
4124         }
4125 #endif
4126
4127         if (pfmemalloc)
4128                 goto skip_taps;
4129
4130         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4131                 if (pt_prev)
4132                         ret = deliver_skb(skb, pt_prev, orig_dev);
4133                 pt_prev = ptype;
4134         }
4135
4136         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4137                 if (pt_prev)
4138                         ret = deliver_skb(skb, pt_prev, orig_dev);
4139                 pt_prev = ptype;
4140         }
4141
4142 skip_taps:
4143 #ifdef CONFIG_NET_INGRESS
4144         if (static_key_false(&ingress_needed)) {
4145                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4146                 if (!skb)
4147                         goto out;
4148
4149                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4150                         goto out;
4151         }
4152 #endif
4153 #ifdef CONFIG_NET_CLS_ACT
4154         skb->tc_verd = 0;
4155 ncls:
4156 #endif
4157         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4158                 goto drop;
4159
4160         if (skb_vlan_tag_present(skb)) {
4161                 if (pt_prev) {
4162                         ret = deliver_skb(skb, pt_prev, orig_dev);
4163                         pt_prev = NULL;
4164                 }
4165                 if (vlan_do_receive(&skb))
4166                         goto another_round;
4167                 else if (unlikely(!skb))
4168                         goto out;
4169         }
4170
4171         rx_handler = rcu_dereference(skb->dev->rx_handler);
4172         if (rx_handler) {
4173                 if (pt_prev) {
4174                         ret = deliver_skb(skb, pt_prev, orig_dev);
4175                         pt_prev = NULL;
4176                 }
4177                 switch (rx_handler(&skb)) {
4178                 case RX_HANDLER_CONSUMED:
4179                         ret = NET_RX_SUCCESS;
4180                         goto out;
4181                 case RX_HANDLER_ANOTHER:
4182                         goto another_round;
4183                 case RX_HANDLER_EXACT:
4184                         deliver_exact = true;
4185                 case RX_HANDLER_PASS:
4186                         break;
4187                 default:
4188                         BUG();
4189                 }
4190         }
4191
4192         if (unlikely(skb_vlan_tag_present(skb))) {
4193                 if (skb_vlan_tag_get_id(skb))
4194                         skb->pkt_type = PACKET_OTHERHOST;
4195                 /* Note: we might in the future use prio bits
4196                  * and set skb->priority like in vlan_do_receive()
4197                  * For the time being, just ignore Priority Code Point
4198                  */
4199                 skb->vlan_tci = 0;
4200         }
4201
4202         type = skb->protocol;
4203
4204         /* deliver only exact match when indicated */
4205         if (likely(!deliver_exact)) {
4206                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4207                                        &ptype_base[ntohs(type) &
4208                                                    PTYPE_HASH_MASK]);
4209         }
4210
4211         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4212                                &orig_dev->ptype_specific);
4213
4214         if (unlikely(skb->dev != orig_dev)) {
4215                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4216                                        &skb->dev->ptype_specific);
4217         }
4218
4219         if (pt_prev) {
4220                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4221                         goto drop;
4222                 else
4223                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4224         } else {
4225 drop:
4226                 if (!deliver_exact)
4227                         atomic_long_inc(&skb->dev->rx_dropped);
4228                 else
4229                         atomic_long_inc(&skb->dev->rx_nohandler);
4230                 kfree_skb(skb);
4231                 /* Jamal, now you will not able to escape explaining
4232                  * me how you were going to use this. :-)
4233                  */
4234                 ret = NET_RX_DROP;
4235         }
4236
4237 out:
4238         return ret;
4239 }
4240
4241 static int __netif_receive_skb(struct sk_buff *skb)
4242 {
4243         int ret;
4244
4245         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4246                 unsigned long pflags = current->flags;
4247
4248                 /*
4249                  * PFMEMALLOC skbs are special, they should
4250                  * - be delivered to SOCK_MEMALLOC sockets only
4251                  * - stay away from userspace
4252                  * - have bounded memory usage
4253                  *
4254                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4255                  * context down to all allocation sites.
4256                  */
4257                 current->flags |= PF_MEMALLOC;
4258                 ret = __netif_receive_skb_core(skb, true);
4259                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4260         } else
4261                 ret = __netif_receive_skb_core(skb, false);
4262
4263         return ret;
4264 }
4265
4266 static int netif_receive_skb_internal(struct sk_buff *skb)
4267 {
4268         int ret;
4269
4270         net_timestamp_check(netdev_tstamp_prequeue, skb);
4271
4272         if (skb_defer_rx_timestamp(skb))
4273                 return NET_RX_SUCCESS;
4274
4275         rcu_read_lock();
4276
4277 #ifdef CONFIG_RPS
4278         if (static_key_false(&rps_needed)) {
4279                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4280                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4281
4282                 if (cpu >= 0) {
4283                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4284                         rcu_read_unlock();
4285                         return ret;
4286                 }
4287         }
4288 #endif
4289         ret = __netif_receive_skb(skb);
4290         rcu_read_unlock();
4291         return ret;
4292 }
4293
4294 /**
4295  *      netif_receive_skb - process receive buffer from network
4296  *      @skb: buffer to process
4297  *
4298  *      netif_receive_skb() is the main receive data processing function.
4299  *      It always succeeds. The buffer may be dropped during processing
4300  *      for congestion control or by the protocol layers.
4301  *
4302  *      This function may only be called from softirq context and interrupts
4303  *      should be enabled.
4304  *
4305  *      Return values (usually ignored):
4306  *      NET_RX_SUCCESS: no congestion
4307  *      NET_RX_DROP: packet was dropped
4308  */
4309 int netif_receive_skb(struct sk_buff *skb)
4310 {
4311         trace_netif_receive_skb_entry(skb);
4312
4313         return netif_receive_skb_internal(skb);
4314 }
4315 EXPORT_SYMBOL(netif_receive_skb);
4316
4317 DEFINE_PER_CPU(struct work_struct, flush_works);
4318
4319 /* Network device is going away, flush any packets still pending */
4320 static void flush_backlog(struct work_struct *work)
4321 {
4322         struct sk_buff *skb, *tmp;
4323         struct softnet_data *sd;
4324
4325         local_bh_disable();
4326         sd = this_cpu_ptr(&softnet_data);
4327
4328         local_irq_disable();
4329         rps_lock(sd);
4330         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4331                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4332                         __skb_unlink(skb, &sd->input_pkt_queue);
4333                         kfree_skb(skb);
4334                         input_queue_head_incr(sd);
4335                 }
4336         }
4337         rps_unlock(sd);
4338         local_irq_enable();
4339
4340         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4341                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4342                         __skb_unlink(skb, &sd->process_queue);
4343                         kfree_skb(skb);
4344                         input_queue_head_incr(sd);
4345                 }
4346         }
4347         local_bh_enable();
4348 }
4349
4350 static void flush_all_backlogs(void)
4351 {
4352         unsigned int cpu;
4353
4354         get_online_cpus();
4355
4356         for_each_online_cpu(cpu)
4357                 queue_work_on(cpu, system_highpri_wq,
4358                               per_cpu_ptr(&flush_works, cpu));
4359
4360         for_each_online_cpu(cpu)
4361                 flush_work(per_cpu_ptr(&flush_works, cpu));
4362
4363         put_online_cpus();
4364 }
4365
4366 static int napi_gro_complete(struct sk_buff *skb)
4367 {
4368         struct packet_offload *ptype;
4369         __be16 type = skb->protocol;
4370         struct list_head *head = &offload_base;
4371         int err = -ENOENT;
4372
4373         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4374
4375         if (NAPI_GRO_CB(skb)->count == 1) {
4376                 skb_shinfo(skb)->gso_size = 0;
4377                 goto out;
4378         }
4379
4380         rcu_read_lock();
4381         list_for_each_entry_rcu(ptype, head, list) {
4382                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4383                         continue;
4384
4385                 err = ptype->callbacks.gro_complete(skb, 0);
4386                 break;
4387         }
4388         rcu_read_unlock();
4389
4390         if (err) {
4391                 WARN_ON(&ptype->list == head);
4392                 kfree_skb(skb);
4393                 return NET_RX_SUCCESS;
4394         }
4395
4396 out:
4397         return netif_receive_skb_internal(skb);
4398 }
4399
4400 /* napi->gro_list contains packets ordered by age.
4401  * youngest packets at the head of it.
4402  * Complete skbs in reverse order to reduce latencies.
4403  */
4404 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4405 {
4406         struct sk_buff *skb, *prev = NULL;
4407
4408         /* scan list and build reverse chain */
4409         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4410                 skb->prev = prev;
4411                 prev = skb;
4412         }
4413
4414         for (skb = prev; skb; skb = prev) {
4415                 skb->next = NULL;
4416
4417                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4418                         return;
4419
4420                 prev = skb->prev;
4421                 napi_gro_complete(skb);
4422                 napi->gro_count--;
4423         }
4424
4425         napi->gro_list = NULL;
4426 }
4427 EXPORT_SYMBOL(napi_gro_flush);
4428
4429 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4430 {
4431         struct sk_buff *p;
4432         unsigned int maclen = skb->dev->hard_header_len;
4433         u32 hash = skb_get_hash_raw(skb);
4434
4435         for (p = napi->gro_list; p; p = p->next) {
4436                 unsigned long diffs;
4437
4438                 NAPI_GRO_CB(p)->flush = 0;
4439
4440                 if (hash != skb_get_hash_raw(p)) {
4441                         NAPI_GRO_CB(p)->same_flow = 0;
4442                         continue;
4443                 }
4444
4445                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4446                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4447                 diffs |= skb_metadata_dst_cmp(p, skb);
4448                 if (maclen == ETH_HLEN)
4449                         diffs |= compare_ether_header(skb_mac_header(p),
4450                                                       skb_mac_header(skb));
4451                 else if (!diffs)
4452                         diffs = memcmp(skb_mac_header(p),
4453                                        skb_mac_header(skb),
4454                                        maclen);
4455                 NAPI_GRO_CB(p)->same_flow = !diffs;
4456         }
4457 }
4458
4459 static void skb_gro_reset_offset(struct sk_buff *skb)
4460 {
4461         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4462         const skb_frag_t *frag0 = &pinfo->frags[0];
4463
4464         NAPI_GRO_CB(skb)->data_offset = 0;
4465         NAPI_GRO_CB(skb)->frag0 = NULL;
4466         NAPI_GRO_CB(skb)->frag0_len = 0;
4467
4468         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4469             pinfo->nr_frags &&
4470             !PageHighMem(skb_frag_page(frag0))) {
4471                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4472                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4473                                                     skb_frag_size(frag0),
4474                                                     skb->end - skb->tail);
4475         }
4476 }
4477
4478 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4479 {
4480         struct skb_shared_info *pinfo = skb_shinfo(skb);
4481
4482         BUG_ON(skb->end - skb->tail < grow);
4483
4484         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4485
4486         skb->data_len -= grow;
4487         skb->tail += grow;
4488
4489         pinfo->frags[0].page_offset += grow;
4490         skb_frag_size_sub(&pinfo->frags[0], grow);
4491
4492         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4493                 skb_frag_unref(skb, 0);
4494                 memmove(pinfo->frags, pinfo->frags + 1,
4495                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4496         }
4497 }
4498
4499 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4500 {
4501         struct sk_buff **pp = NULL;
4502         struct packet_offload *ptype;
4503         __be16 type = skb->protocol;
4504         struct list_head *head = &offload_base;
4505         int same_flow;
4506         enum gro_result ret;
4507         int grow;
4508
4509         if (!(skb->dev->features & NETIF_F_GRO))
4510                 goto normal;
4511
4512         if (skb->csum_bad)
4513                 goto normal;
4514
4515         gro_list_prepare(napi, skb);
4516
4517         rcu_read_lock();
4518         list_for_each_entry_rcu(ptype, head, list) {
4519                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4520                         continue;
4521
4522                 skb_set_network_header(skb, skb_gro_offset(skb));
4523                 skb_reset_mac_len(skb);
4524                 NAPI_GRO_CB(skb)->same_flow = 0;
4525                 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4526                 NAPI_GRO_CB(skb)->free = 0;
4527                 NAPI_GRO_CB(skb)->encap_mark = 0;
4528                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4529                 NAPI_GRO_CB(skb)->is_fou = 0;
4530                 NAPI_GRO_CB(skb)->is_atomic = 1;
4531                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4532
4533                 /* Setup for GRO checksum validation */
4534                 switch (skb->ip_summed) {
4535                 case CHECKSUM_COMPLETE:
4536                         NAPI_GRO_CB(skb)->csum = skb->csum;
4537                         NAPI_GRO_CB(skb)->csum_valid = 1;
4538                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4539                         break;
4540                 case CHECKSUM_UNNECESSARY:
4541                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4542                         NAPI_GRO_CB(skb)->csum_valid = 0;
4543                         break;
4544                 default:
4545                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4546                         NAPI_GRO_CB(skb)->csum_valid = 0;
4547                 }
4548
4549                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4550                 break;
4551         }
4552         rcu_read_unlock();
4553
4554         if (&ptype->list == head)
4555                 goto normal;
4556
4557         same_flow = NAPI_GRO_CB(skb)->same_flow;
4558         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4559
4560         if (pp) {
4561                 struct sk_buff *nskb = *pp;
4562
4563                 *pp = nskb->next;
4564                 nskb->next = NULL;
4565                 napi_gro_complete(nskb);
4566                 napi->gro_count--;
4567         }
4568
4569         if (same_flow)
4570                 goto ok;
4571
4572         if (NAPI_GRO_CB(skb)->flush)
4573                 goto normal;
4574
4575         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4576                 struct sk_buff *nskb = napi->gro_list;
4577
4578                 /* locate the end of the list to select the 'oldest' flow */
4579                 while (nskb->next) {
4580                         pp = &nskb->next;
4581                         nskb = *pp;
4582                 }
4583                 *pp = NULL;
4584                 nskb->next = NULL;
4585                 napi_gro_complete(nskb);
4586         } else {
4587                 napi->gro_count++;
4588         }
4589         NAPI_GRO_CB(skb)->count = 1;
4590         NAPI_GRO_CB(skb)->age = jiffies;
4591         NAPI_GRO_CB(skb)->last = skb;
4592         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4593         skb->next = napi->gro_list;
4594         napi->gro_list = skb;
4595         ret = GRO_HELD;
4596
4597 pull:
4598         grow = skb_gro_offset(skb) - skb_headlen(skb);
4599         if (grow > 0)
4600                 gro_pull_from_frag0(skb, grow);
4601 ok:
4602         return ret;
4603
4604 normal:
4605         ret = GRO_NORMAL;
4606         goto pull;
4607 }
4608
4609 struct packet_offload *gro_find_receive_by_type(__be16 type)
4610 {
4611         struct list_head *offload_head = &offload_base;
4612         struct packet_offload *ptype;
4613
4614         list_for_each_entry_rcu(ptype, offload_head, list) {
4615                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4616                         continue;
4617                 return ptype;
4618         }
4619         return NULL;
4620 }
4621 EXPORT_SYMBOL(gro_find_receive_by_type);
4622
4623 struct packet_offload *gro_find_complete_by_type(__be16 type)
4624 {
4625         struct list_head *offload_head = &offload_base;
4626         struct packet_offload *ptype;
4627
4628         list_for_each_entry_rcu(ptype, offload_head, list) {
4629                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4630                         continue;
4631                 return ptype;
4632         }
4633         return NULL;
4634 }
4635 EXPORT_SYMBOL(gro_find_complete_by_type);
4636
4637 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4638 {
4639         switch (ret) {
4640         case GRO_NORMAL:
4641                 if (netif_receive_skb_internal(skb))
4642                         ret = GRO_DROP;
4643                 break;
4644
4645         case GRO_DROP:
4646                 kfree_skb(skb);
4647                 break;
4648
4649         case GRO_MERGED_FREE:
4650                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4651                         skb_dst_drop(skb);
4652                         kmem_cache_free(skbuff_head_cache, skb);
4653                 } else {
4654                         __kfree_skb(skb);
4655                 }
4656                 break;
4657
4658         case GRO_HELD:
4659         case GRO_MERGED:
4660                 break;
4661         }
4662
4663         return ret;
4664 }
4665
4666 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4667 {
4668         skb_mark_napi_id(skb, napi);
4669         trace_napi_gro_receive_entry(skb);
4670
4671         skb_gro_reset_offset(skb);
4672
4673         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4674 }
4675 EXPORT_SYMBOL(napi_gro_receive);
4676
4677 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4678 {
4679         if (unlikely(skb->pfmemalloc)) {
4680                 consume_skb(skb);
4681                 return;
4682         }
4683         __skb_pull(skb, skb_headlen(skb));
4684         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4685         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4686         skb->vlan_tci = 0;
4687         skb->dev = napi->dev;
4688         skb->skb_iif = 0;
4689         skb->encapsulation = 0;
4690         skb_shinfo(skb)->gso_type = 0;
4691         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4692
4693         napi->skb = skb;
4694 }
4695
4696 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4697 {
4698         struct sk_buff *skb = napi->skb;
4699
4700         if (!skb) {
4701                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4702                 if (skb) {
4703                         napi->skb = skb;
4704                         skb_mark_napi_id(skb, napi);
4705                 }
4706         }
4707         return skb;
4708 }
4709 EXPORT_SYMBOL(napi_get_frags);
4710
4711 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4712                                       struct sk_buff *skb,
4713                                       gro_result_t ret)
4714 {
4715         switch (ret) {
4716         case GRO_NORMAL:
4717         case GRO_HELD:
4718                 __skb_push(skb, ETH_HLEN);
4719                 skb->protocol = eth_type_trans(skb, skb->dev);
4720                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4721                         ret = GRO_DROP;
4722                 break;
4723
4724         case GRO_DROP:
4725         case GRO_MERGED_FREE:
4726                 napi_reuse_skb(napi, skb);
4727                 break;
4728
4729         case GRO_MERGED:
4730                 break;
4731         }
4732
4733         return ret;
4734 }
4735
4736 /* Upper GRO stack assumes network header starts at gro_offset=0
4737  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4738  * We copy ethernet header into skb->data to have a common layout.
4739  */
4740 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4741 {
4742         struct sk_buff *skb = napi->skb;
4743         const struct ethhdr *eth;
4744         unsigned int hlen = sizeof(*eth);
4745
4746         napi->skb = NULL;
4747
4748         skb_reset_mac_header(skb);
4749         skb_gro_reset_offset(skb);
4750
4751         eth = skb_gro_header_fast(skb, 0);
4752         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4753                 eth = skb_gro_header_slow(skb, hlen, 0);
4754                 if (unlikely(!eth)) {
4755                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4756                                              __func__, napi->dev->name);
4757                         napi_reuse_skb(napi, skb);
4758                         return NULL;
4759                 }
4760         } else {
4761                 gro_pull_from_frag0(skb, hlen);
4762                 NAPI_GRO_CB(skb)->frag0 += hlen;
4763                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4764         }
4765         __skb_pull(skb, hlen);
4766
4767         /*
4768          * This works because the only protocols we care about don't require
4769          * special handling.
4770          * We'll fix it up properly in napi_frags_finish()
4771          */
4772         skb->protocol = eth->h_proto;
4773
4774         return skb;
4775 }
4776
4777 gro_result_t napi_gro_frags(struct napi_struct *napi)
4778 {
4779         struct sk_buff *skb = napi_frags_skb(napi);
4780
4781         if (!skb)
4782                 return GRO_DROP;
4783
4784         trace_napi_gro_frags_entry(skb);
4785
4786         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4787 }
4788 EXPORT_SYMBOL(napi_gro_frags);
4789
4790 /* Compute the checksum from gro_offset and return the folded value
4791  * after adding in any pseudo checksum.
4792  */
4793 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4794 {
4795         __wsum wsum;
4796         __sum16 sum;
4797
4798         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4799
4800         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4801         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4802         if (likely(!sum)) {
4803                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4804                     !skb->csum_complete_sw)
4805                         netdev_rx_csum_fault(skb->dev);
4806         }
4807
4808         NAPI_GRO_CB(skb)->csum = wsum;
4809         NAPI_GRO_CB(skb)->csum_valid = 1;
4810
4811         return sum;
4812 }
4813 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4814
4815 /*
4816  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4817  * Note: called with local irq disabled, but exits with local irq enabled.
4818  */
4819 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4820 {
4821 #ifdef CONFIG_RPS
4822         struct softnet_data *remsd = sd->rps_ipi_list;
4823
4824         if (remsd) {
4825                 sd->rps_ipi_list = NULL;
4826
4827                 local_irq_enable();
4828
4829                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4830                 while (remsd) {
4831                         struct softnet_data *next = remsd->rps_ipi_next;
4832
4833                         if (cpu_online(remsd->cpu))
4834                                 smp_call_function_single_async(remsd->cpu,
4835                                                            &remsd->csd);
4836                         remsd = next;
4837                 }
4838         } else
4839 #endif
4840                 local_irq_enable();
4841 }
4842
4843 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4844 {
4845 #ifdef CONFIG_RPS
4846         return sd->rps_ipi_list != NULL;
4847 #else
4848         return false;
4849 #endif
4850 }
4851
4852 static int process_backlog(struct napi_struct *napi, int quota)
4853 {
4854         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4855         bool again = true;
4856         int work = 0;
4857
4858         /* Check if we have pending ipi, its better to send them now,
4859          * not waiting net_rx_action() end.
4860          */
4861         if (sd_has_rps_ipi_waiting(sd)) {
4862                 local_irq_disable();
4863                 net_rps_action_and_irq_enable(sd);
4864         }
4865
4866         napi->weight = weight_p;
4867         while (again) {
4868                 struct sk_buff *skb;
4869
4870                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4871                         rcu_read_lock();
4872                         __netif_receive_skb(skb);
4873                         rcu_read_unlock();
4874                         input_queue_head_incr(sd);
4875                         if (++work >= quota)
4876                                 return work;
4877
4878                 }
4879
4880                 local_irq_disable();
4881                 rps_lock(sd);
4882                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4883                         /*
4884                          * Inline a custom version of __napi_complete().
4885                          * only current cpu owns and manipulates this napi,
4886                          * and NAPI_STATE_SCHED is the only possible flag set
4887                          * on backlog.
4888                          * We can use a plain write instead of clear_bit(),
4889                          * and we dont need an smp_mb() memory barrier.
4890                          */
4891                         napi->state = 0;
4892                         again = false;
4893                 } else {
4894                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4895                                                    &sd->process_queue);
4896                 }
4897                 rps_unlock(sd);
4898                 local_irq_enable();
4899         }
4900
4901         return work;
4902 }
4903
4904 /**
4905  * __napi_schedule - schedule for receive
4906  * @n: entry to schedule
4907  *
4908  * The entry's receive function will be scheduled to run.
4909  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4910  */
4911 void __napi_schedule(struct napi_struct *n)
4912 {
4913         unsigned long flags;
4914
4915         local_irq_save(flags);
4916         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4917         local_irq_restore(flags);
4918 }
4919 EXPORT_SYMBOL(__napi_schedule);
4920
4921 /**
4922  *      napi_schedule_prep - check if napi can be scheduled
4923  *      @n: napi context
4924  *
4925  * Test if NAPI routine is already running, and if not mark
4926  * it as running.  This is used as a condition variable
4927  * insure only one NAPI poll instance runs.  We also make
4928  * sure there is no pending NAPI disable.
4929  */
4930 bool napi_schedule_prep(struct napi_struct *n)
4931 {
4932         unsigned long val, new;
4933
4934         do {
4935                 val = READ_ONCE(n->state);
4936                 if (unlikely(val & NAPIF_STATE_DISABLE))
4937                         return false;
4938                 new = val | NAPIF_STATE_SCHED;
4939
4940                 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4941                  * This was suggested by Alexander Duyck, as compiler
4942                  * emits better code than :
4943                  * if (val & NAPIF_STATE_SCHED)
4944                  *     new |= NAPIF_STATE_MISSED;
4945                  */
4946                 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4947                                                    NAPIF_STATE_MISSED;
4948         } while (cmpxchg(&n->state, val, new) != val);
4949
4950         return !(val & NAPIF_STATE_SCHED);
4951 }
4952 EXPORT_SYMBOL(napi_schedule_prep);
4953
4954 /**
4955  * __napi_schedule_irqoff - schedule for receive
4956  * @n: entry to schedule
4957  *
4958  * Variant of __napi_schedule() assuming hard irqs are masked
4959  */
4960 void __napi_schedule_irqoff(struct napi_struct *n)
4961 {
4962         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4963 }
4964 EXPORT_SYMBOL(__napi_schedule_irqoff);
4965
4966 bool __napi_complete(struct napi_struct *n)
4967 {
4968         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4969
4970         /* Some drivers call us directly, instead of calling
4971          * napi_complete_done().
4972          */
4973         if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4974                 return false;
4975
4976         list_del_init(&n->poll_list);
4977         smp_mb__before_atomic();
4978         clear_bit(NAPI_STATE_SCHED, &n->state);
4979         return true;
4980 }
4981 EXPORT_SYMBOL(__napi_complete);
4982
4983 bool napi_complete_done(struct napi_struct *n, int work_done)
4984 {
4985         unsigned long flags, val, new;
4986
4987         /*
4988          * 1) Don't let napi dequeue from the cpu poll list
4989          *    just in case its running on a different cpu.
4990          * 2) If we are busy polling, do nothing here, we have
4991          *    the guarantee we will be called later.
4992          */
4993         if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4994                                  NAPIF_STATE_IN_BUSY_POLL)))
4995                 return false;
4996
4997         if (n->gro_list) {
4998                 unsigned long timeout = 0;
4999
5000                 if (work_done)
5001                         timeout = n->dev->gro_flush_timeout;
5002
5003                 if (timeout)
5004                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
5005                                       HRTIMER_MODE_REL_PINNED);
5006                 else
5007                         napi_gro_flush(n, false);
5008         }
5009         if (unlikely(!list_empty(&n->poll_list))) {
5010                 /* If n->poll_list is not empty, we need to mask irqs */
5011                 local_irq_save(flags);
5012                 list_del_init(&n->poll_list);
5013                 local_irq_restore(flags);
5014         }
5015
5016         do {
5017                 val = READ_ONCE(n->state);
5018
5019                 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5020
5021                 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5022
5023                 /* If STATE_MISSED was set, leave STATE_SCHED set,
5024                  * because we will call napi->poll() one more time.
5025                  * This C code was suggested by Alexander Duyck to help gcc.
5026                  */
5027                 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5028                                                     NAPIF_STATE_SCHED;
5029         } while (cmpxchg(&n->state, val, new) != val);
5030
5031         if (unlikely(val & NAPIF_STATE_MISSED)) {
5032                 __napi_schedule(n);
5033                 return false;
5034         }
5035
5036         return true;
5037 }
5038 EXPORT_SYMBOL(napi_complete_done);
5039
5040 /* must be called under rcu_read_lock(), as we dont take a reference */
5041 static struct napi_struct *napi_by_id(unsigned int napi_id)
5042 {
5043         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5044         struct napi_struct *napi;
5045
5046         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5047                 if (napi->napi_id == napi_id)
5048                         return napi;
5049
5050         return NULL;
5051 }
5052
5053 #if defined(CONFIG_NET_RX_BUSY_POLL)
5054
5055 #define BUSY_POLL_BUDGET 8
5056
5057 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5058 {
5059         int rc;
5060
5061         /* Busy polling means there is a high chance device driver hard irq
5062          * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5063          * set in napi_schedule_prep().
5064          * Since we are about to call napi->poll() once more, we can safely
5065          * clear NAPI_STATE_MISSED.
5066          *
5067          * Note: x86 could use a single "lock and ..." instruction
5068          * to perform these two clear_bit()
5069          */
5070         clear_bit(NAPI_STATE_MISSED, &napi->state);
5071         clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5072
5073         local_bh_disable();
5074
5075         /* All we really want here is to re-enable device interrupts.
5076          * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5077          */
5078         rc = napi->poll(napi, BUSY_POLL_BUDGET);
5079         netpoll_poll_unlock(have_poll_lock);
5080         if (rc == BUSY_POLL_BUDGET)
5081                 __napi_schedule(napi);
5082         local_bh_enable();
5083         if (local_softirq_pending())
5084                 do_softirq();
5085 }
5086
5087 bool sk_busy_loop(struct sock *sk, int nonblock)
5088 {
5089         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5090         int (*napi_poll)(struct napi_struct *napi, int budget);
5091         int (*busy_poll)(struct napi_struct *dev);
5092         void *have_poll_lock = NULL;
5093         struct napi_struct *napi;
5094         int rc;
5095
5096 restart:
5097         rc = false;
5098         napi_poll = NULL;
5099
5100         rcu_read_lock();
5101
5102         napi = napi_by_id(sk->sk_napi_id);
5103         if (!napi)
5104                 goto out;
5105
5106         /* Note: ndo_busy_poll method is optional in linux-4.5 */
5107         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5108
5109         preempt_disable();
5110         for (;;) {
5111                 rc = 0;
5112                 local_bh_disable();
5113                 if (busy_poll) {
5114                         rc = busy_poll(napi);
5115                         goto count;
5116                 }
5117                 if (!napi_poll) {
5118                         unsigned long val = READ_ONCE(napi->state);
5119
5120                         /* If multiple threads are competing for this napi,
5121                          * we avoid dirtying napi->state as much as we can.
5122                          */
5123                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5124                                    NAPIF_STATE_IN_BUSY_POLL))
5125                                 goto count;
5126                         if (cmpxchg(&napi->state, val,
5127                                     val | NAPIF_STATE_IN_BUSY_POLL |
5128                                           NAPIF_STATE_SCHED) != val)
5129                                 goto count;
5130                         have_poll_lock = netpoll_poll_lock(napi);
5131                         napi_poll = napi->poll;
5132                 }
5133                 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5134                 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5135 count:
5136                 if (rc > 0)
5137                         __NET_ADD_STATS(sock_net(sk),
5138                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5139                 local_bh_enable();
5140
5141                 if (rc == LL_FLUSH_FAILED)
5142                         break; /* permanent failure */
5143
5144                 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5145                     busy_loop_timeout(end_time))
5146                         break;
5147
5148                 if (unlikely(need_resched())) {
5149                         if (napi_poll)
5150                                 busy_poll_stop(napi, have_poll_lock);
5151                         preempt_enable();
5152                         rcu_read_unlock();
5153                         cond_resched();
5154                         rc = !skb_queue_empty(&sk->sk_receive_queue);
5155                         if (rc || busy_loop_timeout(end_time))
5156                                 return rc;
5157                         goto restart;
5158                 }
5159                 cpu_relax();
5160         }
5161         if (napi_poll)
5162                 busy_poll_stop(napi, have_poll_lock);
5163         preempt_enable();
5164         rc = !skb_queue_empty(&sk->sk_receive_queue);
5165 out:
5166         rcu_read_unlock();
5167         return rc;
5168 }
5169 EXPORT_SYMBOL(sk_busy_loop);
5170
5171 #endif /* CONFIG_NET_RX_BUSY_POLL */
5172
5173 static void napi_hash_add(struct napi_struct *napi)
5174 {
5175         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5176             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5177                 return;
5178
5179         spin_lock(&napi_hash_lock);
5180
5181         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5182         do {
5183                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5184                         napi_gen_id = NR_CPUS + 1;
5185         } while (napi_by_id(napi_gen_id));
5186         napi->napi_id = napi_gen_id;
5187
5188         hlist_add_head_rcu(&napi->napi_hash_node,
5189                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5190
5191         spin_unlock(&napi_hash_lock);
5192 }
5193
5194 /* Warning : caller is responsible to make sure rcu grace period
5195  * is respected before freeing memory containing @napi
5196  */
5197 bool napi_hash_del(struct napi_struct *napi)
5198 {
5199         bool rcu_sync_needed = false;
5200
5201         spin_lock(&napi_hash_lock);
5202
5203         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5204                 rcu_sync_needed = true;
5205                 hlist_del_rcu(&napi->napi_hash_node);
5206         }
5207         spin_unlock(&napi_hash_lock);
5208         return rcu_sync_needed;
5209 }
5210 EXPORT_SYMBOL_GPL(napi_hash_del);
5211
5212 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5213 {
5214         struct napi_struct *napi;
5215
5216         napi = container_of(timer, struct napi_struct, timer);
5217
5218         /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5219          * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5220          */
5221         if (napi->gro_list && !napi_disable_pending(napi) &&
5222             !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5223                 __napi_schedule_irqoff(napi);
5224
5225         return HRTIMER_NORESTART;
5226 }
5227
5228 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5229                     int (*poll)(struct napi_struct *, int), int weight)
5230 {
5231         INIT_LIST_HEAD(&napi->poll_list);
5232         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5233         napi->timer.function = napi_watchdog;
5234         napi->gro_count = 0;
5235         napi->gro_list = NULL;
5236         napi->skb = NULL;
5237         napi->poll = poll;
5238         if (weight > NAPI_POLL_WEIGHT)
5239                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5240                             weight, dev->name);
5241         napi->weight = weight;
5242         list_add(&napi->dev_list, &dev->napi_list);
5243         napi->dev = dev;
5244 #ifdef CONFIG_NETPOLL
5245         napi->poll_owner = -1;
5246 #endif
5247         set_bit(NAPI_STATE_SCHED, &napi->state);
5248         napi_hash_add(napi);
5249 }
5250 EXPORT_SYMBOL(netif_napi_add);
5251
5252 void napi_disable(struct napi_struct *n)
5253 {
5254         might_sleep();
5255         set_bit(NAPI_STATE_DISABLE, &n->state);
5256
5257         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5258                 msleep(1);
5259         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5260                 msleep(1);
5261
5262         hrtimer_cancel(&n->timer);
5263
5264         clear_bit(NAPI_STATE_DISABLE, &n->state);
5265 }
5266 EXPORT_SYMBOL(napi_disable);
5267
5268 /* Must be called in process context */
5269 void netif_napi_del(struct napi_struct *napi)
5270 {
5271         might_sleep();
5272         if (napi_hash_del(napi))
5273                 synchronize_net();
5274         list_del_init(&napi->dev_list);
5275         napi_free_frags(napi);
5276
5277         kfree_skb_list(napi->gro_list);
5278         napi->gro_list = NULL;
5279         napi->gro_count = 0;
5280 }
5281 EXPORT_SYMBOL(netif_napi_del);
5282
5283 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5284 {
5285         void *have;
5286         int work, weight;
5287
5288         list_del_init(&n->poll_list);
5289
5290         have = netpoll_poll_lock(n);
5291
5292         weight = n->weight;
5293
5294         /* This NAPI_STATE_SCHED test is for avoiding a race
5295          * with netpoll's poll_napi().  Only the entity which
5296          * obtains the lock and sees NAPI_STATE_SCHED set will
5297          * actually make the ->poll() call.  Therefore we avoid
5298          * accidentally calling ->poll() when NAPI is not scheduled.
5299          */
5300         work = 0;
5301         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5302                 work = n->poll(n, weight);
5303                 trace_napi_poll(n, work, weight);
5304         }
5305
5306         WARN_ON_ONCE(work > weight);
5307
5308         if (likely(work < weight))
5309                 goto out_unlock;
5310
5311         /* Drivers must not modify the NAPI state if they
5312          * consume the entire weight.  In such cases this code
5313          * still "owns" the NAPI instance and therefore can
5314          * move the instance around on the list at-will.
5315          */
5316         if (unlikely(napi_disable_pending(n))) {
5317                 napi_complete(n);
5318                 goto out_unlock;
5319         }
5320
5321         if (n->gro_list) {
5322                 /* flush too old packets
5323                  * If HZ < 1000, flush all packets.
5324                  */
5325                 napi_gro_flush(n, HZ >= 1000);
5326         }
5327
5328         /* Some drivers may have called napi_schedule
5329          * prior to exhausting their budget.
5330          */
5331         if (unlikely(!list_empty(&n->poll_list))) {
5332                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5333                              n->dev ? n->dev->name : "backlog");
5334                 goto out_unlock;
5335         }
5336
5337         list_add_tail(&n->poll_list, repoll);
5338
5339 out_unlock:
5340         netpoll_poll_unlock(have);
5341
5342         return work;
5343 }
5344
5345 static __latent_entropy void net_rx_action(struct softirq_action *h)
5346 {
5347         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5348         unsigned long time_limit = jiffies + 2;
5349         int budget = netdev_budget;
5350         LIST_HEAD(list);
5351         LIST_HEAD(repoll);
5352
5353         local_irq_disable();
5354         list_splice_init(&sd->poll_list, &list);
5355         local_irq_enable();
5356
5357         for (;;) {
5358                 struct napi_struct *n;
5359
5360                 if (list_empty(&list)) {
5361                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5362                                 goto out;
5363                         break;
5364                 }
5365
5366                 n = list_first_entry(&list, struct napi_struct, poll_list);
5367                 budget -= napi_poll(n, &repoll);
5368
5369                 /* If softirq window is exhausted then punt.
5370                  * Allow this to run for 2 jiffies since which will allow
5371                  * an average latency of 1.5/HZ.
5372                  */
5373                 if (unlikely(budget <= 0 ||
5374                              time_after_eq(jiffies, time_limit))) {
5375                         sd->time_squeeze++;
5376                         break;
5377                 }
5378         }
5379
5380         local_irq_disable();
5381
5382         list_splice_tail_init(&sd->poll_list, &list);
5383         list_splice_tail(&repoll, &list);
5384         list_splice(&list, &sd->poll_list);
5385         if (!list_empty(&sd->poll_list))
5386                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5387
5388         net_rps_action_and_irq_enable(sd);
5389 out:
5390         __kfree_skb_flush();
5391 }
5392
5393 struct netdev_adjacent {
5394         struct net_device *dev;
5395
5396         /* upper master flag, there can only be one master device per list */
5397         bool master;
5398
5399         /* counter for the number of times this device was added to us */
5400         u16 ref_nr;
5401
5402         /* private field for the users */
5403         void *private;
5404
5405         struct list_head list;
5406         struct rcu_head rcu;
5407 };
5408
5409 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5410                                                  struct list_head *adj_list)
5411 {
5412         struct netdev_adjacent *adj;
5413
5414         list_for_each_entry(adj, adj_list, list) {
5415                 if (adj->dev == adj_dev)
5416                         return adj;
5417         }
5418         return NULL;
5419 }
5420
5421 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5422 {
5423         struct net_device *dev = data;
5424
5425         return upper_dev == dev;
5426 }
5427
5428 /**
5429  * netdev_has_upper_dev - Check if device is linked to an upper device
5430  * @dev: device
5431  * @upper_dev: upper device to check
5432  *
5433  * Find out if a device is linked to specified upper device and return true
5434  * in case it is. Note that this checks only immediate upper device,
5435  * not through a complete stack of devices. The caller must hold the RTNL lock.
5436  */
5437 bool netdev_has_upper_dev(struct net_device *dev,
5438                           struct net_device *upper_dev)
5439 {
5440         ASSERT_RTNL();
5441
5442         return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5443                                              upper_dev);
5444 }
5445 EXPORT_SYMBOL(netdev_has_upper_dev);
5446
5447 /**
5448  * netdev_has_upper_dev_all - Check if device is linked to an upper device
5449  * @dev: device
5450  * @upper_dev: upper device to check
5451  *
5452  * Find out if a device is linked to specified upper device and return true
5453  * in case it is. Note that this checks the entire upper device chain.
5454  * The caller must hold rcu lock.
5455  */
5456
5457 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5458                                   struct net_device *upper_dev)
5459 {
5460         return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5461                                                upper_dev);
5462 }
5463 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5464
5465 /**
5466  * netdev_has_any_upper_dev - Check if device is linked to some device
5467  * @dev: device
5468  *
5469  * Find out if a device is linked to an upper device and return true in case
5470  * it is. The caller must hold the RTNL lock.
5471  */
5472 static bool netdev_has_any_upper_dev(struct net_device *dev)
5473 {
5474         ASSERT_RTNL();
5475
5476         return !list_empty(&dev->adj_list.upper);
5477 }
5478
5479 /**
5480  * netdev_master_upper_dev_get - Get master upper device
5481  * @dev: device
5482  *
5483  * Find a master upper device and return pointer to it or NULL in case
5484  * it's not there. The caller must hold the RTNL lock.
5485  */
5486 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5487 {
5488         struct netdev_adjacent *upper;
5489
5490         ASSERT_RTNL();
5491
5492         if (list_empty(&dev->adj_list.upper))
5493                 return NULL;
5494
5495         upper = list_first_entry(&dev->adj_list.upper,
5496                                  struct netdev_adjacent, list);
5497         if (likely(upper->master))
5498                 return upper->dev;
5499         return NULL;
5500 }
5501 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5502
5503 /**
5504  * netdev_has_any_lower_dev - Check if device is linked to some device
5505  * @dev: device
5506  *
5507  * Find out if a device is linked to a lower device and return true in case
5508  * it is. The caller must hold the RTNL lock.
5509  */
5510 static bool netdev_has_any_lower_dev(struct net_device *dev)
5511 {
5512         ASSERT_RTNL();
5513
5514         return !list_empty(&dev->adj_list.lower);
5515 }
5516
5517 void *netdev_adjacent_get_private(struct list_head *adj_list)
5518 {
5519         struct netdev_adjacent *adj;
5520
5521         adj = list_entry(adj_list, struct netdev_adjacent, list);
5522
5523         return adj->private;
5524 }
5525 EXPORT_SYMBOL(netdev_adjacent_get_private);
5526
5527 /**
5528  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5529  * @dev: device
5530  * @iter: list_head ** of the current position
5531  *
5532  * Gets the next device from the dev's upper list, starting from iter
5533  * position. The caller must hold RCU read lock.
5534  */
5535 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5536                                                  struct list_head **iter)
5537 {
5538         struct netdev_adjacent *upper;
5539
5540         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5541
5542         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5543
5544         if (&upper->list == &dev->adj_list.upper)
5545                 return NULL;
5546
5547         *iter = &upper->list;
5548
5549         return upper->dev;
5550 }
5551 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5552
5553 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5554                                                     struct list_head **iter)
5555 {
5556         struct netdev_adjacent *upper;
5557
5558         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5559
5560         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5561
5562         if (&upper->list == &dev->adj_list.upper)
5563                 return NULL;
5564
5565         *iter = &upper->list;
5566
5567         return upper->dev;
5568 }
5569
5570 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5571                                   int (*fn)(struct net_device *dev,
5572                                             void *data),
5573                                   void *data)
5574 {
5575         struct net_device *udev;
5576         struct list_head *iter;
5577         int ret;
5578
5579         for (iter = &dev->adj_list.upper,
5580              udev = netdev_next_upper_dev_rcu(dev, &iter);
5581              udev;
5582              udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5583                 /* first is the upper device itself */
5584                 ret = fn(udev, data);
5585                 if (ret)
5586                         return ret;
5587
5588                 /* then look at all of its upper devices */
5589                 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5590                 if (ret)
5591                         return ret;
5592         }
5593
5594         return 0;
5595 }
5596 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5597
5598 /**
5599  * netdev_lower_get_next_private - Get the next ->private from the
5600  *                                 lower neighbour list
5601  * @dev: device
5602  * @iter: list_head ** of the current position
5603  *
5604  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5605  * list, starting from iter position. The caller must hold either hold the
5606  * RTNL lock or its own locking that guarantees that the neighbour lower
5607  * list will remain unchanged.
5608  */
5609 void *netdev_lower_get_next_private(struct net_device *dev,
5610                                     struct list_head **iter)
5611 {
5612         struct netdev_adjacent *lower;
5613
5614         lower = list_entry(*iter, struct netdev_adjacent, list);
5615
5616         if (&lower->list == &dev->adj_list.lower)
5617                 return NULL;
5618
5619         *iter = lower->list.next;
5620
5621         return lower->private;
5622 }
5623 EXPORT_SYMBOL(netdev_lower_get_next_private);
5624
5625 /**
5626  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5627  *                                     lower neighbour list, RCU
5628  *                                     variant
5629  * @dev: device
5630  * @iter: list_head ** of the current position
5631  *
5632  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5633  * list, starting from iter position. The caller must hold RCU read lock.
5634  */
5635 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5636                                         struct list_head **iter)
5637 {
5638         struct netdev_adjacent *lower;
5639
5640         WARN_ON_ONCE(!rcu_read_lock_held());
5641
5642         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5643
5644         if (&lower->list == &dev->adj_list.lower)
5645                 return NULL;
5646
5647         *iter = &lower->list;
5648
5649         return lower->private;
5650 }
5651 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5652
5653 /**
5654  * netdev_lower_get_next - Get the next device from the lower neighbour
5655  *                         list
5656  * @dev: device
5657  * @iter: list_head ** of the current position
5658  *
5659  * Gets the next netdev_adjacent from the dev's lower neighbour
5660  * list, starting from iter position. The caller must hold RTNL lock or
5661  * its own locking that guarantees that the neighbour lower
5662  * list will remain unchanged.
5663  */
5664 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5665 {
5666         struct netdev_adjacent *lower;
5667
5668         lower = list_entry(*iter, struct netdev_adjacent, list);
5669
5670         if (&lower->list == &dev->adj_list.lower)
5671                 return NULL;
5672
5673         *iter = lower->list.next;
5674
5675         return lower->dev;
5676 }
5677 EXPORT_SYMBOL(netdev_lower_get_next);
5678
5679 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5680                                                 struct list_head **iter)
5681 {
5682         struct netdev_adjacent *lower;
5683
5684         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5685
5686         if (&lower->list == &dev->adj_list.lower)
5687                 return NULL;
5688
5689         *iter = &lower->list;
5690
5691         return lower->dev;
5692 }
5693
5694 int netdev_walk_all_lower_dev(struct net_device *dev,
5695                               int (*fn)(struct net_device *dev,
5696                                         void *data),
5697                               void *data)
5698 {
5699         struct net_device *ldev;
5700         struct list_head *iter;
5701         int ret;
5702
5703         for (iter = &dev->adj_list.lower,
5704              ldev = netdev_next_lower_dev(dev, &iter);
5705              ldev;
5706              ldev = netdev_next_lower_dev(dev, &iter)) {
5707                 /* first is the lower device itself */
5708                 ret = fn(ldev, data);
5709                 if (ret)
5710                         return ret;
5711
5712                 /* then look at all of its lower devices */
5713                 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5714                 if (ret)
5715                         return ret;
5716         }
5717
5718         return 0;
5719 }
5720 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5721
5722 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5723                                                     struct list_head **iter)
5724 {
5725         struct netdev_adjacent *lower;
5726
5727         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5728         if (&lower->list == &dev->adj_list.lower)
5729                 return NULL;
5730
5731         *iter = &lower->list;
5732
5733         return lower->dev;
5734 }
5735
5736 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5737                                   int (*fn)(struct net_device *dev,
5738                                             void *data),
5739                                   void *data)
5740 {
5741         struct net_device *ldev;
5742         struct list_head *iter;
5743         int ret;
5744
5745         for (iter = &dev->adj_list.lower,
5746              ldev = netdev_next_lower_dev_rcu(dev, &iter);
5747              ldev;
5748              ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5749                 /* first is the lower device itself */
5750                 ret = fn(ldev, data);
5751                 if (ret)
5752                         return ret;
5753
5754                 /* then look at all of its lower devices */
5755                 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5756                 if (ret)
5757                         return ret;
5758         }
5759
5760         return 0;
5761 }
5762 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5763
5764 /**
5765  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5766  *                                     lower neighbour list, RCU
5767  *                                     variant
5768  * @dev: device
5769  *
5770  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5771  * list. The caller must hold RCU read lock.
5772  */
5773 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5774 {
5775         struct netdev_adjacent *lower;
5776
5777         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5778                         struct netdev_adjacent, list);
5779         if (lower)
5780                 return lower->private;
5781         return NULL;
5782 }
5783 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5784
5785 /**
5786  * netdev_master_upper_dev_get_rcu - Get master upper device
5787  * @dev: device
5788  *
5789  * Find a master upper device and return pointer to it or NULL in case
5790  * it's not there. The caller must hold the RCU read lock.
5791  */
5792 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5793 {
5794         struct netdev_adjacent *upper;
5795
5796         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5797                                        struct netdev_adjacent, list);
5798         if (upper && likely(upper->master))
5799                 return upper->dev;
5800         return NULL;
5801 }
5802 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5803
5804 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5805                               struct net_device *adj_dev,
5806                               struct list_head *dev_list)
5807 {
5808         char linkname[IFNAMSIZ+7];
5809         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5810                 "upper_%s" : "lower_%s", adj_dev->name);
5811         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5812                                  linkname);
5813 }
5814 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5815                                char *name,
5816                                struct list_head *dev_list)
5817 {
5818         char linkname[IFNAMSIZ+7];
5819         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5820                 "upper_%s" : "lower_%s", name);
5821         sysfs_remove_link(&(dev->dev.kobj), linkname);
5822 }
5823
5824 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5825                                                  struct net_device *adj_dev,
5826                                                  struct list_head *dev_list)
5827 {
5828         return (dev_list == &dev->adj_list.upper ||
5829                 dev_list == &dev->adj_list.lower) &&
5830                 net_eq(dev_net(dev), dev_net(adj_dev));
5831 }
5832
5833 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5834                                         struct net_device *adj_dev,
5835                                         struct list_head *dev_list,
5836                                         void *private, bool master)
5837 {
5838         struct netdev_adjacent *adj;
5839         int ret;
5840
5841         adj = __netdev_find_adj(adj_dev, dev_list);
5842
5843         if (adj) {
5844                 adj->ref_nr += 1;
5845                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5846                          dev->name, adj_dev->name, adj->ref_nr);
5847
5848                 return 0;
5849         }
5850
5851         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5852         if (!adj)
5853                 return -ENOMEM;
5854
5855         adj->dev = adj_dev;
5856         adj->master = master;
5857         adj->ref_nr = 1;
5858         adj->private = private;
5859         dev_hold(adj_dev);
5860
5861         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5862                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5863
5864         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5865                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5866                 if (ret)
5867                         goto free_adj;
5868         }
5869
5870         /* Ensure that master link is always the first item in list. */
5871         if (master) {
5872                 ret = sysfs_create_link(&(dev->dev.kobj),
5873                                         &(adj_dev->dev.kobj), "master");
5874                 if (ret)
5875                         goto remove_symlinks;
5876
5877                 list_add_rcu(&adj->list, dev_list);
5878         } else {
5879                 list_add_tail_rcu(&adj->list, dev_list);
5880         }
5881
5882         return 0;
5883
5884 remove_symlinks:
5885         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5886                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5887 free_adj:
5888         kfree(adj);
5889         dev_put(adj_dev);
5890
5891         return ret;
5892 }
5893
5894 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5895                                          struct net_device *adj_dev,
5896                                          u16 ref_nr,
5897                                          struct list_head *dev_list)
5898 {
5899         struct netdev_adjacent *adj;
5900
5901         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5902                  dev->name, adj_dev->name, ref_nr);
5903
5904         adj = __netdev_find_adj(adj_dev, dev_list);
5905
5906         if (!adj) {
5907                 pr_err("Adjacency does not exist for device %s from %s\n",
5908                        dev->name, adj_dev->name);
5909                 WARN_ON(1);
5910                 return;
5911         }
5912
5913         if (adj->ref_nr > ref_nr) {
5914                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5915                          dev->name, adj_dev->name, ref_nr,
5916                          adj->ref_nr - ref_nr);
5917                 adj->ref_nr -= ref_nr;
5918                 return;
5919         }
5920
5921         if (adj->master)
5922                 sysfs_remove_link(&(dev->dev.kobj), "master");
5923
5924         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5925                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5926
5927         list_del_rcu(&adj->list);
5928         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5929                  adj_dev->name, dev->name, adj_dev->name);
5930         dev_put(adj_dev);
5931         kfree_rcu(adj, rcu);
5932 }
5933
5934 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5935                                             struct net_device *upper_dev,
5936                                             struct list_head *up_list,
5937                                             struct list_head *down_list,
5938                                             void *private, bool master)
5939 {
5940         int ret;
5941
5942         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5943                                            private, master);
5944         if (ret)
5945                 return ret;
5946
5947         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5948                                            private, false);
5949         if (ret) {
5950                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5951                 return ret;
5952         }
5953
5954         return 0;
5955 }
5956
5957 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5958                                                struct net_device *upper_dev,
5959                                                u16 ref_nr,
5960                                                struct list_head *up_list,
5961                                                struct list_head *down_list)
5962 {
5963         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5964         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5965 }
5966
5967 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5968                                                 struct net_device *upper_dev,
5969                                                 void *private, bool master)
5970 {
5971         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5972                                                 &dev->adj_list.upper,
5973                                                 &upper_dev->adj_list.lower,
5974                                                 private, master);
5975 }
5976
5977 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5978                                                    struct net_device *upper_dev)
5979 {
5980         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5981                                            &dev->adj_list.upper,
5982                                            &upper_dev->adj_list.lower);
5983 }
5984
5985 static int __netdev_upper_dev_link(struct net_device *dev,
5986                                    struct net_device *upper_dev, bool master,
5987                                    void *upper_priv, void *upper_info)
5988 {
5989         struct netdev_notifier_changeupper_info changeupper_info;
5990         int ret = 0;
5991
5992         ASSERT_RTNL();
5993
5994         if (dev == upper_dev)
5995                 return -EBUSY;
5996
5997         /* To prevent loops, check if dev is not upper device to upper_dev. */
5998         if (netdev_has_upper_dev(upper_dev, dev))
5999                 return -EBUSY;
6000
6001         if (netdev_has_upper_dev(dev, upper_dev))
6002                 return -EEXIST;
6003
6004         if (master && netdev_master_upper_dev_get(dev))
6005                 return -EBUSY;
6006
6007         changeupper_info.upper_dev = upper_dev;
6008         changeupper_info.master = master;
6009         changeupper_info.linking = true;
6010         changeupper_info.upper_info = upper_info;
6011
6012         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6013                                             &changeupper_info.info);
6014         ret = notifier_to_errno(ret);
6015         if (ret)
6016                 return ret;
6017
6018         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6019                                                    master);
6020         if (ret)
6021                 return ret;
6022
6023         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6024                                             &changeupper_info.info);
6025         ret = notifier_to_errno(ret);
6026         if (ret)
6027                 goto rollback;
6028
6029         return 0;
6030
6031 rollback:
6032         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6033
6034         return ret;
6035 }
6036
6037 /**
6038  * netdev_upper_dev_link - Add a link to the upper device
6039  * @dev: device
6040  * @upper_dev: new upper device
6041  *
6042  * Adds a link to device which is upper to this one. The caller must hold
6043  * the RTNL lock. On a failure a negative errno code is returned.
6044  * On success the reference counts are adjusted and the function
6045  * returns zero.
6046  */
6047 int netdev_upper_dev_link(struct net_device *dev,
6048                           struct net_device *upper_dev)
6049 {
6050         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
6051 }
6052 EXPORT_SYMBOL(netdev_upper_dev_link);
6053
6054 /**
6055  * netdev_master_upper_dev_link - Add a master link to the upper device
6056  * @dev: device
6057  * @upper_dev: new upper device
6058  * @upper_priv: upper device private
6059  * @upper_info: upper info to be passed down via notifier
6060  *
6061  * Adds a link to device which is upper to this one. In this case, only
6062  * one master upper device can be linked, although other non-master devices
6063  * might be linked as well. The caller must hold the RTNL lock.
6064  * On a failure a negative errno code is returned. On success the reference
6065  * counts are adjusted and the function returns zero.
6066  */
6067 int netdev_master_upper_dev_link(struct net_device *dev,
6068                                  struct net_device *upper_dev,
6069                                  void *upper_priv, void *upper_info)
6070 {
6071         return __netdev_upper_dev_link(dev, upper_dev, true,
6072                                        upper_priv, upper_info);
6073 }
6074 EXPORT_SYMBOL(netdev_master_upper_dev_link);
6075
6076 /**
6077  * netdev_upper_dev_unlink - Removes a link to upper device
6078  * @dev: device
6079  * @upper_dev: new upper device
6080  *
6081  * Removes a link to device which is upper to this one. The caller must hold
6082  * the RTNL lock.
6083  */
6084 void netdev_upper_dev_unlink(struct net_device *dev,
6085                              struct net_device *upper_dev)
6086 {
6087         struct netdev_notifier_changeupper_info changeupper_info;
6088         ASSERT_RTNL();
6089
6090         changeupper_info.upper_dev = upper_dev;
6091         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6092         changeupper_info.linking = false;
6093
6094         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6095                                       &changeupper_info.info);
6096
6097         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6098
6099         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6100                                       &changeupper_info.info);
6101 }
6102 EXPORT_SYMBOL(netdev_upper_dev_unlink);
6103
6104 /**
6105  * netdev_bonding_info_change - Dispatch event about slave change
6106  * @dev: device
6107  * @bonding_info: info to dispatch
6108  *
6109  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6110  * The caller must hold the RTNL lock.
6111  */
6112 void netdev_bonding_info_change(struct net_device *dev,
6113                                 struct netdev_bonding_info *bonding_info)
6114 {
6115         struct netdev_notifier_bonding_info     info;
6116
6117         memcpy(&info.bonding_info, bonding_info,
6118                sizeof(struct netdev_bonding_info));
6119         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6120                                       &info.info);
6121 }
6122 EXPORT_SYMBOL(netdev_bonding_info_change);
6123
6124 static void netdev_adjacent_add_links(struct net_device *dev)
6125 {
6126         struct netdev_adjacent *iter;
6127
6128         struct net *net = dev_net(dev);
6129
6130         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6131                 if (!net_eq(net, dev_net(iter->dev)))
6132                         continue;
6133                 netdev_adjacent_sysfs_add(iter->dev, dev,
6134                                           &iter->dev->adj_list.lower);
6135                 netdev_adjacent_sysfs_add(dev, iter->dev,
6136                                           &dev->adj_list.upper);
6137         }
6138
6139         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6140                 if (!net_eq(net, dev_net(iter->dev)))
6141                         continue;
6142                 netdev_adjacent_sysfs_add(iter->dev, dev,
6143                                           &iter->dev->adj_list.upper);
6144                 netdev_adjacent_sysfs_add(dev, iter->dev,
6145                                           &dev->adj_list.lower);
6146         }
6147 }
6148
6149 static void netdev_adjacent_del_links(struct net_device *dev)
6150 {
6151         struct netdev_adjacent *iter;
6152
6153         struct net *net = dev_net(dev);
6154
6155         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6156                 if (!net_eq(net, dev_net(iter->dev)))
6157                         continue;
6158                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6159                                           &iter->dev->adj_list.lower);
6160                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6161                                           &dev->adj_list.upper);
6162         }
6163
6164         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6165                 if (!net_eq(net, dev_net(iter->dev)))
6166                         continue;
6167                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6168                                           &iter->dev->adj_list.upper);
6169                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6170                                           &dev->adj_list.lower);
6171         }
6172 }
6173
6174 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6175 {
6176         struct netdev_adjacent *iter;
6177
6178         struct net *net = dev_net(dev);
6179
6180         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6181                 if (!net_eq(net, dev_net(iter->dev)))
6182                         continue;
6183                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6184                                           &iter->dev->adj_list.lower);
6185                 netdev_adjacent_sysfs_add(iter->dev, dev,
6186                                           &iter->dev->adj_list.lower);
6187         }
6188
6189         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6190                 if (!net_eq(net, dev_net(iter->dev)))
6191                         continue;
6192                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6193                                           &iter->dev->adj_list.upper);
6194                 netdev_adjacent_sysfs_add(iter->dev, dev,
6195                                           &iter->dev->adj_list.upper);
6196         }
6197 }
6198
6199 void *netdev_lower_dev_get_private(struct net_device *dev,
6200                                    struct net_device *lower_dev)
6201 {
6202         struct netdev_adjacent *lower;
6203
6204         if (!lower_dev)
6205                 return NULL;
6206         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6207         if (!lower)
6208                 return NULL;
6209
6210         return lower->private;
6211 }
6212 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6213
6214
6215 int dev_get_nest_level(struct net_device *dev)
6216 {
6217         struct net_device *lower = NULL;
6218         struct list_head *iter;
6219         int max_nest = -1;
6220         int nest;
6221
6222         ASSERT_RTNL();
6223
6224         netdev_for_each_lower_dev(dev, lower, iter) {
6225                 nest = dev_get_nest_level(lower);
6226                 if (max_nest < nest)
6227                         max_nest = nest;
6228         }
6229
6230         return max_nest + 1;
6231 }
6232 EXPORT_SYMBOL(dev_get_nest_level);
6233
6234 /**
6235  * netdev_lower_change - Dispatch event about lower device state change
6236  * @lower_dev: device
6237  * @lower_state_info: state to dispatch
6238  *
6239  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6240  * The caller must hold the RTNL lock.
6241  */
6242 void netdev_lower_state_changed(struct net_device *lower_dev,
6243                                 void *lower_state_info)
6244 {
6245         struct netdev_notifier_changelowerstate_info changelowerstate_info;
6246
6247         ASSERT_RTNL();
6248         changelowerstate_info.lower_state_info = lower_state_info;
6249         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6250                                       &changelowerstate_info.info);
6251 }
6252 EXPORT_SYMBOL(netdev_lower_state_changed);
6253
6254 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6255                                            struct neighbour *n)
6256 {
6257         struct net_device *lower_dev, *stop_dev;
6258         struct list_head *iter;
6259         int err;
6260
6261         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6262                 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6263                         continue;
6264                 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6265                 if (err) {
6266                         stop_dev = lower_dev;
6267                         goto rollback;
6268                 }
6269         }
6270         return 0;
6271
6272 rollback:
6273         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6274                 if (lower_dev == stop_dev)
6275                         break;
6276                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6277                         continue;
6278                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6279         }
6280         return err;
6281 }
6282 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6283
6284 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6285                                           struct neighbour *n)
6286 {
6287         struct net_device *lower_dev;
6288         struct list_head *iter;
6289
6290         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6291                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6292                         continue;
6293                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6294         }
6295 }
6296 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6297
6298 static void dev_change_rx_flags(struct net_device *dev, int flags)
6299 {
6300         const struct net_device_ops *ops = dev->netdev_ops;
6301
6302         if (ops->ndo_change_rx_flags)
6303                 ops->ndo_change_rx_flags(dev, flags);
6304 }
6305
6306 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6307 {
6308         unsigned int old_flags = dev->flags;
6309         kuid_t uid;
6310         kgid_t gid;
6311
6312         ASSERT_RTNL();
6313
6314         dev->flags |= IFF_PROMISC;
6315         dev->promiscuity += inc;
6316         if (dev->promiscuity == 0) {
6317                 /*
6318                  * Avoid overflow.
6319                  * If inc causes overflow, untouch promisc and return error.
6320                  */
6321                 if (inc < 0)
6322                         dev->flags &= ~IFF_PROMISC;
6323                 else {
6324                         dev->promiscuity -= inc;
6325                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6326                                 dev->name);
6327                         return -EOVERFLOW;
6328                 }
6329         }
6330         if (dev->flags != old_flags) {
6331                 pr_info("device %s %s promiscuous mode\n",
6332                         dev->name,
6333                         dev->flags & IFF_PROMISC ? "entered" : "left");
6334                 if (audit_enabled) {
6335                         current_uid_gid(&uid, &gid);
6336                         audit_log(current->audit_context, GFP_ATOMIC,
6337                                 AUDIT_ANOM_PROMISCUOUS,
6338                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6339                                 dev->name, (dev->flags & IFF_PROMISC),
6340                                 (old_flags & IFF_PROMISC),
6341                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6342                                 from_kuid(&init_user_ns, uid),
6343                                 from_kgid(&init_user_ns, gid),
6344                                 audit_get_sessionid(current));
6345                 }
6346
6347                 dev_change_rx_flags(dev, IFF_PROMISC);
6348         }
6349         if (notify)
6350                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6351         return 0;
6352 }
6353
6354 /**
6355  *      dev_set_promiscuity     - update promiscuity count on a device
6356  *      @dev: device
6357  *      @inc: modifier
6358  *
6359  *      Add or remove promiscuity from a device. While the count in the device
6360  *      remains above zero the interface remains promiscuous. Once it hits zero
6361  *      the device reverts back to normal filtering operation. A negative inc
6362  *      value is used to drop promiscuity on the device.
6363  *      Return 0 if successful or a negative errno code on error.
6364  */
6365 int dev_set_promiscuity(struct net_device *dev, int inc)
6366 {
6367         unsigned int old_flags = dev->flags;
6368         int err;
6369
6370         err = __dev_set_promiscuity(dev, inc, true);
6371         if (err < 0)
6372                 return err;
6373         if (dev->flags != old_flags)
6374                 dev_set_rx_mode(dev);
6375         return err;
6376 }
6377 EXPORT_SYMBOL(dev_set_promiscuity);
6378
6379 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6380 {
6381         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6382
6383         ASSERT_RTNL();
6384
6385         dev->flags |= IFF_ALLMULTI;
6386         dev->allmulti += inc;
6387         if (dev->allmulti == 0) {
6388                 /*
6389                  * Avoid overflow.
6390                  * If inc causes overflow, untouch allmulti and return error.
6391                  */
6392                 if (inc < 0)
6393                         dev->flags &= ~IFF_ALLMULTI;
6394                 else {
6395                         dev->allmulti -= inc;
6396                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6397                                 dev->name);
6398                         return -EOVERFLOW;
6399                 }
6400         }
6401         if (dev->flags ^ old_flags) {
6402                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6403                 dev_set_rx_mode(dev);
6404                 if (notify)
6405                         __dev_notify_flags(dev, old_flags,
6406                                            dev->gflags ^ old_gflags);
6407         }
6408         return 0;
6409 }
6410
6411 /**
6412  *      dev_set_allmulti        - update allmulti count on a device
6413  *      @dev: device
6414  *      @inc: modifier
6415  *
6416  *      Add or remove reception of all multicast frames to a device. While the
6417  *      count in the device remains above zero the interface remains listening
6418  *      to all interfaces. Once it hits zero the device reverts back to normal
6419  *      filtering operation. A negative @inc value is used to drop the counter
6420  *      when releasing a resource needing all multicasts.
6421  *      Return 0 if successful or a negative errno code on error.
6422  */
6423
6424 int dev_set_allmulti(struct net_device *dev, int inc)
6425 {
6426         return __dev_set_allmulti(dev, inc, true);
6427 }
6428 EXPORT_SYMBOL(dev_set_allmulti);
6429
6430 /*
6431  *      Upload unicast and multicast address lists to device and
6432  *      configure RX filtering. When the device doesn't support unicast
6433  *      filtering it is put in promiscuous mode while unicast addresses
6434  *      are present.
6435  */
6436 void __dev_set_rx_mode(struct net_device *dev)
6437 {
6438         const struct net_device_ops *ops = dev->netdev_ops;
6439
6440         /* dev_open will call this function so the list will stay sane. */
6441         if (!(dev->flags&IFF_UP))
6442                 return;
6443
6444         if (!netif_device_present(dev))
6445                 return;
6446
6447         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6448                 /* Unicast addresses changes may only happen under the rtnl,
6449                  * therefore calling __dev_set_promiscuity here is safe.
6450                  */
6451                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6452                         __dev_set_promiscuity(dev, 1, false);
6453                         dev->uc_promisc = true;
6454                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6455                         __dev_set_promiscuity(dev, -1, false);
6456                         dev->uc_promisc = false;
6457                 }
6458         }
6459
6460         if (ops->ndo_set_rx_mode)
6461                 ops->ndo_set_rx_mode(dev);
6462 }
6463
6464 void dev_set_rx_mode(struct net_device *dev)
6465 {
6466         netif_addr_lock_bh(dev);
6467         __dev_set_rx_mode(dev);
6468         netif_addr_unlock_bh(dev);
6469 }
6470
6471 /**
6472  *      dev_get_flags - get flags reported to userspace
6473  *      @dev: device
6474  *
6475  *      Get the combination of flag bits exported through APIs to userspace.
6476  */
6477 unsigned int dev_get_flags(const struct net_device *dev)
6478 {
6479         unsigned int flags;
6480
6481         flags = (dev->flags & ~(IFF_PROMISC |
6482                                 IFF_ALLMULTI |
6483                                 IFF_RUNNING |
6484                                 IFF_LOWER_UP |
6485                                 IFF_DORMANT)) |
6486                 (dev->gflags & (IFF_PROMISC |
6487                                 IFF_ALLMULTI));
6488
6489         if (netif_running(dev)) {
6490                 if (netif_oper_up(dev))
6491                         flags |= IFF_RUNNING;
6492                 if (netif_carrier_ok(dev))
6493                         flags |= IFF_LOWER_UP;
6494                 if (netif_dormant(dev))
6495                         flags |= IFF_DORMANT;
6496         }
6497
6498         return flags;
6499 }
6500 EXPORT_SYMBOL(dev_get_flags);
6501
6502 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6503 {
6504         unsigned int old_flags = dev->flags;
6505         int ret;
6506
6507         ASSERT_RTNL();
6508
6509         /*
6510          *      Set the flags on our device.
6511          */
6512
6513         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6514                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6515                                IFF_AUTOMEDIA)) |
6516                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6517                                     IFF_ALLMULTI));
6518
6519         /*
6520          *      Load in the correct multicast list now the flags have changed.
6521          */
6522
6523         if ((old_flags ^ flags) & IFF_MULTICAST)
6524                 dev_change_rx_flags(dev, IFF_MULTICAST);
6525
6526         dev_set_rx_mode(dev);
6527
6528         /*
6529          *      Have we downed the interface. We handle IFF_UP ourselves
6530          *      according to user attempts to set it, rather than blindly
6531          *      setting it.
6532          */
6533
6534         ret = 0;
6535         if ((old_flags ^ flags) & IFF_UP)
6536                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6537
6538         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6539                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6540                 unsigned int old_flags = dev->flags;
6541
6542                 dev->gflags ^= IFF_PROMISC;
6543
6544                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6545                         if (dev->flags != old_flags)
6546                                 dev_set_rx_mode(dev);
6547         }
6548
6549         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6550            is important. Some (broken) drivers set IFF_PROMISC, when
6551            IFF_ALLMULTI is requested not asking us and not reporting.
6552          */
6553         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6554                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6555
6556                 dev->gflags ^= IFF_ALLMULTI;
6557                 __dev_set_allmulti(dev, inc, false);
6558         }
6559
6560         return ret;
6561 }
6562
6563 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6564                         unsigned int gchanges)
6565 {
6566         unsigned int changes = dev->flags ^ old_flags;
6567
6568         if (gchanges)
6569                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6570
6571         if (changes & IFF_UP) {
6572                 if (dev->flags & IFF_UP)
6573                         call_netdevice_notifiers(NETDEV_UP, dev);
6574                 else
6575                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6576         }
6577
6578         if (dev->flags & IFF_UP &&
6579             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6580                 struct netdev_notifier_change_info change_info;
6581
6582                 change_info.flags_changed = changes;
6583                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6584                                               &change_info.info);
6585         }
6586 }
6587
6588 /**
6589  *      dev_change_flags - change device settings
6590  *      @dev: device
6591  *      @flags: device state flags
6592  *
6593  *      Change settings on device based state flags. The flags are
6594  *      in the userspace exported format.
6595  */
6596 int dev_change_flags(struct net_device *dev, unsigned int flags)
6597 {
6598         int ret;
6599         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6600
6601         ret = __dev_change_flags(dev, flags);
6602         if (ret < 0)
6603                 return ret;
6604
6605         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6606         __dev_notify_flags(dev, old_flags, changes);
6607         return ret;
6608 }
6609 EXPORT_SYMBOL(dev_change_flags);
6610
6611 int __dev_set_mtu(struct net_device *dev, int new_mtu)
6612 {
6613         const struct net_device_ops *ops = dev->netdev_ops;
6614
6615         if (ops->ndo_change_mtu)
6616                 return ops->ndo_change_mtu(dev, new_mtu);
6617
6618         dev->mtu = new_mtu;
6619         return 0;
6620 }
6621 EXPORT_SYMBOL(__dev_set_mtu);
6622
6623 /**
6624  *      dev_set_mtu - Change maximum transfer unit
6625  *      @dev: device
6626  *      @new_mtu: new transfer unit
6627  *
6628  *      Change the maximum transfer size of the network device.
6629  */
6630 int dev_set_mtu(struct net_device *dev, int new_mtu)
6631 {
6632         int err, orig_mtu;
6633
6634         if (new_mtu == dev->mtu)
6635                 return 0;
6636
6637         /* MTU must be positive, and in range */
6638         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6639                 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6640                                     dev->name, new_mtu, dev->min_mtu);
6641                 return -EINVAL;
6642         }
6643
6644         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6645                 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6646                                     dev->name, new_mtu, dev->max_mtu);
6647                 return -EINVAL;
6648         }
6649
6650         if (!netif_device_present(dev))
6651                 return -ENODEV;
6652
6653         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6654         err = notifier_to_errno(err);
6655         if (err)
6656                 return err;
6657
6658         orig_mtu = dev->mtu;
6659         err = __dev_set_mtu(dev, new_mtu);
6660
6661         if (!err) {
6662                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6663                 err = notifier_to_errno(err);
6664                 if (err) {
6665                         /* setting mtu back and notifying everyone again,
6666                          * so that they have a chance to revert changes.
6667                          */
6668                         __dev_set_mtu(dev, orig_mtu);
6669                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6670                 }
6671         }
6672         return err;
6673 }
6674 EXPORT_SYMBOL(dev_set_mtu);
6675
6676 /**
6677  *      dev_set_group - Change group this device belongs to
6678  *      @dev: device
6679  *      @new_group: group this device should belong to
6680  */
6681 void dev_set_group(struct net_device *dev, int new_group)
6682 {
6683         dev->group = new_group;
6684 }
6685 EXPORT_SYMBOL(dev_set_group);
6686
6687 /**
6688  *      dev_set_mac_address - Change Media Access Control Address
6689  *      @dev: device
6690  *      @sa: new address
6691  *
6692  *      Change the hardware (MAC) address of the device
6693  */
6694 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6695 {
6696         const struct net_device_ops *ops = dev->netdev_ops;
6697         int err;
6698
6699         if (!ops->ndo_set_mac_address)
6700                 return -EOPNOTSUPP;
6701         if (sa->sa_family != dev->type)
6702                 return -EINVAL;
6703         if (!netif_device_present(dev))
6704                 return -ENODEV;
6705         err = ops->ndo_set_mac_address(dev, sa);
6706         if (err)
6707                 return err;
6708         dev->addr_assign_type = NET_ADDR_SET;
6709         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6710         add_device_randomness(dev->dev_addr, dev->addr_len);
6711         return 0;
6712 }
6713 EXPORT_SYMBOL(dev_set_mac_address);
6714
6715 /**
6716  *      dev_change_carrier - Change device carrier
6717  *      @dev: device
6718  *      @new_carrier: new value
6719  *
6720  *      Change device carrier
6721  */
6722 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6723 {
6724         const struct net_device_ops *ops = dev->netdev_ops;
6725
6726         if (!ops->ndo_change_carrier)
6727                 return -EOPNOTSUPP;
6728         if (!netif_device_present(dev))
6729                 return -ENODEV;
6730         return ops->ndo_change_carrier(dev, new_carrier);
6731 }
6732 EXPORT_SYMBOL(dev_change_carrier);
6733
6734 /**
6735  *      dev_get_phys_port_id - Get device physical port ID
6736  *      @dev: device
6737  *      @ppid: port ID
6738  *
6739  *      Get device physical port ID
6740  */
6741 int dev_get_phys_port_id(struct net_device *dev,
6742                          struct netdev_phys_item_id *ppid)
6743 {
6744         const struct net_device_ops *ops = dev->netdev_ops;
6745
6746         if (!ops->ndo_get_phys_port_id)
6747                 return -EOPNOTSUPP;
6748         return ops->ndo_get_phys_port_id(dev, ppid);
6749 }
6750 EXPORT_SYMBOL(dev_get_phys_port_id);
6751
6752 /**
6753  *      dev_get_phys_port_name - Get device physical port name
6754  *      @dev: device
6755  *      @name: port name
6756  *      @len: limit of bytes to copy to name
6757  *
6758  *      Get device physical port name
6759  */
6760 int dev_get_phys_port_name(struct net_device *dev,
6761                            char *name, size_t len)
6762 {
6763         const struct net_device_ops *ops = dev->netdev_ops;
6764
6765         if (!ops->ndo_get_phys_port_name)
6766                 return -EOPNOTSUPP;
6767         return ops->ndo_get_phys_port_name(dev, name, len);
6768 }
6769 EXPORT_SYMBOL(dev_get_phys_port_name);
6770
6771 /**
6772  *      dev_change_proto_down - update protocol port state information
6773  *      @dev: device
6774  *      @proto_down: new value
6775  *
6776  *      This info can be used by switch drivers to set the phys state of the
6777  *      port.
6778  */
6779 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6780 {
6781         const struct net_device_ops *ops = dev->netdev_ops;
6782
6783         if (!ops->ndo_change_proto_down)
6784                 return -EOPNOTSUPP;
6785         if (!netif_device_present(dev))
6786                 return -ENODEV;
6787         return ops->ndo_change_proto_down(dev, proto_down);
6788 }
6789 EXPORT_SYMBOL(dev_change_proto_down);
6790
6791 /**
6792  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
6793  *      @dev: device
6794  *      @fd: new program fd or negative value to clear
6795  *      @flags: xdp-related flags
6796  *
6797  *      Set or clear a bpf program for a device
6798  */
6799 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6800 {
6801         const struct net_device_ops *ops = dev->netdev_ops;
6802         struct bpf_prog *prog = NULL;
6803         struct netdev_xdp xdp;
6804         int err;
6805
6806         ASSERT_RTNL();
6807
6808         if (!ops->ndo_xdp)
6809                 return -EOPNOTSUPP;
6810         if (fd >= 0) {
6811                 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6812                         memset(&xdp, 0, sizeof(xdp));
6813                         xdp.command = XDP_QUERY_PROG;
6814
6815                         err = ops->ndo_xdp(dev, &xdp);
6816                         if (err < 0)
6817                                 return err;
6818                         if (xdp.prog_attached)
6819                                 return -EBUSY;
6820                 }
6821
6822                 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6823                 if (IS_ERR(prog))
6824                         return PTR_ERR(prog);
6825         }
6826
6827         memset(&xdp, 0, sizeof(xdp));
6828         xdp.command = XDP_SETUP_PROG;
6829         xdp.prog = prog;
6830
6831         err = ops->ndo_xdp(dev, &xdp);
6832         if (err < 0 && prog)
6833                 bpf_prog_put(prog);
6834
6835         return err;
6836 }
6837 EXPORT_SYMBOL(dev_change_xdp_fd);
6838
6839 /**
6840  *      dev_new_index   -       allocate an ifindex
6841  *      @net: the applicable net namespace
6842  *
6843  *      Returns a suitable unique value for a new device interface
6844  *      number.  The caller must hold the rtnl semaphore or the
6845  *      dev_base_lock to be sure it remains unique.
6846  */
6847 static int dev_new_index(struct net *net)
6848 {
6849         int ifindex = net->ifindex;
6850         for (;;) {
6851                 if (++ifindex <= 0)
6852                         ifindex = 1;
6853                 if (!__dev_get_by_index(net, ifindex))
6854                         return net->ifindex = ifindex;
6855         }
6856 }
6857
6858 /* Delayed registration/unregisteration */
6859 static LIST_HEAD(net_todo_list);
6860 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6861
6862 static void net_set_todo(struct net_device *dev)
6863 {
6864         list_add_tail(&dev->todo_list, &net_todo_list);
6865         dev_net(dev)->dev_unreg_count++;
6866 }
6867
6868 static void rollback_registered_many(struct list_head *head)
6869 {
6870         struct net_device *dev, *tmp;
6871         LIST_HEAD(close_head);
6872
6873         BUG_ON(dev_boot_phase);
6874         ASSERT_RTNL();
6875
6876         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6877                 /* Some devices call without registering
6878                  * for initialization unwind. Remove those
6879                  * devices and proceed with the remaining.
6880                  */
6881                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6882                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6883                                  dev->name, dev);
6884
6885                         WARN_ON(1);
6886                         list_del(&dev->unreg_list);
6887                         continue;
6888                 }
6889                 dev->dismantle = true;
6890                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6891         }
6892
6893         /* If device is running, close it first. */
6894         list_for_each_entry(dev, head, unreg_list)
6895                 list_add_tail(&dev->close_list, &close_head);
6896         dev_close_many(&close_head, true);
6897
6898         list_for_each_entry(dev, head, unreg_list) {
6899                 /* And unlink it from device chain. */
6900                 unlist_netdevice(dev);
6901
6902                 dev->reg_state = NETREG_UNREGISTERING;
6903         }
6904         flush_all_backlogs();
6905
6906         synchronize_net();
6907
6908         list_for_each_entry(dev, head, unreg_list) {
6909                 struct sk_buff *skb = NULL;
6910
6911                 /* Shutdown queueing discipline. */
6912                 dev_shutdown(dev);
6913
6914
6915                 /* Notify protocols, that we are about to destroy
6916                    this device. They should clean all the things.
6917                 */
6918                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6919
6920                 if (!dev->rtnl_link_ops ||
6921                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6922                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6923                                                      GFP_KERNEL);
6924
6925                 /*
6926                  *      Flush the unicast and multicast chains
6927                  */
6928                 dev_uc_flush(dev);
6929                 dev_mc_flush(dev);
6930
6931                 if (dev->netdev_ops->ndo_uninit)
6932                         dev->netdev_ops->ndo_uninit(dev);
6933
6934                 if (skb)
6935                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6936
6937                 /* Notifier chain MUST detach us all upper devices. */
6938                 WARN_ON(netdev_has_any_upper_dev(dev));
6939                 WARN_ON(netdev_has_any_lower_dev(dev));
6940
6941                 /* Remove entries from kobject tree */
6942                 netdev_unregister_kobject(dev);
6943 #ifdef CONFIG_XPS
6944                 /* Remove XPS queueing entries */
6945                 netif_reset_xps_queues_gt(dev, 0);
6946 #endif
6947         }
6948
6949         synchronize_net();
6950
6951         list_for_each_entry(dev, head, unreg_list)
6952                 dev_put(dev);
6953 }
6954
6955 static void rollback_registered(struct net_device *dev)
6956 {
6957         LIST_HEAD(single);
6958
6959         list_add(&dev->unreg_list, &single);
6960         rollback_registered_many(&single);
6961         list_del(&single);
6962 }
6963
6964 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6965         struct net_device *upper, netdev_features_t features)
6966 {
6967         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6968         netdev_features_t feature;
6969         int feature_bit;
6970
6971         for_each_netdev_feature(&upper_disables, feature_bit) {
6972                 feature = __NETIF_F_BIT(feature_bit);
6973                 if (!(upper->wanted_features & feature)
6974                     && (features & feature)) {
6975                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6976                                    &feature, upper->name);
6977                         features &= ~feature;
6978                 }
6979         }
6980
6981         return features;
6982 }
6983
6984 static void netdev_sync_lower_features(struct net_device *upper,
6985         struct net_device *lower, netdev_features_t features)
6986 {
6987         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6988         netdev_features_t feature;
6989         int feature_bit;
6990
6991         for_each_netdev_feature(&upper_disables, feature_bit) {
6992                 feature = __NETIF_F_BIT(feature_bit);
6993                 if (!(features & feature) && (lower->features & feature)) {
6994                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6995                                    &feature, lower->name);
6996                         lower->wanted_features &= ~feature;
6997                         netdev_update_features(lower);
6998
6999                         if (unlikely(lower->features & feature))
7000                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
7001                                             &feature, lower->name);
7002                 }
7003         }
7004 }
7005
7006 static netdev_features_t netdev_fix_features(struct net_device *dev,
7007         netdev_features_t features)
7008 {
7009         /* Fix illegal checksum combinations */
7010         if ((features & NETIF_F_HW_CSUM) &&
7011             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7012                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7013                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7014         }
7015
7016         /* TSO requires that SG is present as well. */
7017         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7018                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7019                 features &= ~NETIF_F_ALL_TSO;
7020         }
7021
7022         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7023                                         !(features & NETIF_F_IP_CSUM)) {
7024                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7025                 features &= ~NETIF_F_TSO;
7026                 features &= ~NETIF_F_TSO_ECN;
7027         }
7028
7029         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7030                                          !(features & NETIF_F_IPV6_CSUM)) {
7031                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7032                 features &= ~NETIF_F_TSO6;
7033         }
7034
7035         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7036         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7037                 features &= ~NETIF_F_TSO_MANGLEID;
7038
7039         /* TSO ECN requires that TSO is present as well. */
7040         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7041                 features &= ~NETIF_F_TSO_ECN;
7042
7043         /* Software GSO depends on SG. */
7044         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7045                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7046                 features &= ~NETIF_F_GSO;
7047         }
7048
7049         /* UFO needs SG and checksumming */
7050         if (features & NETIF_F_UFO) {
7051                 /* maybe split UFO into V4 and V6? */
7052                 if (!(features & NETIF_F_HW_CSUM) &&
7053                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7054                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7055                         netdev_dbg(dev,
7056                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
7057                         features &= ~NETIF_F_UFO;
7058                 }
7059
7060                 if (!(features & NETIF_F_SG)) {
7061                         netdev_dbg(dev,
7062                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7063                         features &= ~NETIF_F_UFO;
7064                 }
7065         }
7066
7067         /* GSO partial features require GSO partial be set */
7068         if ((features & dev->gso_partial_features) &&
7069             !(features & NETIF_F_GSO_PARTIAL)) {
7070                 netdev_dbg(dev,
7071                            "Dropping partially supported GSO features since no GSO partial.\n");
7072                 features &= ~dev->gso_partial_features;
7073         }
7074
7075 #ifdef CONFIG_NET_RX_BUSY_POLL
7076         if (dev->netdev_ops->ndo_busy_poll)
7077                 features |= NETIF_F_BUSY_POLL;
7078         else
7079 #endif
7080                 features &= ~NETIF_F_BUSY_POLL;
7081
7082         return features;
7083 }
7084
7085 int __netdev_update_features(struct net_device *dev)
7086 {
7087         struct net_device *upper, *lower;
7088         netdev_features_t features;
7089         struct list_head *iter;
7090         int err = -1;
7091
7092         ASSERT_RTNL();
7093
7094         features = netdev_get_wanted_features(dev);
7095
7096         if (dev->netdev_ops->ndo_fix_features)
7097                 features = dev->netdev_ops->ndo_fix_features(dev, features);
7098
7099         /* driver might be less strict about feature dependencies */
7100         features = netdev_fix_features(dev, features);
7101
7102         /* some features can't be enabled if they're off an an upper device */
7103         netdev_for_each_upper_dev_rcu(dev, upper, iter)
7104                 features = netdev_sync_upper_features(dev, upper, features);
7105
7106         if (dev->features == features)
7107                 goto sync_lower;
7108
7109         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7110                 &dev->features, &features);
7111
7112         if (dev->netdev_ops->ndo_set_features)
7113                 err = dev->netdev_ops->ndo_set_features(dev, features);
7114         else
7115                 err = 0;
7116
7117         if (unlikely(err < 0)) {
7118                 netdev_err(dev,
7119                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
7120                         err, &features, &dev->features);
7121                 /* return non-0 since some features might have changed and
7122                  * it's better to fire a spurious notification than miss it
7123                  */
7124                 return -1;
7125         }
7126
7127 sync_lower:
7128         /* some features must be disabled on lower devices when disabled
7129          * on an upper device (think: bonding master or bridge)
7130          */
7131         netdev_for_each_lower_dev(dev, lower, iter)
7132                 netdev_sync_lower_features(dev, lower, features);
7133
7134         if (!err)
7135                 dev->features = features;
7136
7137         return err < 0 ? 0 : 1;
7138 }
7139
7140 /**
7141  *      netdev_update_features - recalculate device features
7142  *      @dev: the device to check
7143  *
7144  *      Recalculate dev->features set and send notifications if it
7145  *      has changed. Should be called after driver or hardware dependent
7146  *      conditions might have changed that influence the features.
7147  */
7148 void netdev_update_features(struct net_device *dev)
7149 {
7150         if (__netdev_update_features(dev))
7151                 netdev_features_change(dev);
7152 }
7153 EXPORT_SYMBOL(netdev_update_features);
7154
7155 /**
7156  *      netdev_change_features - recalculate device features
7157  *      @dev: the device to check
7158  *
7159  *      Recalculate dev->features set and send notifications even
7160  *      if they have not changed. Should be called instead of
7161  *      netdev_update_features() if also dev->vlan_features might
7162  *      have changed to allow the changes to be propagated to stacked
7163  *      VLAN devices.
7164  */
7165 void netdev_change_features(struct net_device *dev)
7166 {
7167         __netdev_update_features(dev);
7168         netdev_features_change(dev);
7169 }
7170 EXPORT_SYMBOL(netdev_change_features);
7171
7172 /**
7173  *      netif_stacked_transfer_operstate -      transfer operstate
7174  *      @rootdev: the root or lower level device to transfer state from
7175  *      @dev: the device to transfer operstate to
7176  *
7177  *      Transfer operational state from root to device. This is normally
7178  *      called when a stacking relationship exists between the root
7179  *      device and the device(a leaf device).
7180  */
7181 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7182                                         struct net_device *dev)
7183 {
7184         if (rootdev->operstate == IF_OPER_DORMANT)
7185                 netif_dormant_on(dev);
7186         else
7187                 netif_dormant_off(dev);
7188
7189         if (netif_carrier_ok(rootdev)) {
7190                 if (!netif_carrier_ok(dev))
7191                         netif_carrier_on(dev);
7192         } else {
7193                 if (netif_carrier_ok(dev))
7194                         netif_carrier_off(dev);
7195         }
7196 }
7197 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7198
7199 #ifdef CONFIG_SYSFS
7200 static int netif_alloc_rx_queues(struct net_device *dev)
7201 {
7202         unsigned int i, count = dev->num_rx_queues;
7203         struct netdev_rx_queue *rx;
7204         size_t sz = count * sizeof(*rx);
7205
7206         BUG_ON(count < 1);
7207
7208         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7209         if (!rx) {
7210                 rx = vzalloc(sz);
7211                 if (!rx)
7212                         return -ENOMEM;
7213         }
7214         dev->_rx = rx;
7215
7216         for (i = 0; i < count; i++)
7217                 rx[i].dev = dev;
7218         return 0;
7219 }
7220 #endif
7221
7222 static void netdev_init_one_queue(struct net_device *dev,
7223                                   struct netdev_queue *queue, void *_unused)
7224 {
7225         /* Initialize queue lock */
7226         spin_lock_init(&queue->_xmit_lock);
7227         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7228         queue->xmit_lock_owner = -1;
7229         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7230         queue->dev = dev;
7231 #ifdef CONFIG_BQL
7232         dql_init(&queue->dql, HZ);
7233 #endif
7234 }
7235
7236 static void netif_free_tx_queues(struct net_device *dev)
7237 {
7238         kvfree(dev->_tx);
7239 }
7240
7241 static int netif_alloc_netdev_queues(struct net_device *dev)
7242 {
7243         unsigned int count = dev->num_tx_queues;
7244         struct netdev_queue *tx;
7245         size_t sz = count * sizeof(*tx);
7246
7247         if (count < 1 || count > 0xffff)
7248                 return -EINVAL;
7249
7250         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7251         if (!tx) {
7252                 tx = vzalloc(sz);
7253                 if (!tx)
7254                         return -ENOMEM;
7255         }
7256         dev->_tx = tx;
7257
7258         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7259         spin_lock_init(&dev->tx_global_lock);
7260
7261         return 0;
7262 }
7263
7264 void netif_tx_stop_all_queues(struct net_device *dev)
7265 {
7266         unsigned int i;
7267
7268         for (i = 0; i < dev->num_tx_queues; i++) {
7269                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7270                 netif_tx_stop_queue(txq);
7271         }
7272 }
7273 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7274
7275 /**
7276  *      register_netdevice      - register a network device
7277  *      @dev: device to register
7278  *
7279  *      Take a completed network device structure and add it to the kernel
7280  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7281  *      chain. 0 is returned on success. A negative errno code is returned
7282  *      on a failure to set up the device, or if the name is a duplicate.
7283  *
7284  *      Callers must hold the rtnl semaphore. You may want
7285  *      register_netdev() instead of this.
7286  *
7287  *      BUGS:
7288  *      The locking appears insufficient to guarantee two parallel registers
7289  *      will not get the same name.
7290  */
7291
7292 int register_netdevice(struct net_device *dev)
7293 {
7294         int ret;
7295         struct net *net = dev_net(dev);
7296
7297         BUG_ON(dev_boot_phase);
7298         ASSERT_RTNL();
7299
7300         might_sleep();
7301
7302         /* When net_device's are persistent, this will be fatal. */
7303         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7304         BUG_ON(!net);
7305
7306         spin_lock_init(&dev->addr_list_lock);
7307         netdev_set_addr_lockdep_class(dev);
7308
7309         ret = dev_get_valid_name(net, dev, dev->name);
7310         if (ret < 0)
7311                 goto out;
7312
7313         /* Init, if this function is available */
7314         if (dev->netdev_ops->ndo_init) {
7315                 ret = dev->netdev_ops->ndo_init(dev);
7316                 if (ret) {
7317                         if (ret > 0)
7318                                 ret = -EIO;
7319                         goto out;
7320                 }
7321         }
7322
7323         if (((dev->hw_features | dev->features) &
7324              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7325             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7326              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7327                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7328                 ret = -EINVAL;
7329                 goto err_uninit;
7330         }
7331
7332         ret = -EBUSY;
7333         if (!dev->ifindex)
7334                 dev->ifindex = dev_new_index(net);
7335         else if (__dev_get_by_index(net, dev->ifindex))
7336                 goto err_uninit;
7337
7338         /* Transfer changeable features to wanted_features and enable
7339          * software offloads (GSO and GRO).
7340          */
7341         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7342         dev->features |= NETIF_F_SOFT_FEATURES;
7343         dev->wanted_features = dev->features & dev->hw_features;
7344
7345         if (!(dev->flags & IFF_LOOPBACK))
7346                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7347
7348         /* If IPv4 TCP segmentation offload is supported we should also
7349          * allow the device to enable segmenting the frame with the option
7350          * of ignoring a static IP ID value.  This doesn't enable the
7351          * feature itself but allows the user to enable it later.
7352          */
7353         if (dev->hw_features & NETIF_F_TSO)
7354                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7355         if (dev->vlan_features & NETIF_F_TSO)
7356                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7357         if (dev->mpls_features & NETIF_F_TSO)
7358                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7359         if (dev->hw_enc_features & NETIF_F_TSO)
7360                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7361
7362         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7363          */
7364         dev->vlan_features |= NETIF_F_HIGHDMA;
7365
7366         /* Make NETIF_F_SG inheritable to tunnel devices.
7367          */
7368         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7369
7370         /* Make NETIF_F_SG inheritable to MPLS.
7371          */
7372         dev->mpls_features |= NETIF_F_SG;
7373
7374         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7375         ret = notifier_to_errno(ret);
7376         if (ret)
7377                 goto err_uninit;
7378
7379         ret = netdev_register_kobject(dev);
7380         if (ret)
7381                 goto err_uninit;
7382         dev->reg_state = NETREG_REGISTERED;
7383
7384         __netdev_update_features(dev);
7385
7386         /*
7387          *      Default initial state at registry is that the
7388          *      device is present.
7389          */
7390
7391         set_bit(__LINK_STATE_PRESENT, &dev->state);
7392
7393         linkwatch_init_dev(dev);
7394
7395         dev_init_scheduler(dev);
7396         dev_hold(dev);
7397         list_netdevice(dev);
7398         add_device_randomness(dev->dev_addr, dev->addr_len);
7399
7400         /* If the device has permanent device address, driver should
7401          * set dev_addr and also addr_assign_type should be set to
7402          * NET_ADDR_PERM (default value).
7403          */
7404         if (dev->addr_assign_type == NET_ADDR_PERM)
7405                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7406
7407         /* Notify protocols, that a new device appeared. */
7408         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7409         ret = notifier_to_errno(ret);
7410         if (ret) {
7411                 rollback_registered(dev);
7412                 dev->reg_state = NETREG_UNREGISTERED;
7413         }
7414         /*
7415          *      Prevent userspace races by waiting until the network
7416          *      device is fully setup before sending notifications.
7417          */
7418         if (!dev->rtnl_link_ops ||
7419             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7420                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7421
7422 out:
7423         return ret;
7424
7425 err_uninit:
7426         if (dev->netdev_ops->ndo_uninit)
7427                 dev->netdev_ops->ndo_uninit(dev);
7428         goto out;
7429 }
7430 EXPORT_SYMBOL(register_netdevice);
7431
7432 /**
7433  *      init_dummy_netdev       - init a dummy network device for NAPI
7434  *      @dev: device to init
7435  *
7436  *      This takes a network device structure and initialize the minimum
7437  *      amount of fields so it can be used to schedule NAPI polls without
7438  *      registering a full blown interface. This is to be used by drivers
7439  *      that need to tie several hardware interfaces to a single NAPI
7440  *      poll scheduler due to HW limitations.
7441  */
7442 int init_dummy_netdev(struct net_device *dev)
7443 {
7444         /* Clear everything. Note we don't initialize spinlocks
7445          * are they aren't supposed to be taken by any of the
7446          * NAPI code and this dummy netdev is supposed to be
7447          * only ever used for NAPI polls
7448          */
7449         memset(dev, 0, sizeof(struct net_device));
7450
7451         /* make sure we BUG if trying to hit standard
7452          * register/unregister code path
7453          */
7454         dev->reg_state = NETREG_DUMMY;
7455
7456         /* NAPI wants this */
7457         INIT_LIST_HEAD(&dev->napi_list);
7458
7459         /* a dummy interface is started by default */
7460         set_bit(__LINK_STATE_PRESENT, &dev->state);
7461         set_bit(__LINK_STATE_START, &dev->state);
7462
7463         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7464          * because users of this 'device' dont need to change
7465          * its refcount.
7466          */
7467
7468         return 0;
7469 }
7470 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7471
7472
7473 /**
7474  *      register_netdev - register a network device
7475  *      @dev: device to register
7476  *
7477  *      Take a completed network device structure and add it to the kernel
7478  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7479  *      chain. 0 is returned on success. A negative errno code is returned
7480  *      on a failure to set up the device, or if the name is a duplicate.
7481  *
7482  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7483  *      and expands the device name if you passed a format string to
7484  *      alloc_netdev.
7485  */
7486 int register_netdev(struct net_device *dev)
7487 {
7488         int err;
7489
7490         rtnl_lock();
7491         err = register_netdevice(dev);
7492         rtnl_unlock();
7493         return err;
7494 }
7495 EXPORT_SYMBOL(register_netdev);
7496
7497 int netdev_refcnt_read(const struct net_device *dev)
7498 {
7499         int i, refcnt = 0;
7500
7501         for_each_possible_cpu(i)
7502                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7503         return refcnt;
7504 }
7505 EXPORT_SYMBOL(netdev_refcnt_read);
7506
7507 /**
7508  * netdev_wait_allrefs - wait until all references are gone.
7509  * @dev: target net_device
7510  *
7511  * This is called when unregistering network devices.
7512  *
7513  * Any protocol or device that holds a reference should register
7514  * for netdevice notification, and cleanup and put back the
7515  * reference if they receive an UNREGISTER event.
7516  * We can get stuck here if buggy protocols don't correctly
7517  * call dev_put.
7518  */
7519 static void netdev_wait_allrefs(struct net_device *dev)
7520 {
7521         unsigned long rebroadcast_time, warning_time;
7522         int refcnt;
7523
7524         linkwatch_forget_dev(dev);
7525
7526         rebroadcast_time = warning_time = jiffies;
7527         refcnt = netdev_refcnt_read(dev);
7528
7529         while (refcnt != 0) {
7530                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7531                         rtnl_lock();
7532
7533                         /* Rebroadcast unregister notification */
7534                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7535
7536                         __rtnl_unlock();
7537                         rcu_barrier();
7538                         rtnl_lock();
7539
7540                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7541                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7542                                      &dev->state)) {
7543                                 /* We must not have linkwatch events
7544                                  * pending on unregister. If this
7545                                  * happens, we simply run the queue
7546                                  * unscheduled, resulting in a noop
7547                                  * for this device.
7548                                  */
7549                                 linkwatch_run_queue();
7550                         }
7551
7552                         __rtnl_unlock();
7553
7554                         rebroadcast_time = jiffies;
7555                 }
7556
7557                 msleep(250);
7558
7559                 refcnt = netdev_refcnt_read(dev);
7560
7561                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7562                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7563                                  dev->name, refcnt);
7564                         warning_time = jiffies;
7565                 }
7566         }
7567 }
7568
7569 /* The sequence is:
7570  *
7571  *      rtnl_lock();
7572  *      ...
7573  *      register_netdevice(x1);
7574  *      register_netdevice(x2);
7575  *      ...
7576  *      unregister_netdevice(y1);
7577  *      unregister_netdevice(y2);
7578  *      ...
7579  *      rtnl_unlock();
7580  *      free_netdev(y1);
7581  *      free_netdev(y2);
7582  *
7583  * We are invoked by rtnl_unlock().
7584  * This allows us to deal with problems:
7585  * 1) We can delete sysfs objects which invoke hotplug
7586  *    without deadlocking with linkwatch via keventd.
7587  * 2) Since we run with the RTNL semaphore not held, we can sleep
7588  *    safely in order to wait for the netdev refcnt to drop to zero.
7589  *
7590  * We must not return until all unregister events added during
7591  * the interval the lock was held have been completed.
7592  */
7593 void netdev_run_todo(void)
7594 {
7595         struct list_head list;
7596
7597         /* Snapshot list, allow later requests */
7598         list_replace_init(&net_todo_list, &list);
7599
7600         __rtnl_unlock();
7601
7602
7603         /* Wait for rcu callbacks to finish before next phase */
7604         if (!list_empty(&list))
7605                 rcu_barrier();
7606
7607         while (!list_empty(&list)) {
7608                 struct net_device *dev
7609                         = list_first_entry(&list, struct net_device, todo_list);
7610                 list_del(&dev->todo_list);
7611
7612                 rtnl_lock();
7613                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7614                 __rtnl_unlock();
7615
7616                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7617                         pr_err("network todo '%s' but state %d\n",
7618                                dev->name, dev->reg_state);
7619                         dump_stack();
7620                         continue;
7621                 }
7622
7623                 dev->reg_state = NETREG_UNREGISTERED;
7624
7625                 netdev_wait_allrefs(dev);
7626
7627                 /* paranoia */
7628                 BUG_ON(netdev_refcnt_read(dev));
7629                 BUG_ON(!list_empty(&dev->ptype_all));
7630                 BUG_ON(!list_empty(&dev->ptype_specific));
7631                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7632                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7633                 WARN_ON(dev->dn_ptr);
7634
7635                 if (dev->destructor)
7636                         dev->destructor(dev);
7637
7638                 /* Report a network device has been unregistered */
7639                 rtnl_lock();
7640                 dev_net(dev)->dev_unreg_count--;
7641                 __rtnl_unlock();
7642                 wake_up(&netdev_unregistering_wq);
7643
7644                 /* Free network device */
7645                 kobject_put(&dev->dev.kobj);
7646         }
7647 }
7648
7649 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7650  * all the same fields in the same order as net_device_stats, with only
7651  * the type differing, but rtnl_link_stats64 may have additional fields
7652  * at the end for newer counters.
7653  */
7654 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7655                              const struct net_device_stats *netdev_stats)
7656 {
7657 #if BITS_PER_LONG == 64
7658         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7659         memcpy(stats64, netdev_stats, sizeof(*stats64));
7660         /* zero out counters that only exist in rtnl_link_stats64 */
7661         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7662                sizeof(*stats64) - sizeof(*netdev_stats));
7663 #else
7664         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7665         const unsigned long *src = (const unsigned long *)netdev_stats;
7666         u64 *dst = (u64 *)stats64;
7667
7668         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7669         for (i = 0; i < n; i++)
7670                 dst[i] = src[i];
7671         /* zero out counters that only exist in rtnl_link_stats64 */
7672         memset((char *)stats64 + n * sizeof(u64), 0,
7673                sizeof(*stats64) - n * sizeof(u64));
7674 #endif
7675 }
7676 EXPORT_SYMBOL(netdev_stats_to_stats64);
7677
7678 /**
7679  *      dev_get_stats   - get network device statistics
7680  *      @dev: device to get statistics from
7681  *      @storage: place to store stats
7682  *
7683  *      Get network statistics from device. Return @storage.
7684  *      The device driver may provide its own method by setting
7685  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7686  *      otherwise the internal statistics structure is used.
7687  */
7688 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7689                                         struct rtnl_link_stats64 *storage)
7690 {
7691         const struct net_device_ops *ops = dev->netdev_ops;
7692
7693         if (ops->ndo_get_stats64) {
7694                 memset(storage, 0, sizeof(*storage));
7695                 ops->ndo_get_stats64(dev, storage);
7696         } else if (ops->ndo_get_stats) {
7697                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7698         } else {
7699                 netdev_stats_to_stats64(storage, &dev->stats);
7700         }
7701         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7702         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7703         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7704         return storage;
7705 }
7706 EXPORT_SYMBOL(dev_get_stats);
7707
7708 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7709 {
7710         struct netdev_queue *queue = dev_ingress_queue(dev);
7711
7712 #ifdef CONFIG_NET_CLS_ACT
7713         if (queue)
7714                 return queue;
7715         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7716         if (!queue)
7717                 return NULL;
7718         netdev_init_one_queue(dev, queue, NULL);
7719         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7720         queue->qdisc_sleeping = &noop_qdisc;
7721         rcu_assign_pointer(dev->ingress_queue, queue);
7722 #endif
7723         return queue;
7724 }
7725
7726 static const struct ethtool_ops default_ethtool_ops;
7727
7728 void netdev_set_default_ethtool_ops(struct net_device *dev,
7729                                     const struct ethtool_ops *ops)
7730 {
7731         if (dev->ethtool_ops == &default_ethtool_ops)
7732                 dev->ethtool_ops = ops;
7733 }
7734 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7735
7736 void netdev_freemem(struct net_device *dev)
7737 {
7738         char *addr = (char *)dev - dev->padded;
7739
7740         kvfree(addr);
7741 }
7742
7743 /**
7744  *      alloc_netdev_mqs - allocate network device
7745  *      @sizeof_priv:           size of private data to allocate space for
7746  *      @name:                  device name format string
7747  *      @name_assign_type:      origin of device name
7748  *      @setup:                 callback to initialize device
7749  *      @txqs:                  the number of TX subqueues to allocate
7750  *      @rxqs:                  the number of RX subqueues to allocate
7751  *
7752  *      Allocates a struct net_device with private data area for driver use
7753  *      and performs basic initialization.  Also allocates subqueue structs
7754  *      for each queue on the device.
7755  */
7756 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7757                 unsigned char name_assign_type,
7758                 void (*setup)(struct net_device *),
7759                 unsigned int txqs, unsigned int rxqs)
7760 {
7761         struct net_device *dev;
7762         size_t alloc_size;
7763         struct net_device *p;
7764
7765         BUG_ON(strlen(name) >= sizeof(dev->name));
7766
7767         if (txqs < 1) {
7768                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7769                 return NULL;
7770         }
7771
7772 #ifdef CONFIG_SYSFS
7773         if (rxqs < 1) {
7774                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7775                 return NULL;
7776         }
7777 #endif
7778
7779         alloc_size = sizeof(struct net_device);
7780         if (sizeof_priv) {
7781                 /* ensure 32-byte alignment of private area */
7782                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7783                 alloc_size += sizeof_priv;
7784         }
7785         /* ensure 32-byte alignment of whole construct */
7786         alloc_size += NETDEV_ALIGN - 1;
7787
7788         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7789         if (!p)
7790                 p = vzalloc(alloc_size);
7791         if (!p)
7792                 return NULL;
7793
7794         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7795         dev->padded = (char *)dev - (char *)p;
7796
7797         dev->pcpu_refcnt = alloc_percpu(int);
7798         if (!dev->pcpu_refcnt)
7799                 goto free_dev;
7800
7801         if (dev_addr_init(dev))
7802                 goto free_pcpu;
7803
7804         dev_mc_init(dev);
7805         dev_uc_init(dev);
7806
7807         dev_net_set(dev, &init_net);
7808
7809         dev->gso_max_size = GSO_MAX_SIZE;
7810         dev->gso_max_segs = GSO_MAX_SEGS;
7811
7812         INIT_LIST_HEAD(&dev->napi_list);
7813         INIT_LIST_HEAD(&dev->unreg_list);
7814         INIT_LIST_HEAD(&dev->close_list);
7815         INIT_LIST_HEAD(&dev->link_watch_list);
7816         INIT_LIST_HEAD(&dev->adj_list.upper);
7817         INIT_LIST_HEAD(&dev->adj_list.lower);
7818         INIT_LIST_HEAD(&dev->ptype_all);
7819         INIT_LIST_HEAD(&dev->ptype_specific);
7820 #ifdef CONFIG_NET_SCHED
7821         hash_init(dev->qdisc_hash);
7822 #endif
7823         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7824         setup(dev);
7825
7826         if (!dev->tx_queue_len) {
7827                 dev->priv_flags |= IFF_NO_QUEUE;
7828                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7829         }
7830
7831         dev->num_tx_queues = txqs;
7832         dev->real_num_tx_queues = txqs;
7833         if (netif_alloc_netdev_queues(dev))
7834                 goto free_all;
7835
7836 #ifdef CONFIG_SYSFS
7837         dev->num_rx_queues = rxqs;
7838         dev->real_num_rx_queues = rxqs;
7839         if (netif_alloc_rx_queues(dev))
7840                 goto free_all;
7841 #endif
7842
7843         strcpy(dev->name, name);
7844         dev->name_assign_type = name_assign_type;
7845         dev->group = INIT_NETDEV_GROUP;
7846         if (!dev->ethtool_ops)
7847                 dev->ethtool_ops = &default_ethtool_ops;
7848
7849         nf_hook_ingress_init(dev);
7850
7851         return dev;
7852
7853 free_all:
7854         free_netdev(dev);
7855         return NULL;
7856
7857 free_pcpu:
7858         free_percpu(dev->pcpu_refcnt);
7859 free_dev:
7860         netdev_freemem(dev);
7861         return NULL;
7862 }
7863 EXPORT_SYMBOL(alloc_netdev_mqs);
7864
7865 /**
7866  *      free_netdev - free network device
7867  *      @dev: device
7868  *
7869  *      This function does the last stage of destroying an allocated device
7870  *      interface. The reference to the device object is released.
7871  *      If this is the last reference then it will be freed.
7872  *      Must be called in process context.
7873  */
7874 void free_netdev(struct net_device *dev)
7875 {
7876         struct napi_struct *p, *n;
7877
7878         might_sleep();
7879         netif_free_tx_queues(dev);
7880 #ifdef CONFIG_SYSFS
7881         kvfree(dev->_rx);
7882 #endif
7883
7884         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7885
7886         /* Flush device addresses */
7887         dev_addr_flush(dev);
7888
7889         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7890                 netif_napi_del(p);
7891
7892         free_percpu(dev->pcpu_refcnt);
7893         dev->pcpu_refcnt = NULL;
7894
7895         /*  Compatibility with error handling in drivers */
7896         if (dev->reg_state == NETREG_UNINITIALIZED) {
7897                 netdev_freemem(dev);
7898                 return;
7899         }
7900
7901         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7902         dev->reg_state = NETREG_RELEASED;
7903
7904         /* will free via device release */
7905         put_device(&dev->dev);
7906 }
7907 EXPORT_SYMBOL(free_netdev);
7908
7909 /**
7910  *      synchronize_net -  Synchronize with packet receive processing
7911  *
7912  *      Wait for packets currently being received to be done.
7913  *      Does not block later packets from starting.
7914  */
7915 void synchronize_net(void)
7916 {
7917         might_sleep();
7918         if (rtnl_is_locked())
7919                 synchronize_rcu_expedited();
7920         else
7921                 synchronize_rcu();
7922 }
7923 EXPORT_SYMBOL(synchronize_net);
7924
7925 /**
7926  *      unregister_netdevice_queue - remove device from the kernel
7927  *      @dev: device
7928  *      @head: list
7929  *
7930  *      This function shuts down a device interface and removes it
7931  *      from the kernel tables.
7932  *      If head not NULL, device is queued to be unregistered later.
7933  *
7934  *      Callers must hold the rtnl semaphore.  You may want
7935  *      unregister_netdev() instead of this.
7936  */
7937
7938 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7939 {
7940         ASSERT_RTNL();
7941
7942         if (head) {
7943                 list_move_tail(&dev->unreg_list, head);
7944         } else {
7945                 rollback_registered(dev);
7946                 /* Finish processing unregister after unlock */
7947                 net_set_todo(dev);
7948         }
7949 }
7950 EXPORT_SYMBOL(unregister_netdevice_queue);
7951
7952 /**
7953  *      unregister_netdevice_many - unregister many devices
7954  *      @head: list of devices
7955  *
7956  *  Note: As most callers use a stack allocated list_head,
7957  *  we force a list_del() to make sure stack wont be corrupted later.
7958  */
7959 void unregister_netdevice_many(struct list_head *head)
7960 {
7961         struct net_device *dev;
7962
7963         if (!list_empty(head)) {
7964                 rollback_registered_many(head);
7965                 list_for_each_entry(dev, head, unreg_list)
7966                         net_set_todo(dev);
7967                 list_del(head);
7968         }
7969 }
7970 EXPORT_SYMBOL(unregister_netdevice_many);
7971
7972 /**
7973  *      unregister_netdev - remove device from the kernel
7974  *      @dev: device
7975  *
7976  *      This function shuts down a device interface and removes it
7977  *      from the kernel tables.
7978  *
7979  *      This is just a wrapper for unregister_netdevice that takes
7980  *      the rtnl semaphore.  In general you want to use this and not
7981  *      unregister_netdevice.
7982  */
7983 void unregister_netdev(struct net_device *dev)
7984 {
7985         rtnl_lock();
7986         unregister_netdevice(dev);
7987         rtnl_unlock();
7988 }
7989 EXPORT_SYMBOL(unregister_netdev);
7990
7991 /**
7992  *      dev_change_net_namespace - move device to different nethost namespace
7993  *      @dev: device
7994  *      @net: network namespace
7995  *      @pat: If not NULL name pattern to try if the current device name
7996  *            is already taken in the destination network namespace.
7997  *
7998  *      This function shuts down a device interface and moves it
7999  *      to a new network namespace. On success 0 is returned, on
8000  *      a failure a netagive errno code is returned.
8001  *
8002  *      Callers must hold the rtnl semaphore.
8003  */
8004
8005 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
8006 {
8007         int err;
8008
8009         ASSERT_RTNL();
8010
8011         /* Don't allow namespace local devices to be moved. */
8012         err = -EINVAL;
8013         if (dev->features & NETIF_F_NETNS_LOCAL)
8014                 goto out;
8015
8016         /* Ensure the device has been registrered */
8017         if (dev->reg_state != NETREG_REGISTERED)
8018                 goto out;
8019
8020         /* Get out if there is nothing todo */
8021         err = 0;
8022         if (net_eq(dev_net(dev), net))
8023                 goto out;
8024
8025         /* Pick the destination device name, and ensure
8026          * we can use it in the destination network namespace.
8027          */
8028         err = -EEXIST;
8029         if (__dev_get_by_name(net, dev->name)) {
8030                 /* We get here if we can't use the current device name */
8031                 if (!pat)
8032                         goto out;
8033                 if (dev_get_valid_name(net, dev, pat) < 0)
8034                         goto out;
8035         }
8036
8037         /*
8038          * And now a mini version of register_netdevice unregister_netdevice.
8039          */
8040
8041         /* If device is running close it first. */
8042         dev_close(dev);
8043
8044         /* And unlink it from device chain */
8045         err = -ENODEV;
8046         unlist_netdevice(dev);
8047
8048         synchronize_net();
8049
8050         /* Shutdown queueing discipline. */
8051         dev_shutdown(dev);
8052
8053         /* Notify protocols, that we are about to destroy
8054            this device. They should clean all the things.
8055
8056            Note that dev->reg_state stays at NETREG_REGISTERED.
8057            This is wanted because this way 8021q and macvlan know
8058            the device is just moving and can keep their slaves up.
8059         */
8060         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8061         rcu_barrier();
8062         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8063         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
8064
8065         /*
8066          *      Flush the unicast and multicast chains
8067          */
8068         dev_uc_flush(dev);
8069         dev_mc_flush(dev);
8070
8071         /* Send a netdev-removed uevent to the old namespace */
8072         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8073         netdev_adjacent_del_links(dev);
8074
8075         /* Actually switch the network namespace */
8076         dev_net_set(dev, net);
8077
8078         /* If there is an ifindex conflict assign a new one */
8079         if (__dev_get_by_index(net, dev->ifindex))
8080                 dev->ifindex = dev_new_index(net);
8081
8082         /* Send a netdev-add uevent to the new namespace */
8083         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8084         netdev_adjacent_add_links(dev);
8085
8086         /* Fixup kobjects */
8087         err = device_rename(&dev->dev, dev->name);
8088         WARN_ON(err);
8089
8090         /* Add the device back in the hashes */
8091         list_netdevice(dev);
8092
8093         /* Notify protocols, that a new device appeared. */
8094         call_netdevice_notifiers(NETDEV_REGISTER, dev);
8095
8096         /*
8097          *      Prevent userspace races by waiting until the network
8098          *      device is fully setup before sending notifications.
8099          */
8100         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8101
8102         synchronize_net();
8103         err = 0;
8104 out:
8105         return err;
8106 }
8107 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8108
8109 static int dev_cpu_dead(unsigned int oldcpu)
8110 {
8111         struct sk_buff **list_skb;
8112         struct sk_buff *skb;
8113         unsigned int cpu;
8114         struct softnet_data *sd, *oldsd;
8115
8116         local_irq_disable();
8117         cpu = smp_processor_id();
8118         sd = &per_cpu(softnet_data, cpu);
8119         oldsd = &per_cpu(softnet_data, oldcpu);
8120
8121         /* Find end of our completion_queue. */
8122         list_skb = &sd->completion_queue;
8123         while (*list_skb)
8124                 list_skb = &(*list_skb)->next;
8125         /* Append completion queue from offline CPU. */
8126         *list_skb = oldsd->completion_queue;
8127         oldsd->completion_queue = NULL;
8128
8129         /* Append output queue from offline CPU. */
8130         if (oldsd->output_queue) {
8131                 *sd->output_queue_tailp = oldsd->output_queue;
8132                 sd->output_queue_tailp = oldsd->output_queue_tailp;
8133                 oldsd->output_queue = NULL;
8134                 oldsd->output_queue_tailp = &oldsd->output_queue;
8135         }
8136         /* Append NAPI poll list from offline CPU, with one exception :
8137          * process_backlog() must be called by cpu owning percpu backlog.
8138          * We properly handle process_queue & input_pkt_queue later.
8139          */
8140         while (!list_empty(&oldsd->poll_list)) {
8141                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8142                                                             struct napi_struct,
8143                                                             poll_list);
8144
8145                 list_del_init(&napi->poll_list);
8146                 if (napi->poll == process_backlog)
8147                         napi->state = 0;
8148                 else
8149                         ____napi_schedule(sd, napi);
8150         }
8151
8152         raise_softirq_irqoff(NET_TX_SOFTIRQ);
8153         local_irq_enable();
8154
8155         /* Process offline CPU's input_pkt_queue */
8156         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8157                 netif_rx_ni(skb);
8158                 input_queue_head_incr(oldsd);
8159         }
8160         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8161                 netif_rx_ni(skb);
8162                 input_queue_head_incr(oldsd);
8163         }
8164
8165         return 0;
8166 }
8167
8168 /**
8169  *      netdev_increment_features - increment feature set by one
8170  *      @all: current feature set
8171  *      @one: new feature set
8172  *      @mask: mask feature set
8173  *
8174  *      Computes a new feature set after adding a device with feature set
8175  *      @one to the master device with current feature set @all.  Will not
8176  *      enable anything that is off in @mask. Returns the new feature set.
8177  */
8178 netdev_features_t netdev_increment_features(netdev_features_t all,
8179         netdev_features_t one, netdev_features_t mask)
8180 {
8181         if (mask & NETIF_F_HW_CSUM)
8182                 mask |= NETIF_F_CSUM_MASK;
8183         mask |= NETIF_F_VLAN_CHALLENGED;
8184
8185         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8186         all &= one | ~NETIF_F_ALL_FOR_ALL;
8187
8188         /* If one device supports hw checksumming, set for all. */
8189         if (all & NETIF_F_HW_CSUM)
8190                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8191
8192         return all;
8193 }
8194 EXPORT_SYMBOL(netdev_increment_features);
8195
8196 static struct hlist_head * __net_init netdev_create_hash(void)
8197 {
8198         int i;
8199         struct hlist_head *hash;
8200
8201         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8202         if (hash != NULL)
8203                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8204                         INIT_HLIST_HEAD(&hash[i]);
8205
8206         return hash;
8207 }
8208
8209 /* Initialize per network namespace state */
8210 static int __net_init netdev_init(struct net *net)
8211 {
8212         if (net != &init_net)
8213                 INIT_LIST_HEAD(&net->dev_base_head);
8214
8215         net->dev_name_head = netdev_create_hash();
8216         if (net->dev_name_head == NULL)
8217                 goto err_name;
8218
8219         net->dev_index_head = netdev_create_hash();
8220         if (net->dev_index_head == NULL)
8221                 goto err_idx;
8222
8223         return 0;
8224
8225 err_idx:
8226         kfree(net->dev_name_head);
8227 err_name:
8228         return -ENOMEM;
8229 }
8230
8231 /**
8232  *      netdev_drivername - network driver for the device
8233  *      @dev: network device
8234  *
8235  *      Determine network driver for device.
8236  */
8237 const char *netdev_drivername(const struct net_device *dev)
8238 {
8239         const struct device_driver *driver;
8240         const struct device *parent;
8241         const char *empty = "";
8242
8243         parent = dev->dev.parent;
8244         if (!parent)
8245                 return empty;
8246
8247         driver = parent->driver;
8248         if (driver && driver->name)
8249                 return driver->name;
8250         return empty;
8251 }
8252
8253 static void __netdev_printk(const char *level, const struct net_device *dev,
8254                             struct va_format *vaf)
8255 {
8256         if (dev && dev->dev.parent) {
8257                 dev_printk_emit(level[1] - '0',
8258                                 dev->dev.parent,
8259                                 "%s %s %s%s: %pV",
8260                                 dev_driver_string(dev->dev.parent),
8261                                 dev_name(dev->dev.parent),
8262                                 netdev_name(dev), netdev_reg_state(dev),
8263                                 vaf);
8264         } else if (dev) {
8265                 printk("%s%s%s: %pV",
8266                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
8267         } else {
8268                 printk("%s(NULL net_device): %pV", level, vaf);
8269         }
8270 }
8271
8272 void netdev_printk(const char *level, const struct net_device *dev,
8273                    const char *format, ...)
8274 {
8275         struct va_format vaf;
8276         va_list args;
8277
8278         va_start(args, format);
8279
8280         vaf.fmt = format;
8281         vaf.va = &args;
8282
8283         __netdev_printk(level, dev, &vaf);
8284
8285         va_end(args);
8286 }
8287 EXPORT_SYMBOL(netdev_printk);
8288
8289 #define define_netdev_printk_level(func, level)                 \
8290 void func(const struct net_device *dev, const char *fmt, ...)   \
8291 {                                                               \
8292         struct va_format vaf;                                   \
8293         va_list args;                                           \
8294                                                                 \
8295         va_start(args, fmt);                                    \
8296                                                                 \
8297         vaf.fmt = fmt;                                          \
8298         vaf.va = &args;                                         \
8299                                                                 \
8300         __netdev_printk(level, dev, &vaf);                      \
8301                                                                 \
8302         va_end(args);                                           \
8303 }                                                               \
8304 EXPORT_SYMBOL(func);
8305
8306 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8307 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8308 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8309 define_netdev_printk_level(netdev_err, KERN_ERR);
8310 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8311 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8312 define_netdev_printk_level(netdev_info, KERN_INFO);
8313
8314 static void __net_exit netdev_exit(struct net *net)
8315 {
8316         kfree(net->dev_name_head);
8317         kfree(net->dev_index_head);
8318 }
8319
8320 static struct pernet_operations __net_initdata netdev_net_ops = {
8321         .init = netdev_init,
8322         .exit = netdev_exit,
8323 };
8324
8325 static void __net_exit default_device_exit(struct net *net)
8326 {
8327         struct net_device *dev, *aux;
8328         /*
8329          * Push all migratable network devices back to the
8330          * initial network namespace
8331          */
8332         rtnl_lock();
8333         for_each_netdev_safe(net, dev, aux) {
8334                 int err;
8335                 char fb_name[IFNAMSIZ];
8336
8337                 /* Ignore unmoveable devices (i.e. loopback) */
8338                 if (dev->features & NETIF_F_NETNS_LOCAL)
8339                         continue;
8340
8341                 /* Leave virtual devices for the generic cleanup */
8342                 if (dev->rtnl_link_ops)
8343                         continue;
8344
8345                 /* Push remaining network devices to init_net */
8346                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8347                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8348                 if (err) {
8349                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8350                                  __func__, dev->name, err);
8351                         BUG();
8352                 }
8353         }
8354         rtnl_unlock();
8355 }
8356
8357 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8358 {
8359         /* Return with the rtnl_lock held when there are no network
8360          * devices unregistering in any network namespace in net_list.
8361          */
8362         struct net *net;
8363         bool unregistering;
8364         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8365
8366         add_wait_queue(&netdev_unregistering_wq, &wait);
8367         for (;;) {
8368                 unregistering = false;
8369                 rtnl_lock();
8370                 list_for_each_entry(net, net_list, exit_list) {
8371                         if (net->dev_unreg_count > 0) {
8372                                 unregistering = true;
8373                                 break;
8374                         }
8375                 }
8376                 if (!unregistering)
8377                         break;
8378                 __rtnl_unlock();
8379
8380                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8381         }
8382         remove_wait_queue(&netdev_unregistering_wq, &wait);
8383 }
8384
8385 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8386 {
8387         /* At exit all network devices most be removed from a network
8388          * namespace.  Do this in the reverse order of registration.
8389          * Do this across as many network namespaces as possible to
8390          * improve batching efficiency.
8391          */
8392         struct net_device *dev;
8393         struct net *net;
8394         LIST_HEAD(dev_kill_list);
8395
8396         /* To prevent network device cleanup code from dereferencing
8397          * loopback devices or network devices that have been freed
8398          * wait here for all pending unregistrations to complete,
8399          * before unregistring the loopback device and allowing the
8400          * network namespace be freed.
8401          *
8402          * The netdev todo list containing all network devices
8403          * unregistrations that happen in default_device_exit_batch
8404          * will run in the rtnl_unlock() at the end of
8405          * default_device_exit_batch.
8406          */
8407         rtnl_lock_unregistering(net_list);
8408         list_for_each_entry(net, net_list, exit_list) {
8409                 for_each_netdev_reverse(net, dev) {
8410                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8411                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8412                         else
8413                                 unregister_netdevice_queue(dev, &dev_kill_list);
8414                 }
8415         }
8416         unregister_netdevice_many(&dev_kill_list);
8417         rtnl_unlock();
8418 }
8419
8420 static struct pernet_operations __net_initdata default_device_ops = {
8421         .exit = default_device_exit,
8422         .exit_batch = default_device_exit_batch,
8423 };
8424
8425 /*
8426  *      Initialize the DEV module. At boot time this walks the device list and
8427  *      unhooks any devices that fail to initialise (normally hardware not
8428  *      present) and leaves us with a valid list of present and active devices.
8429  *
8430  */
8431
8432 /*
8433  *       This is called single threaded during boot, so no need
8434  *       to take the rtnl semaphore.
8435  */
8436 static int __init net_dev_init(void)
8437 {
8438         int i, rc = -ENOMEM;
8439
8440         BUG_ON(!dev_boot_phase);
8441
8442         if (dev_proc_init())
8443                 goto out;
8444
8445         if (netdev_kobject_init())
8446                 goto out;
8447
8448         INIT_LIST_HEAD(&ptype_all);
8449         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8450                 INIT_LIST_HEAD(&ptype_base[i]);
8451
8452         INIT_LIST_HEAD(&offload_base);
8453
8454         if (register_pernet_subsys(&netdev_net_ops))
8455                 goto out;
8456
8457         /*
8458          *      Initialise the packet receive queues.
8459          */
8460
8461         for_each_possible_cpu(i) {
8462                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8463                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8464
8465                 INIT_WORK(flush, flush_backlog);
8466
8467                 skb_queue_head_init(&sd->input_pkt_queue);
8468                 skb_queue_head_init(&sd->process_queue);
8469                 INIT_LIST_HEAD(&sd->poll_list);
8470                 sd->output_queue_tailp = &sd->output_queue;
8471 #ifdef CONFIG_RPS
8472                 sd->csd.func = rps_trigger_softirq;
8473                 sd->csd.info = sd;
8474                 sd->cpu = i;
8475 #endif
8476
8477                 sd->backlog.poll = process_backlog;
8478                 sd->backlog.weight = weight_p;
8479         }
8480
8481         dev_boot_phase = 0;
8482
8483         /* The loopback device is special if any other network devices
8484          * is present in a network namespace the loopback device must
8485          * be present. Since we now dynamically allocate and free the
8486          * loopback device ensure this invariant is maintained by
8487          * keeping the loopback device as the first device on the
8488          * list of network devices.  Ensuring the loopback devices
8489          * is the first device that appears and the last network device
8490          * that disappears.
8491          */
8492         if (register_pernet_device(&loopback_net_ops))
8493                 goto out;
8494
8495         if (register_pernet_device(&default_device_ops))
8496                 goto out;
8497
8498         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8499         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8500
8501         rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8502                                        NULL, dev_cpu_dead);
8503         WARN_ON(rc < 0);
8504         dst_subsys_init();
8505         rc = 0;
8506 out:
8507         return rc;
8508 }
8509
8510 subsys_initcall(net_dev_init);