net/core/dev.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *      NET3    Protocol independent device support routines.
   4  *
   5  *      Derived from the non IP parts of dev.c 1.0.19
   6  *              Authors:        Ross Biro
   7  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   8  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
   9  *
  10  *      Additional Authors:
  11  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  12  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  13  *              David Hinds <dahinds@users.sourceforge.net>
  14  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  15  *              Adam Sulmicki <adam@cfar.umd.edu>
  16  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  17  *
  18  *      Changes:
  19  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  20  *                                      to 2 if register_netdev gets called
  21  *                                      before net_dev_init & also removed a
  22  *                                      few lines of code in the process.
  23  *              Alan Cox        :       device private ioctl copies fields back.
  24  *              Alan Cox        :       Transmit queue code does relevant
  25  *                                      stunts to keep the queue safe.
  26  *              Alan Cox        :       Fixed double lock.
  27  *              Alan Cox        :       Fixed promisc NULL pointer trap
  28  *              ????????        :       Support the full private ioctl range
  29  *              Alan Cox        :       Moved ioctl permission check into
  30  *                                      drivers
  31  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  32  *              Alan Cox        :       100 backlog just doesn't cut it when
  33  *                                      you start doing multicast video 8)
  34  *              Alan Cox        :       Rewrote net_bh and list manager.
  35  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  36  *              Alan Cox        :       Took out transmit every packet pass
  37  *                                      Saved a few bytes in the ioctl handler
  38  *              Alan Cox        :       Network driver sets packet type before
  39  *                                      calling netif_rx. Saves a function
  40  *                                      call a packet.
  41  *              Alan Cox        :       Hashed net_bh()
  42  *              Richard Kooijman:       Timestamp fixes.
  43  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  44  *              Alan Cox        :       Device lock protection.
  45  *              Alan Cox        :       Fixed nasty side effect of device close
  46  *                                      changes.
  47  *              Rudi Cilibrasi  :       Pass the right thing to
  48  *                                      set_mac_address()
  49  *              Dave Miller     :       32bit quantity for the device lock to
  50  *                                      make it work out on a Sparc.
  51  *              Bjorn Ekwall    :       Added KERNELD hack.
  52  *              Alan Cox        :       Cleaned up the backlog initialise.
  53  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  54  *                                      1 device.
  55  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  56  *                                      is no device open function.
  57  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  58  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  59  *              Cyrus Durgin    :       Cleaned for KMOD
  60  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  61  *                                      A network device unload needs to purge
  62  *                                      the backlog queue.
  63  *      Paul Rusty Russell      :       SIOCSIFNAME
  64  *              Pekka Riikonen  :       Netdev boot-time settings code
  65  *              Andrew Morton   :       Make unregister_netdevice wait
  66  *                                      indefinitely on dev->refcnt
  67  *              J Hadi Salim    :       - Backlog queue sampling
  68  *                                      - netif_rx() feedback
  69  */
  70
  71 #include <linux/uaccess.h>
  72 #include <linux/bitops.h>
  73 #include <linux/capability.h>
  74 #include <linux/cpu.h>
  75 #include <linux/types.h>
  76 #include <linux/kernel.h>
  77 #include <linux/hash.h>
  78 #include <linux/slab.h>
  79 #include <linux/sched.h>
  80 #include <linux/sched/mm.h>
  81 #include <linux/mutex.h>
  82 #include <linux/string.h>
  83 #include <linux/mm.h>
  84 #include <linux/socket.h>
  85 #include <linux/sockios.h>
  86 #include <linux/errno.h>
  87 #include <linux/interrupt.h>
  88 #include <linux/if_ether.h>
  89 #include <linux/netdevice.h>
  90 #include <linux/etherdevice.h>
  91 #include <linux/ethtool.h>
  92 #include <linux/skbuff.h>
  93 #include <linux/bpf.h>
  94 #include <linux/bpf_trace.h>
  95 #include <net/net_namespace.h>
  96 #include <net/sock.h>
  97 #include <net/busy_poll.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/stat.h>
 100 #include <net/dst.h>
 101 #include <net/dst_metadata.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/pkt_cls.h>
 104 #include <net/checksum.h>
 105 #include <net/xfrm.h>
 106 #include <linux/highmem.h>
 107 #include <linux/init.h>
 108 #include <linux/module.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #include <linux/delay.h>
 112 #include <net/iw_handler.h>
 113 #include <asm/current.h>
 114 #include <linux/audit.h>
 115 #include <linux/dmaengine.h>
 116 #include <linux/err.h>
 117 #include <linux/ctype.h>
 118 #include <linux/if_arp.h>
 119 #include <linux/if_vlan.h>
 120 #include <linux/ip.h>
 121 #include <net/ip.h>
 122 #include <net/mpls.h>
 123 #include <linux/ipv6.h>
 124 #include <linux/in.h>
 125 #include <linux/jhash.h>
 126 #include <linux/random.h>
 127 #include <trace/events/napi.h>
 128 #include <trace/events/net.h>
 129 #include <trace/events/skb.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138 #include <linux/netfilter_ingress.h>
 139 #include <linux/crash_dump.h>
 140 #include <linux/sctp.h>
 141 #include <net/udp_tunnel.h>
 142 #include <linux/net_namespace.h>
 143 #include <linux/indirect_call_wrapper.h>
 144 #include <net/devlink.h>
 145
 146 #include "net-sysfs.h"
 147
 148 #define MAX_GRO_SKBS 8
 149
 150 /* This should be increased if a protocol with a bigger head is added. */
 151 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 152
 153 static DEFINE_SPINLOCK(ptype_lock);
 154 static DEFINE_SPINLOCK(offload_lock);
 155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 156 struct list_head ptype_all __read_mostly;       /* Taps */
 157 static struct list_head offload_base __read_mostly;
 158
 159 static int netif_rx_internal(struct sk_buff *skb);
 160 static int call_netdevice_notifiers_info(unsigned long val,
 161                                          struct netdev_notifier_info *info);
 162 static int call_netdevice_notifiers_extack(unsigned long val,
 163                                            struct net_device *dev,
 164                                            struct netlink_ext_ack *extack);
 165 static struct napi_struct *napi_by_id(unsigned int napi_id);
 166
 167 /*
 168  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 169  * semaphore.
 170  *
 171  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 172  *
 173  * Writers must hold the rtnl semaphore while they loop through the
 174  * dev_base_head list, and hold dev_base_lock for writing when they do the
 175  * actual updates.  This allows pure readers to access the list even
 176  * while a writer is preparing to update it.
 177  *
 178  * To put it another way, dev_base_lock is held for writing only to
 179  * protect against pure readers; the rtnl semaphore provides the
 180  * protection against other writers.
 181  *
 182  * See, for example usages, register_netdevice() and
 183  * unregister_netdevice(), which must be called with the rtnl
 184  * semaphore held.
 185  */
 186 DEFINE_RWLOCK(dev_base_lock);
 187 EXPORT_SYMBOL(dev_base_lock);
 188
 189 static DEFINE_MUTEX(ifalias_mutex);
 190
 191 /* protects napi_hash addition/deletion and napi_gen_id */
 192 static DEFINE_SPINLOCK(napi_hash_lock);
 193
 194 static unsigned int napi_gen_id = NR_CPUS;
 195 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 196
 197 static seqcount_t devnet_rename_seq;
 198
 199 static inline void dev_base_seq_inc(struct net *net)
 200 {
 201         while (++net->dev_base_seq == 0)
 202                 ;
 203 }
 204
 205 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 206 {
 207         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 208
 209         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 210 }
 211
 212 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 213 {
 214         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 215 }
 216
 217 static inline void rps_lock(struct softnet_data *sd)
 218 {
 219 #ifdef CONFIG_RPS
 220         spin_lock(&sd->input_pkt_queue.lock);
 221 #endif
 222 }
 223
 224 static inline void rps_unlock(struct softnet_data *sd)
 225 {
 226 #ifdef CONFIG_RPS
 227         spin_unlock(&sd->input_pkt_queue.lock);
 228 #endif
 229 }
 230
 231 static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
 232                                                        const char *name)
 233 {
 234         struct netdev_name_node *name_node;
 235
 236         name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
 237         if (!name_node)
 238                 return NULL;
 239         INIT_HLIST_NODE(&name_node->hlist);
 240         name_node->dev = dev;
 241         name_node->name = name;
 242         return name_node;
 243 }
 244
 245 static struct netdev_name_node *
 246 netdev_name_node_head_alloc(struct net_device *dev)
 247 {
 248         struct netdev_name_node *name_node;
 249
 250         name_node = netdev_name_node_alloc(dev, dev->name);
 251         if (!name_node)
 252                 return NULL;
 253         INIT_LIST_HEAD(&name_node->list);
 254         return name_node;
 255 }
 256
 257 static void netdev_name_node_free(struct netdev_name_node *name_node)
 258 {
 259         kfree(name_node);
 260 }
 261
 262 static void netdev_name_node_add(struct net *net,
 263                                  struct netdev_name_node *name_node)
 264 {
 265         hlist_add_head_rcu(&name_node->hlist,
 266                            dev_name_hash(net, name_node->name));
 267 }
 268
 269 static void netdev_name_node_del(struct netdev_name_node *name_node)
 270 {
 271         hlist_del_rcu(&name_node->hlist);
 272 }
 273
 274 static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
 275                                                         const char *name)
 276 {
 277         struct hlist_head *head = dev_name_hash(net, name);
 278         struct netdev_name_node *name_node;
 279
 280         hlist_for_each_entry(name_node, head, hlist)
 281                 if (!strcmp(name_node->name, name))
 282                         return name_node;
 283         return NULL;
 284 }
 285
 286 static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
 287                                                             const char *name)
 288 {
 289         struct hlist_head *head = dev_name_hash(net, name);
 290         struct netdev_name_node *name_node;
 291
 292         hlist_for_each_entry_rcu(name_node, head, hlist)
 293                 if (!strcmp(name_node->name, name))
 294                         return name_node;
 295         return NULL;
 296 }
 297
 298 int netdev_name_node_alt_create(struct net_device *dev, const char *name)
 299 {
 300         struct netdev_name_node *name_node;
 301         struct net *net = dev_net(dev);
 302
 303         name_node = netdev_name_node_lookup(net, name);
 304         if (name_node)
 305                 return -EEXIST;
 306         name_node = netdev_name_node_alloc(dev, name);
 307         if (!name_node)
 308                 return -ENOMEM;
 309         netdev_name_node_add(net, name_node);
 310         /* The node that holds dev->name acts as a head of per-device list. */
 311         list_add_tail(&name_node->list, &dev->name_node->list);
 312
 313         return 0;
 314 }
 315 EXPORT_SYMBOL(netdev_name_node_alt_create);
 316
 317 static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 318 {
 319         list_del(&name_node->list);
 320         netdev_name_node_del(name_node);
 321         kfree(name_node->name);
 322         netdev_name_node_free(name_node);
 323 }
 324
 325 int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
 326 {
 327         struct netdev_name_node *name_node;
 328         struct net *net = dev_net(dev);
 329
 330         name_node = netdev_name_node_lookup(net, name);
 331         if (!name_node)
 332                 return -ENOENT;
 333         __netdev_name_node_alt_destroy(name_node);
 334
 335         return 0;
 336 }
 337 EXPORT_SYMBOL(netdev_name_node_alt_destroy);
 338
 339 static void netdev_name_node_alt_flush(struct net_device *dev)
 340 {
 341         struct netdev_name_node *name_node, *tmp;
 342
 343         list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
 344                 __netdev_name_node_alt_destroy(name_node);
 345 }
 346
 347 /* Device list insertion */
 348 static void list_netdevice(struct net_device *dev)
 349 {
 350         struct net *net = dev_net(dev);
 351
 352         ASSERT_RTNL();
 353
 354         write_lock_bh(&dev_base_lock);
 355         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 356         netdev_name_node_add(net, dev->name_node);
 357         hlist_add_head_rcu(&dev->index_hlist,
 358                            dev_index_hash(net, dev->ifindex));
 359         write_unlock_bh(&dev_base_lock);
 360
 361         dev_base_seq_inc(net);
 362 }
 363
 364 /* Device list removal
 365  * caller must respect a RCU grace period before freeing/reusing dev
 366  */
 367 static void unlist_netdevice(struct net_device *dev)
 368 {
 369         ASSERT_RTNL();
 370
 371         /* Unlink dev from the device chain */
 372         write_lock_bh(&dev_base_lock);
 373         list_del_rcu(&dev->dev_list);
 374         netdev_name_node_del(dev->name_node);
 375         hlist_del_rcu(&dev->index_hlist);
 376         write_unlock_bh(&dev_base_lock);
 377
 378         dev_base_seq_inc(dev_net(dev));
 379 }
 380
 381 /*
 382  *      Our notifier list
 383  */
 384
 385 static RAW_NOTIFIER_HEAD(netdev_chain);
 386
 387 /*
 388  *      Device drivers call our routines to queue packets here. We empty the
 389  *      queue in the local softnet handler.
 390  */
 391
 392 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 393 EXPORT_PER_CPU_SYMBOL(softnet_data);
 394
 395 #ifdef CONFIG_LOCKDEP
 396 /*
 397  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 398  * according to dev->type
 399  */
 400 static const unsigned short netdev_lock_type[] = {
 401          ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 402          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 403          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 404          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 405          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 406          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 407          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 408          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 409          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 410          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 411          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 412          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 413          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 414          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 415          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 416
 417 static const char *const netdev_lock_name[] = {
 418         "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 419         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 420         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 421         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 422         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 423         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 424         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 425         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 426         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 427         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 428         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 429         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 430         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 431         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 432         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 433
 434 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 435 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 436
 437 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 438 {
 439         int i;
 440
 441         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 442                 if (netdev_lock_type[i] == dev_type)
 443                         return i;
 444         /* the last key is used by default */
 445         return ARRAY_SIZE(netdev_lock_type) - 1;
 446 }
 447
 448 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 449                                                  unsigned short dev_type)
 450 {
 451         int i;
 452
 453         i = netdev_lock_pos(dev_type);
 454         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 455                                    netdev_lock_name[i]);
 456 }
 457
 458 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 459 {
 460         int i;
 461
 462         i = netdev_lock_pos(dev->type);
 463         lockdep_set_class_and_name(&dev->addr_list_lock,
 464                                    &netdev_addr_lock_key[i],
 465                                    netdev_lock_name[i]);
 466 }
 467 #else
 468 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 469                                                  unsigned short dev_type)
 470 {
 471 }
 472 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 473 {
 474 }
 475 #endif
 476
 477 /*******************************************************************************
 478  *
 479  *              Protocol management and registration routines
 480  *
 481  *******************************************************************************/
 482
 483
 484 /*
 485  *      Add a protocol ID to the list. Now that the input handler is
 486  *      smarter we can dispense with all the messy stuff that used to be
 487  *      here.
 488  *
 489  *      BEWARE!!! Protocol handlers, mangling input packets,
 490  *      MUST BE last in hash buckets and checking protocol handlers
 491  *      MUST start from promiscuous ptype_all chain in net_bh.
 492  *      It is true now, do not change it.
 493  *      Explanation follows: if protocol handler, mangling packet, will
 494  *      be the first on list, it is not able to sense, that packet
 495  *      is cloned and should be copied-on-write, so that it will
 496  *      change it and subsequent readers will get broken packet.
 497  *                                                      --ANK (980803)
 498  */
 499
 500 static inline struct list_head *ptype_head(const struct packet_type *pt)
 501 {
 502         if (pt->type == htons(ETH_P_ALL))
 503                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 504         else
 505                 return pt->dev ? &pt->dev->ptype_specific :
 506                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 507 }
 508
 509 /**
 510  *      dev_add_pack - add packet handler
 511  *      @pt: packet type declaration
 512  *
 513  *      Add a protocol handler to the networking stack. The passed &packet_type
 514  *      is linked into kernel lists and may not be freed until it has been
 515  *      removed from the kernel lists.
 516  *
 517  *      This call does not sleep therefore it can not
 518  *      guarantee all CPU's that are in middle of receiving packets
 519  *      will see the new packet type (until the next received packet).
 520  */
 521
 522 void dev_add_pack(struct packet_type *pt)
 523 {
 524         struct list_head *head = ptype_head(pt);
 525
 526         spin_lock(&ptype_lock);
 527         list_add_rcu(&pt->list, head);
 528         spin_unlock(&ptype_lock);
 529 }
 530 EXPORT_SYMBOL(dev_add_pack);
 531
 532 /**
 533  *      __dev_remove_pack        - remove packet handler
 534  *      @pt: packet type declaration
 535  *
 536  *      Remove a protocol handler that was previously added to the kernel
 537  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 538  *      from the kernel lists and can be freed or reused once this function
 539  *      returns.
 540  *
 541  *      The packet type might still be in use by receivers
 542  *      and must not be freed until after all the CPU's have gone
 543  *      through a quiescent state.
 544  */
 545 void __dev_remove_pack(struct packet_type *pt)
 546 {
 547         struct list_head *head = ptype_head(pt);
 548         struct packet_type *pt1;
 549
 550         spin_lock(&ptype_lock);
 551
 552         list_for_each_entry(pt1, head, list) {
 553                 if (pt == pt1) {
 554                         list_del_rcu(&pt->list);
 555                         goto out;
 556                 }
 557         }
 558
 559         pr_warn("dev_remove_pack: %p not found\n", pt);
 560 out:
 561         spin_unlock(&ptype_lock);
 562 }
 563 EXPORT_SYMBOL(__dev_remove_pack);
 564
 565 /**
 566  *      dev_remove_pack  - remove packet handler
 567  *      @pt: packet type declaration
 568  *
 569  *      Remove a protocol handler that was previously added to the kernel
 570  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 571  *      from the kernel lists and can be freed or reused once this function
 572  *      returns.
 573  *
 574  *      This call sleeps to guarantee that no CPU is looking at the packet
 575  *      type after return.
 576  */
 577 void dev_remove_pack(struct packet_type *pt)
 578 {
 579         __dev_remove_pack(pt);
 580
 581         synchronize_net();
 582 }
 583 EXPORT_SYMBOL(dev_remove_pack);
 584
 585
 586 /**
 587  *      dev_add_offload - register offload handlers
 588  *      @po: protocol offload declaration
 589  *
 590  *      Add protocol offload handlers to the networking stack. The passed
 591  *      &proto_offload is linked into kernel lists and may not be freed until
 592  *      it has been removed from the kernel lists.
 593  *
 594  *      This call does not sleep therefore it can not
 595  *      guarantee all CPU's that are in middle of receiving packets
 596  *      will see the new offload handlers (until the next received packet).
 597  */
 598 void dev_add_offload(struct packet_offload *po)
 599 {
 600         struct packet_offload *elem;
 601
 602         spin_lock(&offload_lock);
 603         list_for_each_entry(elem, &offload_base, list) {
 604                 if (po->priority < elem->priority)
 605                         break;
 606         }
 607         list_add_rcu(&po->list, elem->list.prev);
 608         spin_unlock(&offload_lock);
 609 }
 610 EXPORT_SYMBOL(dev_add_offload);
 611
 612 /**
 613  *      __dev_remove_offload     - remove offload handler
 614  *      @po: packet offload declaration
 615  *
 616  *      Remove a protocol offload handler that was previously added to the
 617  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 618  *      is removed from the kernel lists and can be freed or reused once this
 619  *      function returns.
 620  *
 621  *      The packet type might still be in use by receivers
 622  *      and must not be freed until after all the CPU's have gone
 623  *      through a quiescent state.
 624  */
 625 static void __dev_remove_offload(struct packet_offload *po)
 626 {
 627         struct list_head *head = &offload_base;
 628         struct packet_offload *po1;
 629
 630         spin_lock(&offload_lock);
 631
 632         list_for_each_entry(po1, head, list) {
 633                 if (po == po1) {
 634                         list_del_rcu(&po->list);
 635                         goto out;
 636                 }
 637         }
 638
 639         pr_warn("dev_remove_offload: %p not found\n", po);
 640 out:
 641         spin_unlock(&offload_lock);
 642 }
 643
 644 /**
 645  *      dev_remove_offload       - remove packet offload handler
 646  *      @po: packet offload declaration
 647  *
 648  *      Remove a packet offload handler that was previously added to the kernel
 649  *      offload handlers by dev_add_offload(). The passed &offload_type is
 650  *      removed from the kernel lists and can be freed or reused once this
 651  *      function returns.
 652  *
 653  *      This call sleeps to guarantee that no CPU is looking at the packet
 654  *      type after return.
 655  */
 656 void dev_remove_offload(struct packet_offload *po)
 657 {
 658         __dev_remove_offload(po);
 659
 660         synchronize_net();
 661 }
 662 EXPORT_SYMBOL(dev_remove_offload);
 663
 664 /******************************************************************************
 665  *
 666  *                    Device Boot-time Settings Routines
 667  *
 668  ******************************************************************************/
 669
 670 /* Boot time configuration table */
 671 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 672
 673 /**
 674  *      netdev_boot_setup_add   - add new setup entry
 675  *      @name: name of the device
 676  *      @map: configured settings for the device
 677  *
 678  *      Adds new setup entry to the dev_boot_setup list.  The function
 679  *      returns 0 on error and 1 on success.  This is a generic routine to
 680  *      all netdevices.
 681  */
 682 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 683 {
 684         struct netdev_boot_setup *s;
 685         int i;
 686
 687         s = dev_boot_setup;
 688         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 689                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 690                         memset(s[i].name, 0, sizeof(s[i].name));
 691                         strlcpy(s[i].name, name, IFNAMSIZ);
 692                         memcpy(&s[i].map, map, sizeof(s[i].map));
 693                         break;
 694                 }
 695         }
 696
 697         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 698 }
 699
 700 /**
 701  * netdev_boot_setup_check      - check boot time settings
 702  * @dev: the netdevice
 703  *
 704  * Check boot time settings for the device.
 705  * The found settings are set for the device to be used
 706  * later in the device probing.
 707  * Returns 0 if no settings found, 1 if they are.
 708  */
 709 int netdev_boot_setup_check(struct net_device *dev)
 710 {
 711         struct netdev_boot_setup *s = dev_boot_setup;
 712         int i;
 713
 714         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 715                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 716                     !strcmp(dev->name, s[i].name)) {
 717                         dev->irq = s[i].map.irq;
 718                         dev->base_addr = s[i].map.base_addr;
 719                         dev->mem_start = s[i].map.mem_start;
 720                         dev->mem_end = s[i].map.mem_end;
 721                         return 1;
 722                 }
 723         }
 724         return 0;
 725 }
 726 EXPORT_SYMBOL(netdev_boot_setup_check);
 727
 728
 729 /**
 730  * netdev_boot_base     - get address from boot time settings
 731  * @prefix: prefix for network device
 732  * @unit: id for network device
 733  *
 734  * Check boot time settings for the base address of device.
 735  * The found settings are set for the device to be used
 736  * later in the device probing.
 737  * Returns 0 if no settings found.
 738  */
 739 unsigned long netdev_boot_base(const char *prefix, int unit)
 740 {
 741         const struct netdev_boot_setup *s = dev_boot_setup;
 742         char name[IFNAMSIZ];
 743         int i;
 744
 745         sprintf(name, "%s%d", prefix, unit);
 746
 747         /*
 748          * If device already registered then return base of 1
 749          * to indicate not to probe for this interface
 750          */
 751         if (__dev_get_by_name(&init_net, name))
 752                 return 1;
 753
 754         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 755                 if (!strcmp(name, s[i].name))
 756                         return s[i].map.base_addr;
 757         return 0;
 758 }
 759
 760 /*
 761  * Saves at boot time configured settings for any netdevice.
 762  */
 763 int __init netdev_boot_setup(char *str)
 764 {
 765         int ints[5];
 766         struct ifmap map;
 767
 768         str = get_options(str, ARRAY_SIZE(ints), ints);
 769         if (!str || !*str)
 770                 return 0;
 771
 772         /* Save settings */
 773         memset(&map, 0, sizeof(map));
 774         if (ints[0] > 0)
 775                 map.irq = ints[1];
 776         if (ints[0] > 1)
 777                 map.base_addr = ints[2];
 778         if (ints[0] > 2)
 779                 map.mem_start = ints[3];
 780         if (ints[0] > 3)
 781                 map.mem_end = ints[4];
 782
 783         /* Add new entry to the list */
 784         return netdev_boot_setup_add(str, &map);
 785 }
 786
 787 __setup("netdev=", netdev_boot_setup);
 788
 789 /*******************************************************************************
 790  *
 791  *                          Device Interface Subroutines
 792  *
 793  *******************************************************************************/
 794
 795 /**
 796  *      dev_get_iflink  - get 'iflink' value of a interface
 797  *      @dev: targeted interface
 798  *
 799  *      Indicates the ifindex the interface is linked to.
 800  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 801  */
 802
 803 int dev_get_iflink(const struct net_device *dev)
 804 {
 805         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 806                 return dev->netdev_ops->ndo_get_iflink(dev);
 807
 808         return dev->ifindex;
 809 }
 810 EXPORT_SYMBOL(dev_get_iflink);
 811
 812 /**
 813  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 814  *      @dev: targeted interface
 815  *      @skb: The packet.
 816  *
 817  *      For better visibility of tunnel traffic OVS needs to retrieve
 818  *      egress tunnel information for a packet. Following API allows
 819  *      user to get this info.
 820  */
 821 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 822 {
 823         struct ip_tunnel_info *info;
 824
 825         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 826                 return -EINVAL;
 827
 828         info = skb_tunnel_info_unclone(skb);
 829         if (!info)
 830                 return -ENOMEM;
 831         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 832                 return -EINVAL;
 833
 834         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 835 }
 836 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 837
 838 /**
 839  *      __dev_get_by_name       - find a device by its name
 840  *      @net: the applicable net namespace
 841  *      @name: name to find
 842  *
 843  *      Find an interface by name. Must be called under RTNL semaphore
 844  *      or @dev_base_lock. If the name is found a pointer to the device
 845  *      is returned. If the name is not found then %NULL is returned. The
 846  *      reference counters are not incremented so the caller must be
 847  *      careful with locks.
 848  */
 849
 850 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 851 {
 852         struct netdev_name_node *node_name;
 853
 854         node_name = netdev_name_node_lookup(net, name);
 855         return node_name ? node_name->dev : NULL;
 856 }
 857 EXPORT_SYMBOL(__dev_get_by_name);
 858
 859 /**
 860  * dev_get_by_name_rcu  - find a device by its name
 861  * @net: the applicable net namespace
 862  * @name: name to find
 863  *
 864  * Find an interface by name.
 865  * If the name is found a pointer to the device is returned.
 866  * If the name is not found then %NULL is returned.
 867  * The reference counters are not incremented so the caller must be
 868  * careful with locks. The caller must hold RCU lock.
 869  */
 870
 871 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 872 {
 873         struct netdev_name_node *node_name;
 874
 875         node_name = netdev_name_node_lookup_rcu(net, name);
 876         return node_name ? node_name->dev : NULL;
 877 }
 878 EXPORT_SYMBOL(dev_get_by_name_rcu);
 879
 880 /**
 881  *      dev_get_by_name         - find a device by its name
 882  *      @net: the applicable net namespace
 883  *      @name: name to find
 884  *
 885  *      Find an interface by name. This can be called from any
 886  *      context and does its own locking. The returned handle has
 887  *      the usage count incremented and the caller must use dev_put() to
 888  *      release it when it is no longer needed. %NULL is returned if no
 889  *      matching device is found.
 890  */
 891
 892 struct net_device *dev_get_by_name(struct net *net, const char *name)
 893 {
 894         struct net_device *dev;
 895
 896         rcu_read_lock();
 897         dev = dev_get_by_name_rcu(net, name);
 898         if (dev)
 899                 dev_hold(dev);
 900         rcu_read_unlock();
 901         return dev;
 902 }
 903 EXPORT_SYMBOL(dev_get_by_name);
 904
 905 /**
 906  *      __dev_get_by_index - find a device by its ifindex
 907  *      @net: the applicable net namespace
 908  *      @ifindex: index of device
 909  *
 910  *      Search for an interface by index. Returns %NULL if the device
 911  *      is not found or a pointer to the device. The device has not
 912  *      had its reference counter increased so the caller must be careful
 913  *      about locking. The caller must hold either the RTNL semaphore
 914  *      or @dev_base_lock.
 915  */
 916
 917 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 918 {
 919         struct net_device *dev;
 920         struct hlist_head *head = dev_index_hash(net, ifindex);
 921
 922         hlist_for_each_entry(dev, head, index_hlist)
 923                 if (dev->ifindex == ifindex)
 924                         return dev;
 925
 926         return NULL;
 927 }
 928 EXPORT_SYMBOL(__dev_get_by_index);
 929
 930 /**
 931  *      dev_get_by_index_rcu - find a device by its ifindex
 932  *      @net: the applicable net namespace
 933  *      @ifindex: index of device
 934  *
 935  *      Search for an interface by index. Returns %NULL if the device
 936  *      is not found or a pointer to the device. The device has not
 937  *      had its reference counter increased so the caller must be careful
 938  *      about locking. The caller must hold RCU lock.
 939  */
 940
 941 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 942 {
 943         struct net_device *dev;
 944         struct hlist_head *head = dev_index_hash(net, ifindex);
 945
 946         hlist_for_each_entry_rcu(dev, head, index_hlist)
 947                 if (dev->ifindex == ifindex)
 948                         return dev;
 949
 950         return NULL;
 951 }
 952 EXPORT_SYMBOL(dev_get_by_index_rcu);
 953
 954
 955 /**
 956  *      dev_get_by_index - find a device by its ifindex
 957  *      @net: the applicable net namespace
 958  *      @ifindex: index of device
 959  *
 960  *      Search for an interface by index. Returns NULL if the device
 961  *      is not found or a pointer to the device. The device returned has
 962  *      had a reference added and the pointer is safe until the user calls
 963  *      dev_put to indicate they have finished with it.
 964  */
 965
 966 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 967 {
 968         struct net_device *dev;
 969
 970         rcu_read_lock();
 971         dev = dev_get_by_index_rcu(net, ifindex);
 972         if (dev)
 973                 dev_hold(dev);
 974         rcu_read_unlock();
 975         return dev;
 976 }
 977 EXPORT_SYMBOL(dev_get_by_index);
 978
 979 /**
 980  *      dev_get_by_napi_id - find a device by napi_id
 981  *      @napi_id: ID of the NAPI struct
 982  *
 983  *      Search for an interface by NAPI ID. Returns %NULL if the device
 984  *      is not found or a pointer to the device. The device has not had
 985  *      its reference counter increased so the caller must be careful
 986  *      about locking. The caller must hold RCU lock.
 987  */
 988
 989 struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 990 {
 991         struct napi_struct *napi;
 992
 993         WARN_ON_ONCE(!rcu_read_lock_held());
 994
 995         if (napi_id < MIN_NAPI_ID)
 996                 return NULL;
 997
 998         napi = napi_by_id(napi_id);
 999
1000         return napi ? napi->dev : NULL;
1001 }
1002 EXPORT_SYMBOL(dev_get_by_napi_id);
1003
1004 /**
1005  *      netdev_get_name - get a netdevice name, knowing its ifindex.
1006  *      @net: network namespace
1007  *      @name: a pointer to the buffer where the name will be stored.
1008  *      @ifindex: the ifindex of the interface to get the name from.
1009  *
1010  *      The use of raw_seqcount_begin() and cond_resched() before
1011  *      retrying is required as we want to give the writers a chance
1012  *      to complete when CONFIG_PREEMPT is not set.
1013  */
1014 int netdev_get_name(struct net *net, char *name, int ifindex)
1015 {
1016         struct net_device *dev;
1017         unsigned int seq;
1018
1019 retry:
1020         seq = raw_seqcount_begin(&devnet_rename_seq);
1021         rcu_read_lock();
1022         dev = dev_get_by_index_rcu(net, ifindex);
1023         if (!dev) {
1024                 rcu_read_unlock();
1025                 return -ENODEV;
1026         }
1027
1028         strcpy(name, dev->name);
1029         rcu_read_unlock();
1030         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
1031                 cond_resched();
1032                 goto retry;
1033         }
1034
1035         return 0;
1036 }
1037
1038 /**
1039  *      dev_getbyhwaddr_rcu - find a device by its hardware address
1040  *      @net: the applicable net namespace
1041  *      @type: media type of device
1042  *      @ha: hardware address
1043  *
1044  *      Search for an interface by MAC address. Returns NULL if the device
1045  *      is not found or a pointer to the device.
1046  *      The caller must hold RCU or RTNL.
1047  *      The returned device has not had its ref count increased
1048  *      and the caller must therefore be careful about locking
1049  *
1050  */
1051
1052 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1053                                        const char *ha)
1054 {
1055         struct net_device *dev;
1056
1057         for_each_netdev_rcu(net, dev)
1058                 if (dev->type == type &&
1059                     !memcmp(dev->dev_addr, ha, dev->addr_len))
1060                         return dev;
1061
1062         return NULL;
1063 }
1064 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1065
1066 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1067 {
1068         struct net_device *dev;
1069
1070         ASSERT_RTNL();
1071         for_each_netdev(net, dev)
1072                 if (dev->type == type)
1073                         return dev;
1074
1075         return NULL;
1076 }
1077 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1078
1079 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1080 {
1081         struct net_device *dev, *ret = NULL;
1082
1083         rcu_read_lock();
1084         for_each_netdev_rcu(net, dev)
1085                 if (dev->type == type) {
1086                         dev_hold(dev);
1087                         ret = dev;
1088                         break;
1089                 }
1090         rcu_read_unlock();
1091         return ret;
1092 }
1093 EXPORT_SYMBOL(dev_getfirstbyhwtype);
1094
1095 /**
1096  *      __dev_get_by_flags - find any device with given flags
1097  *      @net: the applicable net namespace
1098  *      @if_flags: IFF_* values
1099  *      @mask: bitmask of bits in if_flags to check
1100  *
1101  *      Search for any interface with the given flags. Returns NULL if a device
1102  *      is not found or a pointer to the device. Must be called inside
1103  *      rtnl_lock(), and result refcount is unchanged.
1104  */
1105
1106 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1107                                       unsigned short mask)
1108 {
1109         struct net_device *dev, *ret;
1110
1111         ASSERT_RTNL();
1112
1113         ret = NULL;
1114         for_each_netdev(net, dev) {
1115                 if (((dev->flags ^ if_flags) & mask) == 0) {
1116                         ret = dev;
1117                         break;
1118                 }
1119         }
1120         return ret;
1121 }
1122 EXPORT_SYMBOL(__dev_get_by_flags);
1123
1124 /**
1125  *      dev_valid_name - check if name is okay for network device
1126  *      @name: name string
1127  *
1128  *      Network device names need to be valid file names to
1129  *      to allow sysfs to work.  We also disallow any kind of
1130  *      whitespace.
1131  */
1132 bool dev_valid_name(const char *name)
1133 {
1134         if (*name == '\0')
1135                 return false;
1136         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1137                 return false;
1138         if (!strcmp(name, ".") || !strcmp(name, ".."))
1139                 return false;
1140
1141         while (*name) {
1142                 if (*name == '/' || *name == ':' || isspace(*name))
1143                         return false;
1144                 name++;
1145         }
1146         return true;
1147 }
1148 EXPORT_SYMBOL(dev_valid_name);
1149
1150 /**
1151  *      __dev_alloc_name - allocate a name for a device
1152  *      @net: network namespace to allocate the device name in
1153  *      @name: name format string
1154  *      @buf:  scratch buffer and result name string
1155  *
1156  *      Passed a format string - eg "lt%d" it will try and find a suitable
1157  *      id. It scans list of devices to build up a free map, then chooses
1158  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1159  *      while allocating the name and adding the device in order to avoid
1160  *      duplicates.
1161  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1162  *      Returns the number of the unit assigned or a negative errno code.
1163  */
1164
1165 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1166 {
1167         int i = 0;
1168         const char *p;
1169         const int max_netdevices = 8*PAGE_SIZE;
1170         unsigned long *inuse;
1171         struct net_device *d;
1172
1173         if (!dev_valid_name(name))
1174                 return -EINVAL;
1175
1176         p = strchr(name, '%');
1177         if (p) {
1178                 /*
1179                  * Verify the string as this thing may have come from
1180                  * the user.  There must be either one "%d" and no other "%"
1181                  * characters.
1182                  */
1183                 if (p[1] != 'd' || strchr(p + 2, '%'))
1184                         return -EINVAL;
1185
1186                 /* Use one page as a bit array of possible slots */
1187                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1188                 if (!inuse)
1189                         return -ENOMEM;
1190
1191                 for_each_netdev(net, d) {
1192                         if (!sscanf(d->name, name, &i))
1193                                 continue;
1194                         if (i < 0 || i >= max_netdevices)
1195                                 continue;
1196
1197                         /*  avoid cases where sscanf is not exact inverse of printf */
1198                         snprintf(buf, IFNAMSIZ, name, i);
1199                         if (!strncmp(buf, d->name, IFNAMSIZ))
1200                                 set_bit(i, inuse);
1201                 }
1202
1203                 i = find_first_zero_bit(inuse, max_netdevices);
1204                 free_page((unsigned long) inuse);
1205         }
1206
1207         snprintf(buf, IFNAMSIZ, name, i);
1208         if (!__dev_get_by_name(net, buf))
1209                 return i;
1210
1211         /* It is possible to run out of possible slots
1212          * when the name is long and there isn't enough space left
1213          * for the digits, or if all bits are used.
1214          */
1215         return -ENFILE;
1216 }
1217
1218 static int dev_alloc_name_ns(struct net *net,
1219                              struct net_device *dev,
1220                              const char *name)
1221 {
1222         char buf[IFNAMSIZ];
1223         int ret;
1224
1225         BUG_ON(!net);
1226         ret = __dev_alloc_name(net, name, buf);
1227         if (ret >= 0)
1228                 strlcpy(dev->name, buf, IFNAMSIZ);
1229         return ret;
1230 }
1231
1232 /**
1233  *      dev_alloc_name - allocate a name for a device
1234  *      @dev: device
1235  *      @name: name format string
1236  *
1237  *      Passed a format string - eg "lt%d" it will try and find a suitable
1238  *      id. It scans list of devices to build up a free map, then chooses
1239  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1240  *      while allocating the name and adding the device in order to avoid
1241  *      duplicates.
1242  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1243  *      Returns the number of the unit assigned or a negative errno code.
1244  */
1245
1246 int dev_alloc_name(struct net_device *dev, const char *name)
1247 {
1248         return dev_alloc_name_ns(dev_net(dev), dev, name);
1249 }
1250 EXPORT_SYMBOL(dev_alloc_name);
1251
1252 int dev_get_valid_name(struct net *net, struct net_device *dev,
1253                        const char *name)
1254 {
1255         BUG_ON(!net);
1256
1257         if (!dev_valid_name(name))
1258                 return -EINVAL;
1259
1260         if (strchr(name, '%'))
1261                 return dev_alloc_name_ns(net, dev, name);
1262         else if (__dev_get_by_name(net, name))
1263                 return -EEXIST;
1264         else if (dev->name != name)
1265                 strlcpy(dev->name, name, IFNAMSIZ);
1266
1267         return 0;
1268 }
1269 EXPORT_SYMBOL(dev_get_valid_name);
1270
1271 /**
1272  *      dev_change_name - change name of a device
1273  *      @dev: device
1274  *      @newname: name (or format string) must be at least IFNAMSIZ
1275  *
1276  *      Change name of a device, can pass format strings "eth%d".
1277  *      for wildcarding.
1278  */
1279 int dev_change_name(struct net_device *dev, const char *newname)
1280 {
1281         unsigned char old_assign_type;
1282         char oldname[IFNAMSIZ];
1283         int err = 0;
1284         int ret;
1285         struct net *net;
1286
1287         ASSERT_RTNL();
1288         BUG_ON(!dev_net(dev));
1289
1290         net = dev_net(dev);
1291
1292         /* Some auto-enslaved devices e.g. failover slaves are
1293          * special, as userspace might rename the device after
1294          * the interface had been brought up and running since
1295          * the point kernel initiated auto-enslavement. Allow
1296          * live name change even when these slave devices are
1297          * up and running.
1298          *
1299          * Typically, users of these auto-enslaving devices
1300          * don't actually care about slave name change, as
1301          * they are supposed to operate on master interface
1302          * directly.
1303          */
1304         if (dev->flags & IFF_UP &&
1305             likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1306                 return -EBUSY;
1307
1308         write_seqcount_begin(&devnet_rename_seq);
1309
1310         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1311                 write_seqcount_end(&devnet_rename_seq);
1312                 return 0;
1313         }
1314
1315         memcpy(oldname, dev->name, IFNAMSIZ);
1316
1317         err = dev_get_valid_name(net, dev, newname);
1318         if (err < 0) {
1319                 write_seqcount_end(&devnet_rename_seq);
1320                 return err;
1321         }
1322
1323         if (oldname[0] && !strchr(oldname, '%'))
1324                 netdev_info(dev, "renamed from %s\n", oldname);
1325
1326         old_assign_type = dev->name_assign_type;
1327         dev->name_assign_type = NET_NAME_RENAMED;
1328
1329 rollback:
1330         ret = device_rename(&dev->dev, dev->name);
1331         if (ret) {
1332                 memcpy(dev->name, oldname, IFNAMSIZ);
1333                 dev->name_assign_type = old_assign_type;
1334                 write_seqcount_end(&devnet_rename_seq);
1335                 return ret;
1336         }
1337
1338         write_seqcount_end(&devnet_rename_seq);
1339
1340         netdev_adjacent_rename_links(dev, oldname);
1341
1342         write_lock_bh(&dev_base_lock);
1343         netdev_name_node_del(dev->name_node);
1344         write_unlock_bh(&dev_base_lock);
1345
1346         synchronize_rcu();
1347
1348         write_lock_bh(&dev_base_lock);
1349         netdev_name_node_add(net, dev->name_node);
1350         write_unlock_bh(&dev_base_lock);
1351
1352         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1353         ret = notifier_to_errno(ret);
1354
1355         if (ret) {
1356                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1357                 if (err >= 0) {
1358                         err = ret;
1359                         write_seqcount_begin(&devnet_rename_seq);
1360                         memcpy(dev->name, oldname, IFNAMSIZ);
1361                         memcpy(oldname, newname, IFNAMSIZ);
1362                         dev->name_assign_type = old_assign_type;
1363                         old_assign_type = NET_NAME_RENAMED;
1364                         goto rollback;
1365                 } else {
1366                         pr_err("%s: name change rollback failed: %d\n",
1367                                dev->name, ret);
1368                 }
1369         }
1370
1371         return err;
1372 }
1373
1374 /**
1375  *      dev_set_alias - change ifalias of a device
1376  *      @dev: device
1377  *      @alias: name up to IFALIASZ
1378  *      @len: limit of bytes to copy from info
1379  *
1380  *      Set ifalias for a device,
1381  */
1382 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1383 {
1384         struct dev_ifalias *new_alias = NULL;
1385
1386         if (len >= IFALIASZ)
1387                 return -EINVAL;
1388
1389         if (len) {
1390                 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1391                 if (!new_alias)
1392                         return -ENOMEM;
1393
1394                 memcpy(new_alias->ifalias, alias, len);
1395                 new_alias->ifalias[len] = 0;
1396         }
1397
1398         mutex_lock(&ifalias_mutex);
1399         rcu_swap_protected(dev->ifalias, new_alias,
1400                            mutex_is_locked(&ifalias_mutex));
1401         mutex_unlock(&ifalias_mutex);
1402
1403         if (new_alias)
1404                 kfree_rcu(new_alias, rcuhead);
1405
1406         return len;
1407 }
1408 EXPORT_SYMBOL(dev_set_alias);
1409
1410 /**
1411  *      dev_get_alias - get ifalias of a device
1412  *      @dev: device
1413  *      @name: buffer to store name of ifalias
1414  *      @len: size of buffer
1415  *
1416  *      get ifalias for a device.  Caller must make sure dev cannot go
1417  *      away,  e.g. rcu read lock or own a reference count to device.
1418  */
1419 int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1420 {
1421         const struct dev_ifalias *alias;
1422         int ret = 0;
1423
1424         rcu_read_lock();
1425         alias = rcu_dereference(dev->ifalias);
1426         if (alias)
1427                 ret = snprintf(name, len, "%s", alias->ifalias);
1428         rcu_read_unlock();
1429
1430         return ret;
1431 }
1432
1433 /**
1434  *      netdev_features_change - device changes features
1435  *      @dev: device to cause notification
1436  *
1437  *      Called to indicate a device has changed features.
1438  */
1439 void netdev_features_change(struct net_device *dev)
1440 {
1441         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1442 }
1443 EXPORT_SYMBOL(netdev_features_change);
1444
1445 /**
1446  *      netdev_state_change - device changes state
1447  *      @dev: device to cause notification
1448  *
1449  *      Called to indicate a device has changed state. This function calls
1450  *      the notifier chains for netdev_chain and sends a NEWLINK message
1451  *      to the routing socket.
1452  */
1453 void netdev_state_change(struct net_device *dev)
1454 {
1455         if (dev->flags & IFF_UP) {
1456                 struct netdev_notifier_change_info change_info = {
1457                         .info.dev = dev,
1458                 };
1459
1460                 call_netdevice_notifiers_info(NETDEV_CHANGE,
1461                                               &change_info.info);
1462                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1463         }
1464 }
1465 EXPORT_SYMBOL(netdev_state_change);
1466
1467 /**
1468  * netdev_notify_peers - notify network peers about existence of @dev
1469  * @dev: network device
1470  *
1471  * Generate traffic such that interested network peers are aware of
1472  * @dev, such as by generating a gratuitous ARP. This may be used when
1473  * a device wants to inform the rest of the network about some sort of
1474  * reconfiguration such as a failover event or virtual machine
1475  * migration.
1476  */
1477 void netdev_notify_peers(struct net_device *dev)
1478 {
1479         rtnl_lock();
1480         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1481         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1482         rtnl_unlock();
1483 }
1484 EXPORT_SYMBOL(netdev_notify_peers);
1485
1486 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1487 {
1488         const struct net_device_ops *ops = dev->netdev_ops;
1489         int ret;
1490
1491         ASSERT_RTNL();
1492
1493         if (!netif_device_present(dev))
1494                 return -ENODEV;
1495
1496         /* Block netpoll from trying to do any rx path servicing.
1497          * If we don't do this there is a chance ndo_poll_controller
1498          * or ndo_poll may be running while we open the device
1499          */
1500         netpoll_poll_disable(dev);
1501
1502         ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1503         ret = notifier_to_errno(ret);
1504         if (ret)
1505                 return ret;
1506
1507         set_bit(__LINK_STATE_START, &dev->state);
1508
1509         if (ops->ndo_validate_addr)
1510                 ret = ops->ndo_validate_addr(dev);
1511
1512         if (!ret && ops->ndo_open)
1513                 ret = ops->ndo_open(dev);
1514
1515         netpoll_poll_enable(dev);
1516
1517         if (ret)
1518                 clear_bit(__LINK_STATE_START, &dev->state);
1519         else {
1520                 dev->flags |= IFF_UP;
1521                 dev_set_rx_mode(dev);
1522                 dev_activate(dev);
1523                 add_device_randomness(dev->dev_addr, dev->addr_len);
1524         }
1525
1526         return ret;
1527 }
1528
1529 /**
1530  *      dev_open        - prepare an interface for use.
1531  *      @dev: device to open
1532  *      @extack: netlink extended ack
1533  *
1534  *      Takes a device from down to up state. The device's private open
1535  *      function is invoked and then the multicast lists are loaded. Finally
1536  *      the device is moved into the up state and a %NETDEV_UP message is
1537  *      sent to the netdev notifier chain.
1538  *
1539  *      Calling this function on an active interface is a nop. On a failure
1540  *      a negative errno code is returned.
1541  */
1542 int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1543 {
1544         int ret;
1545
1546         if (dev->flags & IFF_UP)
1547                 return 0;
1548
1549         ret = __dev_open(dev, extack);
1550         if (ret < 0)
1551                 return ret;
1552
1553         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1554         call_netdevice_notifiers(NETDEV_UP, dev);
1555
1556         return ret;
1557 }
1558 EXPORT_SYMBOL(dev_open);
1559
1560 static void __dev_close_many(struct list_head *head)
1561 {
1562         struct net_device *dev;
1563
1564         ASSERT_RTNL();
1565         might_sleep();
1566
1567         list_for_each_entry(dev, head, close_list) {
1568                 /* Temporarily disable netpoll until the interface is down */
1569                 netpoll_poll_disable(dev);
1570
1571                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1572
1573                 clear_bit(__LINK_STATE_START, &dev->state);
1574
1575                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1576                  * can be even on different cpu. So just clear netif_running().
1577                  *
1578                  * dev->stop() will invoke napi_disable() on all of it's
1579                  * napi_struct instances on this device.
1580                  */
1581                 smp_mb__after_atomic(); /* Commit netif_running(). */
1582         }
1583
1584         dev_deactivate_many(head);
1585
1586         list_for_each_entry(dev, head, close_list) {
1587                 const struct net_device_ops *ops = dev->netdev_ops;
1588
1589                 /*
1590                  *      Call the device specific close. This cannot fail.
1591                  *      Only if device is UP
1592                  *
1593                  *      We allow it to be called even after a DETACH hot-plug
1594                  *      event.
1595                  */
1596                 if (ops->ndo_stop)
1597                         ops->ndo_stop(dev);
1598
1599                 dev->flags &= ~IFF_UP;
1600                 netpoll_poll_enable(dev);
1601         }
1602 }
1603
1604 static void __dev_close(struct net_device *dev)
1605 {
1606         LIST_HEAD(single);
1607
1608         list_add(&dev->close_list, &single);
1609         __dev_close_many(&single);
1610         list_del(&single);
1611 }
1612
1613 void dev_close_many(struct list_head *head, bool unlink)
1614 {
1615         struct net_device *dev, *tmp;
1616
1617         /* Remove the devices that don't need to be closed */
1618         list_for_each_entry_safe(dev, tmp, head, close_list)
1619                 if (!(dev->flags & IFF_UP))
1620                         list_del_init(&dev->close_list);
1621
1622         __dev_close_many(head);
1623
1624         list_for_each_entry_safe(dev, tmp, head, close_list) {
1625                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1626                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1627                 if (unlink)
1628                         list_del_init(&dev->close_list);
1629         }
1630 }
1631 EXPORT_SYMBOL(dev_close_many);
1632
1633 /**
1634  *      dev_close - shutdown an interface.
1635  *      @dev: device to shutdown
1636  *
1637  *      This function moves an active device into down state. A
1638  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1639  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1640  *      chain.
1641  */
1642 void dev_close(struct net_device *dev)
1643 {
1644         if (dev->flags & IFF_UP) {
1645                 LIST_HEAD(single);
1646
1647                 list_add(&dev->close_list, &single);
1648                 dev_close_many(&single, true);
1649                 list_del(&single);
1650         }
1651 }
1652 EXPORT_SYMBOL(dev_close);
1653
1654
1655 /**
1656  *      dev_disable_lro - disable Large Receive Offload on a device
1657  *      @dev: device
1658  *
1659  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1660  *      called under RTNL.  This is needed if received packets may be
1661  *      forwarded to another interface.
1662  */
1663 void dev_disable_lro(struct net_device *dev)
1664 {
1665         struct net_device *lower_dev;
1666         struct list_head *iter;
1667
1668         dev->wanted_features &= ~NETIF_F_LRO;
1669         netdev_update_features(dev);
1670
1671         if (unlikely(dev->features & NETIF_F_LRO))
1672                 netdev_WARN(dev, "failed to disable LRO!\n");
1673
1674         netdev_for_each_lower_dev(dev, lower_dev, iter)
1675                 dev_disable_lro(lower_dev);
1676 }
1677 EXPORT_SYMBOL(dev_disable_lro);
1678
1679 /**
1680  *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1681  *      @dev: device
1682  *
1683  *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1684  *      called under RTNL.  This is needed if Generic XDP is installed on
1685  *      the device.
1686  */
1687 static void dev_disable_gro_hw(struct net_device *dev)
1688 {
1689         dev->wanted_features &= ~NETIF_F_GRO_HW;
1690         netdev_update_features(dev);
1691
1692         if (unlikely(dev->features & NETIF_F_GRO_HW))
1693                 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1694 }
1695
1696 const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1697 {
1698 #define N(val)                                          \
1699         case NETDEV_##val:                              \
1700                 return "NETDEV_" __stringify(val);
1701         switch (cmd) {
1702         N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1703         N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1704         N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1705         N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1706         N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1707         N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1708         N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1709         N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1710         N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1711         N(PRE_CHANGEADDR)
1712         }
1713 #undef N
1714         return "UNKNOWN_NETDEV_EVENT";
1715 }
1716 EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1717
1718 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1719                                    struct net_device *dev)
1720 {
1721         struct netdev_notifier_info info = {
1722                 .dev = dev,
1723         };
1724
1725         return nb->notifier_call(nb, val, &info);
1726 }
1727
1728 static int dev_boot_phase = 1;
1729
1730 /**
1731  * register_netdevice_notifier - register a network notifier block
1732  * @nb: notifier
1733  *
1734  * Register a notifier to be called when network device events occur.
1735  * The notifier passed is linked into the kernel structures and must
1736  * not be reused until it has been unregistered. A negative errno code
1737  * is returned on a failure.
1738  *
1739  * When registered all registration and up events are replayed
1740  * to the new notifier to allow device to have a race free
1741  * view of the network device list.
1742  */
1743
1744 int register_netdevice_notifier(struct notifier_block *nb)
1745 {
1746         struct net_device *dev;
1747         struct net_device *last;
1748         struct net *net;
1749         int err;
1750
1751         /* Close race with setup_net() and cleanup_net() */
1752         down_write(&pernet_ops_rwsem);
1753         rtnl_lock();
1754         err = raw_notifier_chain_register(&netdev_chain, nb);
1755         if (err)
1756                 goto unlock;
1757         if (dev_boot_phase)
1758                 goto unlock;
1759         for_each_net(net) {
1760                 for_each_netdev(net, dev) {
1761                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1762                         err = notifier_to_errno(err);
1763                         if (err)
1764                                 goto rollback;
1765
1766                         if (!(dev->flags & IFF_UP))
1767                                 continue;
1768
1769                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1770                 }
1771         }
1772
1773 unlock:
1774         rtnl_unlock();
1775         up_write(&pernet_ops_rwsem);
1776         return err;
1777
1778 rollback:
1779         last = dev;
1780         for_each_net(net) {
1781                 for_each_netdev(net, dev) {
1782                         if (dev == last)
1783                                 goto outroll;
1784
1785                         if (dev->flags & IFF_UP) {
1786                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1787                                                         dev);
1788                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1789                         }
1790                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1791                 }
1792         }
1793
1794 outroll:
1795         raw_notifier_chain_unregister(&netdev_chain, nb);
1796         goto unlock;
1797 }
1798 EXPORT_SYMBOL(register_netdevice_notifier);
1799
1800 /**
1801  * unregister_netdevice_notifier - unregister a network notifier block
1802  * @nb: notifier
1803  *
1804  * Unregister a notifier previously registered by
1805  * register_netdevice_notifier(). The notifier is unlinked into the
1806  * kernel structures and may then be reused. A negative errno code
1807  * is returned on a failure.
1808  *
1809  * After unregistering unregister and down device events are synthesized
1810  * for all devices on the device list to the removed notifier to remove
1811  * the need for special case cleanup code.
1812  */
1813
1814 int unregister_netdevice_notifier(struct notifier_block *nb)
1815 {
1816         struct net_device *dev;
1817         struct net *net;
1818         int err;
1819
1820         /* Close race with setup_net() and cleanup_net() */
1821         down_write(&pernet_ops_rwsem);
1822         rtnl_lock();
1823         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1824         if (err)
1825                 goto unlock;
1826
1827         for_each_net(net) {
1828                 for_each_netdev(net, dev) {
1829                         if (dev->flags & IFF_UP) {
1830                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1831                                                         dev);
1832                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1833                         }
1834                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1835                 }
1836         }
1837 unlock:
1838         rtnl_unlock();
1839         up_write(&pernet_ops_rwsem);
1840         return err;
1841 }
1842 EXPORT_SYMBOL(unregister_netdevice_notifier);
1843
1844 /**
1845  *      call_netdevice_notifiers_info - call all network notifier blocks
1846  *      @val: value passed unmodified to notifier function
1847  *      @info: notifier information data
1848  *
1849  *      Call all network notifier blocks.  Parameters and return value
1850  *      are as for raw_notifier_call_chain().
1851  */
1852
1853 static int call_netdevice_notifiers_info(unsigned long val,
1854                                          struct netdev_notifier_info *info)
1855 {
1856         ASSERT_RTNL();
1857         return raw_notifier_call_chain(&netdev_chain, val, info);
1858 }
1859
1860 static int call_netdevice_notifiers_extack(unsigned long val,
1861                                            struct net_device *dev,
1862                                            struct netlink_ext_ack *extack)
1863 {
1864         struct netdev_notifier_info info = {
1865                 .dev = dev,
1866                 .extack = extack,
1867         };
1868
1869         return call_netdevice_notifiers_info(val, &info);
1870 }
1871
1872 /**
1873  *      call_netdevice_notifiers - call all network notifier blocks
1874  *      @val: value passed unmodified to notifier function
1875  *      @dev: net_device pointer passed unmodified to notifier function
1876  *
1877  *      Call all network notifier blocks.  Parameters and return value
1878  *      are as for raw_notifier_call_chain().
1879  */
1880
1881 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1882 {
1883         return call_netdevice_notifiers_extack(val, dev, NULL);
1884 }
1885 EXPORT_SYMBOL(call_netdevice_notifiers);
1886
1887 /**
1888  *      call_netdevice_notifiers_mtu - call all network notifier blocks
1889  *      @val: value passed unmodified to notifier function
1890  *      @dev: net_device pointer passed unmodified to notifier function
1891  *      @arg: additional u32 argument passed to the notifier function
1892  *
1893  *      Call all network notifier blocks.  Parameters and return value
1894  *      are as for raw_notifier_call_chain().
1895  */
1896 static int call_netdevice_notifiers_mtu(unsigned long val,
1897                                         struct net_device *dev, u32 arg)
1898 {
1899         struct netdev_notifier_info_ext info = {
1900                 .info.dev = dev,
1901                 .ext.mtu = arg,
1902         };
1903
1904         BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1905
1906         return call_netdevice_notifiers_info(val, &info.info);
1907 }
1908
1909 #ifdef CONFIG_NET_INGRESS
1910 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
1911
1912 void net_inc_ingress_queue(void)
1913 {
1914         static_branch_inc(&ingress_needed_key);
1915 }
1916 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1917
1918 void net_dec_ingress_queue(void)
1919 {
1920         static_branch_dec(&ingress_needed_key);
1921 }
1922 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1923 #endif
1924
1925 #ifdef CONFIG_NET_EGRESS
1926 static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
1927
1928 void net_inc_egress_queue(void)
1929 {
1930         static_branch_inc(&egress_needed_key);
1931 }
1932 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1933
1934 void net_dec_egress_queue(void)
1935 {
1936         static_branch_dec(&egress_needed_key);
1937 }
1938 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1939 #endif
1940
1941 static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
1942 #ifdef CONFIG_JUMP_LABEL
1943 static atomic_t netstamp_needed_deferred;
1944 static atomic_t netstamp_wanted;
1945 static void netstamp_clear(struct work_struct *work)
1946 {
1947         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1948         int wanted;
1949
1950         wanted = atomic_add_return(deferred, &netstamp_wanted);
1951         if (wanted > 0)
1952                 static_branch_enable(&netstamp_needed_key);
1953         else
1954                 static_branch_disable(&netstamp_needed_key);
1955 }
1956 static DECLARE_WORK(netstamp_work, netstamp_clear);
1957 #endif
1958
1959 void net_enable_timestamp(void)
1960 {
1961 #ifdef CONFIG_JUMP_LABEL
1962         int wanted;
1963
1964         while (1) {
1965                 wanted = atomic_read(&netstamp_wanted);
1966                 if (wanted <= 0)
1967                         break;
1968                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1969                         return;
1970         }
1971         atomic_inc(&netstamp_needed_deferred);
1972         schedule_work(&netstamp_work);
1973 #else
1974         static_branch_inc(&netstamp_needed_key);
1975 #endif
1976 }
1977 EXPORT_SYMBOL(net_enable_timestamp);
1978
1979 void net_disable_timestamp(void)
1980 {
1981 #ifdef CONFIG_JUMP_LABEL
1982         int wanted;
1983
1984         while (1) {
1985                 wanted = atomic_read(&netstamp_wanted);
1986                 if (wanted <= 1)
1987                         break;
1988                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1989                         return;
1990         }
1991         atomic_dec(&netstamp_needed_deferred);
1992         schedule_work(&netstamp_work);
1993 #else
1994         static_branch_dec(&netstamp_needed_key);
1995 #endif
1996 }
1997 EXPORT_SYMBOL(net_disable_timestamp);
1998
1999 static inline void net_timestamp_set(struct sk_buff *skb)
2000 {
2001         skb->tstamp = 0;
2002         if (static_branch_unlikely(&netstamp_needed_key))
2003                 __net_timestamp(skb);
2004 }
2005
2006 #define net_timestamp_check(COND, SKB)                          \
2007         if (static_branch_unlikely(&netstamp_needed_key)) {     \
2008                 if ((COND) && !(SKB)->tstamp)                   \
2009                         __net_timestamp(SKB);                   \
2010         }                                                       \
2011
2012 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2013 {
2014         unsigned int len;
2015
2016         if (!(dev->flags & IFF_UP))
2017                 return false;
2018
2019         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
2020         if (skb->len <= len)
2021                 return true;
2022
2023         /* if TSO is enabled, we don't care about the length as the packet
2024          * could be forwarded without being segmented before
2025          */
2026         if (skb_is_gso(skb))
2027                 return true;
2028
2029         return false;
2030 }
2031 EXPORT_SYMBOL_GPL(is_skb_forwardable);
2032
2033 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2034 {
2035         int ret = ____dev_forward_skb(dev, skb);
2036
2037         if (likely(!ret)) {
2038                 skb->protocol = eth_type_trans(skb, dev);
2039                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2040         }
2041
2042         return ret;
2043 }
2044 EXPORT_SYMBOL_GPL(__dev_forward_skb);
2045
2046 /**
2047  * dev_forward_skb - loopback an skb to another netif
2048  *
2049  * @dev: destination network device
2050  * @skb: buffer to forward
2051  *
2052  * return values:
2053  *      NET_RX_SUCCESS  (no congestion)
2054  *      NET_RX_DROP     (packet was dropped, but freed)
2055  *
2056  * dev_forward_skb can be used for injecting an skb from the
2057  * start_xmit function of one device into the receive queue
2058  * of another device.
2059  *
2060  * The receiving device may be in another namespace, so
2061  * we have to clear all information in the skb that could
2062  * impact namespace isolation.
2063  */
2064 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2065 {
2066         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2067 }
2068 EXPORT_SYMBOL_GPL(dev_forward_skb);
2069
2070 static inline int deliver_skb(struct sk_buff *skb,
2071                               struct packet_type *pt_prev,
2072                               struct net_device *orig_dev)
2073 {
2074         if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2075                 return -ENOMEM;
2076         refcount_inc(&skb->users);
2077         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2078 }
2079
2080 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2081                                           struct packet_type **pt,
2082                                           struct net_device *orig_dev,
2083                                           __be16 type,
2084                                           struct list_head *ptype_list)
2085 {
2086         struct packet_type *ptype, *pt_prev = *pt;
2087
2088         list_for_each_entry_rcu(ptype, ptype_list, list) {
2089                 if (ptype->type != type)
2090                         continue;
2091                 if (pt_prev)
2092                         deliver_skb(skb, pt_prev, orig_dev);
2093                 pt_prev = ptype;
2094         }
2095         *pt = pt_prev;
2096 }
2097
2098 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2099 {
2100         if (!ptype->af_packet_priv || !skb->sk)
2101                 return false;
2102
2103         if (ptype->id_match)
2104                 return ptype->id_match(ptype, skb->sk);
2105         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2106                 return true;
2107
2108         return false;
2109 }
2110
2111 /**
2112  * dev_nit_active - return true if any network interface taps are in use
2113  *
2114  * @dev: network device to check for the presence of taps
2115  */
2116 bool dev_nit_active(struct net_device *dev)
2117 {
2118         return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2119 }
2120 EXPORT_SYMBOL_GPL(dev_nit_active);
2121
2122 /*
2123  *      Support routine. Sends outgoing frames to any network
2124  *      taps currently in use.
2125  */
2126
2127 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2128 {
2129         struct packet_type *ptype;
2130         struct sk_buff *skb2 = NULL;
2131         struct packet_type *pt_prev = NULL;
2132         struct list_head *ptype_list = &ptype_all;
2133
2134         rcu_read_lock();
2135 again:
2136         list_for_each_entry_rcu(ptype, ptype_list, list) {
2137                 if (ptype->ignore_outgoing)
2138                         continue;
2139
2140                 /* Never send packets back to the socket
2141                  * they originated from - MvS (miquels@drinkel.ow.org)
2142                  */
2143                 if (skb_loop_sk(ptype, skb))
2144                         continue;
2145
2146                 if (pt_prev) {
2147                         deliver_skb(skb2, pt_prev, skb->dev);
2148                         pt_prev = ptype;
2149                         continue;
2150                 }
2151
2152                 /* need to clone skb, done only once */
2153                 skb2 = skb_clone(skb, GFP_ATOMIC);
2154                 if (!skb2)
2155                         goto out_unlock;
2156
2157                 net_timestamp_set(skb2);
2158
2159                 /* skb->nh should be correctly
2160                  * set by sender, so that the second statement is
2161                  * just protection against buggy protocols.
2162                  */
2163                 skb_reset_mac_header(skb2);
2164
2165                 if (skb_network_header(skb2) < skb2->data ||
2166                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2167                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2168                                              ntohs(skb2->protocol),
2169                                              dev->name);
2170                         skb_reset_network_header(skb2);
2171                 }
2172
2173                 skb2->transport_header = skb2->network_header;
2174                 skb2->pkt_type = PACKET_OUTGOING;
2175                 pt_prev = ptype;
2176         }
2177
2178         if (ptype_list == &ptype_all) {
2179                 ptype_list = &dev->ptype_all;
2180                 goto again;
2181         }
2182 out_unlock:
2183         if (pt_prev) {
2184                 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2185                         pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2186                 else
2187                         kfree_skb(skb2);
2188         }
2189         rcu_read_unlock();
2190 }
2191 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2192
2193 /**
2194  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2195  * @dev: Network device
2196  * @txq: number of queues available
2197  *
2198  * If real_num_tx_queues is changed the tc mappings may no longer be
2199  * valid. To resolve this verify the tc mapping remains valid and if
2200  * not NULL the mapping. With no priorities mapping to this
2201  * offset/count pair it will no longer be used. In the worst case TC0
2202  * is invalid nothing can be done so disable priority mappings. If is
2203  * expected that drivers will fix this mapping if they can before
2204  * calling netif_set_real_num_tx_queues.
2205  */
2206 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2207 {
2208         int i;
2209         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2210
2211         /* If TC0 is invalidated disable TC mapping */
2212         if (tc->offset + tc->count > txq) {
2213                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2214                 dev->num_tc = 0;
2215                 return;
2216         }
2217
2218         /* Invalidated prio to tc mappings set to TC0 */
2219         for (i = 1; i < TC_BITMASK + 1; i++) {
2220                 int q = netdev_get_prio_tc_map(dev, i);
2221
2222                 tc = &dev->tc_to_txq[q];
2223                 if (tc->offset + tc->count > txq) {
2224                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2225                                 i, q);
2226                         netdev_set_prio_tc_map(dev, i, 0);
2227                 }
2228         }
2229 }
2230
2231 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2232 {
2233         if (dev->num_tc) {
2234                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2235                 int i;
2236
2237                 /* walk through the TCs and see if it falls into any of them */
2238                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2239                         if ((txq - tc->offset) < tc->count)
2240                                 return i;
2241                 }
2242
2243                 /* didn't find it, just return -1 to indicate no match */
2244                 return -1;
2245         }
2246
2247         return 0;
2248 }
2249 EXPORT_SYMBOL(netdev_txq_to_tc);
2250
2251 #ifdef CONFIG_XPS
2252 struct static_key xps_needed __read_mostly;
2253 EXPORT_SYMBOL(xps_needed);
2254 struct static_key xps_rxqs_needed __read_mostly;
2255 EXPORT_SYMBOL(xps_rxqs_needed);
2256 static DEFINE_MUTEX(xps_map_mutex);
2257 #define xmap_dereference(P)             \
2258         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2259
2260 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2261                              int tci, u16 index)
2262 {
2263         struct xps_map *map = NULL;
2264         int pos;
2265
2266         if (dev_maps)
2267                 map = xmap_dereference(dev_maps->attr_map[tci]);
2268         if (!map)
2269                 return false;
2270
2271         for (pos = map->len; pos--;) {
2272                 if (map->queues[pos] != index)
2273                         continue;
2274
2275                 if (map->len > 1) {
2276                         map->queues[pos] = map->queues[--map->len];
2277                         break;
2278                 }
2279
2280                 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2281                 kfree_rcu(map, rcu);
2282                 return false;
2283         }
2284
2285         return true;
2286 }
2287
2288 static bool remove_xps_queue_cpu(struct net_device *dev,
2289                                  struct xps_dev_maps *dev_maps,
2290                                  int cpu, u16 offset, u16 count)
2291 {
2292         int num_tc = dev->num_tc ? : 1;
2293         bool active = false;
2294         int tci;
2295
2296         for (tci = cpu * num_tc; num_tc--; tci++) {
2297                 int i, j;
2298
2299                 for (i = count, j = offset; i--; j++) {
2300                         if (!remove_xps_queue(dev_maps, tci, j))
2301                                 break;
2302                 }
2303
2304                 active |= i < 0;
2305         }
2306
2307         return active;
2308 }
2309
2310 static void reset_xps_maps(struct net_device *dev,
2311                            struct xps_dev_maps *dev_maps,
2312                            bool is_rxqs_map)
2313 {
2314         if (is_rxqs_map) {
2315                 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2316                 RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2317         } else {
2318                 RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2319         }
2320         static_key_slow_dec_cpuslocked(&xps_needed);
2321         kfree_rcu(dev_maps, rcu);
2322 }
2323
2324 static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2325                            struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2326                            u16 offset, u16 count, bool is_rxqs_map)
2327 {
2328         bool active = false;
2329         int i, j;
2330
2331         for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2332              j < nr_ids;)
2333                 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2334                                                count);
2335         if (!active)
2336                 reset_xps_maps(dev, dev_maps, is_rxqs_map);
2337
2338         if (!is_rxqs_map) {
2339                 for (i = offset + (count - 1); count--; i--) {
2340                         netdev_queue_numa_node_write(
2341                                 netdev_get_tx_queue(dev, i),
2342                                 NUMA_NO_NODE);
2343                 }
2344         }
2345 }
2346
2347 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2348                                    u16 count)
2349 {
2350         const unsigned long *possible_mask = NULL;
2351         struct xps_dev_maps *dev_maps;
2352         unsigned int nr_ids;
2353
2354         if (!static_key_false(&xps_needed))
2355                 return;
2356
2357         cpus_read_lock();
2358         mutex_lock(&xps_map_mutex);
2359
2360         if (static_key_false(&xps_rxqs_needed)) {
2361                 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2362                 if (dev_maps) {
2363                         nr_ids = dev->num_rx_queues;
2364                         clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2365                                        offset, count, true);
2366                 }
2367         }
2368
2369         dev_maps = xmap_dereference(dev->xps_cpus_map);
2370         if (!dev_maps)
2371                 goto out_no_maps;
2372
2373         if (num_possible_cpus() > 1)
2374                 possible_mask = cpumask_bits(cpu_possible_mask);
2375         nr_ids = nr_cpu_ids;
2376         clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2377                        false);
2378
2379 out_no_maps:
2380         mutex_unlock(&xps_map_mutex);
2381         cpus_read_unlock();
2382 }
2383
2384 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2385 {
2386         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2387 }
2388
2389 static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2390                                       u16 index, bool is_rxqs_map)
2391 {
2392         struct xps_map *new_map;
2393         int alloc_len = XPS_MIN_MAP_ALLOC;
2394         int i, pos;
2395
2396         for (pos = 0; map && pos < map->len; pos++) {
2397                 if (map->queues[pos] != index)
2398                         continue;
2399                 return map;
2400         }
2401
2402         /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2403         if (map) {
2404                 if (pos < map->alloc_len)
2405                         return map;
2406
2407                 alloc_len = map->alloc_len * 2;
2408         }
2409
2410         /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2411          *  map
2412          */
2413         if (is_rxqs_map)
2414                 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2415         else
2416                 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2417                                        cpu_to_node(attr_index));
2418         if (!new_map)
2419                 return NULL;
2420
2421         for (i = 0; i < pos; i++)
2422                 new_map->queues[i] = map->queues[i];
2423         new_map->alloc_len = alloc_len;
2424         new_map->len = pos;
2425
2426         return new_map;
2427 }
2428
2429 /* Must be called under cpus_read_lock */
2430 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2431                           u16 index, bool is_rxqs_map)
2432 {
2433         const unsigned long *online_mask = NULL, *possible_mask = NULL;
2434         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2435         int i, j, tci, numa_node_id = -2;
2436         int maps_sz, num_tc = 1, tc = 0;
2437         struct xps_map *map, *new_map;
2438         bool active = false;
2439         unsigned int nr_ids;
2440
2441         if (dev->num_tc) {
2442                 /* Do not allow XPS on subordinate device directly */
2443                 num_tc = dev->num_tc;
2444                 if (num_tc < 0)
2445                         return -EINVAL;
2446
2447                 /* If queue belongs to subordinate dev use its map */
2448                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2449
2450                 tc = netdev_txq_to_tc(dev, index);
2451                 if (tc < 0)
2452                         return -EINVAL;
2453         }
2454
2455         mutex_lock(&xps_map_mutex);
2456         if (is_rxqs_map) {
2457                 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2458                 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2459                 nr_ids = dev->num_rx_queues;
2460         } else {
2461                 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2462                 if (num_possible_cpus() > 1) {
2463                         online_mask = cpumask_bits(cpu_online_mask);
2464                         possible_mask = cpumask_bits(cpu_possible_mask);
2465                 }
2466                 dev_maps = xmap_dereference(dev->xps_cpus_map);
2467                 nr_ids = nr_cpu_ids;
2468         }
2469
2470         if (maps_sz < L1_CACHE_BYTES)
2471                 maps_sz = L1_CACHE_BYTES;
2472
2473         /* allocate memory for queue storage */
2474         for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2475              j < nr_ids;) {
2476                 if (!new_dev_maps)
2477                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2478                 if (!new_dev_maps) {
2479                         mutex_unlock(&xps_map_mutex);
2480                         return -ENOMEM;
2481                 }
2482
2483                 tci = j * num_tc + tc;
2484                 map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2485                                  NULL;
2486
2487                 map = expand_xps_map(map, j, index, is_rxqs_map);
2488                 if (!map)
2489                         goto error;
2490
2491                 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2492         }
2493
2494         if (!new_dev_maps)
2495                 goto out_no_new_maps;
2496
2497         if (!dev_maps) {
2498                 /* Increment static keys at most once per type */
2499                 static_key_slow_inc_cpuslocked(&xps_needed);
2500                 if (is_rxqs_map)
2501                         static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2502         }
2503
2504         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2505              j < nr_ids;) {
2506                 /* copy maps belonging to foreign traffic classes */
2507                 for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2508                         /* fill in the new device map from the old device map */
2509                         map = xmap_dereference(dev_maps->attr_map[tci]);
2510                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2511                 }
2512
2513                 /* We need to explicitly update tci as prevous loop
2514                  * could break out early if dev_maps is NULL.
2515                  */
2516                 tci = j * num_tc + tc;
2517
2518                 if (netif_attr_test_mask(j, mask, nr_ids) &&
2519                     netif_attr_test_online(j, online_mask, nr_ids)) {
2520                         /* add tx-queue to CPU/rx-queue maps */
2521                         int pos = 0;
2522
2523                         map = xmap_dereference(new_dev_maps->attr_map[tci]);
2524                         while ((pos < map->len) && (map->queues[pos] != index))
2525                                 pos++;
2526
2527                         if (pos == map->len)
2528                                 map->queues[map->len++] = index;
2529 #ifdef CONFIG_NUMA
2530                         if (!is_rxqs_map) {
2531                                 if (numa_node_id == -2)
2532                                         numa_node_id = cpu_to_node(j);
2533                                 else if (numa_node_id != cpu_to_node(j))
2534                                         numa_node_id = -1;
2535                         }
2536 #endif
2537                 } else if (dev_maps) {
2538                         /* fill in the new device map from the old device map */
2539                         map = xmap_dereference(dev_maps->attr_map[tci]);
2540                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2541                 }
2542
2543                 /* copy maps belonging to foreign traffic classes */
2544                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2545                         /* fill in the new device map from the old device map */
2546                         map = xmap_dereference(dev_maps->attr_map[tci]);
2547                         RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2548                 }
2549         }
2550
2551         if (is_rxqs_map)
2552                 rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2553         else
2554                 rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2555
2556         /* Cleanup old maps */
2557         if (!dev_maps)
2558                 goto out_no_old_maps;
2559
2560         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2561              j < nr_ids;) {
2562                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2563                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2564                         map = xmap_dereference(dev_maps->attr_map[tci]);
2565                         if (map && map != new_map)
2566                                 kfree_rcu(map, rcu);
2567                 }
2568         }
2569
2570         kfree_rcu(dev_maps, rcu);
2571
2572 out_no_old_maps:
2573         dev_maps = new_dev_maps;
2574         active = true;
2575
2576 out_no_new_maps:
2577         if (!is_rxqs_map) {
2578                 /* update Tx queue numa node */
2579                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2580                                              (numa_node_id >= 0) ?
2581                                              numa_node_id : NUMA_NO_NODE);
2582         }
2583
2584         if (!dev_maps)
2585                 goto out_no_maps;
2586
2587         /* removes tx-queue from unused CPUs/rx-queues */
2588         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2589              j < nr_ids;) {
2590                 for (i = tc, tci = j * num_tc; i--; tci++)
2591                         active |= remove_xps_queue(dev_maps, tci, index);
2592                 if (!netif_attr_test_mask(j, mask, nr_ids) ||
2593                     !netif_attr_test_online(j, online_mask, nr_ids))
2594                         active |= remove_xps_queue(dev_maps, tci, index);
2595                 for (i = num_tc - tc, tci++; --i; tci++)
2596                         active |= remove_xps_queue(dev_maps, tci, index);
2597         }
2598
2599         /* free map if not active */
2600         if (!active)
2601                 reset_xps_maps(dev, dev_maps, is_rxqs_map);
2602
2603 out_no_maps:
2604         mutex_unlock(&xps_map_mutex);
2605
2606         return 0;
2607 error:
2608         /* remove any maps that we added */
2609         for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2610              j < nr_ids;) {
2611                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2612                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2613                         map = dev_maps ?
2614                               xmap_dereference(dev_maps->attr_map[tci]) :
2615                               NULL;
2616                         if (new_map && new_map != map)
2617                                 kfree(new_map);
2618                 }
2619         }
2620
2621         mutex_unlock(&xps_map_mutex);
2622
2623         kfree(new_dev_maps);
2624         return -ENOMEM;
2625 }
2626 EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2627
2628 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2629                         u16 index)
2630 {
2631         int ret;
2632
2633         cpus_read_lock();
2634         ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2635         cpus_read_unlock();
2636
2637         return ret;
2638 }
2639 EXPORT_SYMBOL(netif_set_xps_queue);
2640
2641 #endif
2642 static void netdev_unbind_all_sb_channels(struct net_device *dev)
2643 {
2644         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2645
2646         /* Unbind any subordinate channels */
2647         while (txq-- != &dev->_tx[0]) {
2648                 if (txq->sb_dev)
2649                         netdev_unbind_sb_channel(dev, txq->sb_dev);
2650         }
2651 }
2652
2653 void netdev_reset_tc(struct net_device *dev)
2654 {
2655 #ifdef CONFIG_XPS
2656         netif_reset_xps_queues_gt(dev, 0);
2657 #endif
2658         netdev_unbind_all_sb_channels(dev);
2659
2660         /* Reset TC configuration of device */
2661         dev->num_tc = 0;
2662         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2663         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2664 }
2665 EXPORT_SYMBOL(netdev_reset_tc);
2666
2667 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2668 {
2669         if (tc >= dev->num_tc)
2670                 return -EINVAL;
2671
2672 #ifdef CONFIG_XPS
2673         netif_reset_xps_queues(dev, offset, count);
2674 #endif
2675         dev->tc_to_txq[tc].count = count;
2676         dev->tc_to_txq[tc].offset = offset;
2677         return 0;
2678 }
2679 EXPORT_SYMBOL(netdev_set_tc_queue);
2680
2681 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2682 {
2683         if (num_tc > TC_MAX_QUEUE)
2684                 return -EINVAL;
2685
2686 #ifdef CONFIG_XPS
2687         netif_reset_xps_queues_gt(dev, 0);
2688 #endif
2689         netdev_unbind_all_sb_channels(dev);
2690
2691         dev->num_tc = num_tc;
2692         return 0;
2693 }
2694 EXPORT_SYMBOL(netdev_set_num_tc);
2695
2696 void netdev_unbind_sb_channel(struct net_device *dev,
2697                               struct net_device *sb_dev)
2698 {
2699         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2700
2701 #ifdef CONFIG_XPS
2702         netif_reset_xps_queues_gt(sb_dev, 0);
2703 #endif
2704         memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2705         memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2706
2707         while (txq-- != &dev->_tx[0]) {
2708                 if (txq->sb_dev == sb_dev)
2709                         txq->sb_dev = NULL;
2710         }
2711 }
2712 EXPORT_SYMBOL(netdev_unbind_sb_channel);
2713
2714 int netdev_bind_sb_channel_queue(struct net_device *dev,
2715                                  struct net_device *sb_dev,
2716                                  u8 tc, u16 count, u16 offset)
2717 {
2718         /* Make certain the sb_dev and dev are already configured */
2719         if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2720                 return -EINVAL;
2721
2722         /* We cannot hand out queues we don't have */
2723         if ((offset + count) > dev->real_num_tx_queues)
2724                 return -EINVAL;
2725
2726         /* Record the mapping */
2727         sb_dev->tc_to_txq[tc].count = count;
2728         sb_dev->tc_to_txq[tc].offset = offset;
2729
2730         /* Provide a way for Tx queue to find the tc_to_txq map or
2731          * XPS map for itself.
2732          */
2733         while (count--)
2734                 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2735
2736         return 0;
2737 }
2738 EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2739
2740 int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2741 {
2742         /* Do not use a multiqueue device to represent a subordinate channel */
2743         if (netif_is_multiqueue(dev))
2744                 return -ENODEV;
2745
2746         /* We allow channels 1 - 32767 to be used for subordinate channels.
2747          * Channel 0 is meant to be "native" mode and used only to represent
2748          * the main root device. We allow writing 0 to reset the device back
2749          * to normal mode after being used as a subordinate channel.
2750          */
2751         if (channel > S16_MAX)
2752                 return -EINVAL;
2753
2754         dev->num_tc = -channel;
2755
2756         return 0;
2757 }
2758 EXPORT_SYMBOL(netdev_set_sb_channel);
2759
2760 /*
2761  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2762  * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2763  */
2764 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2765 {
2766         bool disabling;
2767         int rc;
2768
2769         disabling = txq < dev->real_num_tx_queues;
2770
2771         if (txq < 1 || txq > dev->num_tx_queues)
2772                 return -EINVAL;
2773
2774         if (dev->reg_state == NETREG_REGISTERED ||
2775             dev->reg_state == NETREG_UNREGISTERING) {
2776                 ASSERT_RTNL();
2777
2778                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2779                                                   txq);
2780                 if (rc)
2781                         return rc;
2782
2783                 if (dev->num_tc)
2784                         netif_setup_tc(dev, txq);
2785
2786                 dev->real_num_tx_queues = txq;
2787
2788                 if (disabling) {
2789                         synchronize_net();
2790                         qdisc_reset_all_tx_gt(dev, txq);
2791 #ifdef CONFIG_XPS
2792                         netif_reset_xps_queues_gt(dev, txq);
2793 #endif
2794                 }
2795         } else {
2796                 dev->real_num_tx_queues = txq;
2797         }
2798
2799         return 0;
2800 }
2801 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2802
2803 #ifdef CONFIG_SYSFS
2804 /**
2805  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2806  *      @dev: Network device
2807  *      @rxq: Actual number of RX queues
2808  *
2809  *      This must be called either with the rtnl_lock held or before
2810  *      registration of the net device.  Returns 0 on success, or a
2811  *      negative error code.  If called before registration, it always
2812  *      succeeds.
2813  */
2814 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2815 {
2816         int rc;
2817
2818         if (rxq < 1 || rxq > dev->num_rx_queues)
2819                 return -EINVAL;
2820
2821         if (dev->reg_state == NETREG_REGISTERED) {
2822                 ASSERT_RTNL();
2823
2824                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2825                                                   rxq);
2826                 if (rc)
2827                         return rc;
2828         }
2829
2830         dev->real_num_rx_queues = rxq;
2831         return 0;
2832 }
2833 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2834 #endif
2835
2836 /**
2837  * netif_get_num_default_rss_queues - default number of RSS queues
2838  *
2839  * This routine should set an upper limit on the number of RSS queues
2840  * used by default by multiqueue devices.
2841  */
2842 int netif_get_num_default_rss_queues(void)
2843 {
2844         return is_kdump_kernel() ?
2845                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2846 }
2847 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2848
2849 static void __netif_reschedule(struct Qdisc *q)
2850 {
2851         struct softnet_data *sd;
2852         unsigned long flags;
2853
2854         local_irq_save(flags);
2855         sd = this_cpu_ptr(&softnet_data);
2856         q->next_sched = NULL;
2857         *sd->output_queue_tailp = q;
2858         sd->output_queue_tailp = &q->next_sched;
2859         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2860         local_irq_restore(flags);
2861 }
2862
2863 void __netif_schedule(struct Qdisc *q)
2864 {
2865         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2866                 __netif_reschedule(q);
2867 }
2868 EXPORT_SYMBOL(__netif_schedule);
2869
2870 struct dev_kfree_skb_cb {
2871         enum skb_free_reason reason;
2872 };
2873
2874 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2875 {
2876         return (struct dev_kfree_skb_cb *)skb->cb;
2877 }
2878
2879 void netif_schedule_queue(struct netdev_queue *txq)
2880 {
2881         rcu_read_lock();
2882         if (!netif_xmit_stopped(txq)) {
2883                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2884
2885                 __netif_schedule(q);
2886         }
2887         rcu_read_unlock();
2888 }
2889 EXPORT_SYMBOL(netif_schedule_queue);
2890
2891 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2892 {
2893         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2894                 struct Qdisc *q;
2895
2896                 rcu_read_lock();
2897                 q = rcu_dereference(dev_queue->qdisc);
2898                 __netif_schedule(q);
2899                 rcu_read_unlock();
2900         }
2901 }
2902 EXPORT_SYMBOL(netif_tx_wake_queue);
2903
2904 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2905 {
2906         unsigned long flags;
2907
2908         if (unlikely(!skb))
2909                 return;
2910
2911         if (likely(refcount_read(&skb->users) == 1)) {
2912                 smp_rmb();
2913                 refcount_set(&skb->users, 0);
2914         } else if (likely(!refcount_dec_and_test(&skb->users))) {
2915                 return;
2916         }
2917         get_kfree_skb_cb(skb)->reason = reason;
2918         local_irq_save(flags);
2919         skb->next = __this_cpu_read(softnet_data.completion_queue);
2920         __this_cpu_write(softnet_data.completion_queue, skb);
2921         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2922         local_irq_restore(flags);
2923 }
2924 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2925
2926 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2927 {
2928         if (in_irq() || irqs_disabled())
2929                 __dev_kfree_skb_irq(skb, reason);
2930         else
2931                 dev_kfree_skb(skb);
2932 }
2933 EXPORT_SYMBOL(__dev_kfree_skb_any);
2934
2935
2936 /**
2937  * netif_device_detach - mark device as removed
2938  * @dev: network device
2939  *
2940  * Mark device as removed from system and therefore no longer available.
2941  */
2942 void netif_device_detach(struct net_device *dev)
2943 {
2944         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2945             netif_running(dev)) {
2946                 netif_tx_stop_all_queues(dev);
2947         }
2948 }
2949 EXPORT_SYMBOL(netif_device_detach);
2950
2951 /**
2952  * netif_device_attach - mark device as attached
2953  * @dev: network device
2954  *
2955  * Mark device as attached from system and restart if needed.
2956  */
2957 void netif_device_attach(struct net_device *dev)
2958 {
2959         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2960             netif_running(dev)) {
2961                 netif_tx_wake_all_queues(dev);
2962                 __netdev_watchdog_up(dev);
2963         }
2964 }
2965 EXPORT_SYMBOL(netif_device_attach);
2966
2967 /*
2968  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2969  * to be used as a distribution range.
2970  */
2971 static u16 skb_tx_hash(const struct net_device *dev,
2972                        const struct net_device *sb_dev,
2973                        struct sk_buff *skb)
2974 {
2975         u32 hash;
2976         u16 qoffset = 0;
2977         u16 qcount = dev->real_num_tx_queues;
2978
2979         if (dev->num_tc) {
2980                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2981
2982                 qoffset = sb_dev->tc_to_txq[tc].offset;
2983                 qcount = sb_dev->tc_to_txq[tc].count;
2984         }
2985
2986         if (skb_rx_queue_recorded(skb)) {
2987                 hash = skb_get_rx_queue(skb);
2988                 while (unlikely(hash >= qcount))
2989                         hash -= qcount;
2990                 return hash + qoffset;
2991         }
2992
2993         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2994 }
2995
2996 static void skb_warn_bad_offload(const struct sk_buff *skb)
2997 {
2998         static const netdev_features_t null_features;
2999         struct net_device *dev = skb->dev;
3000         const char *name = "";
3001
3002         if (!net_ratelimit())
3003                 return;
3004
3005         if (dev) {
3006                 if (dev->dev.parent)
3007                         name = dev_driver_string(dev->dev.parent);
3008                 else
3009                         name = netdev_name(dev);
3010         }
3011         skb_dump(KERN_WARNING, skb, false);
3012         WARN(1, "%s: caps=(%pNF, %pNF)\n",
3013              name, dev ? &dev->features : &null_features,
3014              skb->sk ? &skb->sk->sk_route_caps : &null_features);
3015 }
3016
3017 /*
3018  * Invalidate hardware checksum when packet is to be mangled, and
3019  * complete checksum manually on outgoing path.
3020  */
3021 int skb_checksum_help(struct sk_buff *skb)
3022 {
3023         __wsum csum;
3024         int ret = 0, offset;
3025
3026         if (skb->ip_summed == CHECKSUM_COMPLETE)
3027                 goto out_set_summed;
3028
3029         if (unlikely(skb_shinfo(skb)->gso_size)) {
3030                 skb_warn_bad_offload(skb);
3031                 return -EINVAL;
3032         }
3033
3034         /* Before computing a checksum, we should make sure no frag could
3035          * be modified by an external entity : checksum could be wrong.
3036          */
3037         if (skb_has_shared_frag(skb)) {
3038                 ret = __skb_linearize(skb);
3039                 if (ret)
3040                         goto out;
3041         }
3042
3043         offset = skb_checksum_start_offset(skb);
3044         BUG_ON(offset >= skb_headlen(skb));
3045         csum = skb_checksum(skb, offset, skb->len - offset, 0);
3046
3047         offset += skb->csum_offset;
3048         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3049
3050         if (skb_cloned(skb) &&
3051             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
3052                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
3053                 if (ret)
3054                         goto out;
3055         }
3056
3057         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3058 out_set_summed:
3059         skb->ip_summed = CHECKSUM_NONE;
3060 out:
3061         return ret;
3062 }
3063 EXPORT_SYMBOL(skb_checksum_help);
3064
3065 int skb_crc32c_csum_help(struct sk_buff *skb)
3066 {
3067         __le32 crc32c_csum;
3068         int ret = 0, offset, start;
3069
3070         if (skb->ip_summed != CHECKSUM_PARTIAL)
3071                 goto out;
3072
3073         if (unlikely(skb_is_gso(skb)))
3074                 goto out;
3075
3076         /* Before computing a checksum, we should make sure no frag could
3077          * be modified by an external entity : checksum could be wrong.
3078          */
3079         if (unlikely(skb_has_shared_frag(skb))) {
3080                 ret = __skb_linearize(skb);
3081                 if (ret)
3082                         goto out;
3083         }
3084         start = skb_checksum_start_offset(skb);
3085         offset = start + offsetof(struct sctphdr, checksum);
3086         if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3087                 ret = -EINVAL;
3088                 goto out;
3089         }
3090         if (skb_cloned(skb) &&
3091             !skb_clone_writable(skb, offset + sizeof(__le32))) {
3092                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
3093                 if (ret)
3094                         goto out;
3095         }
3096         crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3097                                                   skb->len - start, ~(__u32)0,
3098                                                   crc32c_csum_stub));
3099         *(__le32 *)(skb->data + offset) = crc32c_csum;
3100         skb->ip_summed = CHECKSUM_NONE;
3101         skb->csum_not_inet = 0;
3102 out:
3103         return ret;
3104 }
3105
3106 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3107 {
3108         __be16 type = skb->protocol;
3109
3110         /* Tunnel gso handlers can set protocol to ethernet. */
3111         if (type == htons(ETH_P_TEB)) {
3112                 struct ethhdr *eth;
3113
3114                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3115                         return 0;
3116
3117                 eth = (struct ethhdr *)skb->data;
3118                 type = eth->h_proto;
3119         }
3120
3121         return __vlan_get_protocol(skb, type, depth);
3122 }
3123
3124 /**
3125  *      skb_mac_gso_segment - mac layer segmentation handler.
3126  *      @skb: buffer to segment
3127  *      @features: features for the output path (see dev->features)
3128  */
3129 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3130                                     netdev_features_t features)
3131 {
3132         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3133         struct packet_offload *ptype;
3134         int vlan_depth = skb->mac_len;
3135         __be16 type = skb_network_protocol(skb, &vlan_depth);
3136
3137         if (unlikely(!type))
3138                 return ERR_PTR(-EINVAL);
3139
3140         __skb_pull(skb, vlan_depth);
3141
3142         rcu_read_lock();
3143         list_for_each_entry_rcu(ptype, &offload_base, list) {
3144                 if (ptype->type == type && ptype->callbacks.gso_segment) {
3145                         segs = ptype->callbacks.gso_segment(skb, features);
3146                         break;
3147                 }
3148         }
3149         rcu_read_unlock();
3150
3151         __skb_push(skb, skb->data - skb_mac_header(skb));
3152
3153         return segs;
3154 }
3155 EXPORT_SYMBOL(skb_mac_gso_segment);
3156
3157
3158 /* openvswitch calls this on rx path, so we need a different check.
3159  */
3160 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3161 {
3162         if (tx_path)
3163                 return skb->ip_summed != CHECKSUM_PARTIAL &&
3164                        skb->ip_summed != CHECKSUM_UNNECESSARY;
3165
3166         return skb->ip_summed == CHECKSUM_NONE;
3167 }
3168
3169 /**
3170  *      __skb_gso_segment - Perform segmentation on skb.
3171  *      @skb: buffer to segment
3172  *      @features: features for the output path (see dev->features)
3173  *      @tx_path: whether it is called in TX path
3174  *
3175  *      This function segments the given skb and returns a list of segments.
3176  *
3177  *      It may return NULL if the skb requires no segmentation.  This is
3178  *      only possible when GSO is used for verifying header integrity.
3179  *
3180  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
3181  */
3182 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3183                                   netdev_features_t features, bool tx_path)
3184 {
3185         struct sk_buff *segs;
3186
3187         if (unlikely(skb_needs_check(skb, tx_path))) {
3188                 int err;
3189
3190                 /* We're going to init ->check field in TCP or UDP header */
3191                 err = skb_cow_head(skb, 0);
3192                 if (err < 0)
3193                         return ERR_PTR(err);
3194         }
3195
3196         /* Only report GSO partial support if it will enable us to
3197          * support segmentation on this frame without needing additional
3198          * work.
3199          */
3200         if (features & NETIF_F_GSO_PARTIAL) {
3201                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3202                 struct net_device *dev = skb->dev;
3203
3204                 partial_features |= dev->features & dev->gso_partial_features;
3205                 if (!skb_gso_ok(skb, features | partial_features))
3206                         features &= ~NETIF_F_GSO_PARTIAL;
3207         }
3208
3209         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3210                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3211
3212         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3213         SKB_GSO_CB(skb)->encap_level = 0;
3214
3215         skb_reset_mac_header(skb);
3216         skb_reset_mac_len(skb);
3217
3218         segs = skb_mac_gso_segment(skb, features);
3219
3220         if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3221                 skb_warn_bad_offload(skb);
3222
3223         return segs;
3224 }
3225 EXPORT_SYMBOL(__skb_gso_segment);
3226
3227 /* Take action when hardware reception checksum errors are detected. */
3228 #ifdef CONFIG_BUG
3229 void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3230 {
3231         if (net_ratelimit()) {
3232                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3233                 skb_dump(KERN_ERR, skb, true);
3234                 dump_stack();
3235         }
3236 }
3237 EXPORT_SYMBOL(netdev_rx_csum_fault);
3238 #endif
3239
3240 /* XXX: check that highmem exists at all on the given machine. */
3241 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3242 {
3243 #ifdef CONFIG_HIGHMEM
3244         int i;
3245
3246         if (!(dev->features & NETIF_F_HIGHDMA)) {
3247                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3248                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3249
3250                         if (PageHighMem(skb_frag_page(frag)))
3251                                 return 1;
3252                 }
3253         }
3254 #endif
3255         return 0;
3256 }
3257
3258 /* If MPLS offload request, verify we are testing hardware MPLS features
3259  * instead of standard features for the netdev.
3260  */
3261 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3262 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3263                                            netdev_features_t features,
3264                                            __be16 type)
3265 {
3266         if (eth_p_mpls(type))
3267                 features &= skb->dev->mpls_features;
3268
3269         return features;
3270 }
3271 #else
3272 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3273                                            netdev_features_t features,
3274                                            __be16 type)
3275 {
3276         return features;
3277 }
3278 #endif
3279
3280 static netdev_features_t harmonize_features(struct sk_buff *skb,
3281         netdev_features_t features)
3282 {
3283         int tmp;
3284         __be16 type;
3285
3286         type = skb_network_protocol(skb, &tmp);
3287         features = net_mpls_features(skb, features, type);
3288
3289         if (skb->ip_summed != CHECKSUM_NONE &&
3290             !can_checksum_protocol(features, type)) {
3291                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3292         }
3293         if (illegal_highdma(skb->dev, skb))
3294                 features &= ~NETIF_F_SG;
3295
3296         return features;
3297 }
3298
3299 netdev_features_t passthru_features_check(struct sk_buff *skb,
3300                                           struct net_device *dev,
3301                                           netdev_features_t features)
3302 {
3303         return features;
3304 }
3305 EXPORT_SYMBOL(passthru_features_check);
3306
3307 static netdev_features_t dflt_features_check(struct sk_buff *skb,
3308                                              struct net_device *dev,
3309                                              netdev_features_t features)
3310 {
3311         return vlan_features_check(skb, features);
3312 }
3313
3314 static netdev_features_t gso_features_check(const struct sk_buff *skb,
3315                                             struct net_device *dev,
3316                                             netdev_features_t features)
3317 {
3318         u16 gso_segs = skb_shinfo(skb)->gso_segs;
3319
3320         if (gso_segs > dev->gso_max_segs)
3321                 return features & ~NETIF_F_GSO_MASK;
3322
3323         /* Support for GSO partial features requires software
3324          * intervention before we can actually process the packets
3325          * so we need to strip support for any partial features now
3326          * and we can pull them back in after we have partially
3327          * segmented the frame.
3328          */
3329         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3330                 features &= ~dev->gso_partial_features;
3331
3332         /* Make sure to clear the IPv4 ID mangling feature if the
3333          * IPv4 header has the potential to be fragmented.
3334          */
3335         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3336                 struct iphdr *iph = skb->encapsulation ?
3337                                     inner_ip_hdr(skb) : ip_hdr(skb);
3338
3339                 if (!(iph->frag_off & htons(IP_DF)))
3340                         features &= ~NETIF_F_TSO_MANGLEID;
3341         }
3342
3343         return features;
3344 }
3345
3346 netdev_features_t netif_skb_features(struct sk_buff *skb)
3347 {
3348         struct net_device *dev = skb->dev;
3349         netdev_features_t features = dev->features;
3350
3351         if (skb_is_gso(skb))
3352                 features = gso_features_check(skb, dev, features);
3353
3354         /* If encapsulation offload request, verify we are testing
3355          * hardware encapsulation features instead of standard
3356          * features for the netdev
3357          */
3358         if (skb->encapsulation)
3359                 features &= dev->hw_enc_features;
3360
3361         if (skb_vlan_tagged(skb))
3362                 features = netdev_intersect_features(features,
3363                                                      dev->vlan_features |
3364                                                      NETIF_F_HW_VLAN_CTAG_TX |
3365                                                      NETIF_F_HW_VLAN_STAG_TX);
3366
3367         if (dev->netdev_ops->ndo_features_check)
3368                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3369                                                                 features);
3370         else
3371                 features &= dflt_features_check(skb, dev, features);
3372
3373         return harmonize_features(skb, features);
3374 }
3375 EXPORT_SYMBOL(netif_skb_features);
3376
3377 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3378                     struct netdev_queue *txq, bool more)
3379 {
3380         unsigned int len;
3381         int rc;
3382
3383         if (dev_nit_active(dev))
3384                 dev_queue_xmit_nit(skb, dev);
3385
3386         len = skb->len;
3387         trace_net_dev_start_xmit(skb, dev);
3388         rc = netdev_start_xmit(skb, dev, txq, more);
3389         trace_net_dev_xmit(skb, rc, dev, len);
3390
3391         return rc;
3392 }
3393
3394 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3395                                     struct netdev_queue *txq, int *ret)
3396 {
3397         struct sk_buff *skb = first;
3398         int rc = NETDEV_TX_OK;
3399
3400         while (skb) {
3401                 struct sk_buff *next = skb->next;
3402
3403                 skb_mark_not_on_list(skb);
3404                 rc = xmit_one(skb, dev, txq, next != NULL);
3405                 if (unlikely(!dev_xmit_complete(rc))) {
3406                         skb->next = next;
3407                         goto out;
3408                 }
3409
3410                 skb = next;
3411                 if (netif_tx_queue_stopped(txq) && skb) {
3412                         rc = NETDEV_TX_BUSY;
3413                         break;
3414                 }
3415         }
3416
3417 out:
3418         *ret = rc;
3419         return skb;
3420 }
3421
3422 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3423                                           netdev_features_t features)
3424 {
3425         if (skb_vlan_tag_present(skb) &&
3426             !vlan_hw_offload_capable(features, skb->vlan_proto))
3427                 skb = __vlan_hwaccel_push_inside(skb);
3428         return skb;
3429 }
3430
3431 int skb_csum_hwoffload_help(struct sk_buff *skb,
3432                             const netdev_features_t features)
3433 {
3434         if (unlikely(skb->csum_not_inet))
3435                 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3436                         skb_crc32c_csum_help(skb);
3437
3438         return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3439 }
3440 EXPORT_SYMBOL(skb_csum_hwoffload_help);
3441
3442 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3443 {
3444         netdev_features_t features;
3445
3446         features = netif_skb_features(skb);
3447         skb = validate_xmit_vlan(skb, features);
3448         if (unlikely(!skb))
3449                 goto out_null;
3450
3451         skb = sk_validate_xmit_skb(skb, dev);
3452         if (unlikely(!skb))
3453                 goto out_null;
3454
3455         if (netif_needs_gso(skb, features)) {
3456                 struct sk_buff *segs;
3457
3458                 segs = skb_gso_segment(skb, features);
3459                 if (IS_ERR(segs)) {
3460                         goto out_kfree_skb;
3461                 } else if (segs) {
3462                         consume_skb(skb);
3463                         skb = segs;
3464                 }
3465         } else {
3466                 if (skb_needs_linearize(skb, features) &&
3467                     __skb_linearize(skb))
3468                         goto out_kfree_skb;
3469
3470                 /* If packet is not checksummed and device does not
3471                  * support checksumming for this protocol, complete
3472                  * checksumming here.
3473                  */
3474                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3475                         if (skb->encapsulation)
3476                                 skb_set_inner_transport_header(skb,
3477                                                                skb_checksum_start_offset(skb));
3478                         else
3479                                 skb_set_transport_header(skb,
3480                                                          skb_checksum_start_offset(skb));
3481                         if (skb_csum_hwoffload_help(skb, features))
3482                                 goto out_kfree_skb;
3483                 }
3484         }
3485
3486         skb = validate_xmit_xfrm(skb, features, again);
3487
3488         return skb;
3489
3490 out_kfree_skb:
3491         kfree_skb(skb);
3492 out_null:
3493         atomic_long_inc(&dev->tx_dropped);
3494         return NULL;
3495 }
3496
3497 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3498 {
3499         struct sk_buff *next, *head = NULL, *tail;
3500
3501         for (; skb != NULL; skb = next) {
3502                 next = skb->next;
3503                 skb_mark_not_on_list(skb);
3504
3505                 /* in case skb wont be segmented, point to itself */
3506                 skb->prev = skb;
3507
3508                 skb = validate_xmit_skb(skb, dev, again);
3509                 if (!skb)
3510                         continue;
3511
3512                 if (!head)
3513                         head = skb;
3514                 else
3515                         tail->next = skb;
3516                 /* If skb was segmented, skb->prev points to
3517                  * the last segment. If not, it still contains skb.
3518                  */
3519                 tail = skb->prev;
3520         }
3521         return head;
3522 }
3523 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3524
3525 static void qdisc_pkt_len_init(struct sk_buff *skb)
3526 {
3527         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3528
3529         qdisc_skb_cb(skb)->pkt_len = skb->len;
3530
3531         /* To get more precise estimation of bytes sent on wire,
3532          * we add to pkt_len the headers size of all segments
3533          */
3534         if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3535                 unsigned int hdr_len;
3536                 u16 gso_segs = shinfo->gso_segs;
3537
3538                 /* mac layer + network layer */
3539                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3540
3541                 /* + transport layer */
3542                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3543                         const struct tcphdr *th;
3544                         struct tcphdr _tcphdr;
3545
3546                         th = skb_header_pointer(skb, skb_transport_offset(skb),
3547                                                 sizeof(_tcphdr), &_tcphdr);
3548                         if (likely(th))
3549                                 hdr_len += __tcp_hdrlen(th);
3550                 } else {
3551                         struct udphdr _udphdr;
3552
3553                         if (skb_header_pointer(skb, skb_transport_offset(skb),
3554                                                sizeof(_udphdr), &_udphdr))
3555                                 hdr_len += sizeof(struct udphdr);
3556                 }
3557
3558                 if (shinfo->gso_type & SKB_GSO_DODGY)
3559                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3560                                                 shinfo->gso_size);
3561
3562                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3563         }
3564 }
3565
3566 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3567                                  struct net_device *dev,
3568                                  struct netdev_queue *txq)
3569 {
3570         spinlock_t *root_lock = qdisc_lock(q);
3571         struct sk_buff *to_free = NULL;
3572         bool contended;
3573         int rc;
3574
3575         qdisc_calculate_pkt_len(skb, q);
3576
3577         if (q->flags & TCQ_F_NOLOCK) {
3578                 if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
3579                     qdisc_run_begin(q)) {
3580                         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
3581                                               &q->state))) {
3582                                 __qdisc_drop(skb, &to_free);
3583                                 rc = NET_XMIT_DROP;
3584                                 goto end_run;
3585                         }
3586                         qdisc_bstats_cpu_update(q, skb);
3587
3588                         rc = NET_XMIT_SUCCESS;
3589                         if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
3590                                 __qdisc_run(q);
3591
3592 end_run:
3593                         qdisc_run_end(q);
3594                 } else {
3595                         rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3596                         qdisc_run(q);
3597                 }
3598
3599                 if (unlikely(to_free))
3600                         kfree_skb_list(to_free);
3601                 return rc;
3602         }
3603
3604         /*
3605          * Heuristic to force contended enqueues to serialize on a
3606          * separate lock before trying to get qdisc main lock.
3607          * This permits qdisc->running owner to get the lock more
3608          * often and dequeue packets faster.
3609          */
3610         contended = qdisc_is_running(q);
3611         if (unlikely(contended))
3612                 spin_lock(&q->busylock);
3613
3614         spin_lock(root_lock);
3615         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3616                 __qdisc_drop(skb, &to_free);
3617                 rc = NET_XMIT_DROP;
3618         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3619                    qdisc_run_begin(q)) {
3620                 /*
3621                  * This is a work-conserving queue; there are no old skbs
3622                  * waiting to be sent out; and the qdisc is not running -
3623                  * xmit the skb directly.
3624                  */
3625
3626                 qdisc_bstats_update(q, skb);
3627
3628                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3629                         if (unlikely(contended)) {
3630                                 spin_unlock(&q->busylock);
3631                                 contended = false;
3632                         }
3633                         __qdisc_run(q);
3634                 }
3635
3636                 qdisc_run_end(q);
3637                 rc = NET_XMIT_SUCCESS;
3638         } else {
3639                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3640                 if (qdisc_run_begin(q)) {
3641                         if (unlikely(contended)) {
3642                                 spin_unlock(&q->busylock);
3643                                 contended = false;
3644                         }
3645                         __qdisc_run(q);
3646                         qdisc_run_end(q);
3647                 }
3648         }
3649         spin_unlock(root_lock);
3650         if (unlikely(to_free))
3651                 kfree_skb_list(to_free);
3652         if (unlikely(contended))
3653                 spin_unlock(&q->busylock);
3654         return rc;
3655 }
3656
3657 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3658 static void skb_update_prio(struct sk_buff *skb)
3659 {
3660         const struct netprio_map *map;
3661         const struct sock *sk;
3662         unsigned int prioidx;
3663
3664         if (skb->priority)
3665                 return;
3666         map = rcu_dereference_bh(skb->dev->priomap);
3667         if (!map)
3668                 return;
3669         sk = skb_to_full_sk(skb);
3670         if (!sk)
3671                 return;
3672
3673         prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3674
3675         if (prioidx < map->priomap_len)
3676                 skb->priority = map->priomap[prioidx];
3677 }
3678 #else
3679 #define skb_update_prio(skb)
3680 #endif
3681
3682 /**
3683  *      dev_loopback_xmit - loop back @skb
3684  *      @net: network namespace this loopback is happening in
3685  *      @sk:  sk needed to be a netfilter okfn
3686  *      @skb: buffer to transmit
3687  */
3688 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3689 {
3690         skb_reset_mac_header(skb);
3691         __skb_pull(skb, skb_network_offset(skb));
3692         skb->pkt_type = PACKET_LOOPBACK;
3693         skb->ip_summed = CHECKSUM_UNNECESSARY;
3694         WARN_ON(!skb_dst(skb));
3695         skb_dst_force(skb);
3696         netif_rx_ni(skb);
3697         return 0;
3698 }
3699 EXPORT_SYMBOL(dev_loopback_xmit);
3700
3701 #ifdef CONFIG_NET_EGRESS
3702 static struct sk_buff *
3703 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3704 {
3705         struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3706         struct tcf_result cl_res;
3707
3708         if (!miniq)
3709                 return skb;
3710
3711         /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3712         mini_qdisc_bstats_cpu_update(miniq, skb);
3713
3714         switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3715         case TC_ACT_OK:
3716         case TC_ACT_RECLASSIFY:
3717                 skb->tc_index = TC_H_MIN(cl_res.classid);
3718                 break;
3719         case TC_ACT_SHOT:
3720                 mini_qdisc_qstats_cpu_drop(miniq);
3721                 *ret = NET_XMIT_DROP;
3722                 kfree_skb(skb);
3723                 return NULL;
3724         case TC_ACT_STOLEN:
3725         case TC_ACT_QUEUED:
3726         case TC_ACT_TRAP:
3727                 *ret = NET_XMIT_SUCCESS;
3728                 consume_skb(skb);
3729                 return NULL;
3730         case TC_ACT_REDIRECT:
3731                 /* No need to push/pop skb's mac_header here on egress! */
3732                 skb_do_redirect(skb);
3733                 *ret = NET_XMIT_SUCCESS;
3734                 return NULL;
3735         default:
3736                 break;
3737         }
3738
3739         return skb;
3740 }
3741 #endif /* CONFIG_NET_EGRESS */
3742
3743 #ifdef CONFIG_XPS
3744 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3745                                struct xps_dev_maps *dev_maps, unsigned int tci)
3746 {
3747         struct xps_map *map;
3748         int queue_index = -1;
3749
3750         if (dev->num_tc) {
3751                 tci *= dev->num_tc;
3752                 tci += netdev_get_prio_tc_map(dev, skb->priority);
3753         }
3754
3755         map = rcu_dereference(dev_maps->attr_map[tci]);
3756         if (map) {
3757                 if (map->len == 1)
3758                         queue_index = map->queues[0];
3759                 else
3760                         queue_index = map->queues[reciprocal_scale(
3761                                                 skb_get_hash(skb), map->len)];
3762                 if (unlikely(queue_index >= dev->real_num_tx_queues))
3763                         queue_index = -1;
3764         }
3765         return queue_index;
3766 }
3767 #endif
3768
3769 static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3770                          struct sk_buff *skb)
3771 {
3772 #ifdef CONFIG_XPS
3773         struct xps_dev_maps *dev_maps;
3774         struct sock *sk = skb->sk;
3775         int queue_index = -1;
3776
3777         if (!static_key_false(&xps_needed))
3778                 return -1;
3779
3780         rcu_read_lock();
3781         if (!static_key_false(&xps_rxqs_needed))
3782                 goto get_cpus_map;
3783
3784         dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3785         if (dev_maps) {
3786                 int tci = sk_rx_queue_get(sk);
3787
3788                 if (tci >= 0 && tci < dev->num_rx_queues)
3789                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3790                                                           tci);
3791         }
3792
3793 get_cpus_map:
3794         if (queue_index < 0) {
3795                 dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3796                 if (dev_maps) {
3797                         unsigned int tci = skb->sender_cpu - 1;
3798
3799                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3800                                                           tci);
3801                 }
3802         }
3803         rcu_read_unlock();
3804
3805         return queue_index;
3806 #else
3807         return -1;
3808 #endif
3809 }
3810
3811 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3812                      struct net_device *sb_dev)
3813 {
3814         return 0;
3815 }
3816 EXPORT_SYMBOL(dev_pick_tx_zero);
3817
3818 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3819                        struct net_device *sb_dev)
3820 {
3821         return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3822 }
3823 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3824
3825 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3826                      struct net_device *sb_dev)
3827 {
3828         struct sock *sk = skb->sk;
3829         int queue_index = sk_tx_queue_get(sk);
3830
3831         sb_dev = sb_dev ? : dev;
3832
3833         if (queue_index < 0 || skb->ooo_okay ||
3834             queue_index >= dev->real_num_tx_queues) {
3835                 int new_index = get_xps_queue(dev, sb_dev, skb);
3836
3837                 if (new_index < 0)
3838                         new_index = skb_tx_hash(dev, sb_dev, skb);
3839
3840                 if (queue_index != new_index && sk &&
3841                     sk_fullsock(sk) &&
3842                     rcu_access_pointer(sk->sk_dst_cache))
3843                         sk_tx_queue_set(sk, new_index);
3844
3845                 queue_index = new_index;
3846         }
3847
3848         return queue_index;
3849 }
3850 EXPORT_SYMBOL(netdev_pick_tx);
3851
3852 struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
3853                                          struct sk_buff *skb,
3854                                          struct net_device *sb_dev)
3855 {
3856         int queue_index = 0;
3857
3858 #ifdef CONFIG_XPS
3859         u32 sender_cpu = skb->sender_cpu - 1;
3860
3861         if (sender_cpu >= (u32)NR_CPUS)
3862                 skb->sender_cpu = raw_smp_processor_id() + 1;
3863 #endif
3864
3865         if (dev->real_num_tx_queues != 1) {
3866                 const struct net_device_ops *ops = dev->netdev_ops;
3867
3868                 if (ops->ndo_select_queue)
3869                         queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
3870                 else
3871                         queue_index = netdev_pick_tx(dev, skb, sb_dev);
3872
3873                 queue_index = netdev_cap_txqueue(dev, queue_index);
3874         }
3875
3876         skb_set_queue_mapping(skb, queue_index);
3877         return netdev_get_tx_queue(dev, queue_index);
3878 }
3879
3880 /**
3881  *      __dev_queue_xmit - transmit a buffer
3882  *      @skb: buffer to transmit
3883  *      @sb_dev: suboordinate device used for L2 forwarding offload
3884  *
3885  *      Queue a buffer for transmission to a network device. The caller must
3886  *      have set the device and priority and built the buffer before calling
3887  *      this function. The function can be called from an interrupt.
3888  *
3889  *      A negative errno code is returned on a failure. A success does not
3890  *      guarantee the frame will be transmitted as it may be dropped due
3891  *      to congestion or traffic shaping.
3892  *
3893  * -----------------------------------------------------------------------------------
3894  *      I notice this method can also return errors from the queue disciplines,
3895  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3896  *      be positive.
3897  *
3898  *      Regardless of the return value, the skb is consumed, so it is currently
3899  *      difficult to retry a send to this method.  (You can bump the ref count
3900  *      before sending to hold a reference for retry if you are careful.)
3901  *
3902  *      When calling this method, interrupts MUST be enabled.  This is because
3903  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3904  *          --BLG
3905  */
3906 static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
3907 {
3908         struct net_device *dev = skb->dev;
3909         struct netdev_queue *txq;
3910         struct Qdisc *q;
3911         int rc = -ENOMEM;
3912         bool again = false;
3913
3914         skb_reset_mac_header(skb);
3915
3916         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3917                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3918
3919         /* Disable soft irqs for various locks below. Also
3920          * stops preemption for RCU.
3921          */
3922         rcu_read_lock_bh();
3923
3924         skb_update_prio(skb);
3925
3926         qdisc_pkt_len_init(skb);
3927 #ifdef CONFIG_NET_CLS_ACT
3928         skb->tc_at_ingress = 0;
3929 # ifdef CONFIG_NET_EGRESS
3930         if (static_branch_unlikely(&egress_needed_key)) {
3931                 skb = sch_handle_egress(skb, &rc, dev);
3932                 if (!skb)
3933                         goto out;
3934         }
3935 # endif
3936 #endif
3937         /* If device/qdisc don't need skb->dst, release it right now while
3938          * its hot in this cpu cache.
3939          */
3940         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3941                 skb_dst_drop(skb);
3942         else
3943                 skb_dst_force(skb);
3944
3945         txq = netdev_core_pick_tx(dev, skb, sb_dev);
3946         q = rcu_dereference_bh(txq->qdisc);
3947
3948         trace_net_dev_queue(skb);
3949         if (q->enqueue) {
3950                 rc = __dev_xmit_skb(skb, q, dev, txq);
3951                 goto out;
3952         }
3953
3954         /* The device has no queue. Common case for software devices:
3955          * loopback, all the sorts of tunnels...
3956
3957          * Really, it is unlikely that netif_tx_lock protection is necessary
3958          * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3959          * counters.)
3960          * However, it is possible, that they rely on protection
3961          * made by us here.
3962
3963          * Check this and shot the lock. It is not prone from deadlocks.
3964          *Either shot noqueue qdisc, it is even simpler 8)
3965          */
3966         if (dev->flags & IFF_UP) {
3967                 int cpu = smp_processor_id(); /* ok because BHs are off */
3968
3969                 if (txq->xmit_lock_owner != cpu) {
3970                         if (dev_xmit_recursion())
3971                                 goto recursion_alert;
3972
3973                         skb = validate_xmit_skb(skb, dev, &again);
3974                         if (!skb)
3975                                 goto out;
3976
3977                         HARD_TX_LOCK(dev, txq, cpu);
3978
3979                         if (!netif_xmit_stopped(txq)) {
3980                                 dev_xmit_recursion_inc();
3981                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3982                                 dev_xmit_recursion_dec();
3983                                 if (dev_xmit_complete(rc)) {
3984                                         HARD_TX_UNLOCK(dev, txq);
3985                                         goto out;
3986                                 }
3987                         }
3988                         HARD_TX_UNLOCK(dev, txq);
3989                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3990                                              dev->name);
3991                 } else {
3992                         /* Recursion is detected! It is possible,
3993                          * unfortunately
3994                          */
3995 recursion_alert:
3996                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3997                                              dev->name);
3998                 }
3999         }
4000
4001         rc = -ENETDOWN;
4002         rcu_read_unlock_bh();
4003
4004         atomic_long_inc(&dev->tx_dropped);
4005         kfree_skb_list(skb);
4006         return rc;
4007 out:
4008         rcu_read_unlock_bh();
4009         return rc;
4010 }
4011
4012 int dev_queue_xmit(struct sk_buff *skb)
4013 {
4014         return __dev_queue_xmit(skb, NULL);
4015 }
4016 EXPORT_SYMBOL(dev_queue_xmit);
4017
4018 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4019 {
4020         return __dev_queue_xmit(skb, sb_dev);
4021 }
4022 EXPORT_SYMBOL(dev_queue_xmit_accel);
4023
4024 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4025 {
4026         struct net_device *dev = skb->dev;
4027         struct sk_buff *orig_skb = skb;
4028         struct netdev_queue *txq;
4029         int ret = NETDEV_TX_BUSY;
4030         bool again = false;
4031
4032         if (unlikely(!netif_running(dev) ||
4033                      !netif_carrier_ok(dev)))
4034                 goto drop;
4035
4036         skb = validate_xmit_skb_list(skb, dev, &again);
4037         if (skb != orig_skb)
4038                 goto drop;
4039
4040         skb_set_queue_mapping(skb, queue_id);
4041         txq = skb_get_tx_queue(dev, skb);
4042
4043         local_bh_disable();
4044
4045         HARD_TX_LOCK(dev, txq, smp_processor_id());
4046         if (!netif_xmit_frozen_or_drv_stopped(txq))
4047                 ret = netdev_start_xmit(skb, dev, txq, false);
4048         HARD_TX_UNLOCK(dev, txq);
4049
4050         local_bh_enable();
4051
4052         if (!dev_xmit_complete(ret))
4053                 kfree_skb(skb);
4054
4055         return ret;
4056 drop:
4057         atomic_long_inc(&dev->tx_dropped);
4058         kfree_skb_list(skb);
4059         return NET_XMIT_DROP;
4060 }
4061 EXPORT_SYMBOL(dev_direct_xmit);
4062
4063 /*************************************************************************
4064  *                      Receiver routines
4065  *************************************************************************/
4066
4067 int netdev_max_backlog __read_mostly = 1000;
4068 EXPORT_SYMBOL(netdev_max_backlog);
4069
4070 int netdev_tstamp_prequeue __read_mostly = 1;
4071 int netdev_budget __read_mostly = 300;
4072 unsigned int __read_mostly netdev_budget_usecs = 2000;
4073 int weight_p __read_mostly = 64;           /* old backlog weight */
4074 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4075 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4076 int dev_rx_weight __read_mostly = 64;
4077 int dev_tx_weight __read_mostly = 64;
4078 /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4079 int gro_normal_batch __read_mostly = 8;
4080
4081 /* Called with irq disabled */
4082 static inline void ____napi_schedule(struct softnet_data *sd,
4083                                      struct napi_struct *napi)
4084 {
4085         list_add_tail(&napi->poll_list, &sd->poll_list);
4086         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4087 }
4088
4089 #ifdef CONFIG_RPS
4090
4091 /* One global table that all flow-based protocols share. */
4092 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4093 EXPORT_SYMBOL(rps_sock_flow_table);
4094 u32 rps_cpu_mask __read_mostly;
4095 EXPORT_SYMBOL(rps_cpu_mask);
4096
4097 struct static_key_false rps_needed __read_mostly;
4098 EXPORT_SYMBOL(rps_needed);
4099 struct static_key_false rfs_needed __read_mostly;
4100 EXPORT_SYMBOL(rfs_needed);
4101
4102 static struct rps_dev_flow *
4103 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4104             struct rps_dev_flow *rflow, u16 next_cpu)
4105 {
4106         if (next_cpu < nr_cpu_ids) {
4107 #ifdef CONFIG_RFS_ACCEL
4108                 struct netdev_rx_queue *rxqueue;
4109                 struct rps_dev_flow_table *flow_table;
4110                 struct rps_dev_flow *old_rflow;
4111                 u32 flow_id;
4112                 u16 rxq_index;
4113                 int rc;
4114
4115                 /* Should we steer this flow to a different hardware queue? */
4116                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4117                     !(dev->features & NETIF_F_NTUPLE))
4118                         goto out;
4119                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4120                 if (rxq_index == skb_get_rx_queue(skb))
4121                         goto out;
4122
4123                 rxqueue = dev->_rx + rxq_index;
4124                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4125                 if (!flow_table)
4126                         goto out;
4127                 flow_id = skb_get_hash(skb) & flow_table->mask;
4128                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4129                                                         rxq_index, flow_id);
4130                 if (rc < 0)
4131                         goto out;
4132                 old_rflow = rflow;
4133                 rflow = &flow_table->flows[flow_id];
4134                 rflow->filter = rc;
4135                 if (old_rflow->filter == rflow->filter)
4136                         old_rflow->filter = RPS_NO_FILTER;
4137         out:
4138 #endif
4139                 rflow->last_qtail =
4140                         per_cpu(softnet_data, next_cpu).input_queue_head;
4141         }
4142
4143         rflow->cpu = next_cpu;
4144         return rflow;
4145 }
4146
4147 /*
4148  * get_rps_cpu is called from netif_receive_skb and returns the target
4149  * CPU from the RPS map of the receiving queue for a given skb.
4150  * rcu_read_lock must be held on entry.
4151  */
4152 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4153                        struct rps_dev_flow **rflowp)
4154 {
4155         const struct rps_sock_flow_table *sock_flow_table;
4156         struct netdev_rx_queue *rxqueue = dev->_rx;
4157         struct rps_dev_flow_table *flow_table;
4158         struct rps_map *map;
4159         int cpu = -1;
4160         u32 tcpu;
4161         u32 hash;
4162
4163         if (skb_rx_queue_recorded(skb)) {
4164                 u16 index = skb_get_rx_queue(skb);
4165
4166                 if (unlikely(index >= dev->real_num_rx_queues)) {
4167                         WARN_ONCE(dev->real_num_rx_queues > 1,
4168                                   "%s received packet on queue %u, but number "
4169                                   "of RX queues is %u\n",
4170                                   dev->name, index, dev->real_num_rx_queues);
4171                         goto done;
4172                 }
4173                 rxqueue += index;
4174         }
4175
4176         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4177
4178         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4179         map = rcu_dereference(rxqueue->rps_map);
4180         if (!flow_table && !map)
4181                 goto done;
4182
4183         skb_reset_network_header(skb);
4184         hash = skb_get_hash(skb);
4185         if (!hash)
4186                 goto done;
4187
4188         sock_flow_table = rcu_dereference(rps_sock_flow_table);
4189         if (flow_table && sock_flow_table) {
4190                 struct rps_dev_flow *rflow;
4191                 u32 next_cpu;
4192                 u32 ident;
4193
4194                 /* First check into global flow table if there is a match */
4195                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4196                 if ((ident ^ hash) & ~rps_cpu_mask)
4197                         goto try_rps;
4198
4199                 next_cpu = ident & rps_cpu_mask;
4200
4201                 /* OK, now we know there is a match,
4202                  * we can look at the local (per receive queue) flow table
4203                  */
4204                 rflow = &flow_table->flows[hash & flow_table->mask];
4205                 tcpu = rflow->cpu;
4206
4207                 /*
4208                  * If the desired CPU (where last recvmsg was done) is
4209                  * different from current CPU (one in the rx-queue flow
4210                  * table entry), switch if one of the following holds:
4211                  *   - Current CPU is unset (>= nr_cpu_ids).
4212                  *   - Current CPU is offline.
4213                  *   - The current CPU's queue tail has advanced beyond the
4214                  *     last packet that was enqueued using this table entry.
4215                  *     This guarantees that all previous packets for the flow
4216                  *     have been dequeued, thus preserving in order delivery.
4217                  */
4218                 if (unlikely(tcpu != next_cpu) &&
4219                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4220                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4221                       rflow->last_qtail)) >= 0)) {
4222                         tcpu = next_cpu;
4223                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4224                 }
4225
4226                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4227                         *rflowp = rflow;
4228                         cpu = tcpu;
4229                         goto done;
4230                 }
4231         }
4232
4233 try_rps:
4234
4235         if (map) {
4236                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4237                 if (cpu_online(tcpu)) {
4238                         cpu = tcpu;
4239                         goto done;
4240                 }
4241         }
4242
4243 done:
4244         return cpu;
4245 }
4246
4247 #ifdef CONFIG_RFS_ACCEL
4248
4249 /**
4250  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4251  * @dev: Device on which the filter was set
4252  * @rxq_index: RX queue index
4253  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4254  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4255  *
4256  * Drivers that implement ndo_rx_flow_steer() should periodically call
4257  * this function for each installed filter and remove the filters for
4258  * which it returns %true.
4259  */
4260 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4261                          u32 flow_id, u16 filter_id)
4262 {
4263         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4264         struct rps_dev_flow_table *flow_table;
4265         struct rps_dev_flow *rflow;
4266         bool expire = true;
4267         unsigned int cpu;
4268
4269         rcu_read_lock();
4270         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4271         if (flow_table && flow_id <= flow_table->mask) {
4272                 rflow = &flow_table->flows[flow_id];
4273                 cpu = READ_ONCE(rflow->cpu);
4274                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4275                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4276                            rflow->last_qtail) <
4277                      (int)(10 * flow_table->mask)))
4278                         expire = false;
4279         }
4280         rcu_read_unlock();
4281         return expire;
4282 }
4283 EXPORT_SYMBOL(rps_may_expire_flow);
4284
4285 #endif /* CONFIG_RFS_ACCEL */
4286
4287 /* Called from hardirq (IPI) context */
4288 static void rps_trigger_softirq(void *data)
4289 {
4290         struct softnet_data *sd = data;
4291
4292         ____napi_schedule(sd, &sd->backlog);
4293         sd->received_rps++;
4294 }
4295
4296 #endif /* CONFIG_RPS */
4297
4298 /*
4299  * Check if this softnet_data structure is another cpu one
4300  * If yes, queue it to our IPI list and return 1
4301  * If no, return 0
4302  */
4303 static int rps_ipi_queued(struct softnet_data *sd)
4304 {
4305 #ifdef CONFIG_RPS
4306         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4307
4308         if (sd != mysd) {
4309                 sd->rps_ipi_next = mysd->rps_ipi_list;
4310                 mysd->rps_ipi_list = sd;
4311
4312                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4313                 return 1;
4314         }
4315 #endif /* CONFIG_RPS */
4316         return 0;
4317 }
4318
4319 #ifdef CONFIG_NET_FLOW_LIMIT
4320 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4321 #endif
4322
4323 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4324 {
4325 #ifdef CONFIG_NET_FLOW_LIMIT
4326         struct sd_flow_limit *fl;
4327         struct softnet_data *sd;
4328         unsigned int old_flow, new_flow;
4329
4330         if (qlen < (netdev_max_backlog >> 1))
4331                 return false;
4332
4333         sd = this_cpu_ptr(&softnet_data);
4334
4335         rcu_read_lock();
4336         fl = rcu_dereference(sd->flow_limit);
4337         if (fl) {
4338                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4339                 old_flow = fl->history[fl->history_head];
4340                 fl->history[fl->history_head] = new_flow;
4341
4342                 fl->history_head++;
4343                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4344
4345                 if (likely(fl->buckets[old_flow]))
4346                         fl->buckets[old_flow]--;
4347
4348                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4349                         fl->count++;
4350                         rcu_read_unlock();
4351                         return true;
4352                 }
4353         }
4354         rcu_read_unlock();
4355 #endif
4356         return false;
4357 }
4358
4359 /*
4360  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4361  * queue (may be a remote CPU queue).
4362  */
4363 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4364                               unsigned int *qtail)
4365 {
4366         struct softnet_data *sd;
4367         unsigned long flags;
4368         unsigned int qlen;
4369
4370         sd = &per_cpu(softnet_data, cpu);
4371
4372         local_irq_save(flags);
4373
4374         rps_lock(sd);
4375         if (!netif_running(skb->dev))
4376                 goto drop;
4377         qlen = skb_queue_len(&sd->input_pkt_queue);
4378         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4379                 if (qlen) {
4380 enqueue:
4381                         __skb_queue_tail(&sd->input_pkt_queue, skb);
4382                         input_queue_tail_incr_save(sd, qtail);
4383                         rps_unlock(sd);
4384                         local_irq_restore(flags);
4385                         return NET_RX_SUCCESS;
4386                 }
4387
4388                 /* Schedule NAPI for backlog device
4389                  * We can use non atomic operation since we own the queue lock
4390                  */
4391                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4392                         if (!rps_ipi_queued(sd))
4393                                 ____napi_schedule(sd, &sd->backlog);
4394                 }
4395                 goto enqueue;
4396         }
4397
4398 drop:
4399         sd->dropped++;
4400         rps_unlock(sd);
4401
4402         local_irq_restore(flags);
4403
4404         atomic_long_inc(&skb->dev->rx_dropped);
4405         kfree_skb(skb);
4406         return NET_RX_DROP;
4407 }
4408
4409 static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4410 {
4411         struct net_device *dev = skb->dev;
4412         struct netdev_rx_queue *rxqueue;
4413
4414         rxqueue = dev->_rx;
4415
4416         if (skb_rx_queue_recorded(skb)) {
4417                 u16 index = skb_get_rx_queue(skb);
4418
4419                 if (unlikely(index >= dev->real_num_rx_queues)) {
4420                         WARN_ONCE(dev->real_num_rx_queues > 1,
4421                                   "%s received packet on queue %u, but number "
4422                                   "of RX queues is %u\n",
4423                                   dev->name, index, dev->real_num_rx_queues);
4424
4425                         return rxqueue; /* Return first rxqueue */
4426                 }
4427                 rxqueue += index;
4428         }
4429         return rxqueue;
4430 }
4431
4432 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4433                                      struct xdp_buff *xdp,
4434                                      struct bpf_prog *xdp_prog)
4435 {
4436         struct netdev_rx_queue *rxqueue;
4437         void *orig_data, *orig_data_end;
4438         u32 metalen, act = XDP_DROP;
4439         __be16 orig_eth_type;
4440         struct ethhdr *eth;
4441         bool orig_bcast;
4442         int hlen, off;
4443         u32 mac_len;
4444
4445         /* Reinjected packets coming from act_mirred or similar should
4446          * not get XDP generic processing.
4447          */
4448         if (skb_cloned(skb) || skb_is_tc_redirected(skb))
4449                 return XDP_PASS;
4450
4451         /* XDP packets must be linear and must have sufficient headroom
4452          * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4453          * native XDP provides, thus we need to do it here as well.
4454          */
4455         if (skb_is_nonlinear(skb) ||
4456             skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4457                 int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4458                 int troom = skb->tail + skb->data_len - skb->end;
4459
4460                 /* In case we have to go down the path and also linearize,
4461                  * then lets do the pskb_expand_head() work just once here.
4462                  */
4463                 if (pskb_expand_head(skb,
4464                                      hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4465                                      troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4466                         goto do_drop;
4467                 if (skb_linearize(skb))
4468                         goto do_drop;
4469         }
4470
4471         /* The XDP program wants to see the packet starting at the MAC
4472          * header.
4473          */
4474         mac_len = skb->data - skb_mac_header(skb);
4475         hlen = skb_headlen(skb) + mac_len;
4476         xdp->data = skb->data - mac_len;
4477         xdp->data_meta = xdp->data;
4478         xdp->data_end = xdp->data + hlen;
4479         xdp->data_hard_start = skb->data - skb_headroom(skb);
4480         orig_data_end = xdp->data_end;
4481         orig_data = xdp->data;
4482         eth = (struct ethhdr *)xdp->data;
4483         orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4484         orig_eth_type = eth->h_proto;
4485
4486         rxqueue = netif_get_rxqueue(skb);
4487         xdp->rxq = &rxqueue->xdp_rxq;
4488
4489         act = bpf_prog_run_xdp(xdp_prog, xdp);
4490
4491         /* check if bpf_xdp_adjust_head was used */
4492         off = xdp->data - orig_data;
4493         if (off) {
4494                 if (off > 0)
4495                         __skb_pull(skb, off);
4496                 else if (off < 0)
4497                         __skb_push(skb, -off);
4498
4499                 skb->mac_header += off;
4500                 skb_reset_network_header(skb);
4501         }
4502
4503         /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4504          * pckt.
4505          */
4506         off = orig_data_end - xdp->data_end;
4507         if (off != 0) {
4508                 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4509                 skb->len -= off;
4510
4511         }
4512
4513         /* check if XDP changed eth hdr such SKB needs update */
4514         eth = (struct ethhdr *)xdp->data;
4515         if ((orig_eth_type != eth->h_proto) ||
4516             (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4517                 __skb_push(skb, ETH_HLEN);
4518                 skb->protocol = eth_type_trans(skb, skb->dev);
4519         }
4520
4521         switch (act) {
4522         case XDP_REDIRECT:
4523         case XDP_TX:
4524                 __skb_push(skb, mac_len);
4525                 break;
4526         case XDP_PASS:
4527                 metalen = xdp->data - xdp->data_meta;
4528                 if (metalen)
4529                         skb_metadata_set(skb, metalen);
4530                 break;
4531         default:
4532                 bpf_warn_invalid_xdp_action(act);
4533                 /* fall through */
4534         case XDP_ABORTED:
4535                 trace_xdp_exception(skb->dev, xdp_prog, act);
4536                 /* fall through */
4537         case XDP_DROP:
4538         do_drop:
4539                 kfree_skb(skb);
4540                 break;
4541         }
4542
4543         return act;
4544 }
4545
4546 /* When doing generic XDP we have to bypass the qdisc layer and the
4547  * network taps in order to match in-driver-XDP behavior.
4548  */
4549 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4550 {
4551         struct net_device *dev = skb->dev;
4552         struct netdev_queue *txq;
4553         bool free_skb = true;
4554         int cpu, rc;
4555
4556         txq = netdev_core_pick_tx(dev, skb, NULL);
4557         cpu = smp_processor_id();
4558         HARD_TX_LOCK(dev, txq, cpu);
4559         if (!netif_xmit_stopped(txq)) {
4560                 rc = netdev_start_xmit(skb, dev, txq, 0);
4561                 if (dev_xmit_complete(rc))
4562                         free_skb = false;
4563         }
4564         HARD_TX_UNLOCK(dev, txq);
4565         if (free_skb) {
4566                 trace_xdp_exception(dev, xdp_prog, XDP_TX);
4567                 kfree_skb(skb);
4568         }
4569 }
4570 EXPORT_SYMBOL_GPL(generic_xdp_tx);
4571
4572 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4573
4574 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4575 {
4576         if (xdp_prog) {
4577                 struct xdp_buff xdp;
4578                 u32 act;
4579                 int err;
4580
4581                 act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4582                 if (act != XDP_PASS) {
4583                         switch (act) {
4584                         case XDP_REDIRECT:
4585                                 err = xdp_do_generic_redirect(skb->dev, skb,
4586                                                               &xdp, xdp_prog);
4587                                 if (err)
4588                                         goto out_redir;
4589                                 break;
4590                         case XDP_TX:
4591                                 generic_xdp_tx(skb, xdp_prog);
4592                                 break;
4593                         }
4594                         return XDP_DROP;
4595                 }
4596         }
4597         return XDP_PASS;
4598 out_redir:
4599         kfree_skb(skb);
4600         return XDP_DROP;
4601 }
4602 EXPORT_SYMBOL_GPL(do_xdp_generic);
4603
4604 static int netif_rx_internal(struct sk_buff *skb)
4605 {
4606         int ret;
4607
4608         net_timestamp_check(netdev_tstamp_prequeue, skb);
4609
4610         trace_netif_rx(skb);
4611
4612 #ifdef CONFIG_RPS
4613         if (static_branch_unlikely(&rps_needed)) {
4614                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4615                 int cpu;
4616
4617                 preempt_disable();
4618                 rcu_read_lock();
4619
4620                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
4621                 if (cpu < 0)
4622                         cpu = smp_processor_id();
4623
4624                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4625
4626                 rcu_read_unlock();
4627                 preempt_enable();
4628         } else
4629 #endif
4630         {
4631                 unsigned int qtail;
4632
4633                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4634                 put_cpu();
4635         }
4636         return ret;
4637 }
4638
4639 /**
4640  *      netif_rx        -       post buffer to the network code
4641  *      @skb: buffer to post
4642  *
4643  *      This function receives a packet from a device driver and queues it for
4644  *      the upper (protocol) levels to process.  It always succeeds. The buffer
4645  *      may be dropped during processing for congestion control or by the
4646  *      protocol layers.
4647  *
4648  *      return values:
4649  *      NET_RX_SUCCESS  (no congestion)
4650  *      NET_RX_DROP     (packet was dropped)
4651  *
4652  */
4653
4654 int netif_rx(struct sk_buff *skb)
4655 {
4656         int ret;
4657
4658         trace_netif_rx_entry(skb);
4659
4660         ret = netif_rx_internal(skb);
4661         trace_netif_rx_exit(ret);
4662
4663         return ret;
4664 }
4665 EXPORT_SYMBOL(netif_rx);
4666
4667 int netif_rx_ni(struct sk_buff *skb)
4668 {
4669         int err;
4670
4671         trace_netif_rx_ni_entry(skb);
4672
4673         preempt_disable();
4674         err = netif_rx_internal(skb);
4675         if (local_softirq_pending())
4676                 do_softirq();
4677         preempt_enable();
4678         trace_netif_rx_ni_exit(err);
4679
4680         return err;
4681 }
4682 EXPORT_SYMBOL(netif_rx_ni);
4683
4684 static __latent_entropy void net_tx_action(struct softirq_action *h)
4685 {
4686         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4687
4688         if (sd->completion_queue) {
4689                 struct sk_buff *clist;
4690
4691                 local_irq_disable();
4692                 clist = sd->completion_queue;
4693                 sd->completion_queue = NULL;
4694                 local_irq_enable();
4695
4696                 while (clist) {
4697                         struct sk_buff *skb = clist;
4698
4699                         clist = clist->next;
4700
4701                         WARN_ON(refcount_read(&skb->users));
4702                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4703                                 trace_consume_skb(skb);
4704                         else
4705                                 trace_kfree_skb(skb, net_tx_action);
4706
4707                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4708                                 __kfree_skb(skb);
4709                         else
4710                                 __kfree_skb_defer(skb);
4711                 }
4712
4713                 __kfree_skb_flush();
4714         }
4715
4716         if (sd->output_queue) {
4717                 struct Qdisc *head;
4718
4719                 local_irq_disable();
4720                 head = sd->output_queue;
4721                 sd->output_queue = NULL;
4722                 sd->output_queue_tailp = &sd->output_queue;
4723                 local_irq_enable();
4724
4725                 while (head) {
4726                         struct Qdisc *q = head;
4727                         spinlock_t *root_lock = NULL;
4728
4729                         head = head->next_sched;
4730
4731                         if (!(q->flags & TCQ_F_NOLOCK)) {
4732                                 root_lock = qdisc_lock(q);
4733                                 spin_lock(root_lock);
4734                         }
4735                         /* We need to make sure head->next_sched is read
4736                          * before clearing __QDISC_STATE_SCHED
4737                          */
4738                         smp_mb__before_atomic();
4739                         clear_bit(__QDISC_STATE_SCHED, &q->state);
4740                         qdisc_run(q);
4741                         if (root_lock)
4742                                 spin_unlock(root_lock);
4743                 }
4744         }
4745
4746         xfrm_dev_backlog(sd);
4747 }
4748
4749 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4750 /* This hook is defined here for ATM LANE */
4751 int (*br_fdb_test_addr_hook)(struct net_device *dev,
4752                              unsigned char *addr) __read_mostly;
4753 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4754 #endif
4755
4756 static inline struct sk_buff *
4757 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4758                    struct net_device *orig_dev)
4759 {
4760 #ifdef CONFIG_NET_CLS_ACT
4761         struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4762         struct tcf_result cl_res;
4763
4764         /* If there's at least one ingress present somewhere (so
4765          * we get here via enabled static key), remaining devices
4766          * that are not configured with an ingress qdisc will bail
4767          * out here.
4768          */
4769         if (!miniq)
4770                 return skb;
4771
4772         if (*pt_prev) {
4773                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4774                 *pt_prev = NULL;
4775         }
4776
4777         qdisc_skb_cb(skb)->pkt_len = skb->len;
4778         skb->tc_at_ingress = 1;
4779         mini_qdisc_bstats_cpu_update(miniq, skb);
4780
4781         switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4782         case TC_ACT_OK:
4783         case TC_ACT_RECLASSIFY:
4784                 skb->tc_index = TC_H_MIN(cl_res.classid);
4785                 break;
4786         case TC_ACT_SHOT:
4787                 mini_qdisc_qstats_cpu_drop(miniq);
4788                 kfree_skb(skb);
4789                 return NULL;
4790         case TC_ACT_STOLEN:
4791         case TC_ACT_QUEUED:
4792         case TC_ACT_TRAP:
4793                 consume_skb(skb);
4794                 return NULL;
4795         case TC_ACT_REDIRECT:
4796                 /* skb_mac_header check was done by cls/act_bpf, so
4797                  * we can safely push the L2 header back before
4798                  * redirecting to another netdev
4799                  */
4800                 __skb_push(skb, skb->mac_len);
4801                 skb_do_redirect(skb);
4802                 return NULL;
4803         case TC_ACT_CONSUMED:
4804                 return NULL;
4805         default:
4806                 break;
4807         }
4808 #endif /* CONFIG_NET_CLS_ACT */
4809         return skb;
4810 }
4811
4812 /**
4813  *      netdev_is_rx_handler_busy - check if receive handler is registered
4814  *      @dev: device to check
4815  *
4816  *      Check if a receive handler is already registered for a given device.
4817  *      Return true if there one.
4818  *
4819  *      The caller must hold the rtnl_mutex.
4820  */
4821 bool netdev_is_rx_handler_busy(struct net_device *dev)
4822 {
4823         ASSERT_RTNL();
4824         return dev && rtnl_dereference(dev->rx_handler);
4825 }
4826 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4827
4828 /**
4829  *      netdev_rx_handler_register - register receive handler
4830  *      @dev: device to register a handler for
4831  *      @rx_handler: receive handler to register
4832  *      @rx_handler_data: data pointer that is used by rx handler
4833  *
4834  *      Register a receive handler for a device. This handler will then be
4835  *      called from __netif_receive_skb. A negative errno code is returned
4836  *      on a failure.
4837  *
4838  *      The caller must hold the rtnl_mutex.
4839  *
4840  *      For a general description of rx_handler, see enum rx_handler_result.
4841  */
4842 int netdev_rx_handler_register(struct net_device *dev,
4843                                rx_handler_func_t *rx_handler,
4844                                void *rx_handler_data)
4845 {
4846         if (netdev_is_rx_handler_busy(dev))
4847                 return -EBUSY;
4848
4849         if (dev->priv_flags & IFF_NO_RX_HANDLER)
4850                 return -EINVAL;
4851
4852         /* Note: rx_handler_data must be set before rx_handler */
4853         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4854         rcu_assign_pointer(dev->rx_handler, rx_handler);
4855
4856         return 0;
4857 }
4858 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4859
4860 /**
4861  *      netdev_rx_handler_unregister - unregister receive handler
4862  *      @dev: device to unregister a handler from
4863  *
4864  *      Unregister a receive handler from a device.
4865  *
4866  *      The caller must hold the rtnl_mutex.
4867  */
4868 void netdev_rx_handler_unregister(struct net_device *dev)
4869 {
4870
4871         ASSERT_RTNL();
4872         RCU_INIT_POINTER(dev->rx_handler, NULL);
4873         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4874          * section has a guarantee to see a non NULL rx_handler_data
4875          * as well.
4876          */
4877         synchronize_net();
4878         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4879 }
4880 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4881
4882 /*
4883  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4884  * the special handling of PFMEMALLOC skbs.
4885  */
4886 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4887 {
4888         switch (skb->protocol) {
4889         case htons(ETH_P_ARP):
4890         case htons(ETH_P_IP):
4891         case htons(ETH_P_IPV6):
4892         case htons(ETH_P_8021Q):
4893         case htons(ETH_P_8021AD):
4894                 return true;
4895         default:
4896                 return false;
4897         }
4898 }
4899
4900 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4901                              int *ret, struct net_device *orig_dev)
4902 {
4903 #ifdef CONFIG_NETFILTER_INGRESS
4904         if (nf_hook_ingress_active(skb)) {
4905                 int ingress_retval;
4906
4907                 if (*pt_prev) {
4908                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4909                         *pt_prev = NULL;
4910                 }
4911
4912                 rcu_read_lock();
4913                 ingress_retval = nf_hook_ingress(skb);
4914                 rcu_read_unlock();
4915                 return ingress_retval;
4916         }
4917 #endif /* CONFIG_NETFILTER_INGRESS */
4918         return 0;
4919 }
4920
4921 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
4922                                     struct packet_type **ppt_prev)
4923 {
4924         struct packet_type *ptype, *pt_prev;
4925         rx_handler_func_t *rx_handler;
4926         struct net_device *orig_dev;
4927         bool deliver_exact = false;
4928         int ret = NET_RX_DROP;
4929         __be16 type;
4930
4931         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4932
4933         trace_netif_receive_skb(skb);
4934
4935         orig_dev = skb->dev;
4936
4937         skb_reset_network_header(skb);
4938         if (!skb_transport_header_was_set(skb))
4939                 skb_reset_transport_header(skb);
4940         skb_reset_mac_len(skb);
4941
4942         pt_prev = NULL;
4943
4944 another_round:
4945         skb->skb_iif = skb->dev->ifindex;
4946
4947         __this_cpu_inc(softnet_data.processed);
4948
4949         if (static_branch_unlikely(&generic_xdp_needed_key)) {
4950                 int ret2;
4951
4952                 preempt_disable();
4953                 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4954                 preempt_enable();
4955
4956                 if (ret2 != XDP_PASS)
4957                         return NET_RX_DROP;
4958                 skb_reset_mac_len(skb);
4959         }
4960
4961         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4962             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4963                 skb = skb_vlan_untag(skb);
4964                 if (unlikely(!skb))
4965                         goto out;
4966         }
4967
4968         if (skb_skip_tc_classify(skb))
4969                 goto skip_classify;
4970
4971         if (pfmemalloc)
4972                 goto skip_taps;
4973
4974         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4975                 if (pt_prev)
4976                         ret = deliver_skb(skb, pt_prev, orig_dev);
4977                 pt_prev = ptype;
4978         }
4979
4980         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4981                 if (pt_prev)
4982                         ret = deliver_skb(skb, pt_prev, orig_dev);
4983                 pt_prev = ptype;
4984         }
4985
4986 skip_taps:
4987 #ifdef CONFIG_NET_INGRESS
4988         if (static_branch_unlikely(&ingress_needed_key)) {
4989                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4990                 if (!skb)
4991                         goto out;
4992
4993                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4994                         goto out;
4995         }
4996 #endif
4997         skb_reset_tc(skb);
4998 skip_classify:
4999         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5000                 goto drop;
5001
5002         if (skb_vlan_tag_present(skb)) {
5003                 if (pt_prev) {
5004                         ret = deliver_skb(skb, pt_prev, orig_dev);
5005                         pt_prev = NULL;
5006                 }
5007                 if (vlan_do_receive(&skb))
5008                         goto another_round;
5009                 else if (unlikely(!skb))
5010                         goto out;
5011         }
5012
5013         rx_handler = rcu_dereference(skb->dev->rx_handler);
5014         if (rx_handler) {
5015                 if (pt_prev) {
5016                         ret = deliver_skb(skb, pt_prev, orig_dev);
5017                         pt_prev = NULL;
5018                 }
5019                 switch (rx_handler(&skb)) {
5020                 case RX_HANDLER_CONSUMED:
5021                         ret = NET_RX_SUCCESS;
5022                         goto out;
5023                 case RX_HANDLER_ANOTHER:
5024                         goto another_round;
5025                 case RX_HANDLER_EXACT:
5026                         deliver_exact = true;
5027                 case RX_HANDLER_PASS:
5028                         break;
5029                 default:
5030                         BUG();
5031                 }
5032         }
5033
5034         if (unlikely(skb_vlan_tag_present(skb))) {
5035 check_vlan_id:
5036                 if (skb_vlan_tag_get_id(skb)) {
5037                         /* Vlan id is non 0 and vlan_do_receive() above couldn't
5038                          * find vlan device.
5039                          */
5040                         skb->pkt_type = PACKET_OTHERHOST;
5041                 } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5042                            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5043                         /* Outer header is 802.1P with vlan 0, inner header is
5044                          * 802.1Q or 802.1AD and vlan_do_receive() above could
5045                          * not find vlan dev for vlan id 0.
5046                          */
5047                         __vlan_hwaccel_clear_tag(skb);
5048                         skb = skb_vlan_untag(skb);
5049                         if (unlikely(!skb))
5050                                 goto out;
5051                         if (vlan_do_receive(&skb))
5052                                 /* After stripping off 802.1P header with vlan 0
5053                                  * vlan dev is found for inner header.
5054                                  */
5055                                 goto another_round;
5056                         else if (unlikely(!skb))
5057                                 goto out;
5058                         else
5059                                 /* We have stripped outer 802.1P vlan 0 header.
5060                                  * But could not find vlan dev.
5061                                  * check again for vlan id to set OTHERHOST.
5062                                  */
5063                                 goto check_vlan_id;
5064                 }
5065                 /* Note: we might in the future use prio bits
5066                  * and set skb->priority like in vlan_do_receive()
5067                  * For the time being, just ignore Priority Code Point
5068                  */
5069                 __vlan_hwaccel_clear_tag(skb);
5070         }
5071
5072         type = skb->protocol;
5073
5074         /* deliver only exact match when indicated */
5075         if (likely(!deliver_exact)) {
5076                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5077                                        &ptype_base[ntohs(type) &
5078                                                    PTYPE_HASH_MASK]);
5079         }
5080
5081         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5082                                &orig_dev->ptype_specific);
5083
5084         if (unlikely(skb->dev != orig_dev)) {
5085                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5086                                        &skb->dev->ptype_specific);
5087         }
5088
5089         if (pt_prev) {
5090                 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5091                         goto drop;
5092                 *ppt_prev = pt_prev;
5093         } else {
5094 drop:
5095                 if (!deliver_exact)
5096                         atomic_long_inc(&skb->dev->rx_dropped);
5097                 else
5098                         atomic_long_inc(&skb->dev->rx_nohandler);
5099                 kfree_skb(skb);
5100                 /* Jamal, now you will not able to escape explaining
5101                  * me how you were going to use this. :-)
5102                  */
5103                 ret = NET_RX_DROP;
5104         }
5105
5106 out:
5107         return ret;
5108 }
5109
5110 static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5111 {
5112         struct net_device *orig_dev = skb->dev;
5113         struct packet_type *pt_prev = NULL;
5114         int ret;
5115
5116         ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
5117         if (pt_prev)
5118                 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5119                                          skb->dev, pt_prev, orig_dev);
5120         return ret;
5121 }
5122
5123 /**
5124  *      netif_receive_skb_core - special purpose version of netif_receive_skb
5125  *      @skb: buffer to process
5126  *
5127  *      More direct receive version of netif_receive_skb().  It should
5128  *      only be used by callers that have a need to skip RPS and Generic XDP.
5129  *      Caller must also take care of handling if (page_is_)pfmemalloc.
5130  *
5131  *      This function may only be called from softirq context and interrupts
5132  *      should be enabled.
5133  *
5134  *      Return values (usually ignored):
5135  *      NET_RX_SUCCESS: no congestion
5136  *      NET_RX_DROP: packet was dropped
5137  */
5138 int netif_receive_skb_core(struct sk_buff *skb)
5139 {
5140         int ret;
5141
5142         rcu_read_lock();
5143         ret = __netif_receive_skb_one_core(skb, false);
5144         rcu_read_unlock();
5145
5146         return ret;
5147 }
5148 EXPORT_SYMBOL(netif_receive_skb_core);
5149
5150 static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5151                                                   struct packet_type *pt_prev,
5152                                                   struct net_device *orig_dev)
5153 {
5154         struct sk_buff *skb, *next;
5155
5156         if (!pt_prev)
5157                 return;
5158         if (list_empty(head))
5159                 return;
5160         if (pt_prev->list_func != NULL)
5161                 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5162                                    ip_list_rcv, head, pt_prev, orig_dev);
5163         else
5164                 list_for_each_entry_safe(skb, next, head, list) {
5165                         skb_list_del_init(skb);
5166                         pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5167                 }
5168 }
5169
5170 static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5171 {
5172         /* Fast-path assumptions:
5173          * - There is no RX handler.
5174          * - Only one packet_type matches.
5175          * If either of these fails, we will end up doing some per-packet
5176          * processing in-line, then handling the 'last ptype' for the whole
5177          * sublist.  This can't cause out-of-order delivery to any single ptype,
5178          * because the 'last ptype' must be constant across the sublist, and all
5179          * other ptypes are handled per-packet.
5180          */
5181         /* Current (common) ptype of sublist */
5182         struct packet_type *pt_curr = NULL;
5183         /* Current (common) orig_dev of sublist */
5184         struct net_device *od_curr = NULL;
5185         struct list_head sublist;
5186         struct sk_buff *skb, *next;
5187
5188         INIT_LIST_HEAD(&sublist);
5189         list_for_each_entry_safe(skb, next, head, list) {
5190                 struct net_device *orig_dev = skb->dev;
5191                 struct packet_type *pt_prev = NULL;
5192
5193                 skb_list_del_init(skb);
5194                 __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
5195                 if (!pt_prev)
5196                         continue;
5197                 if (pt_curr != pt_prev || od_curr != orig_dev) {
5198                         /* dispatch old sublist */
5199                         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5200                         /* start new sublist */
5201                         INIT_LIST_HEAD(&sublist);
5202                         pt_curr = pt_prev;
5203                         od_curr = orig_dev;
5204                 }
5205                 list_add_tail(&skb->list, &sublist);
5206         }
5207
5208         /* dispatch final sublist */
5209         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5210 }
5211
5212 static int __netif_receive_skb(struct sk_buff *skb)
5213 {
5214         int ret;
5215
5216         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5217                 unsigned int noreclaim_flag;
5218
5219                 /*
5220                  * PFMEMALLOC skbs are special, they should
5221                  * - be delivered to SOCK_MEMALLOC sockets only
5222                  * - stay away from userspace
5223                  * - have bounded memory usage
5224                  *
5225                  * Use PF_MEMALLOC as this saves us from propagating the allocation
5226                  * context down to all allocation sites.
5227                  */
5228                 noreclaim_flag = memalloc_noreclaim_save();
5229                 ret = __netif_receive_skb_one_core(skb, true);
5230                 memalloc_noreclaim_restore(noreclaim_flag);
5231         } else
5232                 ret = __netif_receive_skb_one_core(skb, false);
5233
5234         return ret;
5235 }
5236
5237 static void __netif_receive_skb_list(struct list_head *head)
5238 {
5239         unsigned long noreclaim_flag = 0;
5240         struct sk_buff *skb, *next;
5241         bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5242
5243         list_for_each_entry_safe(skb, next, head, list) {
5244                 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5245                         struct list_head sublist;
5246
5247                         /* Handle the previous sublist */
5248                         list_cut_before(&sublist, head, &skb->list);
5249                         if (!list_empty(&sublist))
5250                                 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5251                         pfmemalloc = !pfmemalloc;
5252                         /* See comments in __netif_receive_skb */
5253                         if (pfmemalloc)
5254                                 noreclaim_flag = memalloc_noreclaim_save();
5255                         else
5256                                 memalloc_noreclaim_restore(noreclaim_flag);
5257                 }
5258         }
5259         /* Handle the remaining sublist */
5260         if (!list_empty(head))
5261                 __netif_receive_skb_list_core(head, pfmemalloc);
5262         /* Restore pflags */
5263         if (pfmemalloc)
5264                 memalloc_noreclaim_restore(noreclaim_flag);
5265 }
5266
5267 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5268 {
5269         struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5270         struct bpf_prog *new = xdp->prog;
5271         int ret = 0;
5272
5273         switch (xdp->command) {
5274         case XDP_SETUP_PROG:
5275                 rcu_assign_pointer(dev->xdp_prog, new);
5276                 if (old)
5277                         bpf_prog_put(old);
5278
5279                 if (old && !new) {
5280                         static_branch_dec(&generic_xdp_needed_key);
5281                 } else if (new && !old) {
5282                         static_branch_inc(&generic_xdp_needed_key);
5283                         dev_disable_lro(dev);
5284                         dev_disable_gro_hw(dev);
5285                 }
5286                 break;
5287
5288         case XDP_QUERY_PROG:
5289                 xdp->prog_id = old ? old->aux->id : 0;
5290                 break;
5291
5292         default:
5293                 ret = -EINVAL;
5294                 break;
5295         }
5296
5297         return ret;
5298 }
5299
5300 static int netif_receive_skb_internal(struct sk_buff *skb)
5301 {
5302         int ret;
5303
5304         net_timestamp_check(netdev_tstamp_prequeue, skb);
5305
5306         if (skb_defer_rx_timestamp(skb))
5307                 return NET_RX_SUCCESS;
5308
5309         rcu_read_lock();
5310 #ifdef CONFIG_RPS
5311         if (static_branch_unlikely(&rps_needed)) {
5312                 struct rps_dev_flow voidflow, *rflow = &voidflow;
5313                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5314
5315                 if (cpu >= 0) {
5316                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5317                         rcu_read_unlock();
5318                         return ret;
5319                 }
5320         }
5321 #endif
5322         ret = __netif_receive_skb(skb);
5323         rcu_read_unlock();
5324         return ret;
5325 }
5326
5327 static void netif_receive_skb_list_internal(struct list_head *head)
5328 {
5329         struct sk_buff *skb, *next;
5330         struct list_head sublist;
5331
5332         INIT_LIST_HEAD(&sublist);
5333         list_for_each_entry_safe(skb, next, head, list) {
5334                 net_timestamp_check(netdev_tstamp_prequeue, skb);
5335                 skb_list_del_init(skb);
5336                 if (!skb_defer_rx_timestamp(skb))
5337                         list_add_tail(&skb->list, &sublist);
5338         }
5339         list_splice_init(&sublist, head);
5340
5341         rcu_read_lock();
5342 #ifdef CONFIG_RPS
5343         if (static_branch_unlikely(&rps_needed)) {
5344                 list_for_each_entry_safe(skb, next, head, list) {
5345                         struct rps_dev_flow voidflow, *rflow = &voidflow;
5346                         int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5347
5348                         if (cpu >= 0) {
5349                                 /* Will be handled, remove from list */
5350                                 skb_list_del_init(skb);
5351                                 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5352                         }
5353                 }
5354         }
5355 #endif
5356         __netif_receive_skb_list(head);
5357         rcu_read_unlock();
5358 }
5359
5360 /**
5361  *      netif_receive_skb - process receive buffer from network
5362  *      @skb: buffer to process
5363  *
5364  *      netif_receive_skb() is the main receive data processing function.
5365  *      It always succeeds. The buffer may be dropped during processing
5366  *      for congestion control or by the protocol layers.
5367  *
5368  *      This function may only be called from softirq context and interrupts
5369  *      should be enabled.
5370  *
5371  *      Return values (usually ignored):
5372  *      NET_RX_SUCCESS: no congestion
5373  *      NET_RX_DROP: packet was dropped
5374  */
5375 int netif_receive_skb(struct sk_buff *skb)
5376 {
5377         int ret;
5378
5379         trace_netif_receive_skb_entry(skb);
5380
5381         ret = netif_receive_skb_internal(skb);
5382         trace_netif_receive_skb_exit(ret);
5383
5384         return ret;
5385 }
5386 EXPORT_SYMBOL(netif_receive_skb);
5387
5388 /**
5389  *      netif_receive_skb_list - process many receive buffers from network
5390  *      @head: list of skbs to process.
5391  *
5392  *      Since return value of netif_receive_skb() is normally ignored, and
5393  *      wouldn't be meaningful for a list, this function returns void.
5394  *
5395  *      This function may only be called from softirq context and interrupts
5396  *      should be enabled.
5397  */
5398 void netif_receive_skb_list(struct list_head *head)
5399 {
5400         struct sk_buff *skb;
5401
5402         if (list_empty(head))
5403                 return;
5404         if (trace_netif_receive_skb_list_entry_enabled()) {
5405                 list_for_each_entry(skb, head, list)
5406                         trace_netif_receive_skb_list_entry(skb);
5407         }
5408         netif_receive_skb_list_internal(head);
5409         trace_netif_receive_skb_list_exit(0);
5410 }
5411 EXPORT_SYMBOL(netif_receive_skb_list);
5412
5413 DEFINE_PER_CPU(struct work_struct, flush_works);
5414
5415 /* Network device is going away, flush any packets still pending */
5416 static void flush_backlog(struct work_struct *work)
5417 {
5418         struct sk_buff *skb, *tmp;
5419         struct softnet_data *sd;
5420
5421         local_bh_disable();
5422         sd = this_cpu_ptr(&softnet_data);
5423
5424         local_irq_disable();
5425         rps_lock(sd);
5426         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5427                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5428                         __skb_unlink(skb, &sd->input_pkt_queue);
5429                         kfree_skb(skb);
5430                         input_queue_head_incr(sd);
5431                 }
5432         }
5433         rps_unlock(sd);
5434         local_irq_enable();
5435
5436         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5437                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5438                         __skb_unlink(skb, &sd->process_queue);
5439                         kfree_skb(skb);
5440                         input_queue_head_incr(sd);
5441                 }
5442         }
5443         local_bh_enable();
5444 }
5445
5446 static void flush_all_backlogs(void)
5447 {
5448         unsigned int cpu;
5449
5450         get_online_cpus();
5451
5452         for_each_online_cpu(cpu)
5453                 queue_work_on(cpu, system_highpri_wq,
5454                               per_cpu_ptr(&flush_works, cpu));
5455
5456         for_each_online_cpu(cpu)
5457                 flush_work(per_cpu_ptr(&flush_works, cpu));
5458
5459         put_online_cpus();
5460 }
5461
5462 INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5463 INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5464 static int napi_gro_complete(struct sk_buff *skb)
5465 {
5466         struct packet_offload *ptype;
5467         __be16 type = skb->protocol;
5468         struct list_head *head = &offload_base;
5469         int err = -ENOENT;
5470
5471         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5472
5473         if (NAPI_GRO_CB(skb)->count == 1) {
5474                 skb_shinfo(skb)->gso_size = 0;
5475                 goto out;
5476         }
5477
5478         rcu_read_lock();
5479         list_for_each_entry_rcu(ptype, head, list) {
5480                 if (ptype->type != type || !ptype->callbacks.gro_complete)
5481                         continue;
5482
5483                 err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5484                                          ipv6_gro_complete, inet_gro_complete,
5485                                          skb, 0);
5486                 break;
5487         }
5488         rcu_read_unlock();
5489
5490         if (err) {
5491                 WARN_ON(&ptype->list == head);
5492                 kfree_skb(skb);
5493                 return NET_RX_SUCCESS;
5494         }
5495
5496 out:
5497         return netif_receive_skb_internal(skb);
5498 }
5499
5500 static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5501                                    bool flush_old)
5502 {
5503         struct list_head *head = &napi->gro_hash[index].list;
5504         struct sk_buff *skb, *p;
5505
5506         list_for_each_entry_safe_reverse(skb, p, head, list) {
5507                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5508                         return;
5509                 skb_list_del_init(skb);
5510                 napi_gro_complete(skb);
5511                 napi->gro_hash[index].count--;
5512         }
5513
5514         if (!napi->gro_hash[index].count)
5515                 __clear_bit(index, &napi->gro_bitmask);
5516 }
5517
5518 /* napi->gro_hash[].list contains packets ordered by age.
5519  * youngest packets at the head of it.
5520  * Complete skbs in reverse order to reduce latencies.
5521  */
5522 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5523 {
5524         unsigned long bitmask = napi->gro_bitmask;
5525         unsigned int i, base = ~0U;
5526
5527         while ((i = ffs(bitmask)) != 0) {
5528                 bitmask >>= i;
5529                 base += i;
5530                 __napi_gro_flush_chain(napi, base, flush_old);
5531         }
5532 }
5533 EXPORT_SYMBOL(napi_gro_flush);
5534
5535 static struct list_head *gro_list_prepare(struct napi_struct *napi,
5536                                           struct sk_buff *skb)
5537 {
5538         unsigned int maclen = skb->dev->hard_header_len;
5539         u32 hash = skb_get_hash_raw(skb);
5540         struct list_head *head;
5541         struct sk_buff *p;
5542
5543         head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5544         list_for_each_entry(p, head, list) {
5545                 unsigned long diffs;
5546
5547                 NAPI_GRO_CB(p)->flush = 0;
5548
5549                 if (hash != skb_get_hash_raw(p)) {
5550                         NAPI_GRO_CB(p)->same_flow = 0;
5551                         continue;
5552                 }
5553
5554                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5555                 diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5556                 if (skb_vlan_tag_present(p))
5557                         diffs |= p->vlan_tci ^ skb->vlan_tci;
5558                 diffs |= skb_metadata_dst_cmp(p, skb);
5559                 diffs |= skb_metadata_differs(p, skb);
5560                 if (maclen == ETH_HLEN)
5561                         diffs |= compare_ether_header(skb_mac_header(p),
5562                                                       skb_mac_header(skb));
5563                 else if (!diffs)
5564                         diffs = memcmp(skb_mac_header(p),
5565                                        skb_mac_header(skb),
5566                                        maclen);
5567                 NAPI_GRO_CB(p)->same_flow = !diffs;
5568         }
5569
5570         return head;
5571 }
5572
5573 static void skb_gro_reset_offset(struct sk_buff *skb)
5574 {
5575         const struct skb_shared_info *pinfo = skb_shinfo(skb);
5576         const skb_frag_t *frag0 = &pinfo->frags[0];
5577
5578         NAPI_GRO_CB(skb)->data_offset = 0;
5579         NAPI_GRO_CB(skb)->frag0 = NULL;
5580         NAPI_GRO_CB(skb)->frag0_len = 0;
5581
5582         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
5583             pinfo->nr_frags &&
5584             !PageHighMem(skb_frag_page(frag0))) {
5585                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5586                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5587                                                     skb_frag_size(frag0),
5588                                                     skb->end - skb->tail);
5589         }
5590 }
5591
5592 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5593 {
5594         struct skb_shared_info *pinfo = skb_shinfo(skb);
5595
5596         BUG_ON(skb->end - skb->tail < grow);
5597
5598         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5599
5600         skb->data_len -= grow;
5601         skb->tail += grow;
5602
5603         skb_frag_off_add(&pinfo->frags[0], grow);
5604         skb_frag_size_sub(&pinfo->frags[0], grow);
5605
5606         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5607                 skb_frag_unref(skb, 0);
5608                 memmove(pinfo->frags, pinfo->frags + 1,
5609                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5610         }
5611 }
5612
5613 static void gro_flush_oldest(struct list_head *head)
5614 {
5615         struct sk_buff *oldest;
5616
5617         oldest = list_last_entry(head, struct sk_buff, list);
5618
5619         /* We are called with head length >= MAX_GRO_SKBS, so this is
5620          * impossible.
5621          */
5622         if (WARN_ON_ONCE(!oldest))
5623                 return;
5624
5625         /* Do not adjust napi->gro_hash[].count, caller is adding a new
5626          * SKB to the chain.
5627          */
5628         skb_list_del_init(oldest);
5629         napi_gro_complete(oldest);
5630 }
5631
5632 INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5633                                                            struct sk_buff *));
5634 INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5635                                                            struct sk_buff *));
5636 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5637 {
5638         u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5639         struct list_head *head = &offload_base;
5640         struct packet_offload *ptype;
5641         __be16 type = skb->protocol;
5642         struct list_head *gro_head;
5643         struct sk_buff *pp = NULL;
5644         enum gro_result ret;
5645         int same_flow;
5646         int grow;
5647
5648         if (netif_elide_gro(skb->dev))
5649                 goto normal;
5650
5651         gro_head = gro_list_prepare(napi, skb);
5652
5653         rcu_read_lock();
5654         list_for_each_entry_rcu(ptype, head, list) {
5655                 if (ptype->type != type || !ptype->callbacks.gro_receive)
5656                         continue;
5657
5658                 skb_set_network_header(skb, skb_gro_offset(skb));
5659                 skb_reset_mac_len(skb);
5660                 NAPI_GRO_CB(skb)->same_flow = 0;
5661                 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5662                 NAPI_GRO_CB(skb)->free = 0;
5663                 NAPI_GRO_CB(skb)->encap_mark = 0;
5664                 NAPI_GRO_CB(skb)->recursion_counter = 0;
5665                 NAPI_GRO_CB(skb)->is_fou = 0;
5666                 NAPI_GRO_CB(skb)->is_atomic = 1;
5667                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5668
5669                 /* Setup for GRO checksum validation */
5670                 switch (skb->ip_summed) {
5671                 case CHECKSUM_COMPLETE:
5672                         NAPI_GRO_CB(skb)->csum = skb->csum;
5673                         NAPI_GRO_CB(skb)->csum_valid = 1;
5674                         NAPI_GRO_CB(skb)->csum_cnt = 0;
5675                         break;
5676                 case CHECKSUM_UNNECESSARY:
5677                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5678                         NAPI_GRO_CB(skb)->csum_valid = 0;
5679                         break;
5680                 default:
5681                         NAPI_GRO_CB(skb)->csum_cnt = 0;
5682                         NAPI_GRO_CB(skb)->csum_valid = 0;
5683                 }
5684
5685                 pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
5686                                         ipv6_gro_receive, inet_gro_receive,
5687                                         gro_head, skb);
5688                 break;
5689         }
5690         rcu_read_unlock();
5691
5692         if (&ptype->list == head)
5693                 goto normal;
5694
5695         if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
5696                 ret = GRO_CONSUMED;
5697                 goto ok;
5698         }
5699
5700         same_flow = NAPI_GRO_CB(skb)->same_flow;
5701         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5702
5703         if (pp) {
5704                 skb_list_del_init(pp);
5705                 napi_gro_complete(pp);
5706                 napi->gro_hash[hash].count--;
5707         }
5708
5709         if (same_flow)
5710                 goto ok;
5711
5712         if (NAPI_GRO_CB(skb)->flush)
5713                 goto normal;
5714
5715         if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5716                 gro_flush_oldest(gro_head);
5717         } else {
5718                 napi->gro_hash[hash].count++;
5719         }
5720         NAPI_GRO_CB(skb)->count = 1;
5721         NAPI_GRO_CB(skb)->age = jiffies;
5722         NAPI_GRO_CB(skb)->last = skb;
5723         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5724         list_add(&skb->list, gro_head);
5725         ret = GRO_HELD;
5726
5727 pull:
5728         grow = skb_gro_offset(skb) - skb_headlen(skb);
5729         if (grow > 0)
5730                 gro_pull_from_frag0(skb, grow);
5731 ok:
5732         if (napi->gro_hash[hash].count) {
5733                 if (!test_bit(hash, &napi->gro_bitmask))
5734                         __set_bit(hash, &napi->gro_bitmask);
5735         } else if (test_bit(hash, &napi->gro_bitmask)) {
5736                 __clear_bit(hash, &napi->gro_bitmask);
5737         }
5738
5739         return ret;
5740
5741 normal:
5742         ret = GRO_NORMAL;
5743         goto pull;
5744 }
5745
5746 struct packet_offload *gro_find_receive_by_type(__be16 type)
5747 {
5748         struct list_head *offload_head = &offload_base;
5749         struct packet_offload *ptype;
5750
5751         list_for_each_entry_rcu(ptype, offload_head, list) {
5752                 if (ptype->type != type || !ptype->callbacks.gro_receive)
5753                         continue;
5754                 return ptype;
5755         }
5756         return NULL;
5757 }
5758 EXPORT_SYMBOL(gro_find_receive_by_type);
5759
5760 struct packet_offload *gro_find_complete_by_type(__be16 type)
5761 {
5762         struct list_head *offload_head = &offload_base;
5763         struct packet_offload *ptype;
5764
5765         list_for_each_entry_rcu(ptype, offload_head, list) {
5766                 if (ptype->type != type || !ptype->callbacks.gro_complete)
5767                         continue;
5768                 return ptype;
5769         }
5770         return NULL;
5771 }
5772 EXPORT_SYMBOL(gro_find_complete_by_type);
5773
5774 static void napi_skb_free_stolen_head(struct sk_buff *skb)
5775 {
5776         skb_dst_drop(skb);
5777         skb_ext_put(skb);
5778         kmem_cache_free(skbuff_head_cache, skb);
5779 }
5780
5781 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5782 {
5783         switch (ret) {
5784         case GRO_NORMAL:
5785                 if (netif_receive_skb_internal(skb))
5786                         ret = GRO_DROP;
5787                 break;
5788
5789         case GRO_DROP:
5790                 kfree_skb(skb);
5791                 break;
5792
5793         case GRO_MERGED_FREE:
5794                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5795                         napi_skb_free_stolen_head(skb);
5796                 else
5797                         __kfree_skb(skb);
5798                 break;
5799
5800         case GRO_HELD:
5801         case GRO_MERGED:
5802         case GRO_CONSUMED:
5803                 break;
5804         }
5805
5806         return ret;
5807 }
5808
5809 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5810 {
5811         gro_result_t ret;
5812
5813         skb_mark_napi_id(skb, napi);
5814         trace_napi_gro_receive_entry(skb);
5815
5816         skb_gro_reset_offset(skb);
5817
5818         ret = napi_skb_finish(dev_gro_receive(napi, skb), skb);
5819         trace_napi_gro_receive_exit(ret);
5820
5821         return ret;
5822 }
5823 EXPORT_SYMBOL(napi_gro_receive);
5824
5825 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5826 {
5827         if (unlikely(skb->pfmemalloc)) {
5828                 consume_skb(skb);
5829                 return;
5830         }
5831         __skb_pull(skb, skb_headlen(skb));
5832         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
5833         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5834         __vlan_hwaccel_clear_tag(skb);
5835         skb->dev = napi->dev;
5836         skb->skb_iif = 0;
5837
5838         /* eth_type_trans() assumes pkt_type is PACKET_HOST */
5839         skb->pkt_type = PACKET_HOST;
5840
5841         skb->encapsulation = 0;
5842         skb_shinfo(skb)->gso_type = 0;
5843         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5844         skb_ext_reset(skb);
5845
5846         napi->skb = skb;
5847 }
5848
5849 struct sk_buff *napi_get_frags(struct napi_struct *napi)
5850 {
5851         struct sk_buff *skb = napi->skb;
5852
5853         if (!skb) {
5854                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5855                 if (skb) {
5856                         napi->skb = skb;
5857                         skb_mark_napi_id(skb, napi);
5858                 }
5859         }
5860         return skb;
5861 }
5862 EXPORT_SYMBOL(napi_get_frags);
5863
5864 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5865 static void gro_normal_list(struct napi_struct *napi)
5866 {
5867         if (!napi->rx_count)
5868                 return;
5869         netif_receive_skb_list_internal(&napi->rx_list);
5870         INIT_LIST_HEAD(&napi->rx_list);
5871         napi->rx_count = 0;
5872 }
5873
5874 /* Queue one GRO_NORMAL SKB up for list processing.  If batch size exceeded,
5875  * pass the whole batch up to the stack.
5876  */
5877 static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
5878 {
5879         list_add_tail(&skb->list, &napi->rx_list);
5880         if (++napi->rx_count >= gro_normal_batch)
5881                 gro_normal_list(napi);
5882 }
5883
5884 static gro_result_t napi_frags_finish(struct napi_struct *napi,
5885                                       struct sk_buff *skb,
5886                                       gro_result_t ret)
5887 {
5888         switch (ret) {
5889         case GRO_NORMAL:
5890         case GRO_HELD:
5891                 __skb_push(skb, ETH_HLEN);
5892                 skb->protocol = eth_type_trans(skb, skb->dev);
5893                 if (ret == GRO_NORMAL)
5894                         gro_normal_one(napi, skb);
5895                 break;
5896
5897         case GRO_DROP:
5898                 napi_reuse_skb(napi, skb);
5899                 break;
5900
5901         case GRO_MERGED_FREE:
5902                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5903                         napi_skb_free_stolen_head(skb);
5904                 else
5905                         napi_reuse_skb(napi, skb);
5906                 break;
5907
5908         case GRO_MERGED:
5909         case GRO_CONSUMED:
5910                 break;
5911         }
5912
5913         return ret;
5914 }
5915
5916 /* Upper GRO stack assumes network header starts at gro_offset=0
5917  * Drivers could call both napi_gro_frags() and napi_gro_receive()
5918  * We copy ethernet header into skb->data to have a common layout.
5919  */
5920 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5921 {
5922         struct sk_buff *skb = napi->skb;
5923         const struct ethhdr *eth;
5924         unsigned int hlen = sizeof(*eth);
5925
5926         napi->skb = NULL;
5927
5928         skb_reset_mac_header(skb);
5929         skb_gro_reset_offset(skb);
5930
5931         if (unlikely(skb_gro_header_hard(skb, hlen))) {
5932                 eth = skb_gro_header_slow(skb, hlen, 0);
5933                 if (unlikely(!eth)) {
5934                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5935                                              __func__, napi->dev->name);
5936                         napi_reuse_skb(napi, skb);
5937                         return NULL;
5938                 }
5939         } else {
5940                 eth = (const struct ethhdr *)skb->data;
5941                 gro_pull_from_frag0(skb, hlen);
5942                 NAPI_GRO_CB(skb)->frag0 += hlen;
5943                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
5944         }
5945         __skb_pull(skb, hlen);
5946
5947         /*
5948          * This works because the only protocols we care about don't require
5949          * special handling.
5950          * We'll fix it up properly in napi_frags_finish()
5951          */
5952         skb->protocol = eth->h_proto;
5953
5954         return skb;
5955 }
5956
5957 gro_result_t napi_gro_frags(struct napi_struct *napi)
5958 {
5959         gro_result_t ret;
5960         struct sk_buff *skb = napi_frags_skb(napi);
5961
5962         if (!skb)
5963                 return GRO_DROP;
5964
5965         trace_napi_gro_frags_entry(skb);
5966
5967         ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5968         trace_napi_gro_frags_exit(ret);
5969
5970         return ret;
5971 }
5972 EXPORT_SYMBOL(napi_gro_frags);
5973
5974 /* Compute the checksum from gro_offset and return the folded value
5975  * after adding in any pseudo checksum.
5976  */
5977 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5978 {
5979         __wsum wsum;
5980         __sum16 sum;
5981
5982         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5983
5984         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5985         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
5986         /* See comments in __skb_checksum_complete(). */
5987         if (likely(!sum)) {
5988                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5989                     !skb->csum_complete_sw)
5990                         netdev_rx_csum_fault(skb->dev, skb);
5991         }
5992
5993         NAPI_GRO_CB(skb)->csum = wsum;
5994         NAPI_GRO_CB(skb)->csum_valid = 1;
5995
5996         return sum;
5997 }
5998 EXPORT_SYMBOL(__skb_gro_checksum_complete);
5999
6000 static void net_rps_send_ipi(struct softnet_data *remsd)
6001 {
6002 #ifdef CONFIG_RPS
6003         while (remsd) {
6004                 struct softnet_data *next = remsd->rps_ipi_next;
6005
6006                 if (cpu_online(remsd->cpu))
6007                         smp_call_function_single_async(remsd->cpu, &remsd->csd);
6008                 remsd = next;
6009         }
6010 #endif
6011 }
6012
6013 /*
6014  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6015  * Note: called with local irq disabled, but exits with local irq enabled.
6016  */
6017 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6018 {
6019 #ifdef CONFIG_RPS
6020         struct softnet_data *remsd = sd->rps_ipi_list;
6021
6022         if (remsd) {
6023                 sd->rps_ipi_list = NULL;
6024
6025                 local_irq_enable();
6026
6027                 /* Send pending IPI's to kick RPS processing on remote cpus. */
6028                 net_rps_send_ipi(remsd);
6029         } else
6030 #endif
6031                 local_irq_enable();
6032 }
6033
6034 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6035 {
6036 #ifdef CONFIG_RPS
6037         return sd->rps_ipi_list != NULL;
6038 #else
6039         return false;
6040 #endif
6041 }
6042
6043 static int process_backlog(struct napi_struct *napi, int quota)
6044 {
6045         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6046         bool again = true;
6047         int work = 0;
6048
6049         /* Check if we have pending ipi, its better to send them now,
6050          * not waiting net_rx_action() end.
6051          */
6052         if (sd_has_rps_ipi_waiting(sd)) {
6053                 local_irq_disable();
6054                 net_rps_action_and_irq_enable(sd);
6055         }
6056
6057         napi->weight = dev_rx_weight;
6058         while (again) {
6059                 struct sk_buff *skb;
6060
6061                 while ((skb = __skb_dequeue(&sd->process_queue))) {
6062                         rcu_read_lock();
6063                         __netif_receive_skb(skb);
6064                         rcu_read_unlock();
6065                         input_queue_head_incr(sd);
6066                         if (++work >= quota)
6067                                 return work;
6068
6069                 }
6070
6071                 local_irq_disable();
6072                 rps_lock(sd);
6073                 if (skb_queue_empty(&sd->input_pkt_queue)) {
6074                         /*
6075                          * Inline a custom version of __napi_complete().
6076                          * only current cpu owns and manipulates this napi,
6077                          * and NAPI_STATE_SCHED is the only possible flag set
6078                          * on backlog.
6079                          * We can use a plain write instead of clear_bit(),
6080                          * and we dont need an smp_mb() memory barrier.
6081                          */
6082                         napi->state = 0;
6083                         again = false;
6084                 } else {
6085                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
6086                                                    &sd->process_queue);
6087                 }
6088                 rps_unlock(sd);
6089                 local_irq_enable();
6090         }
6091
6092         return work;
6093 }
6094
6095 /**
6096  * __napi_schedule - schedule for receive
6097  * @n: entry to schedule
6098  *
6099  * The entry's receive function will be scheduled to run.
6100  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6101  */
6102 void __napi_schedule(struct napi_struct *n)
6103 {
6104         unsigned long flags;
6105
6106         local_irq_save(flags);
6107         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6108         local_irq_restore(flags);
6109 }
6110 EXPORT_SYMBOL(__napi_schedule);
6111
6112 /**
6113  *      napi_schedule_prep - check if napi can be scheduled
6114  *      @n: napi context
6115  *
6116  * Test if NAPI routine is already running, and if not mark
6117  * it as running.  This is used as a condition variable
6118  * insure only one NAPI poll instance runs.  We also make
6119  * sure there is no pending NAPI disable.
6120  */
6121 bool napi_schedule_prep(struct napi_struct *n)
6122 {
6123         unsigned long val, new;
6124
6125         do {
6126                 val = READ_ONCE(n->state);
6127                 if (unlikely(val & NAPIF_STATE_DISABLE))
6128                         return false;
6129                 new = val | NAPIF_STATE_SCHED;
6130
6131                 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6132                  * This was suggested by Alexander Duyck, as compiler
6133                  * emits better code than :
6134                  * if (val & NAPIF_STATE_SCHED)
6135                  *     new |= NAPIF_STATE_MISSED;
6136                  */
6137                 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6138                                                    NAPIF_STATE_MISSED;
6139         } while (cmpxchg(&n->state, val, new) != val);
6140
6141         return !(val & NAPIF_STATE_SCHED);
6142 }
6143 EXPORT_SYMBOL(napi_schedule_prep);
6144
6145 /**
6146  * __napi_schedule_irqoff - schedule for receive
6147  * @n: entry to schedule
6148  *
6149  * Variant of __napi_schedule() assuming hard irqs are masked
6150  */
6151 void __napi_schedule_irqoff(struct napi_struct *n)
6152 {
6153         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6154 }
6155 EXPORT_SYMBOL(__napi_schedule_irqoff);
6156
6157 bool napi_complete_done(struct napi_struct *n, int work_done)
6158 {
6159         unsigned long flags, val, new;
6160
6161         /*
6162          * 1) Don't let napi dequeue from the cpu poll list
6163          *    just in case its running on a different cpu.
6164          * 2) If we are busy polling, do nothing here, we have
6165          *    the guarantee we will be called later.
6166          */
6167         if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6168                                  NAPIF_STATE_IN_BUSY_POLL)))
6169                 return false;
6170
6171         gro_normal_list(n);
6172
6173         if (n->gro_bitmask) {
6174                 unsigned long timeout = 0;
6175
6176                 if (work_done)
6177                         timeout = n->dev->gro_flush_timeout;
6178
6179                 /* When the NAPI instance uses a timeout and keeps postponing
6180                  * it, we need to bound somehow the time packets are kept in
6181                  * the GRO layer
6182                  */
6183                 napi_gro_flush(n, !!timeout);
6184                 if (timeout)
6185                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
6186                                       HRTIMER_MODE_REL_PINNED);
6187         }
6188         if (unlikely(!list_empty(&n->poll_list))) {
6189                 /* If n->poll_list is not empty, we need to mask irqs */
6190                 local_irq_save(flags);
6191                 list_del_init(&n->poll_list);
6192                 local_irq_restore(flags);
6193         }
6194
6195         do {
6196                 val = READ_ONCE(n->state);
6197
6198                 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6199
6200                 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6201
6202                 /* If STATE_MISSED was set, leave STATE_SCHED set,
6203                  * because we will call napi->poll() one more time.
6204                  * This C code was suggested by Alexander Duyck to help gcc.
6205                  */
6206                 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6207                                                     NAPIF_STATE_SCHED;
6208         } while (cmpxchg(&n->state, val, new) != val);
6209
6210         if (unlikely(val & NAPIF_STATE_MISSED)) {
6211                 __napi_schedule(n);
6212                 return false;
6213         }
6214
6215         return true;
6216 }
6217 EXPORT_SYMBOL(napi_complete_done);
6218
6219 /* must be called under rcu_read_lock(), as we dont take a reference */
6220 static struct napi_struct *napi_by_id(unsigned int napi_id)
6221 {
6222         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6223         struct napi_struct *napi;
6224
6225         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6226                 if (napi->napi_id == napi_id)
6227                         return napi;
6228
6229         return NULL;
6230 }
6231
6232 #if defined(CONFIG_NET_RX_BUSY_POLL)
6233
6234 #define BUSY_POLL_BUDGET 8
6235
6236 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6237 {
6238         int rc;
6239
6240         /* Busy polling means there is a high chance device driver hard irq
6241          * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6242          * set in napi_schedule_prep().
6243          * Since we are about to call napi->poll() once more, we can safely
6244          * clear NAPI_STATE_MISSED.
6245          *
6246          * Note: x86 could use a single "lock and ..." instruction
6247          * to perform these two clear_bit()
6248          */
6249         clear_bit(NAPI_STATE_MISSED, &napi->state);
6250         clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6251
6252         local_bh_disable();
6253
6254         /* All we really want here is to re-enable device interrupts.
6255          * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6256          */
6257         rc = napi->poll(napi, BUSY_POLL_BUDGET);
6258         /* We can't gro_normal_list() here, because napi->poll() might have
6259          * rearmed the napi (napi_complete_done()) in which case it could
6260          * already be running on another CPU.
6261          */
6262         trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6263         netpoll_poll_unlock(have_poll_lock);
6264         if (rc == BUSY_POLL_BUDGET) {
6265                 /* As the whole budget was spent, we still own the napi so can
6266                  * safely handle the rx_list.
6267                  */
6268                 gro_normal_list(napi);
6269                 __napi_schedule(napi);
6270         }
6271         local_bh_enable();
6272 }
6273
6274 void napi_busy_loop(unsigned int napi_id,
6275                     bool (*loop_end)(void *, unsigned long),
6276                     void *loop_end_arg)
6277 {
6278         unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6279         int (*napi_poll)(struct napi_struct *napi, int budget);
6280         void *have_poll_lock = NULL;
6281         struct napi_struct *napi;
6282
6283 restart:
6284         napi_poll = NULL;
6285
6286         rcu_read_lock();
6287
6288         napi = napi_by_id(napi_id);
6289         if (!napi)
6290                 goto out;
6291
6292         preempt_disable();
6293         for (;;) {
6294                 int work = 0;
6295
6296                 local_bh_disable();
6297                 if (!napi_poll) {
6298                         unsigned long val = READ_ONCE(napi->state);
6299
6300                         /* If multiple threads are competing for this napi,
6301                          * we avoid dirtying napi->state as much as we can.
6302                          */
6303                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6304                                    NAPIF_STATE_IN_BUSY_POLL))
6305                                 goto count;
6306                         if (cmpxchg(&napi->state, val,
6307                                     val | NAPIF_STATE_IN_BUSY_POLL |
6308                                           NAPIF_STATE_SCHED) != val)
6309                                 goto count;
6310                         have_poll_lock = netpoll_poll_lock(napi);
6311                         napi_poll = napi->poll;
6312                 }
6313                 work = napi_poll(napi, BUSY_POLL_BUDGET);
6314                 trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6315                 gro_normal_list(napi);
6316 count:
6317                 if (work > 0)
6318                         __NET_ADD_STATS(dev_net(napi->dev),
6319                                         LINUX_MIB_BUSYPOLLRXPACKETS, work);
6320                 local_bh_enable();
6321
6322                 if (!loop_end || loop_end(loop_end_arg, start_time))
6323                         break;
6324
6325                 if (unlikely(need_resched())) {
6326                         if (napi_poll)
6327                                 busy_poll_stop(napi, have_poll_lock);
6328                         preempt_enable();
6329                         rcu_read_unlock();
6330                         cond_resched();
6331                         if (loop_end(loop_end_arg, start_time))
6332                                 return;
6333                         goto restart;
6334                 }
6335                 cpu_relax();
6336         }
6337         if (napi_poll)
6338                 busy_poll_stop(napi, have_poll_lock);
6339         preempt_enable();
6340 out:
6341         rcu_read_unlock();
6342 }
6343 EXPORT_SYMBOL(napi_busy_loop);
6344
6345 #endif /* CONFIG_NET_RX_BUSY_POLL */
6346
6347 static void napi_hash_add(struct napi_struct *napi)
6348 {
6349         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6350             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6351                 return;
6352
6353         spin_lock(&napi_hash_lock);
6354
6355         /* 0..NR_CPUS range is reserved for sender_cpu use */
6356         do {
6357                 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6358                         napi_gen_id = MIN_NAPI_ID;
6359         } while (napi_by_id(napi_gen_id));
6360         napi->napi_id = napi_gen_id;
6361
6362         hlist_add_head_rcu(&napi->napi_hash_node,
6363                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6364
6365         spin_unlock(&napi_hash_lock);
6366 }
6367
6368 /* Warning : caller is responsible to make sure rcu grace period
6369  * is respected before freeing memory containing @napi
6370  */
6371 bool napi_hash_del(struct napi_struct *napi)
6372 {
6373         bool rcu_sync_needed = false;
6374
6375         spin_lock(&napi_hash_lock);
6376
6377         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6378                 rcu_sync_needed = true;
6379                 hlist_del_rcu(&napi->napi_hash_node);
6380         }
6381         spin_unlock(&napi_hash_lock);
6382         return rcu_sync_needed;
6383 }
6384 EXPORT_SYMBOL_GPL(napi_hash_del);
6385
6386 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6387 {
6388         struct napi_struct *napi;
6389
6390         napi = container_of(timer, struct napi_struct, timer);
6391
6392         /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6393          * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6394          */
6395         if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6396             !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6397                 __napi_schedule_irqoff(napi);
6398
6399         return HRTIMER_NORESTART;
6400 }
6401
6402 static void init_gro_hash(struct napi_struct *napi)
6403 {
6404         int i;
6405
6406         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6407                 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6408                 napi->gro_hash[i].count = 0;
6409         }
6410         napi->gro_bitmask = 0;
6411 }
6412
6413 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6414                     int (*poll)(struct napi_struct *, int), int weight)
6415 {
6416         INIT_LIST_HEAD(&napi->poll_list);
6417         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6418         napi->timer.function = napi_watchdog;
6419         init_gro_hash(napi);
6420         napi->skb = NULL;
6421         INIT_LIST_HEAD(&napi->rx_list);
6422         napi->rx_count = 0;
6423         napi->poll = poll;
6424         if (weight > NAPI_POLL_WEIGHT)
6425                 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6426                                 weight);
6427         napi->weight = weight;
6428         list_add(&napi->dev_list, &dev->napi_list);
6429         napi->dev = dev;
6430 #ifdef CONFIG_NETPOLL
6431         napi->poll_owner = -1;
6432 #endif
6433         set_bit(NAPI_STATE_SCHED, &napi->state);
6434         napi_hash_add(napi);
6435 }
6436 EXPORT_SYMBOL(netif_napi_add);
6437
6438 void napi_disable(struct napi_struct *n)
6439 {
6440         might_sleep();
6441         set_bit(NAPI_STATE_DISABLE, &n->state);
6442
6443         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6444                 msleep(1);
6445         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6446                 msleep(1);
6447
6448         hrtimer_cancel(&n->timer);
6449
6450         clear_bit(NAPI_STATE_DISABLE, &n->state);
6451 }
6452 EXPORT_SYMBOL(napi_disable);
6453
6454 static void flush_gro_hash(struct napi_struct *napi)
6455 {
6456         int i;
6457
6458         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6459                 struct sk_buff *skb, *n;
6460
6461                 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6462                         kfree_skb(skb);
6463                 napi->gro_hash[i].count = 0;
6464         }
6465 }
6466
6467 /* Must be called in process context */
6468 void netif_napi_del(struct napi_struct *napi)
6469 {
6470         might_sleep();
6471         if (napi_hash_del(napi))
6472                 synchronize_net();
6473         list_del_init(&napi->dev_list);
6474         napi_free_frags(napi);
6475
6476         flush_gro_hash(napi);
6477         napi->gro_bitmask = 0;
6478 }
6479 EXPORT_SYMBOL(netif_napi_del);
6480
6481 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6482 {
6483         void *have;
6484         int work, weight;
6485
6486         list_del_init(&n->poll_list);
6487
6488         have = netpoll_poll_lock(n);
6489
6490         weight = n->weight;
6491
6492         /* This NAPI_STATE_SCHED test is for avoiding a race
6493          * with netpoll's poll_napi().  Only the entity which
6494          * obtains the lock and sees NAPI_STATE_SCHED set will
6495          * actually make the ->poll() call.  Therefore we avoid
6496          * accidentally calling ->poll() when NAPI is not scheduled.
6497          */
6498         work = 0;
6499         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6500                 work = n->poll(n, weight);
6501                 trace_napi_poll(n, work, weight);
6502         }
6503
6504         WARN_ON_ONCE(work > weight);
6505
6506         if (likely(work < weight))
6507                 goto out_unlock;
6508
6509         /* Drivers must not modify the NAPI state if they
6510          * consume the entire weight.  In such cases this code
6511          * still "owns" the NAPI instance and therefore can
6512          * move the instance around on the list at-will.
6513          */
6514         if (unlikely(napi_disable_pending(n))) {
6515                 napi_complete(n);
6516                 goto out_unlock;
6517         }
6518
6519         gro_normal_list(n);
6520
6521         if (n->gro_bitmask) {
6522                 /* flush too old packets
6523                  * If HZ < 1000, flush all packets.
6524                  */
6525                 napi_gro_flush(n, HZ >= 1000);
6526         }
6527
6528         /* Some drivers may have called napi_schedule
6529          * prior to exhausting their budget.
6530          */
6531         if (unlikely(!list_empty(&n->poll_list))) {
6532                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6533                              n->dev ? n->dev->name : "backlog");
6534                 goto out_unlock;
6535         }
6536
6537         list_add_tail(&n->poll_list, repoll);
6538
6539 out_unlock:
6540         netpoll_poll_unlock(have);
6541
6542         return work;
6543 }
6544
6545 static __latent_entropy void net_rx_action(struct softirq_action *h)
6546 {
6547         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6548         unsigned long time_limit = jiffies +
6549                 usecs_to_jiffies(netdev_budget_usecs);
6550         int budget = netdev_budget;
6551         LIST_HEAD(list);
6552         LIST_HEAD(repoll);
6553
6554         local_irq_disable();
6555         list_splice_init(&sd->poll_list, &list);
6556         local_irq_enable();
6557
6558         for (;;) {
6559                 struct napi_struct *n;
6560
6561                 if (list_empty(&list)) {
6562                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6563                                 goto out;
6564                         break;
6565                 }
6566
6567                 n = list_first_entry(&list, struct napi_struct, poll_list);
6568                 budget -= napi_poll(n, &repoll);
6569
6570                 /* If softirq window is exhausted then punt.
6571                  * Allow this to run for 2 jiffies since which will allow
6572                  * an average latency of 1.5/HZ.
6573                  */
6574                 if (unlikely(budget <= 0 ||
6575                              time_after_eq(jiffies, time_limit))) {
6576                         sd->time_squeeze++;
6577                         break;
6578                 }
6579         }
6580
6581         local_irq_disable();
6582
6583         list_splice_tail_init(&sd->poll_list, &list);
6584         list_splice_tail(&repoll, &list);
6585         list_splice(&list, &sd->poll_list);
6586         if (!list_empty(&sd->poll_list))
6587                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6588
6589         net_rps_action_and_irq_enable(sd);
6590 out:
6591         __kfree_skb_flush();
6592 }
6593
6594 struct netdev_adjacent {
6595         struct net_device *dev;
6596
6597         /* upper master flag, there can only be one master device per list */
6598         bool master;
6599
6600         /* counter for the number of times this device was added to us */
6601         u16 ref_nr;
6602
6603         /* private field for the users */
6604         void *private;
6605
6606         struct list_head list;
6607         struct rcu_head rcu;
6608 };
6609
6610 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6611                                                  struct list_head *adj_list)
6612 {
6613         struct netdev_adjacent *adj;
6614
6615         list_for_each_entry(adj, adj_list, list) {
6616                 if (adj->dev == adj_dev)
6617                         return adj;
6618         }
6619         return NULL;
6620 }
6621
6622 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6623 {
6624         struct net_device *dev = data;
6625
6626         return upper_dev == dev;
6627 }
6628
6629 /**
6630  * netdev_has_upper_dev - Check if device is linked to an upper device
6631  * @dev: device
6632  * @upper_dev: upper device to check
6633  *
6634  * Find out if a device is linked to specified upper device and return true
6635  * in case it is. Note that this checks only immediate upper device,
6636  * not through a complete stack of devices. The caller must hold the RTNL lock.
6637  */
6638 bool netdev_has_upper_dev(struct net_device *dev,
6639                           struct net_device *upper_dev)
6640 {
6641         ASSERT_RTNL();
6642
6643         return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6644                                              upper_dev);
6645 }
6646 EXPORT_SYMBOL(netdev_has_upper_dev);
6647
6648 /**
6649  * netdev_has_upper_dev_all - Check if device is linked to an upper device
6650  * @dev: device
6651  * @upper_dev: upper device to check
6652  *
6653  * Find out if a device is linked to specified upper device and return true
6654  * in case it is. Note that this checks the entire upper device chain.
6655  * The caller must hold rcu lock.
6656  */
6657
6658 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6659                                   struct net_device *upper_dev)
6660 {
6661         return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6662                                                upper_dev);
6663 }
6664 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6665
6666 /**
6667  * netdev_has_any_upper_dev - Check if device is linked to some device
6668  * @dev: device
6669  *
6670  * Find out if a device is linked to an upper device and return true in case
6671  * it is. The caller must hold the RTNL lock.
6672  */
6673 bool netdev_has_any_upper_dev(struct net_device *dev)
6674 {
6675         ASSERT_RTNL();
6676
6677         return !list_empty(&dev->adj_list.upper);
6678 }
6679 EXPORT_SYMBOL(netdev_has_any_upper_dev);
6680
6681 /**
6682  * netdev_master_upper_dev_get - Get master upper device
6683  * @dev: device
6684  *
6685  * Find a master upper device and return pointer to it or NULL in case
6686  * it's not there. The caller must hold the RTNL lock.
6687  */
6688 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6689 {
6690         struct netdev_adjacent *upper;
6691
6692         ASSERT_RTNL();
6693
6694         if (list_empty(&dev->adj_list.upper))
6695                 return NULL;
6696
6697         upper = list_first_entry(&dev->adj_list.upper,
6698                                  struct netdev_adjacent, list);
6699         if (likely(upper->master))
6700                 return upper->dev;
6701         return NULL;
6702 }
6703 EXPORT_SYMBOL(netdev_master_upper_dev_get);
6704
6705 /**
6706  * netdev_has_any_lower_dev - Check if device is linked to some device
6707  * @dev: device
6708  *
6709  * Find out if a device is linked to a lower device and return true in case
6710  * it is. The caller must hold the RTNL lock.
6711  */
6712 static bool netdev_has_any_lower_dev(struct net_device *dev)
6713 {
6714         ASSERT_RTNL();
6715
6716         return !list_empty(&dev->adj_list.lower);
6717 }
6718
6719 void *netdev_adjacent_get_private(struct list_head *adj_list)
6720 {
6721         struct netdev_adjacent *adj;
6722
6723         adj = list_entry(adj_list, struct netdev_adjacent, list);
6724
6725         return adj->private;
6726 }
6727 EXPORT_SYMBOL(netdev_adjacent_get_private);
6728
6729 /**
6730  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6731  * @dev: device
6732  * @iter: list_head ** of the current position
6733  *
6734  * Gets the next device from the dev's upper list, starting from iter
6735  * position. The caller must hold RCU read lock.
6736  */
6737 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6738                                                  struct list_head **iter)
6739 {
6740         struct netdev_adjacent *upper;
6741
6742         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6743
6744         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6745
6746         if (&upper->list == &dev->adj_list.upper)
6747                 return NULL;
6748
6749         *iter = &upper->list;
6750
6751         return upper->dev;
6752 }
6753 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6754
6755 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6756                                                     struct list_head **iter)
6757 {
6758         struct netdev_adjacent *upper;
6759
6760         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6761
6762         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6763
6764         if (&upper->list == &dev->adj_list.upper)
6765                 return NULL;
6766
6767         *iter = &upper->list;
6768
6769         return upper->dev;
6770 }
6771
6772 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6773                                   int (*fn)(struct net_device *dev,
6774                                             void *data),
6775                                   void *data)
6776 {
6777         struct net_device *udev;
6778         struct list_head *iter;
6779         int ret;
6780
6781         for (iter = &dev->adj_list.upper,
6782              udev = netdev_next_upper_dev_rcu(dev, &iter);
6783              udev;
6784              udev = netdev_next_upper_dev_rcu(dev, &iter)) {
6785                 /* first is the upper device itself */
6786                 ret = fn(udev, data);
6787                 if (ret)
6788                         return ret;
6789
6790                 /* then look at all of its upper devices */
6791                 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
6792                 if (ret)
6793                         return ret;
6794         }
6795
6796         return 0;
6797 }
6798 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6799
6800 /**
6801  * netdev_lower_get_next_private - Get the next ->private from the
6802  *                                 lower neighbour list
6803  * @dev: device
6804  * @iter: list_head ** of the current position
6805  *
6806  * Gets the next netdev_adjacent->private from the dev's lower neighbour
6807  * list, starting from iter position. The caller must hold either hold the
6808  * RTNL lock or its own locking that guarantees that the neighbour lower
6809  * list will remain unchanged.
6810  */
6811 void *netdev_lower_get_next_private(struct net_device *dev,
6812                                     struct list_head **iter)
6813 {
6814         struct netdev_adjacent *lower;
6815
6816         lower = list_entry(*iter, struct netdev_adjacent, list);
6817
6818         if (&lower->list == &dev->adj_list.lower)
6819                 return NULL;
6820
6821         *iter = lower->list.next;
6822
6823         return lower->private;
6824 }
6825 EXPORT_SYMBOL(netdev_lower_get_next_private);
6826
6827 /**
6828  * netdev_lower_get_next_private_rcu - Get the next ->private from the
6829  *                                     lower neighbour list, RCU
6830  *                                     variant
6831  * @dev: device
6832  * @iter: list_head ** of the current position
6833  *
6834  * Gets the next netdev_adjacent->private from the dev's lower neighbour
6835  * list, starting from iter position. The caller must hold RCU read lock.
6836  */
6837 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6838                                         struct list_head **iter)
6839 {
6840         struct netdev_adjacent *lower;
6841
6842         WARN_ON_ONCE(!rcu_read_lock_held());
6843
6844         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6845
6846         if (&lower->list == &dev->adj_list.lower)
6847                 return NULL;
6848
6849         *iter = &lower->list;
6850
6851         return lower->private;
6852 }
6853 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6854
6855 /**
6856  * netdev_lower_get_next - Get the next device from the lower neighbour
6857  *                         list
6858  * @dev: device
6859  * @iter: list_head ** of the current position
6860  *
6861  * Gets the next netdev_adjacent from the dev's lower neighbour
6862  * list, starting from iter position. The caller must hold RTNL lock or
6863  * its own locking that guarantees that the neighbour lower
6864  * list will remain unchanged.
6865  */
6866 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6867 {
6868         struct netdev_adjacent *lower;
6869
6870         lower = list_entry(*iter, struct netdev_adjacent, list);
6871
6872         if (&lower->list == &dev->adj_list.lower)
6873                 return NULL;
6874
6875         *iter = lower->list.next;
6876
6877         return lower->dev;
6878 }
6879 EXPORT_SYMBOL(netdev_lower_get_next);
6880
6881 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
6882                                                 struct list_head **iter)
6883 {
6884         struct netdev_adjacent *lower;
6885
6886         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6887
6888         if (&lower->list == &dev->adj_list.lower)
6889                 return NULL;
6890
6891         *iter = &lower->list;
6892
6893         return lower->dev;
6894 }
6895
6896 int netdev_walk_all_lower_dev(struct net_device *dev,
6897                               int (*fn)(struct net_device *dev,
6898                                         void *data),
6899                               void *data)
6900 {
6901         struct net_device *ldev;
6902         struct list_head *iter;
6903         int ret;
6904
6905         for (iter = &dev->adj_list.lower,
6906              ldev = netdev_next_lower_dev(dev, &iter);
6907              ldev;
6908              ldev = netdev_next_lower_dev(dev, &iter)) {
6909                 /* first is the lower device itself */
6910                 ret = fn(ldev, data);
6911                 if (ret)
6912                         return ret;
6913
6914                 /* then look at all of its lower devices */
6915                 ret = netdev_walk_all_lower_dev(ldev, fn, data);
6916                 if (ret)
6917                         return ret;
6918         }
6919
6920         return 0;
6921 }
6922 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
6923
6924 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6925                                                     struct list_head **iter)
6926 {
6927         struct netdev_adjacent *lower;
6928
6929         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6930         if (&lower->list == &dev->adj_list.lower)
6931                 return NULL;
6932
6933         *iter = &lower->list;
6934
6935         return lower->dev;
6936 }
6937
6938 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
6939                                   int (*fn)(struct net_device *dev,
6940                                             void *data),
6941                                   void *data)
6942 {
6943         struct net_device *ldev;
6944         struct list_head *iter;
6945         int ret;
6946
6947         for (iter = &dev->adj_list.lower,
6948              ldev = netdev_next_lower_dev_rcu(dev, &iter);
6949              ldev;
6950              ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
6951                 /* first is the lower device itself */
6952                 ret = fn(ldev, data);
6953                 if (ret)
6954                         return ret;
6955
6956                 /* then look at all of its lower devices */
6957                 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
6958                 if (ret)
6959                         return ret;
6960         }
6961
6962         return 0;
6963 }
6964 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
6965
6966 /**
6967  * netdev_lower_get_first_private_rcu - Get the first ->private from the
6968  *                                     lower neighbour list, RCU
6969  *                                     variant
6970  * @dev: device
6971  *
6972  * Gets the first netdev_adjacent->private from the dev's lower neighbour
6973  * list. The caller must hold RCU read lock.
6974  */
6975 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
6976 {
6977         struct netdev_adjacent *lower;
6978
6979         lower = list_first_or_null_rcu(&dev->adj_list.lower,
6980                         struct netdev_adjacent, list);
6981         if (lower)
6982                 return lower->private;
6983         return NULL;
6984 }
6985 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
6986
6987 /**
6988  * netdev_master_upper_dev_get_rcu - Get master upper device
6989  * @dev: device
6990  *
6991  * Find a master upper device and return pointer to it or NULL in case
6992  * it's not there. The caller must hold the RCU read lock.
6993  */
6994 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
6995 {
6996         struct netdev_adjacent *upper;
6997
6998         upper = list_first_or_null_rcu(&dev->adj_list.upper,
6999                                        struct netdev_adjacent, list);
7000         if (upper && likely(upper->master))
7001                 return upper->dev;
7002         return NULL;
7003 }
7004 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7005
7006 static int netdev_adjacent_sysfs_add(struct net_device *dev,
7007                               struct net_device *adj_dev,
7008                               struct list_head *dev_list)
7009 {
7010         char linkname[IFNAMSIZ+7];
7011
7012         sprintf(linkname, dev_list == &dev->adj_list.upper ?
7013                 "upper_%s" : "lower_%s", adj_dev->name);
7014         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7015                                  linkname);
7016 }
7017 static void netdev_adjacent_sysfs_del(struct net_device *dev,
7018                                char *name,
7019                                struct list_head *dev_list)
7020 {
7021         char linkname[IFNAMSIZ+7];
7022
7023         sprintf(linkname, dev_list == &dev->adj_list.upper ?
7024                 "upper_%s" : "lower_%s", name);
7025         sysfs_remove_link(&(dev->dev.kobj), linkname);
7026 }
7027
7028 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7029                                                  struct net_device *adj_dev,
7030                                                  struct list_head *dev_list)
7031 {
7032         return (dev_list == &dev->adj_list.upper ||
7033                 dev_list == &dev->adj_list.lower) &&
7034                 net_eq(dev_net(dev), dev_net(adj_dev));
7035 }
7036
7037 static int __netdev_adjacent_dev_insert(struct net_device *dev,
7038                                         struct net_device *adj_dev,
7039                                         struct list_head *dev_list,
7040                                         void *private, bool master)
7041 {
7042         struct netdev_adjacent *adj;
7043         int ret;
7044
7045         adj = __netdev_find_adj(adj_dev, dev_list);
7046
7047         if (adj) {
7048                 adj->ref_nr += 1;
7049                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7050                          dev->name, adj_dev->name, adj->ref_nr);
7051
7052                 return 0;
7053         }
7054
7055         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7056         if (!adj)
7057                 return -ENOMEM;
7058
7059         adj->dev = adj_dev;
7060         adj->master = master;
7061         adj->ref_nr = 1;
7062         adj->private = private;
7063         dev_hold(adj_dev);
7064
7065         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7066                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7067
7068         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7069                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7070                 if (ret)
7071                         goto free_adj;
7072         }
7073
7074         /* Ensure that master link is always the first item in list. */
7075         if (master) {
7076                 ret = sysfs_create_link(&(dev->dev.kobj),
7077                                         &(adj_dev->dev.kobj), "master");
7078                 if (ret)
7079                         goto remove_symlinks;
7080
7081                 list_add_rcu(&adj->list, dev_list);
7082         } else {
7083                 list_add_tail_rcu(&adj->list, dev_list);
7084         }
7085
7086         return 0;
7087
7088 remove_symlinks:
7089         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7090                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7091 free_adj:
7092         kfree(adj);
7093         dev_put(adj_dev);
7094
7095         return ret;
7096 }
7097
7098 static void __netdev_adjacent_dev_remove(struct net_device *dev,
7099                                          struct net_device *adj_dev,
7100                                          u16 ref_nr,
7101                                          struct list_head *dev_list)
7102 {
7103         struct netdev_adjacent *adj;
7104
7105         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7106                  dev->name, adj_dev->name, ref_nr);
7107
7108         adj = __netdev_find_adj(adj_dev, dev_list);
7109
7110         if (!adj) {
7111                 pr_err("Adjacency does not exist for device %s from %s\n",
7112                        dev->name, adj_dev->name);
7113                 WARN_ON(1);
7114                 return;
7115         }
7116
7117         if (adj->ref_nr > ref_nr) {
7118                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7119                          dev->name, adj_dev->name, ref_nr,
7120                          adj->ref_nr - ref_nr);
7121                 adj->ref_nr -= ref_nr;
7122                 return;
7123         }
7124
7125         if (adj->master)
7126                 sysfs_remove_link(&(dev->dev.kobj), "master");
7127
7128         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7129                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7130
7131         list_del_rcu(&adj->list);
7132         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7133                  adj_dev->name, dev->name, adj_dev->name);
7134         dev_put(adj_dev);
7135         kfree_rcu(adj, rcu);
7136 }
7137
7138 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7139                                             struct net_device *upper_dev,
7140                                             struct list_head *up_list,
7141                                             struct list_head *down_list,
7142                                             void *private, bool master)
7143 {
7144         int ret;
7145
7146         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7147                                            private, master);
7148         if (ret)
7149                 return ret;
7150
7151         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7152                                            private, false);
7153         if (ret) {
7154                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7155                 return ret;
7156         }
7157
7158         return 0;
7159 }
7160
7161 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7162                                                struct net_device *upper_dev,
7163                                                u16 ref_nr,
7164                                                struct list_head *up_list,
7165                                                struct list_head *down_list)
7166 {
7167         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7168         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7169 }
7170
7171 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7172                                                 struct net_device *upper_dev,
7173                                                 void *private, bool master)
7174 {
7175         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7176                                                 &dev->adj_list.upper,
7177                                                 &upper_dev->adj_list.lower,
7178                                                 private, master);
7179 }
7180
7181 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7182                                                    struct net_device *upper_dev)
7183 {
7184         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7185                                            &dev->adj_list.upper,
7186                                            &upper_dev->adj_list.lower);
7187 }
7188
7189 static int __netdev_upper_dev_link(struct net_device *dev,
7190                                    struct net_device *upper_dev, bool master,
7191                                    void *upper_priv, void *upper_info,
7192                                    struct netlink_ext_ack *extack)
7193 {
7194         struct netdev_notifier_changeupper_info changeupper_info = {
7195                 .info = {
7196                         .dev = dev,
7197                         .extack = extack,
7198                 },
7199                 .upper_dev = upper_dev,
7200                 .master = master,
7201                 .linking = true,
7202                 .upper_info = upper_info,
7203         };
7204         struct net_device *master_dev;
7205         int ret = 0;
7206
7207         ASSERT_RTNL();
7208
7209         if (dev == upper_dev)
7210                 return -EBUSY;
7211
7212         /* To prevent loops, check if dev is not upper device to upper_dev. */
7213         if (netdev_has_upper_dev(upper_dev, dev))
7214                 return -EBUSY;
7215
7216         if (!master) {
7217                 if (netdev_has_upper_dev(dev, upper_dev))
7218                         return -EEXIST;
7219         } else {
7220                 master_dev = netdev_master_upper_dev_get(dev);
7221                 if (master_dev)
7222                         return master_dev == upper_dev ? -EEXIST : -EBUSY;
7223         }
7224
7225         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7226                                             &changeupper_info.info);
7227         ret = notifier_to_errno(ret);
7228         if (ret)
7229                 return ret;
7230
7231         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7232                                                    master);
7233         if (ret)
7234                 return ret;
7235
7236         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7237                                             &changeupper_info.info);
7238         ret = notifier_to_errno(ret);
7239         if (ret)
7240                 goto rollback;
7241
7242         return 0;
7243
7244 rollback:
7245         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7246
7247         return ret;
7248 }
7249
7250 /**
7251  * netdev_upper_dev_link - Add a link to the upper device
7252  * @dev: device
7253  * @upper_dev: new upper device
7254  * @extack: netlink extended ack
7255  *
7256  * Adds a link to device which is upper to this one. The caller must hold
7257  * the RTNL lock. On a failure a negative errno code is returned.
7258  * On success the reference counts are adjusted and the function
7259  * returns zero.
7260  */
7261 int netdev_upper_dev_link(struct net_device *dev,
7262                           struct net_device *upper_dev,
7263                           struct netlink_ext_ack *extack)
7264 {
7265         return __netdev_upper_dev_link(dev, upper_dev, false,
7266                                        NULL, NULL, extack);
7267 }
7268 EXPORT_SYMBOL(netdev_upper_dev_link);
7269
7270 /**
7271  * netdev_master_upper_dev_link - Add a master link to the upper device
7272  * @dev: device
7273  * @upper_dev: new upper device
7274  * @upper_priv: upper device private
7275  * @upper_info: upper info to be passed down via notifier
7276  * @extack: netlink extended ack
7277  *
7278  * Adds a link to device which is upper to this one. In this case, only
7279  * one master upper device can be linked, although other non-master devices
7280  * might be linked as well. The caller must hold the RTNL lock.
7281  * On a failure a negative errno code is returned. On success the reference
7282  * counts are adjusted and the function returns zero.
7283  */
7284 int netdev_master_upper_dev_link(struct net_device *dev,
7285                                  struct net_device *upper_dev,
7286                                  void *upper_priv, void *upper_info,
7287                                  struct netlink_ext_ack *extack)
7288 {
7289         return __netdev_upper_dev_link(dev, upper_dev, true,
7290                                        upper_priv, upper_info, extack);
7291 }
7292 EXPORT_SYMBOL(netdev_master_upper_dev_link);
7293
7294 /**
7295  * netdev_upper_dev_unlink - Removes a link to upper device
7296  * @dev: device
7297  * @upper_dev: new upper device
7298  *
7299  * Removes a link to device which is upper to this one. The caller must hold
7300  * the RTNL lock.
7301  */
7302 void netdev_upper_dev_unlink(struct net_device *dev,
7303                              struct net_device *upper_dev)
7304 {
7305         struct netdev_notifier_changeupper_info changeupper_info = {
7306                 .info = {
7307                         .dev = dev,
7308                 },
7309                 .upper_dev = upper_dev,
7310                 .linking = false,
7311         };
7312
7313         ASSERT_RTNL();
7314
7315         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7316
7317         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7318                                       &changeupper_info.info);
7319
7320         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7321
7322         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7323                                       &changeupper_info.info);
7324 }
7325 EXPORT_SYMBOL(netdev_upper_dev_unlink);
7326
7327 /**
7328  * netdev_bonding_info_change - Dispatch event about slave change
7329  * @dev: device
7330  * @bonding_info: info to dispatch
7331  *
7332  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7333  * The caller must hold the RTNL lock.
7334  */
7335 void netdev_bonding_info_change(struct net_device *dev,
7336                                 struct netdev_bonding_info *bonding_info)
7337 {
7338         struct netdev_notifier_bonding_info info = {
7339                 .info.dev = dev,
7340         };
7341
7342         memcpy(&info.bonding_info, bonding_info,
7343                sizeof(struct netdev_bonding_info));
7344         call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7345                                       &info.info);
7346 }
7347 EXPORT_SYMBOL(netdev_bonding_info_change);
7348
7349 static void netdev_adjacent_add_links(struct net_device *dev)
7350 {
7351         struct netdev_adjacent *iter;
7352
7353         struct net *net = dev_net(dev);
7354
7355         list_for_each_entry(iter, &dev->adj_list.upper, list) {
7356                 if (!net_eq(net, dev_net(iter->dev)))
7357                         continue;
7358                 netdev_adjacent_sysfs_add(iter->dev, dev,
7359                                           &iter->dev->adj_list.lower);
7360                 netdev_adjacent_sysfs_add(dev, iter->dev,
7361                                           &dev->adj_list.upper);
7362         }
7363
7364         list_for_each_entry(iter, &dev->adj_list.lower, list) {
7365                 if (!net_eq(net, dev_net(iter->dev)))
7366                         continue;
7367                 netdev_adjacent_sysfs_add(iter->dev, dev,
7368                                           &iter->dev->adj_list.upper);
7369                 netdev_adjacent_sysfs_add(dev, iter->dev,
7370                                           &dev->adj_list.lower);
7371         }
7372 }
7373
7374 static void netdev_adjacent_del_links(struct net_device *dev)
7375 {
7376         struct netdev_adjacent *iter;
7377
7378         struct net *net = dev_net(dev);
7379
7380         list_for_each_entry(iter, &dev->adj_list.upper, list) {
7381                 if (!net_eq(net, dev_net(iter->dev)))
7382                         continue;
7383                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
7384                                           &iter->dev->adj_list.lower);
7385                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
7386                                           &dev->adj_list.upper);
7387         }
7388
7389         list_for_each_entry(iter, &dev->adj_list.lower, list) {
7390                 if (!net_eq(net, dev_net(iter->dev)))
7391                         continue;
7392                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
7393                                           &iter->dev->adj_list.upper);
7394                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
7395                                           &dev->adj_list.lower);
7396         }
7397 }
7398
7399 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
7400 {
7401         struct netdev_adjacent *iter;
7402
7403         struct net *net = dev_net(dev);
7404
7405         list_for_each_entry(iter, &dev->adj_list.upper, list) {
7406                 if (!net_eq(net, dev_net(iter->dev)))
7407                         continue;
7408                 netdev_adjacent_sysfs_del(iter->dev, oldname,
7409                                           &iter->dev->adj_list.lower);
7410                 netdev_adjacent_sysfs_add(iter->dev, dev,
7411                                           &iter->dev->adj_list.lower);
7412         }
7413
7414         list_for_each_entry(iter, &dev->adj_list.lower, list) {
7415                 if (!net_eq(net, dev_net(iter->dev)))
7416                         continue;
7417                 netdev_adjacent_sysfs_del(iter->dev, oldname,
7418                                           &iter->dev->adj_list.upper);
7419                 netdev_adjacent_sysfs_add(iter->dev, dev,
7420                                           &iter->dev->adj_list.upper);
7421         }
7422 }
7423
7424 void *netdev_lower_dev_get_private(struct net_device *dev,
7425                                    struct net_device *lower_dev)
7426 {
7427         struct netdev_adjacent *lower;
7428
7429         if (!lower_dev)
7430                 return NULL;
7431         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
7432         if (!lower)
7433                 return NULL;
7434
7435         return lower->private;
7436 }
7437 EXPORT_SYMBOL(netdev_lower_dev_get_private);
7438
7439
7440 int dev_get_nest_level(struct net_device *dev)
7441 {
7442         struct net_device *lower = NULL;
7443         struct list_head *iter;
7444         int max_nest = -1;
7445         int nest;
7446
7447         ASSERT_RTNL();
7448
7449         netdev_for_each_lower_dev(dev, lower, iter) {
7450                 nest = dev_get_nest_level(lower);
7451                 if (max_nest < nest)
7452                         max_nest = nest;
7453         }
7454
7455         return max_nest + 1;
7456 }
7457 EXPORT_SYMBOL(dev_get_nest_level);
7458
7459 /**
7460  * netdev_lower_change - Dispatch event about lower device state change
7461  * @lower_dev: device
7462  * @lower_state_info: state to dispatch
7463  *
7464  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
7465  * The caller must hold the RTNL lock.
7466  */
7467 void netdev_lower_state_changed(struct net_device *lower_dev,
7468                                 void *lower_state_info)
7469 {
7470         struct netdev_notifier_changelowerstate_info changelowerstate_info = {
7471                 .info.dev = lower_dev,
7472         };
7473
7474         ASSERT_RTNL();
7475         changelowerstate_info.lower_state_info = lower_state_info;
7476         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
7477                                       &changelowerstate_info.info);
7478 }
7479 EXPORT_SYMBOL(netdev_lower_state_changed);
7480
7481 static void dev_change_rx_flags(struct net_device *dev, int flags)
7482 {
7483         const struct net_device_ops *ops = dev->netdev_ops;
7484
7485         if (ops->ndo_change_rx_flags)
7486                 ops->ndo_change_rx_flags(dev, flags);
7487 }
7488
7489 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
7490 {
7491         unsigned int old_flags = dev->flags;
7492         kuid_t uid;
7493         kgid_t gid;
7494
7495         ASSERT_RTNL();
7496
7497         dev->flags |= IFF_PROMISC;
7498         dev->promiscuity += inc;
7499         if (dev->promiscuity == 0) {
7500                 /*
7501                  * Avoid overflow.
7502                  * If inc causes overflow, untouch promisc and return error.
7503                  */
7504                 if (inc < 0)
7505                         dev->flags &= ~IFF_PROMISC;
7506                 else {
7507                         dev->promiscuity -= inc;
7508                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
7509                                 dev->name);
7510                         return -EOVERFLOW;
7511                 }
7512         }
7513         if (dev->flags != old_flags) {
7514                 pr_info("device %s %s promiscuous mode\n",
7515                         dev->name,
7516                         dev->flags & IFF_PROMISC ? "entered" : "left");
7517                 if (audit_enabled) {
7518                         current_uid_gid(&uid, &gid);
7519                         audit_log(audit_context(), GFP_ATOMIC,
7520                                   AUDIT_ANOM_PROMISCUOUS,
7521                                   "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
7522                                   dev->name, (dev->flags & IFF_PROMISC),
7523                                   (old_flags & IFF_PROMISC),
7524                                   from_kuid(&init_user_ns, audit_get_loginuid(current)),
7525                                   from_kuid(&init_user_ns, uid),
7526                                   from_kgid(&init_user_ns, gid),
7527                                   audit_get_sessionid(current));
7528                 }
7529
7530                 dev_change_rx_flags(dev, IFF_PROMISC);
7531         }
7532         if (notify)
7533                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
7534         return 0;
7535 }
7536
7537 /**
7538  *      dev_set_promiscuity     - update promiscuity count on a device
7539  *      @dev: device
7540  *      @inc: modifier
7541  *
7542  *      Add or remove promiscuity from a device. While the count in the device
7543  *      remains above zero the interface remains promiscuous. Once it hits zero
7544  *      the device reverts back to normal filtering operation. A negative inc
7545  *      value is used to drop promiscuity on the device.
7546  *      Return 0 if successful or a negative errno code on error.
7547  */
7548 int dev_set_promiscuity(struct net_device *dev, int inc)
7549 {
7550         unsigned int old_flags = dev->flags;
7551         int err;
7552
7553         err = __dev_set_promiscuity(dev, inc, true);
7554         if (err < 0)
7555                 return err;
7556         if (dev->flags != old_flags)
7557                 dev_set_rx_mode(dev);
7558         return err;
7559 }
7560 EXPORT_SYMBOL(dev_set_promiscuity);
7561
7562 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
7563 {
7564         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
7565
7566         ASSERT_RTNL();
7567
7568         dev->flags |= IFF_ALLMULTI;
7569         dev->allmulti += inc;
7570         if (dev->allmulti == 0) {
7571                 /*
7572                  * Avoid overflow.
7573                  * If inc causes overflow, untouch allmulti and return error.
7574                  */
7575                 if (inc < 0)
7576                         dev->flags &= ~IFF_ALLMULTI;
7577                 else {
7578                         dev->allmulti -= inc;
7579                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
7580                                 dev->name);
7581                         return -EOVERFLOW;
7582                 }
7583         }
7584         if (dev->flags ^ old_flags) {
7585                 dev_change_rx_flags(dev, IFF_ALLMULTI);
7586                 dev_set_rx_mode(dev);
7587                 if (notify)
7588                         __dev_notify_flags(dev, old_flags,
7589                                            dev->gflags ^ old_gflags);
7590         }
7591         return 0;
7592 }
7593
7594 /**
7595  *      dev_set_allmulti        - update allmulti count on a device
7596  *      @dev: device
7597  *      @inc: modifier
7598  *
7599  *      Add or remove reception of all multicast frames to a device. While the
7600  *      count in the device remains above zero the interface remains listening
7601  *      to all interfaces. Once it hits zero the device reverts back to normal
7602  *      filtering operation. A negative @inc value is used to drop the counter
7603  *      when releasing a resource needing all multicasts.
7604  *      Return 0 if successful or a negative errno code on error.
7605  */
7606
7607 int dev_set_allmulti(struct net_device *dev, int inc)
7608 {
7609         return __dev_set_allmulti(dev, inc, true);
7610 }
7611 EXPORT_SYMBOL(dev_set_allmulti);
7612
7613 /*
7614  *      Upload unicast and multicast address lists to device and
7615  *      configure RX filtering. When the device doesn't support unicast
7616  *      filtering it is put in promiscuous mode while unicast addresses
7617  *      are present.
7618  */
7619 void __dev_set_rx_mode(struct net_device *dev)
7620 {
7621         const struct net_device_ops *ops = dev->netdev_ops;
7622
7623         /* dev_open will call this function so the list will stay sane. */
7624         if (!(dev->flags&IFF_UP))
7625                 return;
7626
7627         if (!netif_device_present(dev))
7628                 return;
7629
7630         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
7631                 /* Unicast addresses changes may only happen under the rtnl,
7632                  * therefore calling __dev_set_promiscuity here is safe.
7633                  */
7634                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
7635                         __dev_set_promiscuity(dev, 1, false);
7636                         dev->uc_promisc = true;
7637                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
7638                         __dev_set_promiscuity(dev, -1, false);
7639                         dev->uc_promisc = false;
7640                 }
7641         }
7642
7643         if (ops->ndo_set_rx_mode)
7644                 ops->ndo_set_rx_mode(dev);
7645 }
7646
7647 void dev_set_rx_mode(struct net_device *dev)
7648 {
7649         netif_addr_lock_bh(dev);
7650         __dev_set_rx_mode(dev);
7651         netif_addr_unlock_bh(dev);
7652 }
7653
7654 /**
7655  *      dev_get_flags - get flags reported to userspace
7656  *      @dev: device
7657  *
7658  *      Get the combination of flag bits exported through APIs to userspace.
7659  */
7660 unsigned int dev_get_flags(const struct net_device *dev)
7661 {
7662         unsigned int flags;
7663
7664         flags = (dev->flags & ~(IFF_PROMISC |
7665                                 IFF_ALLMULTI |
7666                                 IFF_RUNNING |
7667                                 IFF_LOWER_UP |
7668                                 IFF_DORMANT)) |
7669                 (dev->gflags & (IFF_PROMISC |
7670                                 IFF_ALLMULTI));
7671
7672         if (netif_running(dev)) {
7673                 if (netif_oper_up(dev))
7674                         flags |= IFF_RUNNING;
7675                 if (netif_carrier_ok(dev))
7676                         flags |= IFF_LOWER_UP;
7677                 if (netif_dormant(dev))
7678                         flags |= IFF_DORMANT;
7679         }
7680
7681         return flags;
7682 }
7683 EXPORT_SYMBOL(dev_get_flags);
7684
7685 int __dev_change_flags(struct net_device *dev, unsigned int flags,
7686                        struct netlink_ext_ack *extack)
7687 {
7688         unsigned int old_flags = dev->flags;
7689         int ret;
7690
7691         ASSERT_RTNL();
7692
7693         /*
7694          *      Set the flags on our device.
7695          */
7696
7697         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
7698                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
7699                                IFF_AUTOMEDIA)) |
7700                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
7701                                     IFF_ALLMULTI));
7702
7703         /*
7704          *      Load in the correct multicast list now the flags have changed.
7705          */
7706
7707         if ((old_flags ^ flags) & IFF_MULTICAST)
7708                 dev_change_rx_flags(dev, IFF_MULTICAST);
7709
7710         dev_set_rx_mode(dev);
7711
7712         /*
7713          *      Have we downed the interface. We handle IFF_UP ourselves
7714          *      according to user attempts to set it, rather than blindly
7715          *      setting it.
7716          */
7717
7718         ret = 0;
7719         if ((old_flags ^ flags) & IFF_UP) {
7720                 if (old_flags & IFF_UP)
7721                         __dev_close(dev);
7722                 else
7723                         ret = __dev_open(dev, extack);
7724         }
7725
7726         if ((flags ^ dev->gflags) & IFF_PROMISC) {
7727                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
7728                 unsigned int old_flags = dev->flags;
7729
7730                 dev->gflags ^= IFF_PROMISC;
7731
7732                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
7733                         if (dev->flags != old_flags)
7734                                 dev_set_rx_mode(dev);
7735         }
7736
7737         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
7738          * is important. Some (broken) drivers set IFF_PROMISC, when
7739          * IFF_ALLMULTI is requested not asking us and not reporting.
7740          */
7741         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
7742                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
7743
7744                 dev->gflags ^= IFF_ALLMULTI;
7745                 __dev_set_allmulti(dev, inc, false);
7746         }
7747
7748         return ret;
7749 }
7750
7751 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
7752                         unsigned int gchanges)
7753 {
7754         unsigned int changes = dev->flags ^ old_flags;
7755
7756         if (gchanges)
7757                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
7758
7759         if (changes & IFF_UP) {
7760                 if (dev->flags & IFF_UP)
7761                         call_netdevice_notifiers(NETDEV_UP, dev);
7762                 else
7763                         call_netdevice_notifiers(NETDEV_DOWN, dev);
7764         }
7765
7766         if (dev->flags & IFF_UP &&
7767             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
7768                 struct netdev_notifier_change_info change_info = {
7769                         .info = {
7770                                 .dev = dev,
7771                         },
7772                         .flags_changed = changes,
7773                 };
7774
7775                 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
7776         }
7777 }
7778
7779 /**
7780  *      dev_change_flags - change device settings
7781  *      @dev: device
7782  *      @flags: device state flags
7783  *      @extack: netlink extended ack
7784  *
7785  *      Change settings on device based state flags. The flags are
7786  *      in the userspace exported format.
7787  */
7788 int dev_change_flags(struct net_device *dev, unsigned int flags,
7789                      struct netlink_ext_ack *extack)
7790 {
7791         int ret;
7792         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
7793
7794         ret = __dev_change_flags(dev, flags, extack);
7795         if (ret < 0)
7796                 return ret;
7797
7798         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
7799         __dev_notify_flags(dev, old_flags, changes);
7800         return ret;
7801 }
7802 EXPORT_SYMBOL(dev_change_flags);
7803
7804 int __dev_set_mtu(struct net_device *dev, int new_mtu)
7805 {
7806         const struct net_device_ops *ops = dev->netdev_ops;
7807
7808         if (ops->ndo_change_mtu)
7809                 return ops->ndo_change_mtu(dev, new_mtu);
7810
7811         dev->mtu = new_mtu;
7812         return 0;
7813 }
7814 EXPORT_SYMBOL(__dev_set_mtu);
7815
7816 /**
7817  *      dev_set_mtu_ext - Change maximum transfer unit
7818  *      @dev: device
7819  *      @new_mtu: new transfer unit
7820  *      @extack: netlink extended ack
7821  *
7822  *      Change the maximum transfer size of the network device.
7823  */
7824 int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
7825                     struct netlink_ext_ack *extack)
7826 {
7827         int err, orig_mtu;
7828
7829         if (new_mtu == dev->mtu)
7830                 return 0;
7831
7832         /* MTU must be positive, and in range */
7833         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7834                 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
7835                 return -EINVAL;
7836         }
7837
7838         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7839                 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
7840                 return -EINVAL;
7841         }
7842
7843         if (!netif_device_present(dev))
7844                 return -ENODEV;
7845
7846         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
7847         err = notifier_to_errno(err);
7848         if (err)
7849                 return err;
7850
7851         orig_mtu = dev->mtu;
7852         err = __dev_set_mtu(dev, new_mtu);
7853
7854         if (!err) {
7855                 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
7856                                                    orig_mtu);
7857                 err = notifier_to_errno(err);
7858                 if (err) {
7859                         /* setting mtu back and notifying everyone again,
7860                          * so that they have a chance to revert changes.
7861                          */
7862                         __dev_set_mtu(dev, orig_mtu);
7863                         call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
7864                                                      new_mtu);
7865                 }
7866         }
7867         return err;
7868 }
7869
7870 int dev_set_mtu(struct net_device *dev, int new_mtu)
7871 {
7872         struct netlink_ext_ack extack;
7873         int err;
7874
7875         memset(&extack, 0, sizeof(extack));
7876         err = dev_set_mtu_ext(dev, new_mtu, &extack);
7877         if (err && extack._msg)
7878                 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
7879         return err;
7880 }
7881 EXPORT_SYMBOL(dev_set_mtu);
7882
7883 /**
7884  *      dev_change_tx_queue_len - Change TX queue length of a netdevice
7885  *      @dev: device
7886  *      @new_len: new tx queue length
7887  */
7888 int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
7889 {
7890         unsigned int orig_len = dev->tx_queue_len;
7891         int res;
7892
7893         if (new_len != (unsigned int)new_len)
7894                 return -ERANGE;
7895
7896         if (new_len != orig_len) {
7897                 dev->tx_queue_len = new_len;
7898                 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
7899                 res = notifier_to_errno(res);
7900                 if (res)
7901                         goto err_rollback;
7902                 res = dev_qdisc_change_tx_queue_len(dev);
7903                 if (res)
7904                         goto err_rollback;
7905         }
7906
7907         return 0;
7908
7909 err_rollback:
7910         netdev_err(dev, "refused to change device tx_queue_len\n");
7911         dev->tx_queue_len = orig_len;
7912         return res;
7913 }
7914
7915 /**
7916  *      dev_set_group - Change group this device belongs to
7917  *      @dev: device
7918  *      @new_group: group this device should belong to
7919  */
7920 void dev_set_group(struct net_device *dev, int new_group)
7921 {
7922         dev->group = new_group;
7923 }
7924 EXPORT_SYMBOL(dev_set_group);
7925
7926 /**
7927  *      dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
7928  *      @dev: device
7929  *      @addr: new address
7930  *      @extack: netlink extended ack
7931  */
7932 int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
7933                               struct netlink_ext_ack *extack)
7934 {
7935         struct netdev_notifier_pre_changeaddr_info info = {
7936                 .info.dev = dev,
7937                 .info.extack = extack,
7938                 .dev_addr = addr,
7939         };
7940         int rc;
7941
7942         rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
7943         return notifier_to_errno(rc);
7944 }
7945 EXPORT_SYMBOL(dev_pre_changeaddr_notify);
7946
7947 /**
7948  *      dev_set_mac_address - Change Media Access Control Address
7949  *      @dev: device
7950  *      @sa: new address
7951  *      @extack: netlink extended ack
7952  *
7953  *      Change the hardware (MAC) address of the device
7954  */
7955 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
7956                         struct netlink_ext_ack *extack)
7957 {
7958         const struct net_device_ops *ops = dev->netdev_ops;
7959         int err;
7960
7961         if (!ops->ndo_set_mac_address)
7962                 return -EOPNOTSUPP;
7963         if (sa->sa_family != dev->type)
7964                 return -EINVAL;
7965         if (!netif_device_present(dev))
7966                 return -ENODEV;
7967         err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
7968         if (err)
7969                 return err;
7970         err = ops->ndo_set_mac_address(dev, sa);
7971         if (err)
7972                 return err;
7973         dev->addr_assign_type = NET_ADDR_SET;
7974         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7975         add_device_randomness(dev->dev_addr, dev->addr_len);
7976         return 0;
7977 }
7978 EXPORT_SYMBOL(dev_set_mac_address);
7979
7980 /**
7981  *      dev_change_carrier - Change device carrier
7982  *      @dev: device
7983  *      @new_carrier: new value
7984  *
7985  *      Change device carrier
7986  */
7987 int dev_change_carrier(struct net_device *dev, bool new_carrier)
7988 {
7989         const struct net_device_ops *ops = dev->netdev_ops;
7990
7991         if (!ops->ndo_change_carrier)
7992                 return -EOPNOTSUPP;
7993         if (!netif_device_present(dev))
7994                 return -ENODEV;
7995         return ops->ndo_change_carrier(dev, new_carrier);
7996 }
7997 EXPORT_SYMBOL(dev_change_carrier);
7998
7999 /**
8000  *      dev_get_phys_port_id - Get device physical port ID
8001  *      @dev: device
8002  *      @ppid: port ID
8003  *
8004  *      Get device physical port ID
8005  */
8006 int dev_get_phys_port_id(struct net_device *dev,
8007                          struct netdev_phys_item_id *ppid)
8008 {
8009         const struct net_device_ops *ops = dev->netdev_ops;
8010
8011         if (!ops->ndo_get_phys_port_id)
8012                 return -EOPNOTSUPP;
8013         return ops->ndo_get_phys_port_id(dev, ppid);
8014 }
8015 EXPORT_SYMBOL(dev_get_phys_port_id);
8016
8017 /**
8018  *      dev_get_phys_port_name - Get device physical port name
8019  *      @dev: device
8020  *      @name: port name
8021  *      @len: limit of bytes to copy to name
8022  *
8023  *      Get device physical port name
8024  */
8025 int dev_get_phys_port_name(struct net_device *dev,
8026                            char *name, size_t len)
8027 {
8028         const struct net_device_ops *ops = dev->netdev_ops;
8029         int err;
8030
8031         if (ops->ndo_get_phys_port_name) {
8032                 err = ops->ndo_get_phys_port_name(dev, name, len);
8033                 if (err != -EOPNOTSUPP)
8034                         return err;
8035         }
8036         return devlink_compat_phys_port_name_get(dev, name, len);
8037 }
8038 EXPORT_SYMBOL(dev_get_phys_port_name);
8039
8040 /**
8041  *      dev_get_port_parent_id - Get the device's port parent identifier
8042  *      @dev: network device
8043  *      @ppid: pointer to a storage for the port's parent identifier
8044  *      @recurse: allow/disallow recursion to lower devices
8045  *
8046  *      Get the devices's port parent identifier
8047  */
8048 int dev_get_port_parent_id(struct net_device *dev,
8049                            struct netdev_phys_item_id *ppid,
8050                            bool recurse)
8051 {
8052         const struct net_device_ops *ops = dev->netdev_ops;
8053         struct netdev_phys_item_id first = { };
8054         struct net_device *lower_dev;
8055         struct list_head *iter;
8056         int err;
8057
8058         if (ops->ndo_get_port_parent_id) {
8059                 err = ops->ndo_get_port_parent_id(dev, ppid);
8060                 if (err != -EOPNOTSUPP)
8061                         return err;
8062         }
8063
8064         err = devlink_compat_switch_id_get(dev, ppid);
8065         if (!err || err != -EOPNOTSUPP)
8066                 return err;
8067
8068         if (!recurse)
8069                 return -EOPNOTSUPP;
8070
8071         netdev_for_each_lower_dev(dev, lower_dev, iter) {
8072                 err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8073                 if (err)
8074                         break;
8075                 if (!first.id_len)
8076                         first = *ppid;
8077                 else if (memcmp(&first, ppid, sizeof(*ppid)))
8078                         return -ENODATA;
8079         }
8080
8081         return err;
8082 }
8083 EXPORT_SYMBOL(dev_get_port_parent_id);
8084
8085 /**
8086  *      netdev_port_same_parent_id - Indicate if two network devices have
8087  *      the same port parent identifier
8088  *      @a: first network device
8089  *      @b: second network device
8090  */
8091 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8092 {
8093         struct netdev_phys_item_id a_id = { };
8094         struct netdev_phys_item_id b_id = { };
8095
8096         if (dev_get_port_parent_id(a, &a_id, true) ||
8097             dev_get_port_parent_id(b, &b_id, true))
8098                 return false;
8099
8100         return netdev_phys_item_id_same(&a_id, &b_id);
8101 }
8102 EXPORT_SYMBOL(netdev_port_same_parent_id);
8103
8104 /**
8105  *      dev_change_proto_down - update protocol port state information
8106  *      @dev: device
8107  *      @proto_down: new value
8108  *
8109  *      This info can be used by switch drivers to set the phys state of the
8110  *      port.
8111  */
8112 int dev_change_proto_down(struct net_device *dev, bool proto_down)
8113 {
8114         const struct net_device_ops *ops = dev->netdev_ops;
8115
8116         if (!ops->ndo_change_proto_down)
8117                 return -EOPNOTSUPP;
8118         if (!netif_device_present(dev))
8119                 return -ENODEV;
8120         return ops->ndo_change_proto_down(dev, proto_down);
8121 }
8122 EXPORT_SYMBOL(dev_change_proto_down);
8123
8124 /**
8125  *      dev_change_proto_down_generic - generic implementation for
8126  *      ndo_change_proto_down that sets carrier according to
8127  *      proto_down.
8128  *
8129  *      @dev: device
8130  *      @proto_down: new value
8131  */
8132 int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
8133 {
8134         if (proto_down)
8135                 netif_carrier_off(dev);
8136         else
8137                 netif_carrier_on(dev);
8138         dev->proto_down = proto_down;
8139         return 0;
8140 }
8141 EXPORT_SYMBOL(dev_change_proto_down_generic);
8142
8143 u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
8144                     enum bpf_netdev_command cmd)
8145 {
8146         struct netdev_bpf xdp;
8147
8148         if (!bpf_op)
8149                 return 0;
8150
8151         memset(&xdp, 0, sizeof(xdp));
8152         xdp.command = cmd;
8153
8154         /* Query must always succeed. */
8155         WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
8156
8157         return xdp.prog_id;
8158 }
8159
8160 static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
8161                            struct netlink_ext_ack *extack, u32 flags,
8162                            struct bpf_prog *prog)
8163 {
8164         struct netdev_bpf xdp;
8165
8166         memset(&xdp, 0, sizeof(xdp));
8167         if (flags & XDP_FLAGS_HW_MODE)
8168                 xdp.command = XDP_SETUP_PROG_HW;
8169         else
8170                 xdp.command = XDP_SETUP_PROG;
8171         xdp.extack = extack;
8172         xdp.flags = flags;
8173         xdp.prog = prog;
8174
8175         return bpf_op(dev, &xdp);
8176 }
8177
8178 static void dev_xdp_uninstall(struct net_device *dev)
8179 {
8180         struct netdev_bpf xdp;
8181         bpf_op_t ndo_bpf;
8182
8183         /* Remove generic XDP */
8184         WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
8185
8186         /* Remove from the driver */
8187         ndo_bpf = dev->netdev_ops->ndo_bpf;
8188         if (!ndo_bpf)
8189                 return;
8190
8191         memset(&xdp, 0, sizeof(xdp));
8192         xdp.command = XDP_QUERY_PROG;
8193         WARN_ON(ndo_bpf(dev, &xdp));
8194         if (xdp.prog_id)
8195                 WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8196                                         NULL));
8197
8198         /* Remove HW offload */
8199         memset(&xdp, 0, sizeof(xdp));
8200         xdp.command = XDP_QUERY_PROG_HW;
8201         if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
8202                 WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8203                                         NULL));
8204 }
8205
8206 /**
8207  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
8208  *      @dev: device
8209  *      @extack: netlink extended ack
8210  *      @fd: new program fd or negative value to clear
8211  *      @flags: xdp-related flags
8212  *
8213  *      Set or clear a bpf program for a device
8214  */
8215 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
8216                       int fd, u32 flags)
8217 {
8218         const struct net_device_ops *ops = dev->netdev_ops;
8219         enum bpf_netdev_command query;
8220         struct bpf_prog *prog = NULL;
8221         bpf_op_t bpf_op, bpf_chk;
8222         bool offload;
8223         int err;
8224
8225         ASSERT_RTNL();
8226
8227         offload = flags & XDP_FLAGS_HW_MODE;
8228         query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
8229
8230         bpf_op = bpf_chk = ops->ndo_bpf;
8231         if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
8232                 NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
8233                 return -EOPNOTSUPP;
8234         }
8235         if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
8236                 bpf_op = generic_xdp_install;
8237         if (bpf_op == bpf_chk)
8238                 bpf_chk = generic_xdp_install;
8239
8240         if (fd >= 0) {
8241                 u32 prog_id;
8242
8243                 if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
8244                         NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
8245                         return -EEXIST;
8246                 }
8247
8248                 prog_id = __dev_xdp_query(dev, bpf_op, query);
8249                 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
8250                         NL_SET_ERR_MSG(extack, "XDP program already attached");
8251                         return -EBUSY;
8252                 }
8253
8254                 prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
8255                                              bpf_op == ops->ndo_bpf);
8256                 if (IS_ERR(prog))
8257                         return PTR_ERR(prog);
8258
8259                 if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
8260                         NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
8261                         bpf_prog_put(prog);
8262                         return -EINVAL;
8263                 }
8264
8265                 if (prog->aux->id == prog_id) {
8266                         bpf_prog_put(prog);
8267                         return 0;
8268                 }
8269         } else {
8270                 if (!__dev_xdp_query(dev, bpf_op, query))
8271                         return 0;
8272         }
8273
8274         err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
8275         if (err < 0 && prog)
8276                 bpf_prog_put(prog);
8277
8278         return err;
8279 }
8280
8281 /**
8282  *      dev_new_index   -       allocate an ifindex
8283  *      @net: the applicable net namespace
8284  *
8285  *      Returns a suitable unique value for a new device interface
8286  *      number.  The caller must hold the rtnl semaphore or the
8287  *      dev_base_lock to be sure it remains unique.
8288  */
8289 static int dev_new_index(struct net *net)
8290 {
8291         int ifindex = net->ifindex;
8292
8293         for (;;) {
8294                 if (++ifindex <= 0)
8295                         ifindex = 1;
8296                 if (!__dev_get_by_index(net, ifindex))
8297                         return net->ifindex = ifindex;
8298         }
8299 }
8300
8301 /* Delayed registration/unregisteration */
8302 static LIST_HEAD(net_todo_list);
8303 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
8304
8305 static void net_set_todo(struct net_device *dev)
8306 {
8307         list_add_tail(&dev->todo_list, &net_todo_list);
8308         dev_net(dev)->dev_unreg_count++;
8309 }
8310
8311 static void rollback_registered_many(struct list_head *head)
8312 {
8313         struct net_device *dev, *tmp;
8314         LIST_HEAD(close_head);
8315
8316         BUG_ON(dev_boot_phase);
8317         ASSERT_RTNL();
8318
8319         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
8320                 /* Some devices call without registering
8321                  * for initialization unwind. Remove those
8322                  * devices and proceed with the remaining.
8323                  */
8324                 if (dev->reg_state == NETREG_UNINITIALIZED) {
8325                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
8326                                  dev->name, dev);
8327
8328                         WARN_ON(1);
8329                         list_del(&dev->unreg_list);
8330                         continue;
8331                 }
8332                 dev->dismantle = true;
8333                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
8334         }
8335
8336         /* If device is running, close it first. */
8337         list_for_each_entry(dev, head, unreg_list)
8338                 list_add_tail(&dev->close_list, &close_head);
8339         dev_close_many(&close_head, true);
8340
8341         list_for_each_entry(dev, head, unreg_list) {
8342                 /* And unlink it from device chain. */
8343                 unlist_netdevice(dev);
8344
8345                 dev->reg_state = NETREG_UNREGISTERING;
8346         }
8347         flush_all_backlogs();
8348
8349         synchronize_net();
8350
8351         list_for_each_entry(dev, head, unreg_list) {
8352                 struct sk_buff *skb = NULL;
8353
8354                 /* Shutdown queueing discipline. */
8355                 dev_shutdown(dev);
8356
8357                 dev_xdp_uninstall(dev);
8358
8359                 /* Notify protocols, that we are about to destroy
8360                  * this device. They should clean all the things.
8361                  */
8362                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8363
8364                 if (!dev->rtnl_link_ops ||
8365                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8366                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
8367                                                      GFP_KERNEL, NULL, 0);
8368
8369                 /*
8370                  *      Flush the unicast and multicast chains
8371                  */
8372                 dev_uc_flush(dev);
8373                 dev_mc_flush(dev);
8374
8375                 netdev_name_node_alt_flush(dev);
8376                 netdev_name_node_free(dev->name_node);
8377
8378                 if (dev->netdev_ops->ndo_uninit)
8379                         dev->netdev_ops->ndo_uninit(dev);
8380
8381                 if (skb)
8382                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
8383
8384                 /* Notifier chain MUST detach us all upper devices. */
8385                 WARN_ON(netdev_has_any_upper_dev(dev));
8386                 WARN_ON(netdev_has_any_lower_dev(dev));
8387
8388                 /* Remove entries from kobject tree */
8389                 netdev_unregister_kobject(dev);
8390 #ifdef CONFIG_XPS
8391                 /* Remove XPS queueing entries */
8392                 netif_reset_xps_queues_gt(dev, 0);
8393 #endif
8394         }
8395
8396         synchronize_net();
8397
8398         list_for_each_entry(dev, head, unreg_list)
8399                 dev_put(dev);
8400 }
8401
8402 static void rollback_registered(struct net_device *dev)
8403 {
8404         LIST_HEAD(single);
8405
8406         list_add(&dev->unreg_list, &single);
8407         rollback_registered_many(&single);
8408         list_del(&single);
8409 }
8410
8411 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
8412         struct net_device *upper, netdev_features_t features)
8413 {
8414         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8415         netdev_features_t feature;
8416         int feature_bit;
8417
8418         for_each_netdev_feature(upper_disables, feature_bit) {
8419                 feature = __NETIF_F_BIT(feature_bit);
8420                 if (!(upper->wanted_features & feature)
8421                     && (features & feature)) {
8422                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
8423                                    &feature, upper->name);
8424                         features &= ~feature;
8425                 }
8426         }
8427
8428         return features;
8429 }
8430
8431 static void netdev_sync_lower_features(struct net_device *upper,
8432         struct net_device *lower, netdev_features_t features)
8433 {
8434         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8435         netdev_features_t feature;
8436         int feature_bit;
8437
8438         for_each_netdev_feature(upper_disables, feature_bit) {
8439                 feature = __NETIF_F_BIT(feature_bit);
8440                 if (!(features & feature) && (lower->features & feature)) {
8441                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
8442                                    &feature, lower->name);
8443                         lower->wanted_features &= ~feature;
8444                         netdev_update_features(lower);
8445
8446                         if (unlikely(lower->features & feature))
8447                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
8448                                             &feature, lower->name);
8449                 }
8450         }
8451 }
8452
8453 static netdev_features_t netdev_fix_features(struct net_device *dev,
8454         netdev_features_t features)
8455 {
8456         /* Fix illegal checksum combinations */
8457         if ((features & NETIF_F_HW_CSUM) &&
8458             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
8459                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
8460                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
8461         }
8462
8463         /* TSO requires that SG is present as well. */
8464         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
8465                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
8466                 features &= ~NETIF_F_ALL_TSO;
8467         }
8468
8469         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
8470                                         !(features & NETIF_F_IP_CSUM)) {
8471                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
8472                 features &= ~NETIF_F_TSO;
8473                 features &= ~NETIF_F_TSO_ECN;
8474         }
8475
8476         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
8477                                          !(features & NETIF_F_IPV6_CSUM)) {
8478                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
8479                 features &= ~NETIF_F_TSO6;
8480         }
8481
8482         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
8483         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
8484                 features &= ~NETIF_F_TSO_MANGLEID;
8485
8486         /* TSO ECN requires that TSO is present as well. */
8487         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
8488                 features &= ~NETIF_F_TSO_ECN;
8489
8490         /* Software GSO depends on SG. */
8491         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
8492                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
8493                 features &= ~NETIF_F_GSO;
8494         }
8495
8496         /* GSO partial features require GSO partial be set */
8497         if ((features & dev->gso_partial_features) &&
8498             !(features & NETIF_F_GSO_PARTIAL)) {
8499                 netdev_dbg(dev,
8500                            "Dropping partially supported GSO features since no GSO partial.\n");
8501                 features &= ~dev->gso_partial_features;
8502         }
8503
8504         if (!(features & NETIF_F_RXCSUM)) {
8505                 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
8506                  * successfully merged by hardware must also have the
8507                  * checksum verified by hardware.  If the user does not
8508                  * want to enable RXCSUM, logically, we should disable GRO_HW.
8509                  */
8510                 if (features & NETIF_F_GRO_HW) {
8511                         netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
8512                         features &= ~NETIF_F_GRO_HW;
8513                 }
8514         }
8515
8516         /* LRO/HW-GRO features cannot be combined with RX-FCS */
8517         if (features & NETIF_F_RXFCS) {
8518                 if (features & NETIF_F_LRO) {
8519                         netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
8520                         features &= ~NETIF_F_LRO;
8521                 }
8522
8523                 if (features & NETIF_F_GRO_HW) {
8524                         netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
8525                         features &= ~NETIF_F_GRO_HW;
8526                 }
8527         }
8528
8529         return features;
8530 }
8531
8532 int __netdev_update_features(struct net_device *dev)
8533 {
8534         struct net_device *upper, *lower;
8535         netdev_features_t features;
8536         struct list_head *iter;
8537         int err = -1;
8538
8539         ASSERT_RTNL();
8540
8541         features = netdev_get_wanted_features(dev);
8542
8543         if (dev->netdev_ops->ndo_fix_features)
8544                 features = dev->netdev_ops->ndo_fix_features(dev, features);
8545
8546         /* driver might be less strict about feature dependencies */
8547         features = netdev_fix_features(dev, features);
8548
8549         /* some features can't be enabled if they're off an an upper device */
8550         netdev_for_each_upper_dev_rcu(dev, upper, iter)
8551                 features = netdev_sync_upper_features(dev, upper, features);
8552
8553         if (dev->features == features)
8554                 goto sync_lower;
8555
8556         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
8557                 &dev->features, &features);
8558
8559         if (dev->netdev_ops->ndo_set_features)
8560                 err = dev->netdev_ops->ndo_set_features(dev, features);
8561         else
8562                 err = 0;
8563
8564         if (unlikely(err < 0)) {
8565                 netdev_err(dev,
8566                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
8567                         err, &features, &dev->features);
8568                 /* return non-0 since some features might have changed and
8569                  * it's better to fire a spurious notification than miss it
8570                  */
8571                 return -1;
8572         }
8573
8574 sync_lower:
8575         /* some features must be disabled on lower devices when disabled
8576          * on an upper device (think: bonding master or bridge)
8577          */
8578         netdev_for_each_lower_dev(dev, lower, iter)
8579                 netdev_sync_lower_features(dev, lower, features);
8580
8581         if (!err) {
8582                 netdev_features_t diff = features ^ dev->features;
8583
8584                 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
8585                         /* udp_tunnel_{get,drop}_rx_info both need
8586                          * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
8587                          * device, or they won't do anything.
8588                          * Thus we need to update dev->features
8589                          * *before* calling udp_tunnel_get_rx_info,
8590                          * but *after* calling udp_tunnel_drop_rx_info.
8591                          */
8592                         if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
8593                                 dev->features = features;
8594                                 udp_tunnel_get_rx_info(dev);
8595                         } else {
8596                                 udp_tunnel_drop_rx_info(dev);
8597                         }
8598                 }
8599
8600                 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
8601                         if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
8602                                 dev->features = features;
8603                                 err |= vlan_get_rx_ctag_filter_info(dev);
8604                         } else {
8605                                 vlan_drop_rx_ctag_filter_info(dev);
8606                         }
8607                 }
8608
8609                 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
8610                         if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
8611                                 dev->features = features;
8612                                 err |= vlan_get_rx_stag_filter_info(dev);
8613                         } else {
8614                                 vlan_drop_rx_stag_filter_info(dev);
8615                         }
8616                 }
8617
8618                 dev->features = features;
8619         }
8620
8621         return err < 0 ? 0 : 1;
8622 }
8623
8624 /**
8625  *      netdev_update_features - recalculate device features
8626  *      @dev: the device to check
8627  *
8628  *      Recalculate dev->features set and send notifications if it
8629  *      has changed. Should be called after driver or hardware dependent
8630  *      conditions might have changed that influence the features.
8631  */
8632 void netdev_update_features(struct net_device *dev)
8633 {
8634         if (__netdev_update_features(dev))
8635                 netdev_features_change(dev);
8636 }
8637 EXPORT_SYMBOL(netdev_update_features);
8638
8639 /**
8640  *      netdev_change_features - recalculate device features
8641  *      @dev: the device to check
8642  *
8643  *      Recalculate dev->features set and send notifications even
8644  *      if they have not changed. Should be called instead of
8645  *      netdev_update_features() if also dev->vlan_features might
8646  *      have changed to allow the changes to be propagated to stacked
8647  *      VLAN devices.
8648  */
8649 void netdev_change_features(struct net_device *dev)
8650 {
8651         __netdev_update_features(dev);
8652         netdev_features_change(dev);
8653 }
8654 EXPORT_SYMBOL(netdev_change_features);
8655
8656 /**
8657  *      netif_stacked_transfer_operstate -      transfer operstate
8658  *      @rootdev: the root or lower level device to transfer state from
8659  *      @dev: the device to transfer operstate to
8660  *
8661  *      Transfer operational state from root to device. This is normally
8662  *      called when a stacking relationship exists between the root
8663  *      device and the device(a leaf device).
8664  */
8665 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
8666                                         struct net_device *dev)
8667 {
8668         if (rootdev->operstate == IF_OPER_DORMANT)
8669                 netif_dormant_on(dev);
8670         else
8671                 netif_dormant_off(dev);
8672
8673         if (netif_carrier_ok(rootdev))
8674                 netif_carrier_on(dev);
8675         else
8676                 netif_carrier_off(dev);
8677 }
8678 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
8679
8680 static int netif_alloc_rx_queues(struct net_device *dev)
8681 {
8682         unsigned int i, count = dev->num_rx_queues;
8683         struct netdev_rx_queue *rx;
8684         size_t sz = count * sizeof(*rx);
8685         int err = 0;
8686
8687         BUG_ON(count < 1);
8688
8689         rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8690         if (!rx)
8691                 return -ENOMEM;
8692
8693         dev->_rx = rx;
8694
8695         for (i = 0; i < count; i++) {
8696                 rx[i].dev = dev;
8697
8698                 /* XDP RX-queue setup */
8699                 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
8700                 if (err < 0)
8701                         goto err_rxq_info;
8702         }
8703         return 0;
8704
8705 err_rxq_info:
8706         /* Rollback successful reg's and free other resources */
8707         while (i--)
8708                 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
8709         kvfree(dev->_rx);
8710         dev->_rx = NULL;
8711         return err;
8712 }
8713
8714 static void netif_free_rx_queues(struct net_device *dev)
8715 {
8716         unsigned int i, count = dev->num_rx_queues;
8717
8718         /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
8719         if (!dev->_rx)
8720                 return;
8721
8722         for (i = 0; i < count; i++)
8723                 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
8724
8725         kvfree(dev->_rx);
8726 }
8727
8728 static void netdev_init_one_queue(struct net_device *dev,
8729                                   struct netdev_queue *queue, void *_unused)
8730 {
8731         /* Initialize queue lock */
8732         spin_lock_init(&queue->_xmit_lock);
8733         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
8734         queue->xmit_lock_owner = -1;
8735         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
8736         queue->dev = dev;
8737 #ifdef CONFIG_BQL
8738         dql_init(&queue->dql, HZ);
8739 #endif
8740 }
8741
8742 static void netif_free_tx_queues(struct net_device *dev)
8743 {
8744         kvfree(dev->_tx);
8745 }
8746
8747 static int netif_alloc_netdev_queues(struct net_device *dev)
8748 {
8749         unsigned int count = dev->num_tx_queues;
8750         struct netdev_queue *tx;
8751         size_t sz = count * sizeof(*tx);
8752
8753         if (count < 1 || count > 0xffff)
8754                 return -EINVAL;
8755
8756         tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8757         if (!tx)
8758                 return -ENOMEM;
8759
8760         dev->_tx = tx;
8761
8762         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
8763         spin_lock_init(&dev->tx_global_lock);
8764
8765         return 0;
8766 }
8767
8768 void netif_tx_stop_all_queues(struct net_device *dev)
8769 {
8770         unsigned int i;
8771
8772         for (i = 0; i < dev->num_tx_queues; i++) {
8773                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
8774
8775                 netif_tx_stop_queue(txq);
8776         }
8777 }
8778 EXPORT_SYMBOL(netif_tx_stop_all_queues);
8779
8780 /**
8781  *      register_netdevice      - register a network device
8782  *      @dev: device to register
8783  *
8784  *      Take a completed network device structure and add it to the kernel
8785  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8786  *      chain. 0 is returned on success. A negative errno code is returned
8787  *      on a failure to set up the device, or if the name is a duplicate.
8788  *
8789  *      Callers must hold the rtnl semaphore. You may want
8790  *      register_netdev() instead of this.
8791  *
8792  *      BUGS:
8793  *      The locking appears insufficient to guarantee two parallel registers
8794  *      will not get the same name.
8795  */
8796
8797 int register_netdevice(struct net_device *dev)
8798 {
8799         int ret;
8800         struct net *net = dev_net(dev);
8801
8802         BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
8803                      NETDEV_FEATURE_COUNT);
8804         BUG_ON(dev_boot_phase);
8805         ASSERT_RTNL();
8806
8807         might_sleep();
8808
8809         /* When net_device's are persistent, this will be fatal. */
8810         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
8811         BUG_ON(!net);
8812
8813         spin_lock_init(&dev->addr_list_lock);
8814         netdev_set_addr_lockdep_class(dev);
8815
8816         ret = dev_get_valid_name(net, dev, dev->name);
8817         if (ret < 0)
8818                 goto out;
8819
8820         dev->name_node = netdev_name_node_head_alloc(dev);
8821         if (!dev->name_node)
8822                 goto out;
8823
8824         /* Init, if this function is available */
8825         if (dev->netdev_ops->ndo_init) {
8826                 ret = dev->netdev_ops->ndo_init(dev);
8827                 if (ret) {
8828                         if (ret > 0)
8829                                 ret = -EIO;
8830                         goto out;
8831                 }
8832         }
8833
8834         if (((dev->hw_features | dev->features) &
8835              NETIF_F_HW_VLAN_CTAG_FILTER) &&
8836             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
8837              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
8838                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
8839                 ret = -EINVAL;
8840                 goto err_uninit;
8841         }
8842
8843         ret = -EBUSY;
8844         if (!dev->ifindex)
8845                 dev->ifindex = dev_new_index(net);
8846         else if (__dev_get_by_index(net, dev->ifindex))
8847                 goto err_uninit;
8848
8849         /* Transfer changeable features to wanted_features and enable
8850          * software offloads (GSO and GRO).
8851          */
8852         dev->hw_features |= NETIF_F_SOFT_FEATURES;
8853         dev->features |= NETIF_F_SOFT_FEATURES;
8854
8855         if (dev->netdev_ops->ndo_udp_tunnel_add) {
8856                 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
8857                 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
8858         }
8859
8860         dev->wanted_features = dev->features & dev->hw_features;
8861
8862         if (!(dev->flags & IFF_LOOPBACK))
8863                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
8864
8865         /* If IPv4 TCP segmentation offload is supported we should also
8866          * allow the device to enable segmenting the frame with the option
8867          * of ignoring a static IP ID value.  This doesn't enable the
8868          * feature itself but allows the user to enable it later.
8869          */
8870         if (dev->hw_features & NETIF_F_TSO)
8871                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
8872         if (dev->vlan_features & NETIF_F_TSO)
8873                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
8874         if (dev->mpls_features & NETIF_F_TSO)
8875                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
8876         if (dev->hw_enc_features & NETIF_F_TSO)
8877                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
8878
8879         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
8880          */
8881         dev->vlan_features |= NETIF_F_HIGHDMA;
8882
8883         /* Make NETIF_F_SG inheritable to tunnel devices.
8884          */
8885         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
8886
8887         /* Make NETIF_F_SG inheritable to MPLS.
8888          */
8889         dev->mpls_features |= NETIF_F_SG;
8890
8891         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
8892         ret = notifier_to_errno(ret);
8893         if (ret)
8894                 goto err_uninit;
8895
8896         ret = netdev_register_kobject(dev);
8897         if (ret)
8898                 goto err_uninit;
8899         dev->reg_state = NETREG_REGISTERED;
8900
8901         __netdev_update_features(dev);
8902
8903         /*
8904          *      Default initial state at registry is that the
8905          *      device is present.
8906          */
8907
8908         set_bit(__LINK_STATE_PRESENT, &dev->state);
8909
8910         linkwatch_init_dev(dev);
8911
8912         dev_init_scheduler(dev);
8913         dev_hold(dev);
8914         list_netdevice(dev);
8915         add_device_randomness(dev->dev_addr, dev->addr_len);
8916
8917         /* If the device has permanent device address, driver should
8918          * set dev_addr and also addr_assign_type should be set to
8919          * NET_ADDR_PERM (default value).
8920          */
8921         if (dev->addr_assign_type == NET_ADDR_PERM)
8922                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
8923
8924         /* Notify protocols, that a new device appeared. */
8925         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
8926         ret = notifier_to_errno(ret);
8927         if (ret) {
8928                 rollback_registered(dev);
8929                 rcu_barrier();
8930
8931                 dev->reg_state = NETREG_UNREGISTERED;
8932         }
8933         /*
8934          *      Prevent userspace races by waiting until the network
8935          *      device is fully setup before sending notifications.
8936          */
8937         if (!dev->rtnl_link_ops ||
8938             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8939                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8940
8941 out:
8942         return ret;
8943
8944 err_uninit:
8945         if (dev->name_node)
8946                 netdev_name_node_free(dev->name_node);
8947         if (dev->netdev_ops->ndo_uninit)
8948                 dev->netdev_ops->ndo_uninit(dev);
8949         if (dev->priv_destructor)
8950                 dev->priv_destructor(dev);
8951         goto out;
8952 }
8953 EXPORT_SYMBOL(register_netdevice);
8954
8955 /**
8956  *      init_dummy_netdev       - init a dummy network device for NAPI
8957  *      @dev: device to init
8958  *
8959  *      This takes a network device structure and initialize the minimum
8960  *      amount of fields so it can be used to schedule NAPI polls without
8961  *      registering a full blown interface. This is to be used by drivers
8962  *      that need to tie several hardware interfaces to a single NAPI
8963  *      poll scheduler due to HW limitations.
8964  */
8965 int init_dummy_netdev(struct net_device *dev)
8966 {
8967         /* Clear everything. Note we don't initialize spinlocks
8968          * are they aren't supposed to be taken by any of the
8969          * NAPI code and this dummy netdev is supposed to be
8970          * only ever used for NAPI polls
8971          */
8972         memset(dev, 0, sizeof(struct net_device));
8973
8974         /* make sure we BUG if trying to hit standard
8975          * register/unregister code path
8976          */
8977         dev->reg_state = NETREG_DUMMY;
8978
8979         /* NAPI wants this */
8980         INIT_LIST_HEAD(&dev->napi_list);
8981
8982         /* a dummy interface is started by default */
8983         set_bit(__LINK_STATE_PRESENT, &dev->state);
8984         set_bit(__LINK_STATE_START, &dev->state);
8985
8986         /* napi_busy_loop stats accounting wants this */
8987         dev_net_set(dev, &init_net);
8988
8989         /* Note : We dont allocate pcpu_refcnt for dummy devices,
8990          * because users of this 'device' dont need to change
8991          * its refcount.
8992          */
8993
8994         return 0;
8995 }
8996 EXPORT_SYMBOL_GPL(init_dummy_netdev);
8997
8998
8999 /**
9000  *      register_netdev - register a network device
9001  *      @dev: device to register
9002  *
9003  *      Take a completed network device structure and add it to the kernel
9004  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9005  *      chain. 0 is returned on success. A negative errno code is returned
9006  *      on a failure to set up the device, or if the name is a duplicate.
9007  *
9008  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
9009  *      and expands the device name if you passed a format string to
9010  *      alloc_netdev.
9011  */
9012 int register_netdev(struct net_device *dev)
9013 {
9014         int err;
9015
9016         if (rtnl_lock_killable())
9017                 return -EINTR;
9018         err = register_netdevice(dev);
9019         rtnl_unlock();
9020         return err;
9021 }
9022 EXPORT_SYMBOL(register_netdev);
9023
9024 int netdev_refcnt_read(const struct net_device *dev)
9025 {
9026         int i, refcnt = 0;
9027
9028         for_each_possible_cpu(i)
9029                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
9030         return refcnt;
9031 }
9032 EXPORT_SYMBOL(netdev_refcnt_read);
9033
9034 /**
9035  * netdev_wait_allrefs - wait until all references are gone.
9036  * @dev: target net_device
9037  *
9038  * This is called when unregistering network devices.
9039  *
9040  * Any protocol or device that holds a reference should register
9041  * for netdevice notification, and cleanup and put back the
9042  * reference if they receive an UNREGISTER event.
9043  * We can get stuck here if buggy protocols don't correctly
9044  * call dev_put.
9045  */
9046 static void netdev_wait_allrefs(struct net_device *dev)
9047 {
9048         unsigned long rebroadcast_time, warning_time;
9049         int refcnt;
9050
9051         linkwatch_forget_dev(dev);
9052
9053         rebroadcast_time = warning_time = jiffies;
9054         refcnt = netdev_refcnt_read(dev);
9055
9056         while (refcnt != 0) {
9057                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
9058                         rtnl_lock();
9059
9060                         /* Rebroadcast unregister notification */
9061                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9062
9063                         __rtnl_unlock();
9064                         rcu_barrier();
9065                         rtnl_lock();
9066
9067                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
9068                                      &dev->state)) {
9069                                 /* We must not have linkwatch events
9070                                  * pending on unregister. If this
9071                                  * happens, we simply run the queue
9072                                  * unscheduled, resulting in a noop
9073                                  * for this device.
9074                                  */
9075                                 linkwatch_run_queue();
9076                         }
9077
9078                         __rtnl_unlock();
9079
9080                         rebroadcast_time = jiffies;
9081                 }
9082
9083                 msleep(250);
9084
9085                 refcnt = netdev_refcnt_read(dev);
9086
9087                 if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
9088                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
9089                                  dev->name, refcnt);
9090                         warning_time = jiffies;
9091                 }
9092         }
9093 }
9094
9095 /* The sequence is:
9096  *
9097  *      rtnl_lock();
9098  *      ...
9099  *      register_netdevice(x1);
9100  *      register_netdevice(x2);
9101  *      ...
9102  *      unregister_netdevice(y1);
9103  *      unregister_netdevice(y2);
9104  *      ...
9105  *      rtnl_unlock();
9106  *      free_netdev(y1);
9107  *      free_netdev(y2);
9108  *
9109  * We are invoked by rtnl_unlock().
9110  * This allows us to deal with problems:
9111  * 1) We can delete sysfs objects which invoke hotplug
9112  *    without deadlocking with linkwatch via keventd.
9113  * 2) Since we run with the RTNL semaphore not held, we can sleep
9114  *    safely in order to wait for the netdev refcnt to drop to zero.
9115  *
9116  * We must not return until all unregister events added during
9117  * the interval the lock was held have been completed.
9118  */
9119 void netdev_run_todo(void)
9120 {
9121         struct list_head list;
9122
9123         /* Snapshot list, allow later requests */
9124         list_replace_init(&net_todo_list, &list);
9125
9126         __rtnl_unlock();
9127
9128
9129         /* Wait for rcu callbacks to finish before next phase */
9130         if (!list_empty(&list))
9131                 rcu_barrier();
9132
9133         while (!list_empty(&list)) {
9134                 struct net_device *dev
9135                         = list_first_entry(&list, struct net_device, todo_list);
9136                 list_del(&dev->todo_list);
9137
9138                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
9139                         pr_err("network todo '%s' but state %d\n",
9140                                dev->name, dev->reg_state);
9141                         dump_stack();
9142                         continue;
9143                 }
9144
9145                 dev->reg_state = NETREG_UNREGISTERED;
9146
9147                 netdev_wait_allrefs(dev);
9148
9149                 /* paranoia */
9150                 BUG_ON(netdev_refcnt_read(dev));
9151                 BUG_ON(!list_empty(&dev->ptype_all));
9152                 BUG_ON(!list_empty(&dev->ptype_specific));
9153                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
9154                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
9155 #if IS_ENABLED(CONFIG_DECNET)
9156                 WARN_ON(dev->dn_ptr);
9157 #endif
9158                 if (dev->priv_destructor)
9159                         dev->priv_destructor(dev);
9160                 if (dev->needs_free_netdev)
9161                         free_netdev(dev);
9162
9163                 /* Report a network device has been unregistered */
9164                 rtnl_lock();
9165                 dev_net(dev)->dev_unreg_count--;
9166                 __rtnl_unlock();
9167                 wake_up(&netdev_unregistering_wq);
9168
9169                 /* Free network device */
9170                 kobject_put(&dev->dev.kobj);
9171         }
9172 }
9173
9174 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
9175  * all the same fields in the same order as net_device_stats, with only
9176  * the type differing, but rtnl_link_stats64 may have additional fields
9177  * at the end for newer counters.
9178  */
9179 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
9180                              const struct net_device_stats *netdev_stats)
9181 {
9182 #if BITS_PER_LONG == 64
9183         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
9184         memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
9185         /* zero out counters that only exist in rtnl_link_stats64 */
9186         memset((char *)stats64 + sizeof(*netdev_stats), 0,
9187                sizeof(*stats64) - sizeof(*netdev_stats));
9188 #else
9189         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
9190         const unsigned long *src = (const unsigned long *)netdev_stats;
9191         u64 *dst = (u64 *)stats64;
9192
9193         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
9194         for (i = 0; i < n; i++)
9195                 dst[i] = src[i];
9196         /* zero out counters that only exist in rtnl_link_stats64 */
9197         memset((char *)stats64 + n * sizeof(u64), 0,
9198                sizeof(*stats64) - n * sizeof(u64));
9199 #endif
9200 }
9201 EXPORT_SYMBOL(netdev_stats_to_stats64);
9202
9203 /**
9204  *      dev_get_stats   - get network device statistics
9205  *      @dev: device to get statistics from
9206  *      @storage: place to store stats
9207  *
9208  *      Get network statistics from device. Return @storage.
9209  *      The device driver may provide its own method by setting
9210  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
9211  *      otherwise the internal statistics structure is used.
9212  */
9213 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
9214                                         struct rtnl_link_stats64 *storage)
9215 {
9216         const struct net_device_ops *ops = dev->netdev_ops;
9217
9218         if (ops->ndo_get_stats64) {
9219                 memset(storage, 0, sizeof(*storage));
9220                 ops->ndo_get_stats64(dev, storage);
9221         } else if (ops->ndo_get_stats) {
9222                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
9223         } else {
9224                 netdev_stats_to_stats64(storage, &dev->stats);
9225         }
9226         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
9227         storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
9228         storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
9229         return storage;
9230 }
9231 EXPORT_SYMBOL(dev_get_stats);
9232
9233 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
9234 {
9235         struct netdev_queue *queue = dev_ingress_queue(dev);
9236
9237 #ifdef CONFIG_NET_CLS_ACT
9238         if (queue)
9239                 return queue;
9240         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
9241         if (!queue)
9242                 return NULL;
9243         netdev_init_one_queue(dev, queue, NULL);
9244         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
9245         queue->qdisc_sleeping = &noop_qdisc;
9246         rcu_assign_pointer(dev->ingress_queue, queue);
9247 #endif
9248         return queue;
9249 }
9250
9251 static const struct ethtool_ops default_ethtool_ops;
9252
9253 void netdev_set_default_ethtool_ops(struct net_device *dev,
9254                                     const struct ethtool_ops *ops)
9255 {
9256         if (dev->ethtool_ops == &default_ethtool_ops)
9257                 dev->ethtool_ops = ops;
9258 }
9259 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
9260
9261 void netdev_freemem(struct net_device *dev)
9262 {
9263         char *addr = (char *)dev - dev->padded;
9264
9265         kvfree(addr);
9266 }
9267
9268 /**
9269  * alloc_netdev_mqs - allocate network device
9270  * @sizeof_priv: size of private data to allocate space for
9271  * @name: device name format string
9272  * @name_assign_type: origin of device name
9273  * @setup: callback to initialize device
9274  * @txqs: the number of TX subqueues to allocate
9275  * @rxqs: the number of RX subqueues to allocate
9276  *
9277  * Allocates a struct net_device with private data area for driver use
9278  * and performs basic initialization.  Also allocates subqueue structs
9279  * for each queue on the device.
9280  */
9281 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
9282                 unsigned char name_assign_type,
9283                 void (*setup)(struct net_device *),
9284                 unsigned int txqs, unsigned int rxqs)
9285 {
9286         struct net_device *dev;
9287         unsigned int alloc_size;
9288         struct net_device *p;
9289
9290         BUG_ON(strlen(name) >= sizeof(dev->name));
9291
9292         if (txqs < 1) {
9293                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
9294                 return NULL;
9295         }
9296
9297         if (rxqs < 1) {
9298                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
9299                 return NULL;
9300         }
9301
9302         alloc_size = sizeof(struct net_device);
9303         if (sizeof_priv) {
9304                 /* ensure 32-byte alignment of private area */
9305                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
9306                 alloc_size += sizeof_priv;
9307         }
9308         /* ensure 32-byte alignment of whole construct */
9309         alloc_size += NETDEV_ALIGN - 1;
9310
9311         p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9312         if (!p)
9313                 return NULL;
9314
9315         dev = PTR_ALIGN(p, NETDEV_ALIGN);
9316         dev->padded = (char *)dev - (char *)p;
9317
9318         dev->pcpu_refcnt = alloc_percpu(int);
9319         if (!dev->pcpu_refcnt)
9320                 goto free_dev;
9321
9322         if (dev_addr_init(dev))
9323                 goto free_pcpu;
9324
9325         dev_mc_init(dev);
9326         dev_uc_init(dev);
9327
9328         dev_net_set(dev, &init_net);
9329
9330         dev->gso_max_size = GSO_MAX_SIZE;
9331         dev->gso_max_segs = GSO_MAX_SEGS;
9332
9333         INIT_LIST_HEAD(&dev->napi_list);
9334         INIT_LIST_HEAD(&dev->unreg_list);
9335         INIT_LIST_HEAD(&dev->close_list);
9336         INIT_LIST_HEAD(&dev->link_watch_list);
9337         INIT_LIST_HEAD(&dev->adj_list.upper);
9338         INIT_LIST_HEAD(&dev->adj_list.lower);
9339         INIT_LIST_HEAD(&dev->ptype_all);
9340         INIT_LIST_HEAD(&dev->ptype_specific);
9341 #ifdef CONFIG_NET_SCHED
9342         hash_init(dev->qdisc_hash);
9343 #endif
9344         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
9345         setup(dev);
9346
9347         if (!dev->tx_queue_len) {
9348                 dev->priv_flags |= IFF_NO_QUEUE;
9349                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
9350         }
9351
9352         dev->num_tx_queues = txqs;
9353         dev->real_num_tx_queues = txqs;
9354         if (netif_alloc_netdev_queues(dev))
9355                 goto free_all;
9356
9357         dev->num_rx_queues = rxqs;
9358         dev->real_num_rx_queues = rxqs;
9359         if (netif_alloc_rx_queues(dev))
9360                 goto free_all;
9361
9362         strcpy(dev->name, name);
9363         dev->name_assign_type = name_assign_type;
9364         dev->group = INIT_NETDEV_GROUP;
9365         if (!dev->ethtool_ops)
9366                 dev->ethtool_ops = &default_ethtool_ops;
9367
9368         nf_hook_ingress_init(dev);
9369
9370         return dev;
9371
9372 free_all:
9373         free_netdev(dev);
9374         return NULL;
9375
9376 free_pcpu:
9377         free_percpu(dev->pcpu_refcnt);
9378 free_dev:
9379         netdev_freemem(dev);
9380         return NULL;
9381 }
9382 EXPORT_SYMBOL(alloc_netdev_mqs);
9383
9384 /**
9385  * free_netdev - free network device
9386  * @dev: device
9387  *
9388  * This function does the last stage of destroying an allocated device
9389  * interface. The reference to the device object is released. If this
9390  * is the last reference then it will be freed.Must be called in process
9391  * context.
9392  */
9393 void free_netdev(struct net_device *dev)
9394 {
9395         struct napi_struct *p, *n;
9396
9397         might_sleep();
9398         netif_free_tx_queues(dev);
9399         netif_free_rx_queues(dev);
9400
9401         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
9402
9403         /* Flush device addresses */
9404         dev_addr_flush(dev);
9405
9406         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
9407                 netif_napi_del(p);
9408
9409         free_percpu(dev->pcpu_refcnt);
9410         dev->pcpu_refcnt = NULL;
9411
9412         /*  Compatibility with error handling in drivers */
9413         if (dev->reg_state == NETREG_UNINITIALIZED) {
9414                 netdev_freemem(dev);
9415                 return;
9416         }
9417
9418         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
9419         dev->reg_state = NETREG_RELEASED;
9420
9421         /* will free via device release */
9422         put_device(&dev->dev);
9423 }
9424 EXPORT_SYMBOL(free_netdev);
9425
9426 /**
9427  *      synchronize_net -  Synchronize with packet receive processing
9428  *
9429  *      Wait for packets currently being received to be done.
9430  *      Does not block later packets from starting.
9431  */
9432 void synchronize_net(void)
9433 {
9434         might_sleep();
9435         if (rtnl_is_locked())
9436                 synchronize_rcu_expedited();
9437         else
9438                 synchronize_rcu();
9439 }
9440 EXPORT_SYMBOL(synchronize_net);
9441
9442 /**
9443  *      unregister_netdevice_queue - remove device from the kernel
9444  *      @dev: device
9445  *      @head: list
9446  *
9447  *      This function shuts down a device interface and removes it
9448  *      from the kernel tables.
9449  *      If head not NULL, device is queued to be unregistered later.
9450  *
9451  *      Callers must hold the rtnl semaphore.  You may want
9452  *      unregister_netdev() instead of this.
9453  */
9454
9455 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
9456 {
9457         ASSERT_RTNL();
9458
9459         if (head) {
9460                 list_move_tail(&dev->unreg_list, head);
9461         } else {
9462                 rollback_registered(dev);
9463                 /* Finish processing unregister after unlock */
9464                 net_set_todo(dev);
9465         }
9466 }
9467 EXPORT_SYMBOL(unregister_netdevice_queue);
9468
9469 /**
9470  *      unregister_netdevice_many - unregister many devices
9471  *      @head: list of devices
9472  *
9473  *  Note: As most callers use a stack allocated list_head,
9474  *  we force a list_del() to make sure stack wont be corrupted later.
9475  */
9476 void unregister_netdevice_many(struct list_head *head)
9477 {
9478         struct net_device *dev;
9479
9480         if (!list_empty(head)) {
9481                 rollback_registered_many(head);
9482                 list_for_each_entry(dev, head, unreg_list)
9483                         net_set_todo(dev);
9484                 list_del(head);
9485         }
9486 }
9487 EXPORT_SYMBOL(unregister_netdevice_many);
9488
9489 /**
9490  *      unregister_netdev - remove device from the kernel
9491  *      @dev: device
9492  *
9493  *      This function shuts down a device interface and removes it
9494  *      from the kernel tables.
9495  *
9496  *      This is just a wrapper for unregister_netdevice that takes
9497  *      the rtnl semaphore.  In general you want to use this and not
9498  *      unregister_netdevice.
9499  */
9500 void unregister_netdev(struct net_device *dev)
9501 {
9502         rtnl_lock();
9503         unregister_netdevice(dev);
9504         rtnl_unlock();
9505 }
9506 EXPORT_SYMBOL(unregister_netdev);
9507
9508 /**
9509  *      dev_change_net_namespace - move device to different nethost namespace
9510  *      @dev: device
9511  *      @net: network namespace
9512  *      @pat: If not NULL name pattern to try if the current device name
9513  *            is already taken in the destination network namespace.
9514  *
9515  *      This function shuts down a device interface and moves it
9516  *      to a new network namespace. On success 0 is returned, on
9517  *      a failure a netagive errno code is returned.
9518  *
9519  *      Callers must hold the rtnl semaphore.
9520  */
9521
9522 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
9523 {
9524         int err, new_nsid, new_ifindex;
9525
9526         ASSERT_RTNL();
9527
9528         /* Don't allow namespace local devices to be moved. */
9529         err = -EINVAL;
9530         if (dev->features & NETIF_F_NETNS_LOCAL)
9531                 goto out;
9532
9533         /* Ensure the device has been registrered */
9534         if (dev->reg_state != NETREG_REGISTERED)
9535                 goto out;
9536
9537         /* Get out if there is nothing todo */
9538         err = 0;
9539         if (net_eq(dev_net(dev), net))
9540                 goto out;
9541
9542         /* Pick the destination device name, and ensure
9543          * we can use it in the destination network namespace.
9544          */
9545         err = -EEXIST;
9546         if (__dev_get_by_name(net, dev->name)) {
9547                 /* We get here if we can't use the current device name */
9548                 if (!pat)
9549                         goto out;
9550                 err = dev_get_valid_name(net, dev, pat);
9551                 if (err < 0)
9552                         goto out;
9553         }
9554
9555         /*
9556          * And now a mini version of register_netdevice unregister_netdevice.
9557          */
9558
9559         /* If device is running close it first. */
9560         dev_close(dev);
9561
9562         /* And unlink it from device chain */
9563         unlist_netdevice(dev);
9564
9565         synchronize_net();
9566
9567         /* Shutdown queueing discipline. */
9568         dev_shutdown(dev);
9569
9570         /* Notify protocols, that we are about to destroy
9571          * this device. They should clean all the things.
9572          *
9573          * Note that dev->reg_state stays at NETREG_REGISTERED.
9574          * This is wanted because this way 8021q and macvlan know
9575          * the device is just moving and can keep their slaves up.
9576          */
9577         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9578         rcu_barrier();
9579
9580         new_nsid = peernet2id_alloc(dev_net(dev), net);
9581         /* If there is an ifindex conflict assign a new one */
9582         if (__dev_get_by_index(net, dev->ifindex))
9583                 new_ifindex = dev_new_index(net);
9584         else
9585                 new_ifindex = dev->ifindex;
9586
9587         rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
9588                             new_ifindex);
9589
9590         /*
9591          *      Flush the unicast and multicast chains
9592          */
9593         dev_uc_flush(dev);
9594         dev_mc_flush(dev);
9595
9596         /* Send a netdev-removed uevent to the old namespace */
9597         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
9598         netdev_adjacent_del_links(dev);
9599
9600         /* Actually switch the network namespace */
9601         dev_net_set(dev, net);
9602         dev->ifindex = new_ifindex;
9603
9604         /* Send a netdev-add uevent to the new namespace */
9605         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
9606         netdev_adjacent_add_links(dev);
9607
9608         /* Fixup kobjects */
9609         err = device_rename(&dev->dev, dev->name);
9610         WARN_ON(err);
9611
9612         /* Add the device back in the hashes */
9613         list_netdevice(dev);
9614
9615         /* Notify protocols, that a new device appeared. */
9616         call_netdevice_notifiers(NETDEV_REGISTER, dev);
9617
9618         /*
9619          *      Prevent userspace races by waiting until the network
9620          *      device is fully setup before sending notifications.
9621          */
9622         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9623
9624         synchronize_net();
9625         err = 0;
9626 out:
9627         return err;
9628 }
9629 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
9630
9631 static int dev_cpu_dead(unsigned int oldcpu)
9632 {
9633         struct sk_buff **list_skb;
9634         struct sk_buff *skb;
9635         unsigned int cpu;
9636         struct softnet_data *sd, *oldsd, *remsd = NULL;
9637
9638         local_irq_disable();
9639         cpu = smp_processor_id();
9640         sd = &per_cpu(softnet_data, cpu);
9641         oldsd = &per_cpu(softnet_data, oldcpu);
9642
9643         /* Find end of our completion_queue. */
9644         list_skb = &sd->completion_queue;
9645         while (*list_skb)
9646                 list_skb = &(*list_skb)->next;
9647         /* Append completion queue from offline CPU. */
9648         *list_skb = oldsd->completion_queue;
9649         oldsd->completion_queue = NULL;
9650
9651         /* Append output queue from offline CPU. */
9652         if (oldsd->output_queue) {
9653                 *sd->output_queue_tailp = oldsd->output_queue;
9654                 sd->output_queue_tailp = oldsd->output_queue_tailp;
9655                 oldsd->output_queue = NULL;
9656                 oldsd->output_queue_tailp = &oldsd->output_queue;
9657         }
9658         /* Append NAPI poll list from offline CPU, with one exception :
9659          * process_backlog() must be called by cpu owning percpu backlog.
9660          * We properly handle process_queue & input_pkt_queue later.
9661          */
9662         while (!list_empty(&oldsd->poll_list)) {
9663                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
9664                                                             struct napi_struct,
9665                                                             poll_list);
9666
9667                 list_del_init(&napi->poll_list);
9668                 if (napi->poll == process_backlog)
9669                         napi->state = 0;
9670                 else
9671                         ____napi_schedule(sd, napi);
9672         }
9673
9674         raise_softirq_irqoff(NET_TX_SOFTIRQ);
9675         local_irq_enable();
9676
9677 #ifdef CONFIG_RPS
9678         remsd = oldsd->rps_ipi_list;
9679         oldsd->rps_ipi_list = NULL;
9680 #endif
9681         /* send out pending IPI's on offline CPU */
9682         net_rps_send_ipi(remsd);
9683
9684         /* Process offline CPU's input_pkt_queue */
9685         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
9686                 netif_rx_ni(skb);
9687                 input_queue_head_incr(oldsd);
9688         }
9689         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
9690                 netif_rx_ni(skb);
9691                 input_queue_head_incr(oldsd);
9692         }
9693
9694         return 0;
9695 }
9696
9697 /**
9698  *      netdev_increment_features - increment feature set by one
9699  *      @all: current feature set
9700  *      @one: new feature set
9701  *      @mask: mask feature set
9702  *
9703  *      Computes a new feature set after adding a device with feature set
9704  *      @one to the master device with current feature set @all.  Will not
9705  *      enable anything that is off in @mask. Returns the new feature set.
9706  */
9707 netdev_features_t netdev_increment_features(netdev_features_t all,
9708         netdev_features_t one, netdev_features_t mask)
9709 {
9710         if (mask & NETIF_F_HW_CSUM)
9711                 mask |= NETIF_F_CSUM_MASK;
9712         mask |= NETIF_F_VLAN_CHALLENGED;
9713
9714         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
9715         all &= one | ~NETIF_F_ALL_FOR_ALL;
9716
9717         /* If one device supports hw checksumming, set for all. */
9718         if (all & NETIF_F_HW_CSUM)
9719                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
9720
9721         return all;
9722 }
9723 EXPORT_SYMBOL(netdev_increment_features);
9724
9725 static struct hlist_head * __net_init netdev_create_hash(void)
9726 {
9727         int i;
9728         struct hlist_head *hash;
9729
9730         hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
9731         if (hash != NULL)
9732                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
9733                         INIT_HLIST_HEAD(&hash[i]);
9734
9735         return hash;
9736 }
9737
9738 /* Initialize per network namespace state */
9739 static int __net_init netdev_init(struct net *net)
9740 {
9741         BUILD_BUG_ON(GRO_HASH_BUCKETS >
9742                      8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
9743
9744         if (net != &init_net)
9745                 INIT_LIST_HEAD(&net->dev_base_head);
9746
9747         net->dev_name_head = netdev_create_hash();
9748         if (net->dev_name_head == NULL)
9749                 goto err_name;
9750
9751         net->dev_index_head = netdev_create_hash();
9752         if (net->dev_index_head == NULL)
9753                 goto err_idx;
9754
9755         return 0;
9756
9757 err_idx:
9758         kfree(net->dev_name_head);
9759 err_name:
9760         return -ENOMEM;
9761 }
9762
9763 /**
9764  *      netdev_drivername - network driver for the device
9765  *      @dev: network device
9766  *
9767  *      Determine network driver for device.
9768  */
9769 const char *netdev_drivername(const struct net_device *dev)
9770 {
9771         const struct device_driver *driver;
9772         const struct device *parent;
9773         const char *empty = "";
9774
9775         parent = dev->dev.parent;
9776         if (!parent)
9777                 return empty;
9778
9779         driver = parent->driver;
9780         if (driver && driver->name)
9781                 return driver->name;
9782         return empty;
9783 }
9784
9785 static void __netdev_printk(const char *level, const struct net_device *dev,
9786                             struct va_format *vaf)
9787 {
9788         if (dev && dev->dev.parent) {
9789                 dev_printk_emit(level[1] - '0',
9790                                 dev->dev.parent,
9791                                 "%s %s %s%s: %pV",
9792                                 dev_driver_string(dev->dev.parent),
9793                                 dev_name(dev->dev.parent),
9794                                 netdev_name(dev), netdev_reg_state(dev),
9795                                 vaf);
9796         } else if (dev) {
9797                 printk("%s%s%s: %pV",
9798                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
9799         } else {
9800                 printk("%s(NULL net_device): %pV", level, vaf);
9801         }
9802 }
9803
9804 void netdev_printk(const char *level, const struct net_device *dev,
9805                    const char *format, ...)
9806 {
9807         struct va_format vaf;
9808         va_list args;
9809
9810         va_start(args, format);
9811
9812         vaf.fmt = format;
9813         vaf.va = &args;
9814
9815         __netdev_printk(level, dev, &vaf);
9816
9817         va_end(args);
9818 }
9819 EXPORT_SYMBOL(netdev_printk);
9820
9821 #define define_netdev_printk_level(func, level)                 \
9822 void func(const struct net_device *dev, const char *fmt, ...)   \
9823 {                                                               \
9824         struct va_format vaf;                                   \
9825         va_list args;                                           \
9826                                                                 \
9827         va_start(args, fmt);                                    \
9828                                                                 \
9829         vaf.fmt = fmt;                                          \
9830         vaf.va = &args;                                         \
9831                                                                 \
9832         __netdev_printk(level, dev, &vaf);                      \
9833                                                                 \
9834         va_end(args);                                           \
9835 }                                                               \
9836 EXPORT_SYMBOL(func);
9837
9838 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
9839 define_netdev_printk_level(netdev_alert, KERN_ALERT);
9840 define_netdev_printk_level(netdev_crit, KERN_CRIT);
9841 define_netdev_printk_level(netdev_err, KERN_ERR);
9842 define_netdev_printk_level(netdev_warn, KERN_WARNING);
9843 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
9844 define_netdev_printk_level(netdev_info, KERN_INFO);
9845
9846 static void __net_exit netdev_exit(struct net *net)
9847 {
9848         kfree(net->dev_name_head);
9849         kfree(net->dev_index_head);
9850         if (net != &init_net)
9851                 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
9852 }
9853
9854 static struct pernet_operations __net_initdata netdev_net_ops = {
9855         .init = netdev_init,
9856         .exit = netdev_exit,
9857 };
9858
9859 static void __net_exit default_device_exit(struct net *net)
9860 {
9861         struct net_device *dev, *aux;
9862         /*
9863          * Push all migratable network devices back to the
9864          * initial network namespace
9865          */
9866         rtnl_lock();
9867         for_each_netdev_safe(net, dev, aux) {
9868                 int err;
9869                 char fb_name[IFNAMSIZ];
9870
9871                 /* Ignore unmoveable devices (i.e. loopback) */
9872                 if (dev->features & NETIF_F_NETNS_LOCAL)
9873                         continue;
9874
9875                 /* Leave virtual devices for the generic cleanup */
9876                 if (dev->rtnl_link_ops)
9877                         continue;
9878
9879                 /* Push remaining network devices to init_net */
9880                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
9881                 if (__dev_get_by_name(&init_net, fb_name))
9882                         snprintf(fb_name, IFNAMSIZ, "dev%%d");
9883                 err = dev_change_net_namespace(dev, &init_net, fb_name);
9884                 if (err) {
9885                         pr_emerg("%s: failed to move %s to init_net: %d\n",
9886                                  __func__, dev->name, err);
9887                         BUG();
9888                 }
9889         }
9890         rtnl_unlock();
9891 }
9892
9893 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
9894 {
9895         /* Return with the rtnl_lock held when there are no network
9896          * devices unregistering in any network namespace in net_list.
9897          */
9898         struct net *net;
9899         bool unregistering;
9900         DEFINE_WAIT_FUNC(wait, woken_wake_function);
9901
9902         add_wait_queue(&netdev_unregistering_wq, &wait);
9903         for (;;) {
9904                 unregistering = false;
9905                 rtnl_lock();
9906                 list_for_each_entry(net, net_list, exit_list) {
9907                         if (net->dev_unreg_count > 0) {
9908                                 unregistering = true;
9909                                 break;
9910                         }
9911                 }
9912                 if (!unregistering)
9913                         break;
9914                 __rtnl_unlock();
9915
9916                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
9917         }
9918         remove_wait_queue(&netdev_unregistering_wq, &wait);
9919 }
9920
9921 static void __net_exit default_device_exit_batch(struct list_head *net_list)
9922 {
9923         /* At exit all network devices most be removed from a network
9924          * namespace.  Do this in the reverse order of registration.
9925          * Do this across as many network namespaces as possible to
9926          * improve batching efficiency.
9927          */
9928         struct net_device *dev;
9929         struct net *net;
9930         LIST_HEAD(dev_kill_list);
9931
9932         /* To prevent network device cleanup code from dereferencing
9933          * loopback devices or network devices that have been freed
9934          * wait here for all pending unregistrations to complete,
9935          * before unregistring the loopback device and allowing the
9936          * network namespace be freed.
9937          *
9938          * The netdev todo list containing all network devices
9939          * unregistrations that happen in default_device_exit_batch
9940          * will run in the rtnl_unlock() at the end of
9941          * default_device_exit_batch.
9942          */
9943         rtnl_lock_unregistering(net_list);
9944         list_for_each_entry(net, net_list, exit_list) {
9945                 for_each_netdev_reverse(net, dev) {
9946                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
9947                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
9948                         else
9949                                 unregister_netdevice_queue(dev, &dev_kill_list);
9950                 }
9951         }
9952         unregister_netdevice_many(&dev_kill_list);
9953         rtnl_unlock();
9954 }
9955
9956 static struct pernet_operations __net_initdata default_device_ops = {
9957         .exit = default_device_exit,
9958         .exit_batch = default_device_exit_batch,
9959 };
9960
9961 /*
9962  *      Initialize the DEV module. At boot time this walks the device list and
9963  *      unhooks any devices that fail to initialise (normally hardware not
9964  *      present) and leaves us with a valid list of present and active devices.
9965  *
9966  */
9967
9968 /*
9969  *       This is called single threaded during boot, so no need
9970  *       to take the rtnl semaphore.
9971  */
9972 static int __init net_dev_init(void)
9973 {
9974         int i, rc = -ENOMEM;
9975
9976         BUG_ON(!dev_boot_phase);
9977
9978         if (dev_proc_init())
9979                 goto out;
9980
9981         if (netdev_kobject_init())
9982                 goto out;
9983
9984         INIT_LIST_HEAD(&ptype_all);
9985         for (i = 0; i < PTYPE_HASH_SIZE; i++)
9986                 INIT_LIST_HEAD(&ptype_base[i]);
9987
9988         INIT_LIST_HEAD(&offload_base);
9989
9990         if (register_pernet_subsys(&netdev_net_ops))
9991                 goto out;
9992
9993         /*
9994          *      Initialise the packet receive queues.
9995          */
9996
9997         for_each_possible_cpu(i) {
9998                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
9999                 struct softnet_data *sd = &per_cpu(softnet_data, i);
10000
10001                 INIT_WORK(flush, flush_backlog);
10002
10003                 skb_queue_head_init(&sd->input_pkt_queue);
10004                 skb_queue_head_init(&sd->process_queue);
10005 #ifdef CONFIG_XFRM_OFFLOAD
10006                 skb_queue_head_init(&sd->xfrm_backlog);
10007 #endif
10008                 INIT_LIST_HEAD(&sd->poll_list);
10009                 sd->output_queue_tailp = &sd->output_queue;
10010 #ifdef CONFIG_RPS
10011                 sd->csd.func = rps_trigger_softirq;
10012                 sd->csd.info = sd;
10013                 sd->cpu = i;
10014 #endif
10015
10016                 init_gro_hash(&sd->backlog);
10017                 sd->backlog.poll = process_backlog;
10018                 sd->backlog.weight = weight_p;
10019         }
10020
10021         dev_boot_phase = 0;
10022
10023         /* The loopback device is special if any other network devices
10024          * is present in a network namespace the loopback device must
10025          * be present. Since we now dynamically allocate and free the
10026          * loopback device ensure this invariant is maintained by
10027          * keeping the loopback device as the first device on the
10028          * list of network devices.  Ensuring the loopback devices
10029          * is the first device that appears and the last network device
10030          * that disappears.
10031          */
10032         if (register_pernet_device(&loopback_net_ops))
10033                 goto out;
10034
10035         if (register_pernet_device(&default_device_ops))
10036                 goto out;
10037
10038         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
10039         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
10040
10041         rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
10042                                        NULL, dev_cpu_dead);
10043         WARN_ON(rc < 0);
10044         rc = 0;
10045 out:
10046         return rc;
10047 }
10048
10049 subsys_initcall(net_dev_init);