net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137
 138 #include "net-sysfs.h"
 139
 140 /* Instead of increasing this, you should create a hash table. */
 141 #define MAX_GRO_SKBS 8
 142
 143 /* This should be increased if a protocol with a bigger head is added. */
 144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 145
 146 /*
 147  *      The list of packet types we will receive (as opposed to discard)
 148  *      and the routines to invoke.
 149  *
 150  *      Why 16. Because with 16 the only overlap we get on a hash of the
 151  *      low nibble of the protocol value is RARP/SNAP/X.25.
 152  *
 153  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 154  *             sure which should go first, but I bet it won't make much
 155  *             difference if we are running VLANs.  The good news is that
 156  *             this protocol won't be in the list unless compiled in, so
 157  *             the average user (w/out VLANs) will not be adversely affected.
 158  *             --BLG
 159  *
 160  *              0800    IP
 161  *              8100    802.1Q VLAN
 162  *              0001    802.3
 163  *              0002    AX.25
 164  *              0004    802.2
 165  *              8035    RARP
 166  *              0005    SNAP
 167  *              0805    X.25
 168  *              0806    ARP
 169  *              8137    IPX
 170  *              0009    Localtalk
 171  *              86DD    IPv6
 172  */
 173
 174 #define PTYPE_HASH_SIZE (16)
 175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 176
 177 static DEFINE_SPINLOCK(ptype_lock);
 178 static DEFINE_SPINLOCK(offload_lock);
 179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180 static struct list_head ptype_all __read_mostly;        /* Taps */
 181 static struct list_head offload_base __read_mostly;
 182
 183 /*
 184  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 185  * semaphore.
 186  *
 187  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 188  *
 189  * Writers must hold the rtnl semaphore while they loop through the
 190  * dev_base_head list, and hold dev_base_lock for writing when they do the
 191  * actual updates.  This allows pure readers to access the list even
 192  * while a writer is preparing to update it.
 193  *
 194  * To put it another way, dev_base_lock is held for writing only to
 195  * protect against pure readers; the rtnl semaphore provides the
 196  * protection against other writers.
 197  *
 198  * See, for example usages, register_netdevice() and
 199  * unregister_netdevice(), which must be called with the rtnl
 200  * semaphore held.
 201  */
 202 DEFINE_RWLOCK(dev_base_lock);
 203 EXPORT_SYMBOL(dev_base_lock);
 204
 205 seqcount_t devnet_rename_seq;
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215
 216         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 217 }
 218
 219 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 220 {
 221         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 222 }
 223
 224 static inline void rps_lock(struct softnet_data *sd)
 225 {
 226 #ifdef CONFIG_RPS
 227         spin_lock(&sd->input_pkt_queue.lock);
 228 #endif
 229 }
 230
 231 static inline void rps_unlock(struct softnet_data *sd)
 232 {
 233 #ifdef CONFIG_RPS
 234         spin_unlock(&sd->input_pkt_queue.lock);
 235 #endif
 236 }
 237
 238 /* Device list insertion */
 239 static int list_netdevice(struct net_device *dev)
 240 {
 241         struct net *net = dev_net(dev);
 242
 243         ASSERT_RTNL();
 244
 245         write_lock_bh(&dev_base_lock);
 246         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 247         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 248         hlist_add_head_rcu(&dev->index_hlist,
 249                            dev_index_hash(net, dev->ifindex));
 250         write_unlock_bh(&dev_base_lock);
 251
 252         dev_base_seq_inc(net);
 253
 254         return 0;
 255 }
 256
 257 /* Device list removal
 258  * caller must respect a RCU grace period before freeing/reusing dev
 259  */
 260 static void unlist_netdevice(struct net_device *dev)
 261 {
 262         ASSERT_RTNL();
 263
 264         /* Unlink dev from the device chain */
 265         write_lock_bh(&dev_base_lock);
 266         list_del_rcu(&dev->dev_list);
 267         hlist_del_rcu(&dev->name_hlist);
 268         hlist_del_rcu(&dev->index_hlist);
 269         write_unlock_bh(&dev_base_lock);
 270
 271         dev_base_seq_inc(dev_net(dev));
 272 }
 273
 274 /*
 275  *      Our notifier list
 276  */
 277
 278 static RAW_NOTIFIER_HEAD(netdev_chain);
 279
 280 /*
 281  *      Device drivers call our routines to queue packets here. We empty the
 282  *      queue in the local softnet handler.
 283  */
 284
 285 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 286 EXPORT_PER_CPU_SYMBOL(softnet_data);
 287
 288 #ifdef CONFIG_LOCKDEP
 289 /*
 290  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 291  * according to dev->type
 292  */
 293 static const unsigned short netdev_lock_type[] =
 294         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 295          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 296          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 297          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 298          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 299          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 300          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 301          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 302          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 303          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 304          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 305          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 306          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 307          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 308          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 324          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 325          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 326
 327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331 {
 332         int i;
 333
 334         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335                 if (netdev_lock_type[i] == dev_type)
 336                         return i;
 337         /* the last key is used by default */
 338         return ARRAY_SIZE(netdev_lock_type) - 1;
 339 }
 340
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344         int i;
 345
 346         i = netdev_lock_pos(dev_type);
 347         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348                                    netdev_lock_name[i]);
 349 }
 350
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353         int i;
 354
 355         i = netdev_lock_pos(dev->type);
 356         lockdep_set_class_and_name(&dev->addr_list_lock,
 357                                    &netdev_addr_lock_key[i],
 358                                    netdev_lock_name[i]);
 359 }
 360 #else
 361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362                                                  unsigned short dev_type)
 363 {
 364 }
 365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366 {
 367 }
 368 #endif
 369
 370 /*******************************************************************************
 371
 372                 Protocol management and registration routines
 373
 374 *******************************************************************************/
 375
 376 /*
 377  *      Add a protocol ID to the list. Now that the input handler is
 378  *      smarter we can dispense with all the messy stuff that used to be
 379  *      here.
 380  *
 381  *      BEWARE!!! Protocol handlers, mangling input packets,
 382  *      MUST BE last in hash buckets and checking protocol handlers
 383  *      MUST start from promiscuous ptype_all chain in net_bh.
 384  *      It is true now, do not change it.
 385  *      Explanation follows: if protocol handler, mangling packet, will
 386  *      be the first on list, it is not able to sense, that packet
 387  *      is cloned and should be copied-on-write, so that it will
 388  *      change it and subsequent readers will get broken packet.
 389  *                                                      --ANK (980803)
 390  */
 391
 392 static inline struct list_head *ptype_head(const struct packet_type *pt)
 393 {
 394         if (pt->type == htons(ETH_P_ALL))
 395                 return &ptype_all;
 396         else
 397                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398 }
 399
 400 /**
 401  *      dev_add_pack - add packet handler
 402  *      @pt: packet type declaration
 403  *
 404  *      Add a protocol handler to the networking stack. The passed &packet_type
 405  *      is linked into kernel lists and may not be freed until it has been
 406  *      removed from the kernel lists.
 407  *
 408  *      This call does not sleep therefore it can not
 409  *      guarantee all CPU's that are in middle of receiving packets
 410  *      will see the new packet type (until the next received packet).
 411  */
 412
 413 void dev_add_pack(struct packet_type *pt)
 414 {
 415         struct list_head *head = ptype_head(pt);
 416
 417         spin_lock(&ptype_lock);
 418         list_add_rcu(&pt->list, head);
 419         spin_unlock(&ptype_lock);
 420 }
 421 EXPORT_SYMBOL(dev_add_pack);
 422
 423 /**
 424  *      __dev_remove_pack        - remove packet handler
 425  *      @pt: packet type declaration
 426  *
 427  *      Remove a protocol handler that was previously added to the kernel
 428  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429  *      from the kernel lists and can be freed or reused once this function
 430  *      returns.
 431  *
 432  *      The packet type might still be in use by receivers
 433  *      and must not be freed until after all the CPU's have gone
 434  *      through a quiescent state.
 435  */
 436 void __dev_remove_pack(struct packet_type *pt)
 437 {
 438         struct list_head *head = ptype_head(pt);
 439         struct packet_type *pt1;
 440
 441         spin_lock(&ptype_lock);
 442
 443         list_for_each_entry(pt1, head, list) {
 444                 if (pt == pt1) {
 445                         list_del_rcu(&pt->list);
 446                         goto out;
 447                 }
 448         }
 449
 450         pr_warn("dev_remove_pack: %p not found\n", pt);
 451 out:
 452         spin_unlock(&ptype_lock);
 453 }
 454 EXPORT_SYMBOL(__dev_remove_pack);
 455
 456 /**
 457  *      dev_remove_pack  - remove packet handler
 458  *      @pt: packet type declaration
 459  *
 460  *      Remove a protocol handler that was previously added to the kernel
 461  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462  *      from the kernel lists and can be freed or reused once this function
 463  *      returns.
 464  *
 465  *      This call sleeps to guarantee that no CPU is looking at the packet
 466  *      type after return.
 467  */
 468 void dev_remove_pack(struct packet_type *pt)
 469 {
 470         __dev_remove_pack(pt);
 471
 472         synchronize_net();
 473 }
 474 EXPORT_SYMBOL(dev_remove_pack);
 475
 476
 477 /**
 478  *      dev_add_offload - register offload handlers
 479  *      @po: protocol offload declaration
 480  *
 481  *      Add protocol offload handlers to the networking stack. The passed
 482  *      &proto_offload is linked into kernel lists and may not be freed until
 483  *      it has been removed from the kernel lists.
 484  *
 485  *      This call does not sleep therefore it can not
 486  *      guarantee all CPU's that are in middle of receiving packets
 487  *      will see the new offload handlers (until the next received packet).
 488  */
 489 void dev_add_offload(struct packet_offload *po)
 490 {
 491         struct list_head *head = &offload_base;
 492
 493         spin_lock(&offload_lock);
 494         list_add_rcu(&po->list, head);
 495         spin_unlock(&offload_lock);
 496 }
 497 EXPORT_SYMBOL(dev_add_offload);
 498
 499 /**
 500  *      __dev_remove_offload     - remove offload handler
 501  *      @po: packet offload declaration
 502  *
 503  *      Remove a protocol offload handler that was previously added to the
 504  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 505  *      is removed from the kernel lists and can be freed or reused once this
 506  *      function returns.
 507  *
 508  *      The packet type might still be in use by receivers
 509  *      and must not be freed until after all the CPU's have gone
 510  *      through a quiescent state.
 511  */
 512 void __dev_remove_offload(struct packet_offload *po)
 513 {
 514         struct list_head *head = &offload_base;
 515         struct packet_offload *po1;
 516
 517         spin_lock(&offload_lock);
 518
 519         list_for_each_entry(po1, head, list) {
 520                 if (po == po1) {
 521                         list_del_rcu(&po->list);
 522                         goto out;
 523                 }
 524         }
 525
 526         pr_warn("dev_remove_offload: %p not found\n", po);
 527 out:
 528         spin_unlock(&offload_lock);
 529 }
 530 EXPORT_SYMBOL(__dev_remove_offload);
 531
 532 /**
 533  *      dev_remove_offload       - remove packet offload handler
 534  *      @po: packet offload declaration
 535  *
 536  *      Remove a packet offload handler that was previously added to the kernel
 537  *      offload handlers by dev_add_offload(). The passed &offload_type is
 538  *      removed from the kernel lists and can be freed or reused once this
 539  *      function returns.
 540  *
 541  *      This call sleeps to guarantee that no CPU is looking at the packet
 542  *      type after return.
 543  */
 544 void dev_remove_offload(struct packet_offload *po)
 545 {
 546         __dev_remove_offload(po);
 547
 548         synchronize_net();
 549 }
 550 EXPORT_SYMBOL(dev_remove_offload);
 551
 552 /******************************************************************************
 553
 554                       Device Boot-time Settings Routines
 555
 556 *******************************************************************************/
 557
 558 /* Boot time configuration table */
 559 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 560
 561 /**
 562  *      netdev_boot_setup_add   - add new setup entry
 563  *      @name: name of the device
 564  *      @map: configured settings for the device
 565  *
 566  *      Adds new setup entry to the dev_boot_setup list.  The function
 567  *      returns 0 on error and 1 on success.  This is a generic routine to
 568  *      all netdevices.
 569  */
 570 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 571 {
 572         struct netdev_boot_setup *s;
 573         int i;
 574
 575         s = dev_boot_setup;
 576         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 577                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 578                         memset(s[i].name, 0, sizeof(s[i].name));
 579                         strlcpy(s[i].name, name, IFNAMSIZ);
 580                         memcpy(&s[i].map, map, sizeof(s[i].map));
 581                         break;
 582                 }
 583         }
 584
 585         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 586 }
 587
 588 /**
 589  *      netdev_boot_setup_check - check boot time settings
 590  *      @dev: the netdevice
 591  *
 592  *      Check boot time settings for the device.
 593  *      The found settings are set for the device to be used
 594  *      later in the device probing.
 595  *      Returns 0 if no settings found, 1 if they are.
 596  */
 597 int netdev_boot_setup_check(struct net_device *dev)
 598 {
 599         struct netdev_boot_setup *s = dev_boot_setup;
 600         int i;
 601
 602         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 603                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 604                     !strcmp(dev->name, s[i].name)) {
 605                         dev->irq        = s[i].map.irq;
 606                         dev->base_addr  = s[i].map.base_addr;
 607                         dev->mem_start  = s[i].map.mem_start;
 608                         dev->mem_end    = s[i].map.mem_end;
 609                         return 1;
 610                 }
 611         }
 612         return 0;
 613 }
 614 EXPORT_SYMBOL(netdev_boot_setup_check);
 615
 616
 617 /**
 618  *      netdev_boot_base        - get address from boot time settings
 619  *      @prefix: prefix for network device
 620  *      @unit: id for network device
 621  *
 622  *      Check boot time settings for the base address of device.
 623  *      The found settings are set for the device to be used
 624  *      later in the device probing.
 625  *      Returns 0 if no settings found.
 626  */
 627 unsigned long netdev_boot_base(const char *prefix, int unit)
 628 {
 629         const struct netdev_boot_setup *s = dev_boot_setup;
 630         char name[IFNAMSIZ];
 631         int i;
 632
 633         sprintf(name, "%s%d", prefix, unit);
 634
 635         /*
 636          * If device already registered then return base of 1
 637          * to indicate not to probe for this interface
 638          */
 639         if (__dev_get_by_name(&init_net, name))
 640                 return 1;
 641
 642         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 643                 if (!strcmp(name, s[i].name))
 644                         return s[i].map.base_addr;
 645         return 0;
 646 }
 647
 648 /*
 649  * Saves at boot time configured settings for any netdevice.
 650  */
 651 int __init netdev_boot_setup(char *str)
 652 {
 653         int ints[5];
 654         struct ifmap map;
 655
 656         str = get_options(str, ARRAY_SIZE(ints), ints);
 657         if (!str || !*str)
 658                 return 0;
 659
 660         /* Save settings */
 661         memset(&map, 0, sizeof(map));
 662         if (ints[0] > 0)
 663                 map.irq = ints[1];
 664         if (ints[0] > 1)
 665                 map.base_addr = ints[2];
 666         if (ints[0] > 2)
 667                 map.mem_start = ints[3];
 668         if (ints[0] > 3)
 669                 map.mem_end = ints[4];
 670
 671         /* Add new entry to the list */
 672         return netdev_boot_setup_add(str, &map);
 673 }
 674
 675 __setup("netdev=", netdev_boot_setup);
 676
 677 /*******************************************************************************
 678
 679                             Device Interface Subroutines
 680
 681 *******************************************************************************/
 682
 683 /**
 684  *      __dev_get_by_name       - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name. Must be called under RTNL semaphore
 689  *      or @dev_base_lock. If the name is found a pointer to the device
 690  *      is returned. If the name is not found then %NULL is returned. The
 691  *      reference counters are not incremented so the caller must be
 692  *      careful with locks.
 693  */
 694
 695 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 696 {
 697         struct hlist_node *p;
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_name_hash(net, name);
 700
 701         hlist_for_each_entry(dev, p, head, name_hlist)
 702                 if (!strncmp(dev->name, name, IFNAMSIZ))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(__dev_get_by_name);
 708
 709 /**
 710  *      dev_get_by_name_rcu     - find a device by its name
 711  *      @net: the applicable net namespace
 712  *      @name: name to find
 713  *
 714  *      Find an interface by name.
 715  *      If the name is found a pointer to the device is returned.
 716  *      If the name is not found then %NULL is returned.
 717  *      The reference counters are not incremented so the caller must be
 718  *      careful with locks. The caller must hold RCU lock.
 719  */
 720
 721 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 722 {
 723         struct hlist_node *p;
 724         struct net_device *dev;
 725         struct hlist_head *head = dev_name_hash(net, name);
 726
 727         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 728                 if (!strncmp(dev->name, name, IFNAMSIZ))
 729                         return dev;
 730
 731         return NULL;
 732 }
 733 EXPORT_SYMBOL(dev_get_by_name_rcu);
 734
 735 /**
 736  *      dev_get_by_name         - find a device by its name
 737  *      @net: the applicable net namespace
 738  *      @name: name to find
 739  *
 740  *      Find an interface by name. This can be called from any
 741  *      context and does its own locking. The returned handle has
 742  *      the usage count incremented and the caller must use dev_put() to
 743  *      release it when it is no longer needed. %NULL is returned if no
 744  *      matching device is found.
 745  */
 746
 747 struct net_device *dev_get_by_name(struct net *net, const char *name)
 748 {
 749         struct net_device *dev;
 750
 751         rcu_read_lock();
 752         dev = dev_get_by_name_rcu(net, name);
 753         if (dev)
 754                 dev_hold(dev);
 755         rcu_read_unlock();
 756         return dev;
 757 }
 758 EXPORT_SYMBOL(dev_get_by_name);
 759
 760 /**
 761  *      __dev_get_by_index - find a device by its ifindex
 762  *      @net: the applicable net namespace
 763  *      @ifindex: index of device
 764  *
 765  *      Search for an interface by index. Returns %NULL if the device
 766  *      is not found or a pointer to the device. The device has not
 767  *      had its reference counter increased so the caller must be careful
 768  *      about locking. The caller must hold either the RTNL semaphore
 769  *      or @dev_base_lock.
 770  */
 771
 772 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 773 {
 774         struct hlist_node *p;
 775         struct net_device *dev;
 776         struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778         hlist_for_each_entry(dev, p, head, index_hlist)
 779                 if (dev->ifindex == ifindex)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(__dev_get_by_index);
 785
 786 /**
 787  *      dev_get_by_index_rcu - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns %NULL if the device
 792  *      is not found or a pointer to the device. The device has not
 793  *      had its reference counter increased so the caller must be careful
 794  *      about locking. The caller must hold RCU lock.
 795  */
 796
 797 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 798 {
 799         struct hlist_node *p;
 800         struct net_device *dev;
 801         struct hlist_head *head = dev_index_hash(net, ifindex);
 802
 803         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 804                 if (dev->ifindex == ifindex)
 805                         return dev;
 806
 807         return NULL;
 808 }
 809 EXPORT_SYMBOL(dev_get_by_index_rcu);
 810
 811
 812 /**
 813  *      dev_get_by_index - find a device by its ifindex
 814  *      @net: the applicable net namespace
 815  *      @ifindex: index of device
 816  *
 817  *      Search for an interface by index. Returns NULL if the device
 818  *      is not found or a pointer to the device. The device returned has
 819  *      had a reference added and the pointer is safe until the user calls
 820  *      dev_put to indicate they have finished with it.
 821  */
 822
 823 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 824 {
 825         struct net_device *dev;
 826
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (dev)
 830                 dev_hold(dev);
 831         rcu_read_unlock();
 832         return dev;
 833 }
 834 EXPORT_SYMBOL(dev_get_by_index);
 835
 836 /**
 837  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 838  *      @net: the applicable net namespace
 839  *      @type: media type of device
 840  *      @ha: hardware address
 841  *
 842  *      Search for an interface by MAC address. Returns NULL if the device
 843  *      is not found or a pointer to the device.
 844  *      The caller must hold RCU or RTNL.
 845  *      The returned device has not had its ref count increased
 846  *      and the caller must therefore be careful about locking
 847  *
 848  */
 849
 850 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 851                                        const char *ha)
 852 {
 853         struct net_device *dev;
 854
 855         for_each_netdev_rcu(net, dev)
 856                 if (dev->type == type &&
 857                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 858                         return dev;
 859
 860         return NULL;
 861 }
 862 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 863
 864 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 865 {
 866         struct net_device *dev;
 867
 868         ASSERT_RTNL();
 869         for_each_netdev(net, dev)
 870                 if (dev->type == type)
 871                         return dev;
 872
 873         return NULL;
 874 }
 875 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 876
 877 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 878 {
 879         struct net_device *dev, *ret = NULL;
 880
 881         rcu_read_lock();
 882         for_each_netdev_rcu(net, dev)
 883                 if (dev->type == type) {
 884                         dev_hold(dev);
 885                         ret = dev;
 886                         break;
 887                 }
 888         rcu_read_unlock();
 889         return ret;
 890 }
 891 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 892
 893 /**
 894  *      dev_get_by_flags_rcu - find any device with given flags
 895  *      @net: the applicable net namespace
 896  *      @if_flags: IFF_* values
 897  *      @mask: bitmask of bits in if_flags to check
 898  *
 899  *      Search for any interface with the given flags. Returns NULL if a device
 900  *      is not found or a pointer to the device. Must be called inside
 901  *      rcu_read_lock(), and result refcount is unchanged.
 902  */
 903
 904 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 905                                     unsigned short mask)
 906 {
 907         struct net_device *dev, *ret;
 908
 909         ret = NULL;
 910         for_each_netdev_rcu(net, dev) {
 911                 if (((dev->flags ^ if_flags) & mask) == 0) {
 912                         ret = dev;
 913                         break;
 914                 }
 915         }
 916         return ret;
 917 }
 918 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 919
 920 /**
 921  *      dev_valid_name - check if name is okay for network device
 922  *      @name: name string
 923  *
 924  *      Network device names need to be valid file names to
 925  *      to allow sysfs to work.  We also disallow any kind of
 926  *      whitespace.
 927  */
 928 bool dev_valid_name(const char *name)
 929 {
 930         if (*name == '\0')
 931                 return false;
 932         if (strlen(name) >= IFNAMSIZ)
 933                 return false;
 934         if (!strcmp(name, ".") || !strcmp(name, ".."))
 935                 return false;
 936
 937         while (*name) {
 938                 if (*name == '/' || isspace(*name))
 939                         return false;
 940                 name++;
 941         }
 942         return true;
 943 }
 944 EXPORT_SYMBOL(dev_valid_name);
 945
 946 /**
 947  *      __dev_alloc_name - allocate a name for a device
 948  *      @net: network namespace to allocate the device name in
 949  *      @name: name format string
 950  *      @buf:  scratch buffer and result name string
 951  *
 952  *      Passed a format string - eg "lt%d" it will try and find a suitable
 953  *      id. It scans list of devices to build up a free map, then chooses
 954  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 955  *      while allocating the name and adding the device in order to avoid
 956  *      duplicates.
 957  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 958  *      Returns the number of the unit assigned or a negative errno code.
 959  */
 960
 961 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 962 {
 963         int i = 0;
 964         const char *p;
 965         const int max_netdevices = 8*PAGE_SIZE;
 966         unsigned long *inuse;
 967         struct net_device *d;
 968
 969         p = strnchr(name, IFNAMSIZ-1, '%');
 970         if (p) {
 971                 /*
 972                  * Verify the string as this thing may have come from
 973                  * the user.  There must be either one "%d" and no other "%"
 974                  * characters.
 975                  */
 976                 if (p[1] != 'd' || strchr(p + 2, '%'))
 977                         return -EINVAL;
 978
 979                 /* Use one page as a bit array of possible slots */
 980                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 981                 if (!inuse)
 982                         return -ENOMEM;
 983
 984                 for_each_netdev(net, d) {
 985                         if (!sscanf(d->name, name, &i))
 986                                 continue;
 987                         if (i < 0 || i >= max_netdevices)
 988                                 continue;
 989
 990                         /*  avoid cases where sscanf is not exact inverse of printf */
 991                         snprintf(buf, IFNAMSIZ, name, i);
 992                         if (!strncmp(buf, d->name, IFNAMSIZ))
 993                                 set_bit(i, inuse);
 994                 }
 995
 996                 i = find_first_zero_bit(inuse, max_netdevices);
 997                 free_page((unsigned long) inuse);
 998         }
 999
1000         if (buf != name)
1001                 snprintf(buf, IFNAMSIZ, name, i);
1002         if (!__dev_get_by_name(net, buf))
1003                 return i;
1004
1005         /* It is possible to run out of possible slots
1006          * when the name is long and there isn't enough space left
1007          * for the digits, or if all bits are used.
1008          */
1009         return -ENFILE;
1010 }
1011
1012 /**
1013  *      dev_alloc_name - allocate a name for a device
1014  *      @dev: device
1015  *      @name: name format string
1016  *
1017  *      Passed a format string - eg "lt%d" it will try and find a suitable
1018  *      id. It scans list of devices to build up a free map, then chooses
1019  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1020  *      while allocating the name and adding the device in order to avoid
1021  *      duplicates.
1022  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023  *      Returns the number of the unit assigned or a negative errno code.
1024  */
1025
1026 int dev_alloc_name(struct net_device *dev, const char *name)
1027 {
1028         char buf[IFNAMSIZ];
1029         struct net *net;
1030         int ret;
1031
1032         BUG_ON(!dev_net(dev));
1033         net = dev_net(dev);
1034         ret = __dev_alloc_name(net, name, buf);
1035         if (ret >= 0)
1036                 strlcpy(dev->name, buf, IFNAMSIZ);
1037         return ret;
1038 }
1039 EXPORT_SYMBOL(dev_alloc_name);
1040
1041 static int dev_alloc_name_ns(struct net *net,
1042                              struct net_device *dev,
1043                              const char *name)
1044 {
1045         char buf[IFNAMSIZ];
1046         int ret;
1047
1048         ret = __dev_alloc_name(net, name, buf);
1049         if (ret >= 0)
1050                 strlcpy(dev->name, buf, IFNAMSIZ);
1051         return ret;
1052 }
1053
1054 static int dev_get_valid_name(struct net *net,
1055                               struct net_device *dev,
1056                               const char *name)
1057 {
1058         BUG_ON(!net);
1059
1060         if (!dev_valid_name(name))
1061                 return -EINVAL;
1062
1063         if (strchr(name, '%'))
1064                 return dev_alloc_name_ns(net, dev, name);
1065         else if (__dev_get_by_name(net, name))
1066                 return -EEXIST;
1067         else if (dev->name != name)
1068                 strlcpy(dev->name, name, IFNAMSIZ);
1069
1070         return 0;
1071 }
1072
1073 /**
1074  *      dev_change_name - change name of a device
1075  *      @dev: device
1076  *      @newname: name (or format string) must be at least IFNAMSIZ
1077  *
1078  *      Change name of a device, can pass format strings "eth%d".
1079  *      for wildcarding.
1080  */
1081 int dev_change_name(struct net_device *dev, const char *newname)
1082 {
1083         char oldname[IFNAMSIZ];
1084         int err = 0;
1085         int ret;
1086         struct net *net;
1087
1088         ASSERT_RTNL();
1089         BUG_ON(!dev_net(dev));
1090
1091         net = dev_net(dev);
1092         if (dev->flags & IFF_UP)
1093                 return -EBUSY;
1094
1095         write_seqcount_begin(&devnet_rename_seq);
1096
1097         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1098                 write_seqcount_end(&devnet_rename_seq);
1099                 return 0;
1100         }
1101
1102         memcpy(oldname, dev->name, IFNAMSIZ);
1103
1104         err = dev_get_valid_name(net, dev, newname);
1105         if (err < 0) {
1106                 write_seqcount_end(&devnet_rename_seq);
1107                 return err;
1108         }
1109
1110 rollback:
1111         ret = device_rename(&dev->dev, dev->name);
1112         if (ret) {
1113                 memcpy(dev->name, oldname, IFNAMSIZ);
1114                 write_seqcount_end(&devnet_rename_seq);
1115                 return ret;
1116         }
1117
1118         write_seqcount_end(&devnet_rename_seq);
1119
1120         write_lock_bh(&dev_base_lock);
1121         hlist_del_rcu(&dev->name_hlist);
1122         write_unlock_bh(&dev_base_lock);
1123
1124         synchronize_rcu();
1125
1126         write_lock_bh(&dev_base_lock);
1127         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1128         write_unlock_bh(&dev_base_lock);
1129
1130         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1131         ret = notifier_to_errno(ret);
1132
1133         if (ret) {
1134                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135                 if (err >= 0) {
1136                         err = ret;
1137                         write_seqcount_begin(&devnet_rename_seq);
1138                         memcpy(dev->name, oldname, IFNAMSIZ);
1139                         goto rollback;
1140                 } else {
1141                         pr_err("%s: name change rollback failed: %d\n",
1142                                dev->name, ret);
1143                 }
1144         }
1145
1146         return err;
1147 }
1148
1149 /**
1150  *      dev_set_alias - change ifalias of a device
1151  *      @dev: device
1152  *      @alias: name up to IFALIASZ
1153  *      @len: limit of bytes to copy from info
1154  *
1155  *      Set ifalias for a device,
1156  */
1157 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1158 {
1159         char *new_ifalias;
1160
1161         ASSERT_RTNL();
1162
1163         if (len >= IFALIASZ)
1164                 return -EINVAL;
1165
1166         if (!len) {
1167                 kfree(dev->ifalias);
1168                 dev->ifalias = NULL;
1169                 return 0;
1170         }
1171
1172         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173         if (!new_ifalias)
1174                 return -ENOMEM;
1175         dev->ifalias = new_ifalias;
1176
1177         strlcpy(dev->ifalias, alias, len+1);
1178         return len;
1179 }
1180
1181
1182 /**
1183  *      netdev_features_change - device changes features
1184  *      @dev: device to cause notification
1185  *
1186  *      Called to indicate a device has changed features.
1187  */
1188 void netdev_features_change(struct net_device *dev)
1189 {
1190         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1191 }
1192 EXPORT_SYMBOL(netdev_features_change);
1193
1194 /**
1195  *      netdev_state_change - device changes state
1196  *      @dev: device to cause notification
1197  *
1198  *      Called to indicate a device has changed state. This function calls
1199  *      the notifier chains for netdev_chain and sends a NEWLINK message
1200  *      to the routing socket.
1201  */
1202 void netdev_state_change(struct net_device *dev)
1203 {
1204         if (dev->flags & IFF_UP) {
1205                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1206                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1207         }
1208 }
1209 EXPORT_SYMBOL(netdev_state_change);
1210
1211 /**
1212  *      netdev_notify_peers - notify network peers about existence of @dev
1213  *      @dev: network device
1214  *
1215  * Generate traffic such that interested network peers are aware of
1216  * @dev, such as by generating a gratuitous ARP. This may be used when
1217  * a device wants to inform the rest of the network about some sort of
1218  * reconfiguration such as a failover event or virtual machine
1219  * migration.
1220  */
1221 void netdev_notify_peers(struct net_device *dev)
1222 {
1223         rtnl_lock();
1224         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225         rtnl_unlock();
1226 }
1227 EXPORT_SYMBOL(netdev_notify_peers);
1228
1229 /**
1230  *      dev_load        - load a network module
1231  *      @net: the applicable net namespace
1232  *      @name: name of interface
1233  *
1234  *      If a network interface is not present and the process has suitable
1235  *      privileges this function loads the module. If module loading is not
1236  *      available in this kernel then it becomes a nop.
1237  */
1238
1239 void dev_load(struct net *net, const char *name)
1240 {
1241         struct net_device *dev;
1242         int no_module;
1243
1244         rcu_read_lock();
1245         dev = dev_get_by_name_rcu(net, name);
1246         rcu_read_unlock();
1247
1248         no_module = !dev;
1249         if (no_module && capable(CAP_NET_ADMIN))
1250                 no_module = request_module("netdev-%s", name);
1251         if (no_module && capable(CAP_SYS_MODULE)) {
1252                 if (!request_module("%s", name))
1253                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1254                                 name);
1255         }
1256 }
1257 EXPORT_SYMBOL(dev_load);
1258
1259 static int __dev_open(struct net_device *dev)
1260 {
1261         const struct net_device_ops *ops = dev->netdev_ops;
1262         int ret;
1263
1264         ASSERT_RTNL();
1265
1266         if (!netif_device_present(dev))
1267                 return -ENODEV;
1268
1269         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1270         ret = notifier_to_errno(ret);
1271         if (ret)
1272                 return ret;
1273
1274         set_bit(__LINK_STATE_START, &dev->state);
1275
1276         if (ops->ndo_validate_addr)
1277                 ret = ops->ndo_validate_addr(dev);
1278
1279         if (!ret && ops->ndo_open)
1280                 ret = ops->ndo_open(dev);
1281
1282         if (ret)
1283                 clear_bit(__LINK_STATE_START, &dev->state);
1284         else {
1285                 dev->flags |= IFF_UP;
1286                 net_dmaengine_get();
1287                 dev_set_rx_mode(dev);
1288                 dev_activate(dev);
1289                 add_device_randomness(dev->dev_addr, dev->addr_len);
1290         }
1291
1292         return ret;
1293 }
1294
1295 /**
1296  *      dev_open        - prepare an interface for use.
1297  *      @dev:   device to open
1298  *
1299  *      Takes a device from down to up state. The device's private open
1300  *      function is invoked and then the multicast lists are loaded. Finally
1301  *      the device is moved into the up state and a %NETDEV_UP message is
1302  *      sent to the netdev notifier chain.
1303  *
1304  *      Calling this function on an active interface is a nop. On a failure
1305  *      a negative errno code is returned.
1306  */
1307 int dev_open(struct net_device *dev)
1308 {
1309         int ret;
1310
1311         if (dev->flags & IFF_UP)
1312                 return 0;
1313
1314         ret = __dev_open(dev);
1315         if (ret < 0)
1316                 return ret;
1317
1318         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1319         call_netdevice_notifiers(NETDEV_UP, dev);
1320
1321         return ret;
1322 }
1323 EXPORT_SYMBOL(dev_open);
1324
1325 static int __dev_close_many(struct list_head *head)
1326 {
1327         struct net_device *dev;
1328
1329         ASSERT_RTNL();
1330         might_sleep();
1331
1332         list_for_each_entry(dev, head, unreg_list) {
1333                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1334
1335                 clear_bit(__LINK_STATE_START, &dev->state);
1336
1337                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1338                  * can be even on different cpu. So just clear netif_running().
1339                  *
1340                  * dev->stop() will invoke napi_disable() on all of it's
1341                  * napi_struct instances on this device.
1342                  */
1343                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1344         }
1345
1346         dev_deactivate_many(head);
1347
1348         list_for_each_entry(dev, head, unreg_list) {
1349                 const struct net_device_ops *ops = dev->netdev_ops;
1350
1351                 /*
1352                  *      Call the device specific close. This cannot fail.
1353                  *      Only if device is UP
1354                  *
1355                  *      We allow it to be called even after a DETACH hot-plug
1356                  *      event.
1357                  */
1358                 if (ops->ndo_stop)
1359                         ops->ndo_stop(dev);
1360
1361                 dev->flags &= ~IFF_UP;
1362                 net_dmaengine_put();
1363         }
1364
1365         return 0;
1366 }
1367
1368 static int __dev_close(struct net_device *dev)
1369 {
1370         int retval;
1371         LIST_HEAD(single);
1372
1373         list_add(&dev->unreg_list, &single);
1374         retval = __dev_close_many(&single);
1375         list_del(&single);
1376         return retval;
1377 }
1378
1379 static int dev_close_many(struct list_head *head)
1380 {
1381         struct net_device *dev, *tmp;
1382         LIST_HEAD(tmp_list);
1383
1384         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1385                 if (!(dev->flags & IFF_UP))
1386                         list_move(&dev->unreg_list, &tmp_list);
1387
1388         __dev_close_many(head);
1389
1390         list_for_each_entry(dev, head, unreg_list) {
1391                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1392                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1393         }
1394
1395         /* rollback_registered_many needs the complete original list */
1396         list_splice(&tmp_list, head);
1397         return 0;
1398 }
1399
1400 /**
1401  *      dev_close - shutdown an interface.
1402  *      @dev: device to shutdown
1403  *
1404  *      This function moves an active device into down state. A
1405  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1406  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1407  *      chain.
1408  */
1409 int dev_close(struct net_device *dev)
1410 {
1411         if (dev->flags & IFF_UP) {
1412                 LIST_HEAD(single);
1413
1414                 list_add(&dev->unreg_list, &single);
1415                 dev_close_many(&single);
1416                 list_del(&single);
1417         }
1418         return 0;
1419 }
1420 EXPORT_SYMBOL(dev_close);
1421
1422
1423 /**
1424  *      dev_disable_lro - disable Large Receive Offload on a device
1425  *      @dev: device
1426  *
1427  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1428  *      called under RTNL.  This is needed if received packets may be
1429  *      forwarded to another interface.
1430  */
1431 void dev_disable_lro(struct net_device *dev)
1432 {
1433         /*
1434          * If we're trying to disable lro on a vlan device
1435          * use the underlying physical device instead
1436          */
1437         if (is_vlan_dev(dev))
1438                 dev = vlan_dev_real_dev(dev);
1439
1440         dev->wanted_features &= ~NETIF_F_LRO;
1441         netdev_update_features(dev);
1442
1443         if (unlikely(dev->features & NETIF_F_LRO))
1444                 netdev_WARN(dev, "failed to disable LRO!\n");
1445 }
1446 EXPORT_SYMBOL(dev_disable_lro);
1447
1448
1449 static int dev_boot_phase = 1;
1450
1451 /**
1452  *      register_netdevice_notifier - register a network notifier block
1453  *      @nb: notifier
1454  *
1455  *      Register a notifier to be called when network device events occur.
1456  *      The notifier passed is linked into the kernel structures and must
1457  *      not be reused until it has been unregistered. A negative errno code
1458  *      is returned on a failure.
1459  *
1460  *      When registered all registration and up events are replayed
1461  *      to the new notifier to allow device to have a race free
1462  *      view of the network device list.
1463  */
1464
1465 int register_netdevice_notifier(struct notifier_block *nb)
1466 {
1467         struct net_device *dev;
1468         struct net_device *last;
1469         struct net *net;
1470         int err;
1471
1472         rtnl_lock();
1473         err = raw_notifier_chain_register(&netdev_chain, nb);
1474         if (err)
1475                 goto unlock;
1476         if (dev_boot_phase)
1477                 goto unlock;
1478         for_each_net(net) {
1479                 for_each_netdev(net, dev) {
1480                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1481                         err = notifier_to_errno(err);
1482                         if (err)
1483                                 goto rollback;
1484
1485                         if (!(dev->flags & IFF_UP))
1486                                 continue;
1487
1488                         nb->notifier_call(nb, NETDEV_UP, dev);
1489                 }
1490         }
1491
1492 unlock:
1493         rtnl_unlock();
1494         return err;
1495
1496 rollback:
1497         last = dev;
1498         for_each_net(net) {
1499                 for_each_netdev(net, dev) {
1500                         if (dev == last)
1501                                 goto outroll;
1502
1503                         if (dev->flags & IFF_UP) {
1504                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1505                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1506                         }
1507                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1508                 }
1509         }
1510
1511 outroll:
1512         raw_notifier_chain_unregister(&netdev_chain, nb);
1513         goto unlock;
1514 }
1515 EXPORT_SYMBOL(register_netdevice_notifier);
1516
1517 /**
1518  *      unregister_netdevice_notifier - unregister a network notifier block
1519  *      @nb: notifier
1520  *
1521  *      Unregister a notifier previously registered by
1522  *      register_netdevice_notifier(). The notifier is unlinked into the
1523  *      kernel structures and may then be reused. A negative errno code
1524  *      is returned on a failure.
1525  *
1526  *      After unregistering unregister and down device events are synthesized
1527  *      for all devices on the device list to the removed notifier to remove
1528  *      the need for special case cleanup code.
1529  */
1530
1531 int unregister_netdevice_notifier(struct notifier_block *nb)
1532 {
1533         struct net_device *dev;
1534         struct net *net;
1535         int err;
1536
1537         rtnl_lock();
1538         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1539         if (err)
1540                 goto unlock;
1541
1542         for_each_net(net) {
1543                 for_each_netdev(net, dev) {
1544                         if (dev->flags & IFF_UP) {
1545                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1546                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1547                         }
1548                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1549                 }
1550         }
1551 unlock:
1552         rtnl_unlock();
1553         return err;
1554 }
1555 EXPORT_SYMBOL(unregister_netdevice_notifier);
1556
1557 /**
1558  *      call_netdevice_notifiers - call all network notifier blocks
1559  *      @val: value passed unmodified to notifier function
1560  *      @dev: net_device pointer passed unmodified to notifier function
1561  *
1562  *      Call all network notifier blocks.  Parameters and return value
1563  *      are as for raw_notifier_call_chain().
1564  */
1565
1566 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1567 {
1568         ASSERT_RTNL();
1569         return raw_notifier_call_chain(&netdev_chain, val, dev);
1570 }
1571 EXPORT_SYMBOL(call_netdevice_notifiers);
1572
1573 static struct static_key netstamp_needed __read_mostly;
1574 #ifdef HAVE_JUMP_LABEL
1575 /* We are not allowed to call static_key_slow_dec() from irq context
1576  * If net_disable_timestamp() is called from irq context, defer the
1577  * static_key_slow_dec() calls.
1578  */
1579 static atomic_t netstamp_needed_deferred;
1580 #endif
1581
1582 void net_enable_timestamp(void)
1583 {
1584 #ifdef HAVE_JUMP_LABEL
1585         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1586
1587         if (deferred) {
1588                 while (--deferred)
1589                         static_key_slow_dec(&netstamp_needed);
1590                 return;
1591         }
1592 #endif
1593         WARN_ON(in_interrupt());
1594         static_key_slow_inc(&netstamp_needed);
1595 }
1596 EXPORT_SYMBOL(net_enable_timestamp);
1597
1598 void net_disable_timestamp(void)
1599 {
1600 #ifdef HAVE_JUMP_LABEL
1601         if (in_interrupt()) {
1602                 atomic_inc(&netstamp_needed_deferred);
1603                 return;
1604         }
1605 #endif
1606         static_key_slow_dec(&netstamp_needed);
1607 }
1608 EXPORT_SYMBOL(net_disable_timestamp);
1609
1610 static inline void net_timestamp_set(struct sk_buff *skb)
1611 {
1612         skb->tstamp.tv64 = 0;
1613         if (static_key_false(&netstamp_needed))
1614                 __net_timestamp(skb);
1615 }
1616
1617 #define net_timestamp_check(COND, SKB)                  \
1618         if (static_key_false(&netstamp_needed)) {               \
1619                 if ((COND) && !(SKB)->tstamp.tv64)      \
1620                         __net_timestamp(SKB);           \
1621         }                                               \
1622
1623 static int net_hwtstamp_validate(struct ifreq *ifr)
1624 {
1625         struct hwtstamp_config cfg;
1626         enum hwtstamp_tx_types tx_type;
1627         enum hwtstamp_rx_filters rx_filter;
1628         int tx_type_valid = 0;
1629         int rx_filter_valid = 0;
1630
1631         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1632                 return -EFAULT;
1633
1634         if (cfg.flags) /* reserved for future extensions */
1635                 return -EINVAL;
1636
1637         tx_type = cfg.tx_type;
1638         rx_filter = cfg.rx_filter;
1639
1640         switch (tx_type) {
1641         case HWTSTAMP_TX_OFF:
1642         case HWTSTAMP_TX_ON:
1643         case HWTSTAMP_TX_ONESTEP_SYNC:
1644                 tx_type_valid = 1;
1645                 break;
1646         }
1647
1648         switch (rx_filter) {
1649         case HWTSTAMP_FILTER_NONE:
1650         case HWTSTAMP_FILTER_ALL:
1651         case HWTSTAMP_FILTER_SOME:
1652         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1653         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1654         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1655         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1656         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1657         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1658         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1659         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1660         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1661         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1662         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1663         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1664                 rx_filter_valid = 1;
1665                 break;
1666         }
1667
1668         if (!tx_type_valid || !rx_filter_valid)
1669                 return -ERANGE;
1670
1671         return 0;
1672 }
1673
1674 static inline bool is_skb_forwardable(struct net_device *dev,
1675                                       struct sk_buff *skb)
1676 {
1677         unsigned int len;
1678
1679         if (!(dev->flags & IFF_UP))
1680                 return false;
1681
1682         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1683         if (skb->len <= len)
1684                 return true;
1685
1686         /* if TSO is enabled, we don't care about the length as the packet
1687          * could be forwarded without being segmented before
1688          */
1689         if (skb_is_gso(skb))
1690                 return true;
1691
1692         return false;
1693 }
1694
1695 /**
1696  * dev_forward_skb - loopback an skb to another netif
1697  *
1698  * @dev: destination network device
1699  * @skb: buffer to forward
1700  *
1701  * return values:
1702  *      NET_RX_SUCCESS  (no congestion)
1703  *      NET_RX_DROP     (packet was dropped, but freed)
1704  *
1705  * dev_forward_skb can be used for injecting an skb from the
1706  * start_xmit function of one device into the receive queue
1707  * of another device.
1708  *
1709  * The receiving device may be in another namespace, so
1710  * we have to clear all information in the skb that could
1711  * impact namespace isolation.
1712  */
1713 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1714 {
1715         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1716                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1717                         atomic_long_inc(&dev->rx_dropped);
1718                         kfree_skb(skb);
1719                         return NET_RX_DROP;
1720                 }
1721         }
1722
1723         skb_orphan(skb);
1724         nf_reset(skb);
1725
1726         if (unlikely(!is_skb_forwardable(dev, skb))) {
1727                 atomic_long_inc(&dev->rx_dropped);
1728                 kfree_skb(skb);
1729                 return NET_RX_DROP;
1730         }
1731         skb->skb_iif = 0;
1732         skb->dev = dev;
1733         skb_dst_drop(skb);
1734         skb->tstamp.tv64 = 0;
1735         skb->pkt_type = PACKET_HOST;
1736         skb->protocol = eth_type_trans(skb, dev);
1737         skb->mark = 0;
1738         secpath_reset(skb);
1739         nf_reset(skb);
1740         return netif_rx(skb);
1741 }
1742 EXPORT_SYMBOL_GPL(dev_forward_skb);
1743
1744 static inline int deliver_skb(struct sk_buff *skb,
1745                               struct packet_type *pt_prev,
1746                               struct net_device *orig_dev)
1747 {
1748         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1749                 return -ENOMEM;
1750         atomic_inc(&skb->users);
1751         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1752 }
1753
1754 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1755 {
1756         if (!ptype->af_packet_priv || !skb->sk)
1757                 return false;
1758
1759         if (ptype->id_match)
1760                 return ptype->id_match(ptype, skb->sk);
1761         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1762                 return true;
1763
1764         return false;
1765 }
1766
1767 /*
1768  *      Support routine. Sends outgoing frames to any network
1769  *      taps currently in use.
1770  */
1771
1772 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1773 {
1774         struct packet_type *ptype;
1775         struct sk_buff *skb2 = NULL;
1776         struct packet_type *pt_prev = NULL;
1777
1778         rcu_read_lock();
1779         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1780                 /* Never send packets back to the socket
1781                  * they originated from - MvS (miquels@drinkel.ow.org)
1782                  */
1783                 if ((ptype->dev == dev || !ptype->dev) &&
1784                     (!skb_loop_sk(ptype, skb))) {
1785                         if (pt_prev) {
1786                                 deliver_skb(skb2, pt_prev, skb->dev);
1787                                 pt_prev = ptype;
1788                                 continue;
1789                         }
1790
1791                         skb2 = skb_clone(skb, GFP_ATOMIC);
1792                         if (!skb2)
1793                                 break;
1794
1795                         net_timestamp_set(skb2);
1796
1797                         /* skb->nh should be correctly
1798                            set by sender, so that the second statement is
1799                            just protection against buggy protocols.
1800                          */
1801                         skb_reset_mac_header(skb2);
1802
1803                         if (skb_network_header(skb2) < skb2->data ||
1804                             skb2->network_header > skb2->tail) {
1805                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1806                                                      ntohs(skb2->protocol),
1807                                                      dev->name);
1808                                 skb_reset_network_header(skb2);
1809                         }
1810
1811                         skb2->transport_header = skb2->network_header;
1812                         skb2->pkt_type = PACKET_OUTGOING;
1813                         pt_prev = ptype;
1814                 }
1815         }
1816         if (pt_prev)
1817                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1818         rcu_read_unlock();
1819 }
1820
1821 /**
1822  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1823  * @dev: Network device
1824  * @txq: number of queues available
1825  *
1826  * If real_num_tx_queues is changed the tc mappings may no longer be
1827  * valid. To resolve this verify the tc mapping remains valid and if
1828  * not NULL the mapping. With no priorities mapping to this
1829  * offset/count pair it will no longer be used. In the worst case TC0
1830  * is invalid nothing can be done so disable priority mappings. If is
1831  * expected that drivers will fix this mapping if they can before
1832  * calling netif_set_real_num_tx_queues.
1833  */
1834 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1835 {
1836         int i;
1837         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1838
1839         /* If TC0 is invalidated disable TC mapping */
1840         if (tc->offset + tc->count > txq) {
1841                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1842                 dev->num_tc = 0;
1843                 return;
1844         }
1845
1846         /* Invalidated prio to tc mappings set to TC0 */
1847         for (i = 1; i < TC_BITMASK + 1; i++) {
1848                 int q = netdev_get_prio_tc_map(dev, i);
1849
1850                 tc = &dev->tc_to_txq[q];
1851                 if (tc->offset + tc->count > txq) {
1852                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1853                                 i, q);
1854                         netdev_set_prio_tc_map(dev, i, 0);
1855                 }
1856         }
1857 }
1858
1859 #ifdef CONFIG_XPS
1860 static DEFINE_MUTEX(xps_map_mutex);
1861 #define xmap_dereference(P)             \
1862         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1863
1864 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1865                                         int cpu, u16 index)
1866 {
1867         struct xps_map *map = NULL;
1868         int pos;
1869
1870         if (dev_maps)
1871                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1872
1873         for (pos = 0; map && pos < map->len; pos++) {
1874                 if (map->queues[pos] == index) {
1875                         if (map->len > 1) {
1876                                 map->queues[pos] = map->queues[--map->len];
1877                         } else {
1878                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1879                                 kfree_rcu(map, rcu);
1880                                 map = NULL;
1881                         }
1882                         break;
1883                 }
1884         }
1885
1886         return map;
1887 }
1888
1889 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1890 {
1891         struct xps_dev_maps *dev_maps;
1892         int cpu, i;
1893         bool active = false;
1894
1895         mutex_lock(&xps_map_mutex);
1896         dev_maps = xmap_dereference(dev->xps_maps);
1897
1898         if (!dev_maps)
1899                 goto out_no_maps;
1900
1901         for_each_possible_cpu(cpu) {
1902                 for (i = index; i < dev->num_tx_queues; i++) {
1903                         if (!remove_xps_queue(dev_maps, cpu, i))
1904                                 break;
1905                 }
1906                 if (i == dev->num_tx_queues)
1907                         active = true;
1908         }
1909
1910         if (!active) {
1911                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1912                 kfree_rcu(dev_maps, rcu);
1913         }
1914
1915         for (i = index; i < dev->num_tx_queues; i++)
1916                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1917                                              NUMA_NO_NODE);
1918
1919 out_no_maps:
1920         mutex_unlock(&xps_map_mutex);
1921 }
1922
1923 static struct xps_map *expand_xps_map(struct xps_map *map,
1924                                       int cpu, u16 index)
1925 {
1926         struct xps_map *new_map;
1927         int alloc_len = XPS_MIN_MAP_ALLOC;
1928         int i, pos;
1929
1930         for (pos = 0; map && pos < map->len; pos++) {
1931                 if (map->queues[pos] != index)
1932                         continue;
1933                 return map;
1934         }
1935
1936         /* Need to add queue to this CPU's existing map */
1937         if (map) {
1938                 if (pos < map->alloc_len)
1939                         return map;
1940
1941                 alloc_len = map->alloc_len * 2;
1942         }
1943
1944         /* Need to allocate new map to store queue on this CPU's map */
1945         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1946                                cpu_to_node(cpu));
1947         if (!new_map)
1948                 return NULL;
1949
1950         for (i = 0; i < pos; i++)
1951                 new_map->queues[i] = map->queues[i];
1952         new_map->alloc_len = alloc_len;
1953         new_map->len = pos;
1954
1955         return new_map;
1956 }
1957
1958 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1959 {
1960         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1961         struct xps_map *map, *new_map;
1962         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1963         int cpu, numa_node_id = -2;
1964         bool active = false;
1965
1966         mutex_lock(&xps_map_mutex);
1967
1968         dev_maps = xmap_dereference(dev->xps_maps);
1969
1970         /* allocate memory for queue storage */
1971         for_each_online_cpu(cpu) {
1972                 if (!cpumask_test_cpu(cpu, mask))
1973                         continue;
1974
1975                 if (!new_dev_maps)
1976                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1977                 if (!new_dev_maps)
1978                         return -ENOMEM;
1979
1980                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1981                                  NULL;
1982
1983                 map = expand_xps_map(map, cpu, index);
1984                 if (!map)
1985                         goto error;
1986
1987                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1988         }
1989
1990         if (!new_dev_maps)
1991                 goto out_no_new_maps;
1992
1993         for_each_possible_cpu(cpu) {
1994                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1995                         /* add queue to CPU maps */
1996                         int pos = 0;
1997
1998                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1999                         while ((pos < map->len) && (map->queues[pos] != index))
2000                                 pos++;
2001
2002                         if (pos == map->len)
2003                                 map->queues[map->len++] = index;
2004 #ifdef CONFIG_NUMA
2005                         if (numa_node_id == -2)
2006                                 numa_node_id = cpu_to_node(cpu);
2007                         else if (numa_node_id != cpu_to_node(cpu))
2008                                 numa_node_id = -1;
2009 #endif
2010                 } else if (dev_maps) {
2011                         /* fill in the new device map from the old device map */
2012                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2013                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2014                 }
2015
2016         }
2017
2018         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2019
2020         /* Cleanup old maps */
2021         if (dev_maps) {
2022                 for_each_possible_cpu(cpu) {
2023                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2024                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2025                         if (map && map != new_map)
2026                                 kfree_rcu(map, rcu);
2027                 }
2028
2029                 kfree_rcu(dev_maps, rcu);
2030         }
2031
2032         dev_maps = new_dev_maps;
2033         active = true;
2034
2035 out_no_new_maps:
2036         /* update Tx queue numa node */
2037         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2038                                      (numa_node_id >= 0) ? numa_node_id :
2039                                      NUMA_NO_NODE);
2040
2041         if (!dev_maps)
2042                 goto out_no_maps;
2043
2044         /* removes queue from unused CPUs */
2045         for_each_possible_cpu(cpu) {
2046                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2047                         continue;
2048
2049                 if (remove_xps_queue(dev_maps, cpu, index))
2050                         active = true;
2051         }
2052
2053         /* free map if not active */
2054         if (!active) {
2055                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2056                 kfree_rcu(dev_maps, rcu);
2057         }
2058
2059 out_no_maps:
2060         mutex_unlock(&xps_map_mutex);
2061
2062         return 0;
2063 error:
2064         /* remove any maps that we added */
2065         for_each_possible_cpu(cpu) {
2066                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2067                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2068                                  NULL;
2069                 if (new_map && new_map != map)
2070                         kfree(new_map);
2071         }
2072
2073         mutex_unlock(&xps_map_mutex);
2074
2075         kfree(new_dev_maps);
2076         return -ENOMEM;
2077 }
2078 EXPORT_SYMBOL(netif_set_xps_queue);
2079
2080 #endif
2081 /*
2082  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2083  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2084  */
2085 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2086 {
2087         int rc;
2088
2089         if (txq < 1 || txq > dev->num_tx_queues)
2090                 return -EINVAL;
2091
2092         if (dev->reg_state == NETREG_REGISTERED ||
2093             dev->reg_state == NETREG_UNREGISTERING) {
2094                 ASSERT_RTNL();
2095
2096                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2097                                                   txq);
2098                 if (rc)
2099                         return rc;
2100
2101                 if (dev->num_tc)
2102                         netif_setup_tc(dev, txq);
2103
2104                 if (txq < dev->real_num_tx_queues) {
2105                         qdisc_reset_all_tx_gt(dev, txq);
2106 #ifdef CONFIG_XPS
2107                         netif_reset_xps_queues_gt(dev, txq);
2108 #endif
2109                 }
2110         }
2111
2112         dev->real_num_tx_queues = txq;
2113         return 0;
2114 }
2115 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2116
2117 #ifdef CONFIG_RPS
2118 /**
2119  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2120  *      @dev: Network device
2121  *      @rxq: Actual number of RX queues
2122  *
2123  *      This must be called either with the rtnl_lock held or before
2124  *      registration of the net device.  Returns 0 on success, or a
2125  *      negative error code.  If called before registration, it always
2126  *      succeeds.
2127  */
2128 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2129 {
2130         int rc;
2131
2132         if (rxq < 1 || rxq > dev->num_rx_queues)
2133                 return -EINVAL;
2134
2135         if (dev->reg_state == NETREG_REGISTERED) {
2136                 ASSERT_RTNL();
2137
2138                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2139                                                   rxq);
2140                 if (rc)
2141                         return rc;
2142         }
2143
2144         dev->real_num_rx_queues = rxq;
2145         return 0;
2146 }
2147 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2148 #endif
2149
2150 /**
2151  * netif_get_num_default_rss_queues - default number of RSS queues
2152  *
2153  * This routine should set an upper limit on the number of RSS queues
2154  * used by default by multiqueue devices.
2155  */
2156 int netif_get_num_default_rss_queues(void)
2157 {
2158         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2159 }
2160 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2161
2162 static inline void __netif_reschedule(struct Qdisc *q)
2163 {
2164         struct softnet_data *sd;
2165         unsigned long flags;
2166
2167         local_irq_save(flags);
2168         sd = &__get_cpu_var(softnet_data);
2169         q->next_sched = NULL;
2170         *sd->output_queue_tailp = q;
2171         sd->output_queue_tailp = &q->next_sched;
2172         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2173         local_irq_restore(flags);
2174 }
2175
2176 void __netif_schedule(struct Qdisc *q)
2177 {
2178         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2179                 __netif_reschedule(q);
2180 }
2181 EXPORT_SYMBOL(__netif_schedule);
2182
2183 void dev_kfree_skb_irq(struct sk_buff *skb)
2184 {
2185         if (atomic_dec_and_test(&skb->users)) {
2186                 struct softnet_data *sd;
2187                 unsigned long flags;
2188
2189                 local_irq_save(flags);
2190                 sd = &__get_cpu_var(softnet_data);
2191                 skb->next = sd->completion_queue;
2192                 sd->completion_queue = skb;
2193                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2194                 local_irq_restore(flags);
2195         }
2196 }
2197 EXPORT_SYMBOL(dev_kfree_skb_irq);
2198
2199 void dev_kfree_skb_any(struct sk_buff *skb)
2200 {
2201         if (in_irq() || irqs_disabled())
2202                 dev_kfree_skb_irq(skb);
2203         else
2204                 dev_kfree_skb(skb);
2205 }
2206 EXPORT_SYMBOL(dev_kfree_skb_any);
2207
2208
2209 /**
2210  * netif_device_detach - mark device as removed
2211  * @dev: network device
2212  *
2213  * Mark device as removed from system and therefore no longer available.
2214  */
2215 void netif_device_detach(struct net_device *dev)
2216 {
2217         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2218             netif_running(dev)) {
2219                 netif_tx_stop_all_queues(dev);
2220         }
2221 }
2222 EXPORT_SYMBOL(netif_device_detach);
2223
2224 /**
2225  * netif_device_attach - mark device as attached
2226  * @dev: network device
2227  *
2228  * Mark device as attached from system and restart if needed.
2229  */
2230 void netif_device_attach(struct net_device *dev)
2231 {
2232         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2233             netif_running(dev)) {
2234                 netif_tx_wake_all_queues(dev);
2235                 __netdev_watchdog_up(dev);
2236         }
2237 }
2238 EXPORT_SYMBOL(netif_device_attach);
2239
2240 static void skb_warn_bad_offload(const struct sk_buff *skb)
2241 {
2242         static const netdev_features_t null_features = 0;
2243         struct net_device *dev = skb->dev;
2244         const char *driver = "";
2245
2246         if (dev && dev->dev.parent)
2247                 driver = dev_driver_string(dev->dev.parent);
2248
2249         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2250              "gso_type=%d ip_summed=%d\n",
2251              driver, dev ? &dev->features : &null_features,
2252              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2253              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2254              skb_shinfo(skb)->gso_type, skb->ip_summed);
2255 }
2256
2257 /*
2258  * Invalidate hardware checksum when packet is to be mangled, and
2259  * complete checksum manually on outgoing path.
2260  */
2261 int skb_checksum_help(struct sk_buff *skb)
2262 {
2263         __wsum csum;
2264         int ret = 0, offset;
2265
2266         if (skb->ip_summed == CHECKSUM_COMPLETE)
2267                 goto out_set_summed;
2268
2269         if (unlikely(skb_shinfo(skb)->gso_size)) {
2270                 skb_warn_bad_offload(skb);
2271                 return -EINVAL;
2272         }
2273
2274         /* Before computing a checksum, we should make sure no frag could
2275          * be modified by an external entity : checksum could be wrong.
2276          */
2277         if (skb_has_shared_frag(skb)) {
2278                 ret = __skb_linearize(skb);
2279                 if (ret)
2280                         goto out;
2281         }
2282
2283         offset = skb_checksum_start_offset(skb);
2284         BUG_ON(offset >= skb_headlen(skb));
2285         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2286
2287         offset += skb->csum_offset;
2288         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2289
2290         if (skb_cloned(skb) &&
2291             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2292                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2293                 if (ret)
2294                         goto out;
2295         }
2296
2297         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2298 out_set_summed:
2299         skb->ip_summed = CHECKSUM_NONE;
2300 out:
2301         return ret;
2302 }
2303 EXPORT_SYMBOL(skb_checksum_help);
2304
2305 /**
2306  *      skb_gso_segment - Perform segmentation on skb.
2307  *      @skb: buffer to segment
2308  *      @features: features for the output path (see dev->features)
2309  *
2310  *      This function segments the given skb and returns a list of segments.
2311  *
2312  *      It may return NULL if the skb requires no segmentation.  This is
2313  *      only possible when GSO is used for verifying header integrity.
2314  */
2315 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2316         netdev_features_t features)
2317 {
2318         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2319         struct packet_offload *ptype;
2320         __be16 type = skb->protocol;
2321         int vlan_depth = ETH_HLEN;
2322         int err;
2323
2324         while (type == htons(ETH_P_8021Q)) {
2325                 struct vlan_hdr *vh;
2326
2327                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2328                         return ERR_PTR(-EINVAL);
2329
2330                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2331                 type = vh->h_vlan_encapsulated_proto;
2332                 vlan_depth += VLAN_HLEN;
2333         }
2334
2335         skb_reset_mac_header(skb);
2336         skb->mac_len = skb->network_header - skb->mac_header;
2337         __skb_pull(skb, skb->mac_len);
2338
2339         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2340                 skb_warn_bad_offload(skb);
2341
2342                 if (skb_header_cloned(skb) &&
2343                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2344                         return ERR_PTR(err);
2345         }
2346
2347         rcu_read_lock();
2348         list_for_each_entry_rcu(ptype, &offload_base, list) {
2349                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2350                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2351                                 err = ptype->callbacks.gso_send_check(skb);
2352                                 segs = ERR_PTR(err);
2353                                 if (err || skb_gso_ok(skb, features))
2354                                         break;
2355                                 __skb_push(skb, (skb->data -
2356                                                  skb_network_header(skb)));
2357                         }
2358                         segs = ptype->callbacks.gso_segment(skb, features);
2359                         break;
2360                 }
2361         }
2362         rcu_read_unlock();
2363
2364         __skb_push(skb, skb->data - skb_mac_header(skb));
2365
2366         return segs;
2367 }
2368 EXPORT_SYMBOL(skb_gso_segment);
2369
2370 /* Take action when hardware reception checksum errors are detected. */
2371 #ifdef CONFIG_BUG
2372 void netdev_rx_csum_fault(struct net_device *dev)
2373 {
2374         if (net_ratelimit()) {
2375                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2376                 dump_stack();
2377         }
2378 }
2379 EXPORT_SYMBOL(netdev_rx_csum_fault);
2380 #endif
2381
2382 /* Actually, we should eliminate this check as soon as we know, that:
2383  * 1. IOMMU is present and allows to map all the memory.
2384  * 2. No high memory really exists on this machine.
2385  */
2386
2387 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2388 {
2389 #ifdef CONFIG_HIGHMEM
2390         int i;
2391         if (!(dev->features & NETIF_F_HIGHDMA)) {
2392                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2393                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2394                         if (PageHighMem(skb_frag_page(frag)))
2395                                 return 1;
2396                 }
2397         }
2398
2399         if (PCI_DMA_BUS_IS_PHYS) {
2400                 struct device *pdev = dev->dev.parent;
2401
2402                 if (!pdev)
2403                         return 0;
2404                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2405                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2406                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2407                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2408                                 return 1;
2409                 }
2410         }
2411 #endif
2412         return 0;
2413 }
2414
2415 struct dev_gso_cb {
2416         void (*destructor)(struct sk_buff *skb);
2417 };
2418
2419 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2420
2421 static void dev_gso_skb_destructor(struct sk_buff *skb)
2422 {
2423         struct dev_gso_cb *cb;
2424
2425         do {
2426                 struct sk_buff *nskb = skb->next;
2427
2428                 skb->next = nskb->next;
2429                 nskb->next = NULL;
2430                 kfree_skb(nskb);
2431         } while (skb->next);
2432
2433         cb = DEV_GSO_CB(skb);
2434         if (cb->destructor)
2435                 cb->destructor(skb);
2436 }
2437
2438 /**
2439  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2440  *      @skb: buffer to segment
2441  *      @features: device features as applicable to this skb
2442  *
2443  *      This function segments the given skb and stores the list of segments
2444  *      in skb->next.
2445  */
2446 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2447 {
2448         struct sk_buff *segs;
2449
2450         segs = skb_gso_segment(skb, features);
2451
2452         /* Verifying header integrity only. */
2453         if (!segs)
2454                 return 0;
2455
2456         if (IS_ERR(segs))
2457                 return PTR_ERR(segs);
2458
2459         skb->next = segs;
2460         DEV_GSO_CB(skb)->destructor = skb->destructor;
2461         skb->destructor = dev_gso_skb_destructor;
2462
2463         return 0;
2464 }
2465
2466 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2467 {
2468         return ((features & NETIF_F_GEN_CSUM) ||
2469                 ((features & NETIF_F_V4_CSUM) &&
2470                  protocol == htons(ETH_P_IP)) ||
2471                 ((features & NETIF_F_V6_CSUM) &&
2472                  protocol == htons(ETH_P_IPV6)) ||
2473                 ((features & NETIF_F_FCOE_CRC) &&
2474                  protocol == htons(ETH_P_FCOE)));
2475 }
2476
2477 static netdev_features_t harmonize_features(struct sk_buff *skb,
2478         __be16 protocol, netdev_features_t features)
2479 {
2480         if (skb->ip_summed != CHECKSUM_NONE &&
2481             !can_checksum_protocol(features, protocol)) {
2482                 features &= ~NETIF_F_ALL_CSUM;
2483                 features &= ~NETIF_F_SG;
2484         } else if (illegal_highdma(skb->dev, skb)) {
2485                 features &= ~NETIF_F_SG;
2486         }
2487
2488         return features;
2489 }
2490
2491 netdev_features_t netif_skb_features(struct sk_buff *skb)
2492 {
2493         __be16 protocol = skb->protocol;
2494         netdev_features_t features = skb->dev->features;
2495
2496         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2497                 features &= ~NETIF_F_GSO_MASK;
2498
2499         if (protocol == htons(ETH_P_8021Q)) {
2500                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2501                 protocol = veh->h_vlan_encapsulated_proto;
2502         } else if (!vlan_tx_tag_present(skb)) {
2503                 return harmonize_features(skb, protocol, features);
2504         }
2505
2506         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2507
2508         if (protocol != htons(ETH_P_8021Q)) {
2509                 return harmonize_features(skb, protocol, features);
2510         } else {
2511                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2512                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2513                 return harmonize_features(skb, protocol, features);
2514         }
2515 }
2516 EXPORT_SYMBOL(netif_skb_features);
2517
2518 /*
2519  * Returns true if either:
2520  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2521  *      2. skb is fragmented and the device does not support SG.
2522  */
2523 static inline int skb_needs_linearize(struct sk_buff *skb,
2524                                       int features)
2525 {
2526         return skb_is_nonlinear(skb) &&
2527                         ((skb_has_frag_list(skb) &&
2528                                 !(features & NETIF_F_FRAGLIST)) ||
2529                         (skb_shinfo(skb)->nr_frags &&
2530                                 !(features & NETIF_F_SG)));
2531 }
2532
2533 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2534                         struct netdev_queue *txq)
2535 {
2536         const struct net_device_ops *ops = dev->netdev_ops;
2537         int rc = NETDEV_TX_OK;
2538         unsigned int skb_len;
2539
2540         if (likely(!skb->next)) {
2541                 netdev_features_t features;
2542
2543                 /*
2544                  * If device doesn't need skb->dst, release it right now while
2545                  * its hot in this cpu cache
2546                  */
2547                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2548                         skb_dst_drop(skb);
2549
2550                 features = netif_skb_features(skb);
2551
2552                 if (vlan_tx_tag_present(skb) &&
2553                     !(features & NETIF_F_HW_VLAN_TX)) {
2554                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2555                         if (unlikely(!skb))
2556                                 goto out;
2557
2558                         skb->vlan_tci = 0;
2559                 }
2560
2561                 /* If encapsulation offload request, verify we are testing
2562                  * hardware encapsulation features instead of standard
2563                  * features for the netdev
2564                  */
2565                 if (skb->encapsulation)
2566                         features &= dev->hw_enc_features;
2567
2568                 if (netif_needs_gso(skb, features)) {
2569                         if (unlikely(dev_gso_segment(skb, features)))
2570                                 goto out_kfree_skb;
2571                         if (skb->next)
2572                                 goto gso;
2573                 } else {
2574                         if (skb_needs_linearize(skb, features) &&
2575                             __skb_linearize(skb))
2576                                 goto out_kfree_skb;
2577
2578                         /* If packet is not checksummed and device does not
2579                          * support checksumming for this protocol, complete
2580                          * checksumming here.
2581                          */
2582                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2583                                 if (skb->encapsulation)
2584                                         skb_set_inner_transport_header(skb,
2585                                                 skb_checksum_start_offset(skb));
2586                                 else
2587                                         skb_set_transport_header(skb,
2588                                                 skb_checksum_start_offset(skb));
2589                                 if (!(features & NETIF_F_ALL_CSUM) &&
2590                                      skb_checksum_help(skb))
2591                                         goto out_kfree_skb;
2592                         }
2593                 }
2594
2595                 if (!list_empty(&ptype_all))
2596                         dev_queue_xmit_nit(skb, dev);
2597
2598                 skb_len = skb->len;
2599                 rc = ops->ndo_start_xmit(skb, dev);
2600                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2601                 if (rc == NETDEV_TX_OK)
2602                         txq_trans_update(txq);
2603                 return rc;
2604         }
2605
2606 gso:
2607         do {
2608                 struct sk_buff *nskb = skb->next;
2609
2610                 skb->next = nskb->next;
2611                 nskb->next = NULL;
2612
2613                 /*
2614                  * If device doesn't need nskb->dst, release it right now while
2615                  * its hot in this cpu cache
2616                  */
2617                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2618                         skb_dst_drop(nskb);
2619
2620                 if (!list_empty(&ptype_all))
2621                         dev_queue_xmit_nit(nskb, dev);
2622
2623                 skb_len = nskb->len;
2624                 rc = ops->ndo_start_xmit(nskb, dev);
2625                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2626                 if (unlikely(rc != NETDEV_TX_OK)) {
2627                         if (rc & ~NETDEV_TX_MASK)
2628                                 goto out_kfree_gso_skb;
2629                         nskb->next = skb->next;
2630                         skb->next = nskb;
2631                         return rc;
2632                 }
2633                 txq_trans_update(txq);
2634                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2635                         return NETDEV_TX_BUSY;
2636         } while (skb->next);
2637
2638 out_kfree_gso_skb:
2639         if (likely(skb->next == NULL))
2640                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2641 out_kfree_skb:
2642         kfree_skb(skb);
2643 out:
2644         return rc;
2645 }
2646
2647 static void qdisc_pkt_len_init(struct sk_buff *skb)
2648 {
2649         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2650
2651         qdisc_skb_cb(skb)->pkt_len = skb->len;
2652
2653         /* To get more precise estimation of bytes sent on wire,
2654          * we add to pkt_len the headers size of all segments
2655          */
2656         if (shinfo->gso_size)  {
2657                 unsigned int hdr_len;
2658
2659                 /* mac layer + network layer */
2660                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2661
2662                 /* + transport layer */
2663                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2664                         hdr_len += tcp_hdrlen(skb);
2665                 else
2666                         hdr_len += sizeof(struct udphdr);
2667                 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2668         }
2669 }
2670
2671 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2672                                  struct net_device *dev,
2673                                  struct netdev_queue *txq)
2674 {
2675         spinlock_t *root_lock = qdisc_lock(q);
2676         bool contended;
2677         int rc;
2678
2679         qdisc_pkt_len_init(skb);
2680         qdisc_calculate_pkt_len(skb, q);
2681         /*
2682          * Heuristic to force contended enqueues to serialize on a
2683          * separate lock before trying to get qdisc main lock.
2684          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2685          * and dequeue packets faster.
2686          */
2687         contended = qdisc_is_running(q);
2688         if (unlikely(contended))
2689                 spin_lock(&q->busylock);
2690
2691         spin_lock(root_lock);
2692         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2693                 kfree_skb(skb);
2694                 rc = NET_XMIT_DROP;
2695         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2696                    qdisc_run_begin(q)) {
2697                 /*
2698                  * This is a work-conserving queue; there are no old skbs
2699                  * waiting to be sent out; and the qdisc is not running -
2700                  * xmit the skb directly.
2701                  */
2702                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2703                         skb_dst_force(skb);
2704
2705                 qdisc_bstats_update(q, skb);
2706
2707                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2708                         if (unlikely(contended)) {
2709                                 spin_unlock(&q->busylock);
2710                                 contended = false;
2711                         }
2712                         __qdisc_run(q);
2713                 } else
2714                         qdisc_run_end(q);
2715
2716                 rc = NET_XMIT_SUCCESS;
2717         } else {
2718                 skb_dst_force(skb);
2719                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2720                 if (qdisc_run_begin(q)) {
2721                         if (unlikely(contended)) {
2722                                 spin_unlock(&q->busylock);
2723                                 contended = false;
2724                         }
2725                         __qdisc_run(q);
2726                 }
2727         }
2728         spin_unlock(root_lock);
2729         if (unlikely(contended))
2730                 spin_unlock(&q->busylock);
2731         return rc;
2732 }
2733
2734 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2735 static void skb_update_prio(struct sk_buff *skb)
2736 {
2737         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2738
2739         if (!skb->priority && skb->sk && map) {
2740                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2741
2742                 if (prioidx < map->priomap_len)
2743                         skb->priority = map->priomap[prioidx];
2744         }
2745 }
2746 #else
2747 #define skb_update_prio(skb)
2748 #endif
2749
2750 static DEFINE_PER_CPU(int, xmit_recursion);
2751 #define RECURSION_LIMIT 10
2752
2753 /**
2754  *      dev_loopback_xmit - loop back @skb
2755  *      @skb: buffer to transmit
2756  */
2757 int dev_loopback_xmit(struct sk_buff *skb)
2758 {
2759         skb_reset_mac_header(skb);
2760         __skb_pull(skb, skb_network_offset(skb));
2761         skb->pkt_type = PACKET_LOOPBACK;
2762         skb->ip_summed = CHECKSUM_UNNECESSARY;
2763         WARN_ON(!skb_dst(skb));
2764         skb_dst_force(skb);
2765         netif_rx_ni(skb);
2766         return 0;
2767 }
2768 EXPORT_SYMBOL(dev_loopback_xmit);
2769
2770 /**
2771  *      dev_queue_xmit - transmit a buffer
2772  *      @skb: buffer to transmit
2773  *
2774  *      Queue a buffer for transmission to a network device. The caller must
2775  *      have set the device and priority and built the buffer before calling
2776  *      this function. The function can be called from an interrupt.
2777  *
2778  *      A negative errno code is returned on a failure. A success does not
2779  *      guarantee the frame will be transmitted as it may be dropped due
2780  *      to congestion or traffic shaping.
2781  *
2782  * -----------------------------------------------------------------------------------
2783  *      I notice this method can also return errors from the queue disciplines,
2784  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2785  *      be positive.
2786  *
2787  *      Regardless of the return value, the skb is consumed, so it is currently
2788  *      difficult to retry a send to this method.  (You can bump the ref count
2789  *      before sending to hold a reference for retry if you are careful.)
2790  *
2791  *      When calling this method, interrupts MUST be enabled.  This is because
2792  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2793  *          --BLG
2794  */
2795 int dev_queue_xmit(struct sk_buff *skb)
2796 {
2797         struct net_device *dev = skb->dev;
2798         struct netdev_queue *txq;
2799         struct Qdisc *q;
2800         int rc = -ENOMEM;
2801
2802         /* Disable soft irqs for various locks below. Also
2803          * stops preemption for RCU.
2804          */
2805         rcu_read_lock_bh();
2806
2807         skb_update_prio(skb);
2808
2809         txq = netdev_pick_tx(dev, skb);
2810         q = rcu_dereference_bh(txq->qdisc);
2811
2812 #ifdef CONFIG_NET_CLS_ACT
2813         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2814 #endif
2815         trace_net_dev_queue(skb);
2816         if (q->enqueue) {
2817                 rc = __dev_xmit_skb(skb, q, dev, txq);
2818                 goto out;
2819         }
2820
2821         /* The device has no queue. Common case for software devices:
2822            loopback, all the sorts of tunnels...
2823
2824            Really, it is unlikely that netif_tx_lock protection is necessary
2825            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2826            counters.)
2827            However, it is possible, that they rely on protection
2828            made by us here.
2829
2830            Check this and shot the lock. It is not prone from deadlocks.
2831            Either shot noqueue qdisc, it is even simpler 8)
2832          */
2833         if (dev->flags & IFF_UP) {
2834                 int cpu = smp_processor_id(); /* ok because BHs are off */
2835
2836                 if (txq->xmit_lock_owner != cpu) {
2837
2838                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2839                                 goto recursion_alert;
2840
2841                         HARD_TX_LOCK(dev, txq, cpu);
2842
2843                         if (!netif_xmit_stopped(txq)) {
2844                                 __this_cpu_inc(xmit_recursion);
2845                                 rc = dev_hard_start_xmit(skb, dev, txq);
2846                                 __this_cpu_dec(xmit_recursion);
2847                                 if (dev_xmit_complete(rc)) {
2848                                         HARD_TX_UNLOCK(dev, txq);
2849                                         goto out;
2850                                 }
2851                         }
2852                         HARD_TX_UNLOCK(dev, txq);
2853                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2854                                              dev->name);
2855                 } else {
2856                         /* Recursion is detected! It is possible,
2857                          * unfortunately
2858                          */
2859 recursion_alert:
2860                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2861                                              dev->name);
2862                 }
2863         }
2864
2865         rc = -ENETDOWN;
2866         rcu_read_unlock_bh();
2867
2868         kfree_skb(skb);
2869         return rc;
2870 out:
2871         rcu_read_unlock_bh();
2872         return rc;
2873 }
2874 EXPORT_SYMBOL(dev_queue_xmit);
2875
2876
2877 /*=======================================================================
2878                         Receiver routines
2879   =======================================================================*/
2880
2881 int netdev_max_backlog __read_mostly = 1000;
2882 EXPORT_SYMBOL(netdev_max_backlog);
2883
2884 int netdev_tstamp_prequeue __read_mostly = 1;
2885 int netdev_budget __read_mostly = 300;
2886 int weight_p __read_mostly = 64;            /* old backlog weight */
2887
2888 /* Called with irq disabled */
2889 static inline void ____napi_schedule(struct softnet_data *sd,
2890                                      struct napi_struct *napi)
2891 {
2892         list_add_tail(&napi->poll_list, &sd->poll_list);
2893         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2894 }
2895
2896 #ifdef CONFIG_RPS
2897
2898 /* One global table that all flow-based protocols share. */
2899 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2900 EXPORT_SYMBOL(rps_sock_flow_table);
2901
2902 struct static_key rps_needed __read_mostly;
2903
2904 static struct rps_dev_flow *
2905 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2906             struct rps_dev_flow *rflow, u16 next_cpu)
2907 {
2908         if (next_cpu != RPS_NO_CPU) {
2909 #ifdef CONFIG_RFS_ACCEL
2910                 struct netdev_rx_queue *rxqueue;
2911                 struct rps_dev_flow_table *flow_table;
2912                 struct rps_dev_flow *old_rflow;
2913                 u32 flow_id;
2914                 u16 rxq_index;
2915                 int rc;
2916
2917                 /* Should we steer this flow to a different hardware queue? */
2918                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2919                     !(dev->features & NETIF_F_NTUPLE))
2920                         goto out;
2921                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2922                 if (rxq_index == skb_get_rx_queue(skb))
2923                         goto out;
2924
2925                 rxqueue = dev->_rx + rxq_index;
2926                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2927                 if (!flow_table)
2928                         goto out;
2929                 flow_id = skb->rxhash & flow_table->mask;
2930                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2931                                                         rxq_index, flow_id);
2932                 if (rc < 0)
2933                         goto out;
2934                 old_rflow = rflow;
2935                 rflow = &flow_table->flows[flow_id];
2936                 rflow->filter = rc;
2937                 if (old_rflow->filter == rflow->filter)
2938                         old_rflow->filter = RPS_NO_FILTER;
2939         out:
2940 #endif
2941                 rflow->last_qtail =
2942                         per_cpu(softnet_data, next_cpu).input_queue_head;
2943         }
2944
2945         rflow->cpu = next_cpu;
2946         return rflow;
2947 }
2948
2949 /*
2950  * get_rps_cpu is called from netif_receive_skb and returns the target
2951  * CPU from the RPS map of the receiving queue for a given skb.
2952  * rcu_read_lock must be held on entry.
2953  */
2954 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2955                        struct rps_dev_flow **rflowp)
2956 {
2957         struct netdev_rx_queue *rxqueue;
2958         struct rps_map *map;
2959         struct rps_dev_flow_table *flow_table;
2960         struct rps_sock_flow_table *sock_flow_table;
2961         int cpu = -1;
2962         u16 tcpu;
2963
2964         if (skb_rx_queue_recorded(skb)) {
2965                 u16 index = skb_get_rx_queue(skb);
2966                 if (unlikely(index >= dev->real_num_rx_queues)) {
2967                         WARN_ONCE(dev->real_num_rx_queues > 1,
2968                                   "%s received packet on queue %u, but number "
2969                                   "of RX queues is %u\n",
2970                                   dev->name, index, dev->real_num_rx_queues);
2971                         goto done;
2972                 }
2973                 rxqueue = dev->_rx + index;
2974         } else
2975                 rxqueue = dev->_rx;
2976
2977         map = rcu_dereference(rxqueue->rps_map);
2978         if (map) {
2979                 if (map->len == 1 &&
2980                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2981                         tcpu = map->cpus[0];
2982                         if (cpu_online(tcpu))
2983                                 cpu = tcpu;
2984                         goto done;
2985                 }
2986         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2987                 goto done;
2988         }
2989
2990         skb_reset_network_header(skb);
2991         if (!skb_get_rxhash(skb))
2992                 goto done;
2993
2994         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2995         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2996         if (flow_table && sock_flow_table) {
2997                 u16 next_cpu;
2998                 struct rps_dev_flow *rflow;
2999
3000                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3001                 tcpu = rflow->cpu;
3002
3003                 next_cpu = sock_flow_table->ents[skb->rxhash &
3004                     sock_flow_table->mask];
3005
3006                 /*
3007                  * If the desired CPU (where last recvmsg was done) is
3008                  * different from current CPU (one in the rx-queue flow
3009                  * table entry), switch if one of the following holds:
3010                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3011                  *   - Current CPU is offline.
3012                  *   - The current CPU's queue tail has advanced beyond the
3013                  *     last packet that was enqueued using this table entry.
3014                  *     This guarantees that all previous packets for the flow
3015                  *     have been dequeued, thus preserving in order delivery.
3016                  */
3017                 if (unlikely(tcpu != next_cpu) &&
3018                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3019                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3020                       rflow->last_qtail)) >= 0)) {
3021                         tcpu = next_cpu;
3022                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3023                 }
3024
3025                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3026                         *rflowp = rflow;
3027                         cpu = tcpu;
3028                         goto done;
3029                 }
3030         }
3031
3032         if (map) {
3033                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3034
3035                 if (cpu_online(tcpu)) {
3036                         cpu = tcpu;
3037                         goto done;
3038                 }
3039         }
3040
3041 done:
3042         return cpu;
3043 }
3044
3045 #ifdef CONFIG_RFS_ACCEL
3046
3047 /**
3048  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3049  * @dev: Device on which the filter was set
3050  * @rxq_index: RX queue index
3051  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3052  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3053  *
3054  * Drivers that implement ndo_rx_flow_steer() should periodically call
3055  * this function for each installed filter and remove the filters for
3056  * which it returns %true.
3057  */
3058 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3059                          u32 flow_id, u16 filter_id)
3060 {
3061         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3062         struct rps_dev_flow_table *flow_table;
3063         struct rps_dev_flow *rflow;
3064         bool expire = true;
3065         int cpu;
3066
3067         rcu_read_lock();
3068         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3069         if (flow_table && flow_id <= flow_table->mask) {
3070                 rflow = &flow_table->flows[flow_id];
3071                 cpu = ACCESS_ONCE(rflow->cpu);
3072                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3073                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3074                            rflow->last_qtail) <
3075                      (int)(10 * flow_table->mask)))
3076                         expire = false;
3077         }
3078         rcu_read_unlock();
3079         return expire;
3080 }
3081 EXPORT_SYMBOL(rps_may_expire_flow);
3082
3083 #endif /* CONFIG_RFS_ACCEL */
3084
3085 /* Called from hardirq (IPI) context */
3086 static void rps_trigger_softirq(void *data)
3087 {
3088         struct softnet_data *sd = data;
3089
3090         ____napi_schedule(sd, &sd->backlog);
3091         sd->received_rps++;
3092 }
3093
3094 #endif /* CONFIG_RPS */
3095
3096 /*
3097  * Check if this softnet_data structure is another cpu one
3098  * If yes, queue it to our IPI list and return 1
3099  * If no, return 0
3100  */
3101 static int rps_ipi_queued(struct softnet_data *sd)
3102 {
3103 #ifdef CONFIG_RPS
3104         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3105
3106         if (sd != mysd) {
3107                 sd->rps_ipi_next = mysd->rps_ipi_list;
3108                 mysd->rps_ipi_list = sd;
3109
3110                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3111                 return 1;
3112         }
3113 #endif /* CONFIG_RPS */
3114         return 0;
3115 }
3116
3117 /*
3118  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3119  * queue (may be a remote CPU queue).
3120  */
3121 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3122                               unsigned int *qtail)
3123 {
3124         struct softnet_data *sd;
3125         unsigned long flags;
3126
3127         sd = &per_cpu(softnet_data, cpu);
3128
3129         local_irq_save(flags);
3130
3131         rps_lock(sd);
3132         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3133                 if (skb_queue_len(&sd->input_pkt_queue)) {
3134 enqueue:
3135                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3136                         input_queue_tail_incr_save(sd, qtail);
3137                         rps_unlock(sd);
3138                         local_irq_restore(flags);
3139                         return NET_RX_SUCCESS;
3140                 }
3141
3142                 /* Schedule NAPI for backlog device
3143                  * We can use non atomic operation since we own the queue lock
3144                  */
3145                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3146                         if (!rps_ipi_queued(sd))
3147                                 ____napi_schedule(sd, &sd->backlog);
3148                 }
3149                 goto enqueue;
3150         }
3151
3152         sd->dropped++;
3153         rps_unlock(sd);
3154
3155         local_irq_restore(flags);
3156
3157         atomic_long_inc(&skb->dev->rx_dropped);
3158         kfree_skb(skb);
3159         return NET_RX_DROP;
3160 }
3161
3162 /**
3163  *      netif_rx        -       post buffer to the network code
3164  *      @skb: buffer to post
3165  *
3166  *      This function receives a packet from a device driver and queues it for
3167  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3168  *      may be dropped during processing for congestion control or by the
3169  *      protocol layers.
3170  *
3171  *      return values:
3172  *      NET_RX_SUCCESS  (no congestion)
3173  *      NET_RX_DROP     (packet was dropped)
3174  *
3175  */
3176
3177 int netif_rx(struct sk_buff *skb)
3178 {
3179         int ret;
3180
3181         /* if netpoll wants it, pretend we never saw it */
3182         if (netpoll_rx(skb))
3183                 return NET_RX_DROP;
3184
3185         net_timestamp_check(netdev_tstamp_prequeue, skb);
3186
3187         trace_netif_rx(skb);
3188 #ifdef CONFIG_RPS
3189         if (static_key_false(&rps_needed)) {
3190                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3191                 int cpu;
3192
3193                 preempt_disable();
3194                 rcu_read_lock();
3195
3196                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3197                 if (cpu < 0)
3198                         cpu = smp_processor_id();
3199
3200                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3201
3202                 rcu_read_unlock();
3203                 preempt_enable();
3204         } else
3205 #endif
3206         {
3207                 unsigned int qtail;
3208                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3209                 put_cpu();
3210         }
3211         return ret;
3212 }
3213 EXPORT_SYMBOL(netif_rx);
3214
3215 int netif_rx_ni(struct sk_buff *skb)
3216 {
3217         int err;
3218
3219         preempt_disable();
3220         err = netif_rx(skb);
3221         if (local_softirq_pending())
3222                 do_softirq();
3223         preempt_enable();
3224
3225         return err;
3226 }
3227 EXPORT_SYMBOL(netif_rx_ni);
3228
3229 static void net_tx_action(struct softirq_action *h)
3230 {
3231         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3232
3233         if (sd->completion_queue) {
3234                 struct sk_buff *clist;
3235
3236                 local_irq_disable();
3237                 clist = sd->completion_queue;
3238                 sd->completion_queue = NULL;
3239                 local_irq_enable();
3240
3241                 while (clist) {
3242                         struct sk_buff *skb = clist;
3243                         clist = clist->next;
3244
3245                         WARN_ON(atomic_read(&skb->users));
3246                         trace_kfree_skb(skb, net_tx_action);
3247                         __kfree_skb(skb);
3248                 }
3249         }
3250
3251         if (sd->output_queue) {
3252                 struct Qdisc *head;
3253
3254                 local_irq_disable();
3255                 head = sd->output_queue;
3256                 sd->output_queue = NULL;
3257                 sd->output_queue_tailp = &sd->output_queue;
3258                 local_irq_enable();
3259
3260                 while (head) {
3261                         struct Qdisc *q = head;
3262                         spinlock_t *root_lock;
3263
3264                         head = head->next_sched;
3265
3266                         root_lock = qdisc_lock(q);
3267                         if (spin_trylock(root_lock)) {
3268                                 smp_mb__before_clear_bit();
3269                                 clear_bit(__QDISC_STATE_SCHED,
3270                                           &q->state);
3271                                 qdisc_run(q);
3272                                 spin_unlock(root_lock);
3273                         } else {
3274                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3275                                               &q->state)) {
3276                                         __netif_reschedule(q);
3277                                 } else {
3278                                         smp_mb__before_clear_bit();
3279                                         clear_bit(__QDISC_STATE_SCHED,
3280                                                   &q->state);
3281                                 }
3282                         }
3283                 }
3284         }
3285 }
3286
3287 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3288     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3289 /* This hook is defined here for ATM LANE */
3290 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3291                              unsigned char *addr) __read_mostly;
3292 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3293 #endif
3294
3295 #ifdef CONFIG_NET_CLS_ACT
3296 /* TODO: Maybe we should just force sch_ingress to be compiled in
3297  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3298  * a compare and 2 stores extra right now if we dont have it on
3299  * but have CONFIG_NET_CLS_ACT
3300  * NOTE: This doesn't stop any functionality; if you dont have
3301  * the ingress scheduler, you just can't add policies on ingress.
3302  *
3303  */
3304 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3305 {
3306         struct net_device *dev = skb->dev;
3307         u32 ttl = G_TC_RTTL(skb->tc_verd);
3308         int result = TC_ACT_OK;
3309         struct Qdisc *q;
3310
3311         if (unlikely(MAX_RED_LOOP < ttl++)) {
3312                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3313                                      skb->skb_iif, dev->ifindex);
3314                 return TC_ACT_SHOT;
3315         }
3316
3317         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3318         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3319
3320         q = rxq->qdisc;
3321         if (q != &noop_qdisc) {
3322                 spin_lock(qdisc_lock(q));
3323                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3324                         result = qdisc_enqueue_root(skb, q);
3325                 spin_unlock(qdisc_lock(q));
3326         }
3327
3328         return result;
3329 }
3330
3331 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3332                                          struct packet_type **pt_prev,
3333                                          int *ret, struct net_device *orig_dev)
3334 {
3335         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3336
3337         if (!rxq || rxq->qdisc == &noop_qdisc)
3338                 goto out;
3339
3340         if (*pt_prev) {
3341                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3342                 *pt_prev = NULL;
3343         }
3344
3345         switch (ing_filter(skb, rxq)) {
3346         case TC_ACT_SHOT:
3347         case TC_ACT_STOLEN:
3348                 kfree_skb(skb);
3349                 return NULL;
3350         }
3351
3352 out:
3353         skb->tc_verd = 0;
3354         return skb;
3355 }
3356 #endif
3357
3358 /**
3359  *      netdev_rx_handler_register - register receive handler
3360  *      @dev: device to register a handler for
3361  *      @rx_handler: receive handler to register
3362  *      @rx_handler_data: data pointer that is used by rx handler
3363  *
3364  *      Register a receive hander for a device. This handler will then be
3365  *      called from __netif_receive_skb. A negative errno code is returned
3366  *      on a failure.
3367  *
3368  *      The caller must hold the rtnl_mutex.
3369  *
3370  *      For a general description of rx_handler, see enum rx_handler_result.
3371  */
3372 int netdev_rx_handler_register(struct net_device *dev,
3373                                rx_handler_func_t *rx_handler,
3374                                void *rx_handler_data)
3375 {
3376         ASSERT_RTNL();
3377
3378         if (dev->rx_handler)
3379                 return -EBUSY;
3380
3381         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3382         rcu_assign_pointer(dev->rx_handler, rx_handler);
3383
3384         return 0;
3385 }
3386 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3387
3388 /**
3389  *      netdev_rx_handler_unregister - unregister receive handler
3390  *      @dev: device to unregister a handler from
3391  *
3392  *      Unregister a receive hander from a device.
3393  *
3394  *      The caller must hold the rtnl_mutex.
3395  */
3396 void netdev_rx_handler_unregister(struct net_device *dev)
3397 {
3398
3399         ASSERT_RTNL();
3400         RCU_INIT_POINTER(dev->rx_handler, NULL);
3401         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3402 }
3403 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3404
3405 /*
3406  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3407  * the special handling of PFMEMALLOC skbs.
3408  */
3409 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3410 {
3411         switch (skb->protocol) {
3412         case __constant_htons(ETH_P_ARP):
3413         case __constant_htons(ETH_P_IP):
3414         case __constant_htons(ETH_P_IPV6):
3415         case __constant_htons(ETH_P_8021Q):
3416                 return true;
3417         default:
3418                 return false;
3419         }
3420 }
3421
3422 static int __netif_receive_skb(struct sk_buff *skb)
3423 {
3424         struct packet_type *ptype, *pt_prev;
3425         rx_handler_func_t *rx_handler;
3426         struct net_device *orig_dev;
3427         struct net_device *null_or_dev;
3428         bool deliver_exact = false;
3429         int ret = NET_RX_DROP;
3430         __be16 type;
3431         unsigned long pflags = current->flags;
3432
3433         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3434
3435         trace_netif_receive_skb(skb);
3436
3437         /*
3438          * PFMEMALLOC skbs are special, they should
3439          * - be delivered to SOCK_MEMALLOC sockets only
3440          * - stay away from userspace
3441          * - have bounded memory usage
3442          *
3443          * Use PF_MEMALLOC as this saves us from propagating the allocation
3444          * context down to all allocation sites.
3445          */
3446         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3447                 current->flags |= PF_MEMALLOC;
3448
3449         /* if we've gotten here through NAPI, check netpoll */
3450         if (netpoll_receive_skb(skb))
3451                 goto out;
3452
3453         orig_dev = skb->dev;
3454
3455         skb_reset_network_header(skb);
3456         if (!skb_transport_header_was_set(skb))
3457                 skb_reset_transport_header(skb);
3458         skb_reset_mac_len(skb);
3459
3460         pt_prev = NULL;
3461
3462         rcu_read_lock();
3463
3464 another_round:
3465         skb->skb_iif = skb->dev->ifindex;
3466
3467         __this_cpu_inc(softnet_data.processed);
3468
3469         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3470                 skb = vlan_untag(skb);
3471                 if (unlikely(!skb))
3472                         goto unlock;
3473         }
3474
3475 #ifdef CONFIG_NET_CLS_ACT
3476         if (skb->tc_verd & TC_NCLS) {
3477                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3478                 goto ncls;
3479         }
3480 #endif
3481
3482         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3483                 goto skip_taps;
3484
3485         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3486                 if (!ptype->dev || ptype->dev == skb->dev) {
3487                         if (pt_prev)
3488                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3489                         pt_prev = ptype;
3490                 }
3491         }
3492
3493 skip_taps:
3494 #ifdef CONFIG_NET_CLS_ACT
3495         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3496         if (!skb)
3497                 goto unlock;
3498 ncls:
3499 #endif
3500
3501         if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3502                                 && !skb_pfmemalloc_protocol(skb))
3503                 goto drop;
3504
3505         if (vlan_tx_tag_present(skb)) {
3506                 if (pt_prev) {
3507                         ret = deliver_skb(skb, pt_prev, orig_dev);
3508                         pt_prev = NULL;
3509                 }
3510                 if (vlan_do_receive(&skb))
3511                         goto another_round;
3512                 else if (unlikely(!skb))
3513                         goto unlock;
3514         }
3515
3516         rx_handler = rcu_dereference(skb->dev->rx_handler);
3517         if (rx_handler) {
3518                 if (pt_prev) {
3519                         ret = deliver_skb(skb, pt_prev, orig_dev);
3520                         pt_prev = NULL;
3521                 }
3522                 switch (rx_handler(&skb)) {
3523                 case RX_HANDLER_CONSUMED:
3524                         goto unlock;
3525                 case RX_HANDLER_ANOTHER:
3526                         goto another_round;
3527                 case RX_HANDLER_EXACT:
3528                         deliver_exact = true;
3529                 case RX_HANDLER_PASS:
3530                         break;
3531                 default:
3532                         BUG();
3533                 }
3534         }
3535
3536         if (vlan_tx_nonzero_tag_present(skb))
3537                 skb->pkt_type = PACKET_OTHERHOST;
3538
3539         /* deliver only exact match when indicated */
3540         null_or_dev = deliver_exact ? skb->dev : NULL;
3541
3542         type = skb->protocol;
3543         list_for_each_entry_rcu(ptype,
3544                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3545                 if (ptype->type == type &&
3546                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3547                      ptype->dev == orig_dev)) {
3548                         if (pt_prev)
3549                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3550                         pt_prev = ptype;
3551                 }
3552         }
3553
3554         if (pt_prev) {
3555                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3556                         goto drop;
3557                 else
3558                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3559         } else {
3560 drop:
3561                 atomic_long_inc(&skb->dev->rx_dropped);
3562                 kfree_skb(skb);
3563                 /* Jamal, now you will not able to escape explaining
3564                  * me how you were going to use this. :-)
3565                  */
3566                 ret = NET_RX_DROP;
3567         }
3568
3569 unlock:
3570         rcu_read_unlock();
3571 out:
3572         tsk_restore_flags(current, pflags, PF_MEMALLOC);
3573         return ret;
3574 }
3575
3576 /**
3577  *      netif_receive_skb - process receive buffer from network
3578  *      @skb: buffer to process
3579  *
3580  *      netif_receive_skb() is the main receive data processing function.
3581  *      It always succeeds. The buffer may be dropped during processing
3582  *      for congestion control or by the protocol layers.
3583  *
3584  *      This function may only be called from softirq context and interrupts
3585  *      should be enabled.
3586  *
3587  *      Return values (usually ignored):
3588  *      NET_RX_SUCCESS: no congestion
3589  *      NET_RX_DROP: packet was dropped
3590  */
3591 int netif_receive_skb(struct sk_buff *skb)
3592 {
3593         net_timestamp_check(netdev_tstamp_prequeue, skb);
3594
3595         if (skb_defer_rx_timestamp(skb))
3596                 return NET_RX_SUCCESS;
3597
3598 #ifdef CONFIG_RPS
3599         if (static_key_false(&rps_needed)) {
3600                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3601                 int cpu, ret;
3602
3603                 rcu_read_lock();
3604
3605                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3606
3607                 if (cpu >= 0) {
3608                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3609                         rcu_read_unlock();
3610                         return ret;
3611                 }
3612                 rcu_read_unlock();
3613         }
3614 #endif
3615         return __netif_receive_skb(skb);
3616 }
3617 EXPORT_SYMBOL(netif_receive_skb);
3618
3619 /* Network device is going away, flush any packets still pending
3620  * Called with irqs disabled.
3621  */
3622 static void flush_backlog(void *arg)
3623 {
3624         struct net_device *dev = arg;
3625         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3626         struct sk_buff *skb, *tmp;
3627
3628         rps_lock(sd);
3629         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3630                 if (skb->dev == dev) {
3631                         __skb_unlink(skb, &sd->input_pkt_queue);
3632                         kfree_skb(skb);
3633                         input_queue_head_incr(sd);
3634                 }
3635         }
3636         rps_unlock(sd);
3637
3638         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3639                 if (skb->dev == dev) {
3640                         __skb_unlink(skb, &sd->process_queue);
3641                         kfree_skb(skb);
3642                         input_queue_head_incr(sd);
3643                 }
3644         }
3645 }
3646
3647 static int napi_gro_complete(struct sk_buff *skb)
3648 {
3649         struct packet_offload *ptype;
3650         __be16 type = skb->protocol;
3651         struct list_head *head = &offload_base;
3652         int err = -ENOENT;
3653
3654         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3655
3656         if (NAPI_GRO_CB(skb)->count == 1) {
3657                 skb_shinfo(skb)->gso_size = 0;
3658                 goto out;
3659         }
3660
3661         rcu_read_lock();
3662         list_for_each_entry_rcu(ptype, head, list) {
3663                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3664                         continue;
3665
3666                 err = ptype->callbacks.gro_complete(skb);
3667                 break;
3668         }
3669         rcu_read_unlock();
3670
3671         if (err) {
3672                 WARN_ON(&ptype->list == head);
3673                 kfree_skb(skb);
3674                 return NET_RX_SUCCESS;
3675         }
3676
3677 out:
3678         return netif_receive_skb(skb);
3679 }
3680
3681 /* napi->gro_list contains packets ordered by age.
3682  * youngest packets at the head of it.
3683  * Complete skbs in reverse order to reduce latencies.
3684  */
3685 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3686 {
3687         struct sk_buff *skb, *prev = NULL;
3688
3689         /* scan list and build reverse chain */
3690         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3691                 skb->prev = prev;
3692                 prev = skb;
3693         }
3694
3695         for (skb = prev; skb; skb = prev) {
3696                 skb->next = NULL;
3697
3698                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3699                         return;
3700
3701                 prev = skb->prev;
3702                 napi_gro_complete(skb);
3703                 napi->gro_count--;
3704         }
3705
3706         napi->gro_list = NULL;
3707 }
3708 EXPORT_SYMBOL(napi_gro_flush);
3709
3710 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3711 {
3712         struct sk_buff *p;
3713         unsigned int maclen = skb->dev->hard_header_len;
3714
3715         for (p = napi->gro_list; p; p = p->next) {
3716                 unsigned long diffs;
3717
3718                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3719                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3720                 if (maclen == ETH_HLEN)
3721                         diffs |= compare_ether_header(skb_mac_header(p),
3722                                                       skb_gro_mac_header(skb));
3723                 else if (!diffs)
3724                         diffs = memcmp(skb_mac_header(p),
3725                                        skb_gro_mac_header(skb),
3726                                        maclen);
3727                 NAPI_GRO_CB(p)->same_flow = !diffs;
3728                 NAPI_GRO_CB(p)->flush = 0;
3729         }
3730 }
3731
3732 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3733 {
3734         struct sk_buff **pp = NULL;
3735         struct packet_offload *ptype;
3736         __be16 type = skb->protocol;
3737         struct list_head *head = &offload_base;
3738         int same_flow;
3739         int mac_len;
3740         enum gro_result ret;
3741
3742         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3743                 goto normal;
3744
3745         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3746                 goto normal;
3747
3748         gro_list_prepare(napi, skb);
3749
3750         rcu_read_lock();
3751         list_for_each_entry_rcu(ptype, head, list) {
3752                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3753                         continue;
3754
3755                 skb_set_network_header(skb, skb_gro_offset(skb));
3756                 mac_len = skb->network_header - skb->mac_header;
3757                 skb->mac_len = mac_len;
3758                 NAPI_GRO_CB(skb)->same_flow = 0;
3759                 NAPI_GRO_CB(skb)->flush = 0;
3760                 NAPI_GRO_CB(skb)->free = 0;
3761
3762                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3763                 break;
3764         }
3765         rcu_read_unlock();
3766
3767         if (&ptype->list == head)
3768                 goto normal;
3769
3770         same_flow = NAPI_GRO_CB(skb)->same_flow;
3771         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3772
3773         if (pp) {
3774                 struct sk_buff *nskb = *pp;
3775
3776                 *pp = nskb->next;
3777                 nskb->next = NULL;
3778                 napi_gro_complete(nskb);
3779                 napi->gro_count--;
3780         }
3781
3782         if (same_flow)
3783                 goto ok;
3784
3785         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3786                 goto normal;
3787
3788         napi->gro_count++;
3789         NAPI_GRO_CB(skb)->count = 1;
3790         NAPI_GRO_CB(skb)->age = jiffies;
3791         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3792         skb->next = napi->gro_list;
3793         napi->gro_list = skb;
3794         ret = GRO_HELD;
3795
3796 pull:
3797         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3798                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3799
3800                 BUG_ON(skb->end - skb->tail < grow);
3801
3802                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3803
3804                 skb->tail += grow;
3805                 skb->data_len -= grow;
3806
3807                 skb_shinfo(skb)->frags[0].page_offset += grow;
3808                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3809
3810                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3811                         skb_frag_unref(skb, 0);
3812                         memmove(skb_shinfo(skb)->frags,
3813                                 skb_shinfo(skb)->frags + 1,
3814                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3815                 }
3816         }
3817
3818 ok:
3819         return ret;
3820
3821 normal:
3822         ret = GRO_NORMAL;
3823         goto pull;
3824 }
3825
3826
3827 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3828 {
3829         switch (ret) {
3830         case GRO_NORMAL:
3831                 if (netif_receive_skb(skb))
3832                         ret = GRO_DROP;
3833                 break;
3834
3835         case GRO_DROP:
3836                 kfree_skb(skb);
3837                 break;
3838
3839         case GRO_MERGED_FREE:
3840                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3841                         kmem_cache_free(skbuff_head_cache, skb);
3842                 else
3843                         __kfree_skb(skb);
3844                 break;
3845
3846         case GRO_HELD:
3847         case GRO_MERGED:
3848                 break;
3849         }
3850
3851         return ret;
3852 }
3853
3854 static void skb_gro_reset_offset(struct sk_buff *skb)
3855 {
3856         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3857         const skb_frag_t *frag0 = &pinfo->frags[0];
3858
3859         NAPI_GRO_CB(skb)->data_offset = 0;
3860         NAPI_GRO_CB(skb)->frag0 = NULL;
3861         NAPI_GRO_CB(skb)->frag0_len = 0;
3862
3863         if (skb->mac_header == skb->tail &&
3864             pinfo->nr_frags &&
3865             !PageHighMem(skb_frag_page(frag0))) {
3866                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3867                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3868         }
3869 }
3870
3871 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3872 {
3873         skb_gro_reset_offset(skb);
3874
3875         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3876 }
3877 EXPORT_SYMBOL(napi_gro_receive);
3878
3879 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3880 {
3881         __skb_pull(skb, skb_headlen(skb));
3882         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3883         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3884         skb->vlan_tci = 0;
3885         skb->dev = napi->dev;
3886         skb->skb_iif = 0;
3887
3888         napi->skb = skb;
3889 }
3890
3891 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3892 {
3893         struct sk_buff *skb = napi->skb;
3894
3895         if (!skb) {
3896                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3897                 if (skb)
3898                         napi->skb = skb;
3899         }
3900         return skb;
3901 }
3902 EXPORT_SYMBOL(napi_get_frags);
3903
3904 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3905                                gro_result_t ret)
3906 {
3907         switch (ret) {
3908         case GRO_NORMAL:
3909         case GRO_HELD:
3910                 skb->protocol = eth_type_trans(skb, skb->dev);
3911
3912                 if (ret == GRO_HELD)
3913                         skb_gro_pull(skb, -ETH_HLEN);
3914                 else if (netif_receive_skb(skb))
3915                         ret = GRO_DROP;
3916                 break;
3917
3918         case GRO_DROP:
3919         case GRO_MERGED_FREE:
3920                 napi_reuse_skb(napi, skb);
3921                 break;
3922
3923         case GRO_MERGED:
3924                 break;
3925         }
3926
3927         return ret;
3928 }
3929
3930 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3931 {
3932         struct sk_buff *skb = napi->skb;
3933         struct ethhdr *eth;
3934         unsigned int hlen;
3935         unsigned int off;
3936
3937         napi->skb = NULL;
3938
3939         skb_reset_mac_header(skb);
3940         skb_gro_reset_offset(skb);
3941
3942         off = skb_gro_offset(skb);
3943         hlen = off + sizeof(*eth);
3944         eth = skb_gro_header_fast(skb, off);
3945         if (skb_gro_header_hard(skb, hlen)) {
3946                 eth = skb_gro_header_slow(skb, hlen, off);
3947                 if (unlikely(!eth)) {
3948                         napi_reuse_skb(napi, skb);
3949                         skb = NULL;
3950                         goto out;
3951                 }
3952         }
3953
3954         skb_gro_pull(skb, sizeof(*eth));
3955
3956         /*
3957          * This works because the only protocols we care about don't require
3958          * special handling.  We'll fix it up properly at the end.
3959          */
3960         skb->protocol = eth->h_proto;
3961
3962 out:
3963         return skb;
3964 }
3965
3966 gro_result_t napi_gro_frags(struct napi_struct *napi)
3967 {
3968         struct sk_buff *skb = napi_frags_skb(napi);
3969
3970         if (!skb)
3971                 return GRO_DROP;
3972
3973         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3974 }
3975 EXPORT_SYMBOL(napi_gro_frags);
3976
3977 /*
3978  * net_rps_action sends any pending IPI's for rps.
3979  * Note: called with local irq disabled, but exits with local irq enabled.
3980  */
3981 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3982 {
3983 #ifdef CONFIG_RPS
3984         struct softnet_data *remsd = sd->rps_ipi_list;
3985
3986         if (remsd) {
3987                 sd->rps_ipi_list = NULL;
3988
3989                 local_irq_enable();
3990
3991                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3992                 while (remsd) {
3993                         struct softnet_data *next = remsd->rps_ipi_next;
3994
3995                         if (cpu_online(remsd->cpu))
3996                                 __smp_call_function_single(remsd->cpu,
3997                                                            &remsd->csd, 0);
3998                         remsd = next;
3999                 }
4000         } else
4001 #endif
4002                 local_irq_enable();
4003 }
4004
4005 static int process_backlog(struct napi_struct *napi, int quota)
4006 {
4007         int work = 0;
4008         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4009
4010 #ifdef CONFIG_RPS
4011         /* Check if we have pending ipi, its better to send them now,
4012          * not waiting net_rx_action() end.
4013          */
4014         if (sd->rps_ipi_list) {
4015                 local_irq_disable();
4016                 net_rps_action_and_irq_enable(sd);
4017         }
4018 #endif
4019         napi->weight = weight_p;
4020         local_irq_disable();
4021         while (work < quota) {
4022                 struct sk_buff *skb;
4023                 unsigned int qlen;
4024
4025                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4026                         local_irq_enable();
4027                         __netif_receive_skb(skb);
4028                         local_irq_disable();
4029                         input_queue_head_incr(sd);
4030                         if (++work >= quota) {
4031                                 local_irq_enable();
4032                                 return work;
4033                         }
4034                 }
4035
4036                 rps_lock(sd);
4037                 qlen = skb_queue_len(&sd->input_pkt_queue);
4038                 if (qlen)
4039                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4040                                                    &sd->process_queue);
4041
4042                 if (qlen < quota - work) {
4043                         /*
4044                          * Inline a custom version of __napi_complete().
4045                          * only current cpu owns and manipulates this napi,
4046                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4047                          * we can use a plain write instead of clear_bit(),
4048                          * and we dont need an smp_mb() memory barrier.
4049                          */
4050                         list_del(&napi->poll_list);
4051                         napi->state = 0;
4052
4053                         quota = work + qlen;
4054                 }
4055                 rps_unlock(sd);
4056         }
4057         local_irq_enable();
4058
4059         return work;
4060 }
4061
4062 /**
4063  * __napi_schedule - schedule for receive
4064  * @n: entry to schedule
4065  *
4066  * The entry's receive function will be scheduled to run
4067  */
4068 void __napi_schedule(struct napi_struct *n)
4069 {
4070         unsigned long flags;
4071
4072         local_irq_save(flags);
4073         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4074         local_irq_restore(flags);
4075 }
4076 EXPORT_SYMBOL(__napi_schedule);
4077
4078 void __napi_complete(struct napi_struct *n)
4079 {
4080         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4081         BUG_ON(n->gro_list);
4082
4083         list_del(&n->poll_list);
4084         smp_mb__before_clear_bit();
4085         clear_bit(NAPI_STATE_SCHED, &n->state);
4086 }
4087 EXPORT_SYMBOL(__napi_complete);
4088
4089 void napi_complete(struct napi_struct *n)
4090 {
4091         unsigned long flags;
4092
4093         /*
4094          * don't let napi dequeue from the cpu poll list
4095          * just in case its running on a different cpu
4096          */
4097         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4098                 return;
4099
4100         napi_gro_flush(n, false);
4101         local_irq_save(flags);
4102         __napi_complete(n);
4103         local_irq_restore(flags);
4104 }
4105 EXPORT_SYMBOL(napi_complete);
4106
4107 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4108                     int (*poll)(struct napi_struct *, int), int weight)
4109 {
4110         INIT_LIST_HEAD(&napi->poll_list);
4111         napi->gro_count = 0;
4112         napi->gro_list = NULL;
4113         napi->skb = NULL;
4114         napi->poll = poll;
4115         napi->weight = weight;
4116         list_add(&napi->dev_list, &dev->napi_list);
4117         napi->dev = dev;
4118 #ifdef CONFIG_NETPOLL
4119         spin_lock_init(&napi->poll_lock);
4120         napi->poll_owner = -1;
4121 #endif
4122         set_bit(NAPI_STATE_SCHED, &napi->state);
4123 }
4124 EXPORT_SYMBOL(netif_napi_add);
4125
4126 void netif_napi_del(struct napi_struct *napi)
4127 {
4128         struct sk_buff *skb, *next;
4129
4130         list_del_init(&napi->dev_list);
4131         napi_free_frags(napi);
4132
4133         for (skb = napi->gro_list; skb; skb = next) {
4134                 next = skb->next;
4135                 skb->next = NULL;
4136                 kfree_skb(skb);
4137         }
4138
4139         napi->gro_list = NULL;
4140         napi->gro_count = 0;
4141 }
4142 EXPORT_SYMBOL(netif_napi_del);
4143
4144 static void net_rx_action(struct softirq_action *h)
4145 {
4146         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4147         unsigned long time_limit = jiffies + 2;
4148         int budget = netdev_budget;
4149         void *have;
4150
4151         local_irq_disable();
4152
4153         while (!list_empty(&sd->poll_list)) {
4154                 struct napi_struct *n;
4155                 int work, weight;
4156
4157                 /* If softirq window is exhuasted then punt.
4158                  * Allow this to run for 2 jiffies since which will allow
4159                  * an average latency of 1.5/HZ.
4160                  */
4161                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4162                         goto softnet_break;
4163
4164                 local_irq_enable();
4165
4166                 /* Even though interrupts have been re-enabled, this
4167                  * access is safe because interrupts can only add new
4168                  * entries to the tail of this list, and only ->poll()
4169                  * calls can remove this head entry from the list.
4170                  */
4171                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4172
4173                 have = netpoll_poll_lock(n);
4174
4175                 weight = n->weight;
4176
4177                 /* This NAPI_STATE_SCHED test is for avoiding a race
4178                  * with netpoll's poll_napi().  Only the entity which
4179                  * obtains the lock and sees NAPI_STATE_SCHED set will
4180                  * actually make the ->poll() call.  Therefore we avoid
4181                  * accidentally calling ->poll() when NAPI is not scheduled.
4182                  */
4183                 work = 0;
4184                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4185                         work = n->poll(n, weight);
4186                         trace_napi_poll(n);
4187                 }
4188
4189                 WARN_ON_ONCE(work > weight);
4190
4191                 budget -= work;
4192
4193                 local_irq_disable();
4194
4195                 /* Drivers must not modify the NAPI state if they
4196                  * consume the entire weight.  In such cases this code
4197                  * still "owns" the NAPI instance and therefore can
4198                  * move the instance around on the list at-will.
4199                  */
4200                 if (unlikely(work == weight)) {
4201                         if (unlikely(napi_disable_pending(n))) {
4202                                 local_irq_enable();
4203                                 napi_complete(n);
4204                                 local_irq_disable();
4205                         } else {
4206                                 if (n->gro_list) {
4207                                         /* flush too old packets
4208                                          * If HZ < 1000, flush all packets.
4209                                          */
4210                                         local_irq_enable();
4211                                         napi_gro_flush(n, HZ >= 1000);
4212                                         local_irq_disable();
4213                                 }
4214                                 list_move_tail(&n->poll_list, &sd->poll_list);
4215                         }
4216                 }
4217
4218                 netpoll_poll_unlock(have);
4219         }
4220 out:
4221         net_rps_action_and_irq_enable(sd);
4222
4223 #ifdef CONFIG_NET_DMA
4224         /*
4225          * There may not be any more sk_buffs coming right now, so push
4226          * any pending DMA copies to hardware
4227          */
4228         dma_issue_pending_all();
4229 #endif
4230
4231         return;
4232
4233 softnet_break:
4234         sd->time_squeeze++;
4235         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4236         goto out;
4237 }
4238
4239 static gifconf_func_t *gifconf_list[NPROTO];
4240
4241 /**
4242  *      register_gifconf        -       register a SIOCGIF handler
4243  *      @family: Address family
4244  *      @gifconf: Function handler
4245  *
4246  *      Register protocol dependent address dumping routines. The handler
4247  *      that is passed must not be freed or reused until it has been replaced
4248  *      by another handler.
4249  */
4250 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4251 {
4252         if (family >= NPROTO)
4253                 return -EINVAL;
4254         gifconf_list[family] = gifconf;
4255         return 0;
4256 }
4257 EXPORT_SYMBOL(register_gifconf);
4258
4259
4260 /*
4261  *      Map an interface index to its name (SIOCGIFNAME)
4262  */
4263
4264 /*
4265  *      We need this ioctl for efficient implementation of the
4266  *      if_indextoname() function required by the IPv6 API.  Without
4267  *      it, we would have to search all the interfaces to find a
4268  *      match.  --pb
4269  */
4270
4271 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4272 {
4273         struct net_device *dev;
4274         struct ifreq ifr;
4275         unsigned seq;
4276
4277         /*
4278          *      Fetch the caller's info block.
4279          */
4280
4281         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4282                 return -EFAULT;
4283
4284 retry:
4285         seq = read_seqcount_begin(&devnet_rename_seq);
4286         rcu_read_lock();
4287         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4288         if (!dev) {
4289                 rcu_read_unlock();
4290                 return -ENODEV;
4291         }
4292
4293         strcpy(ifr.ifr_name, dev->name);
4294         rcu_read_unlock();
4295         if (read_seqcount_retry(&devnet_rename_seq, seq))
4296                 goto retry;
4297
4298         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4299                 return -EFAULT;
4300         return 0;
4301 }
4302
4303 /*
4304  *      Perform a SIOCGIFCONF call. This structure will change
4305  *      size eventually, and there is nothing I can do about it.
4306  *      Thus we will need a 'compatibility mode'.
4307  */
4308
4309 static int dev_ifconf(struct net *net, char __user *arg)
4310 {
4311         struct ifconf ifc;
4312         struct net_device *dev;
4313         char __user *pos;
4314         int len;
4315         int total;
4316         int i;
4317
4318         /*
4319          *      Fetch the caller's info block.
4320          */
4321
4322         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4323                 return -EFAULT;
4324
4325         pos = ifc.ifc_buf;
4326         len = ifc.ifc_len;
4327
4328         /*
4329          *      Loop over the interfaces, and write an info block for each.
4330          */
4331
4332         total = 0;
4333         for_each_netdev(net, dev) {
4334                 for (i = 0; i < NPROTO; i++) {
4335                         if (gifconf_list[i]) {
4336                                 int done;
4337                                 if (!pos)
4338                                         done = gifconf_list[i](dev, NULL, 0);
4339                                 else
4340                                         done = gifconf_list[i](dev, pos + total,
4341                                                                len - total);
4342                                 if (done < 0)
4343                                         return -EFAULT;
4344                                 total += done;
4345                         }
4346                 }
4347         }
4348
4349         /*
4350          *      All done.  Write the updated control block back to the caller.
4351          */
4352         ifc.ifc_len = total;
4353
4354         /*
4355          *      Both BSD and Solaris return 0 here, so we do too.
4356          */
4357         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4358 }
4359
4360 #ifdef CONFIG_PROC_FS
4361
4362 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4363
4364 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4365 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4366 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4367
4368 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4369 {
4370         struct net *net = seq_file_net(seq);
4371         struct net_device *dev;
4372         struct hlist_node *p;
4373         struct hlist_head *h;
4374         unsigned int count = 0, offset = get_offset(*pos);
4375
4376         h = &net->dev_name_head[get_bucket(*pos)];
4377         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4378                 if (++count == offset)
4379                         return dev;
4380         }
4381
4382         return NULL;
4383 }
4384
4385 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4386 {
4387         struct net_device *dev;
4388         unsigned int bucket;
4389
4390         do {
4391                 dev = dev_from_same_bucket(seq, pos);
4392                 if (dev)
4393                         return dev;
4394
4395                 bucket = get_bucket(*pos) + 1;
4396                 *pos = set_bucket_offset(bucket, 1);
4397         } while (bucket < NETDEV_HASHENTRIES);
4398
4399         return NULL;
4400 }
4401
4402 /*
4403  *      This is invoked by the /proc filesystem handler to display a device
4404  *      in detail.
4405  */
4406 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4407         __acquires(RCU)
4408 {
4409         rcu_read_lock();
4410         if (!*pos)
4411                 return SEQ_START_TOKEN;
4412
4413         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4414                 return NULL;
4415
4416         return dev_from_bucket(seq, pos);
4417 }
4418
4419 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4420 {
4421         ++*pos;
4422         return dev_from_bucket(seq, pos);
4423 }
4424
4425 void dev_seq_stop(struct seq_file *seq, void *v)
4426         __releases(RCU)
4427 {
4428         rcu_read_unlock();
4429 }
4430
4431 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4432 {
4433         struct rtnl_link_stats64 temp;
4434         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4435
4436         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4437                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4438                    dev->name, stats->rx_bytes, stats->rx_packets,
4439                    stats->rx_errors,
4440                    stats->rx_dropped + stats->rx_missed_errors,
4441                    stats->rx_fifo_errors,
4442                    stats->rx_length_errors + stats->rx_over_errors +
4443                     stats->rx_crc_errors + stats->rx_frame_errors,
4444                    stats->rx_compressed, stats->multicast,
4445                    stats->tx_bytes, stats->tx_packets,
4446                    stats->tx_errors, stats->tx_dropped,
4447                    stats->tx_fifo_errors, stats->collisions,
4448                    stats->tx_carrier_errors +
4449                     stats->tx_aborted_errors +
4450                     stats->tx_window_errors +
4451                     stats->tx_heartbeat_errors,
4452                    stats->tx_compressed);
4453 }
4454
4455 /*
4456  *      Called from the PROCfs module. This now uses the new arbitrary sized
4457  *      /proc/net interface to create /proc/net/dev
4458  */
4459 static int dev_seq_show(struct seq_file *seq, void *v)
4460 {
4461         if (v == SEQ_START_TOKEN)
4462                 seq_puts(seq, "Inter-|   Receive                            "
4463                               "                    |  Transmit\n"
4464                               " face |bytes    packets errs drop fifo frame "
4465                               "compressed multicast|bytes    packets errs "
4466                               "drop fifo colls carrier compressed\n");
4467         else
4468                 dev_seq_printf_stats(seq, v);
4469         return 0;
4470 }
4471
4472 static struct softnet_data *softnet_get_online(loff_t *pos)
4473 {
4474         struct softnet_data *sd = NULL;
4475
4476         while (*pos < nr_cpu_ids)
4477                 if (cpu_online(*pos)) {
4478                         sd = &per_cpu(softnet_data, *pos);
4479                         break;
4480                 } else
4481                         ++*pos;
4482         return sd;
4483 }
4484
4485 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4486 {
4487         return softnet_get_online(pos);
4488 }
4489
4490 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4491 {
4492         ++*pos;
4493         return softnet_get_online(pos);
4494 }
4495
4496 static void softnet_seq_stop(struct seq_file *seq, void *v)
4497 {
4498 }
4499
4500 static int softnet_seq_show(struct seq_file *seq, void *v)
4501 {
4502         struct softnet_data *sd = v;
4503
4504         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4505                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4506                    0, 0, 0, 0, /* was fastroute */
4507                    sd->cpu_collision, sd->received_rps);
4508         return 0;
4509 }
4510
4511 static const struct seq_operations dev_seq_ops = {
4512         .start = dev_seq_start,
4513         .next  = dev_seq_next,
4514         .stop  = dev_seq_stop,
4515         .show  = dev_seq_show,
4516 };
4517
4518 static int dev_seq_open(struct inode *inode, struct file *file)
4519 {
4520         return seq_open_net(inode, file, &dev_seq_ops,
4521                             sizeof(struct seq_net_private));
4522 }
4523
4524 static const struct file_operations dev_seq_fops = {
4525         .owner   = THIS_MODULE,
4526         .open    = dev_seq_open,
4527         .read    = seq_read,
4528         .llseek  = seq_lseek,
4529         .release = seq_release_net,
4530 };
4531
4532 static const struct seq_operations softnet_seq_ops = {
4533         .start = softnet_seq_start,
4534         .next  = softnet_seq_next,
4535         .stop  = softnet_seq_stop,
4536         .show  = softnet_seq_show,
4537 };
4538
4539 static int softnet_seq_open(struct inode *inode, struct file *file)
4540 {
4541         return seq_open(file, &softnet_seq_ops);
4542 }
4543
4544 static const struct file_operations softnet_seq_fops = {
4545         .owner   = THIS_MODULE,
4546         .open    = softnet_seq_open,
4547         .read    = seq_read,
4548         .llseek  = seq_lseek,
4549         .release = seq_release,
4550 };
4551
4552 static void *ptype_get_idx(loff_t pos)
4553 {
4554         struct packet_type *pt = NULL;
4555         loff_t i = 0;
4556         int t;
4557
4558         list_for_each_entry_rcu(pt, &ptype_all, list) {
4559                 if (i == pos)
4560                         return pt;
4561                 ++i;
4562         }
4563
4564         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4565                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4566                         if (i == pos)
4567                                 return pt;
4568                         ++i;
4569                 }
4570         }
4571         return NULL;
4572 }
4573
4574 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4575         __acquires(RCU)
4576 {
4577         rcu_read_lock();
4578         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4579 }
4580
4581 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4582 {
4583         struct packet_type *pt;
4584         struct list_head *nxt;
4585         int hash;
4586
4587         ++*pos;
4588         if (v == SEQ_START_TOKEN)
4589                 return ptype_get_idx(0);
4590
4591         pt = v;
4592         nxt = pt->list.next;
4593         if (pt->type == htons(ETH_P_ALL)) {
4594                 if (nxt != &ptype_all)
4595                         goto found;
4596                 hash = 0;
4597                 nxt = ptype_base[0].next;
4598         } else
4599                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4600
4601         while (nxt == &ptype_base[hash]) {
4602                 if (++hash >= PTYPE_HASH_SIZE)
4603                         return NULL;
4604                 nxt = ptype_base[hash].next;
4605         }
4606 found:
4607         return list_entry(nxt, struct packet_type, list);
4608 }
4609
4610 static void ptype_seq_stop(struct seq_file *seq, void *v)
4611         __releases(RCU)
4612 {
4613         rcu_read_unlock();
4614 }
4615
4616 static int ptype_seq_show(struct seq_file *seq, void *v)
4617 {
4618         struct packet_type *pt = v;
4619
4620         if (v == SEQ_START_TOKEN)
4621                 seq_puts(seq, "Type Device      Function\n");
4622         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4623                 if (pt->type == htons(ETH_P_ALL))
4624                         seq_puts(seq, "ALL ");
4625                 else
4626                         seq_printf(seq, "%04x", ntohs(pt->type));
4627
4628                 seq_printf(seq, " %-8s %pF\n",
4629                            pt->dev ? pt->dev->name : "", pt->func);
4630         }
4631
4632         return 0;
4633 }
4634
4635 static const struct seq_operations ptype_seq_ops = {
4636         .start = ptype_seq_start,
4637         .next  = ptype_seq_next,
4638         .stop  = ptype_seq_stop,
4639         .show  = ptype_seq_show,
4640 };
4641
4642 static int ptype_seq_open(struct inode *inode, struct file *file)
4643 {
4644         return seq_open_net(inode, file, &ptype_seq_ops,
4645                         sizeof(struct seq_net_private));
4646 }
4647
4648 static const struct file_operations ptype_seq_fops = {
4649         .owner   = THIS_MODULE,
4650         .open    = ptype_seq_open,
4651         .read    = seq_read,
4652         .llseek  = seq_lseek,
4653         .release = seq_release_net,
4654 };
4655
4656
4657 static int __net_init dev_proc_net_init(struct net *net)
4658 {
4659         int rc = -ENOMEM;
4660
4661         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4662                 goto out;
4663         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4664                 goto out_dev;
4665         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4666                 goto out_softnet;
4667
4668         if (wext_proc_init(net))
4669                 goto out_ptype;
4670         rc = 0;
4671 out:
4672         return rc;
4673 out_ptype:
4674         proc_net_remove(net, "ptype");
4675 out_softnet:
4676         proc_net_remove(net, "softnet_stat");
4677 out_dev:
4678         proc_net_remove(net, "dev");
4679         goto out;
4680 }
4681
4682 static void __net_exit dev_proc_net_exit(struct net *net)
4683 {
4684         wext_proc_exit(net);
4685
4686         proc_net_remove(net, "ptype");
4687         proc_net_remove(net, "softnet_stat");
4688         proc_net_remove(net, "dev");
4689 }
4690
4691 static struct pernet_operations __net_initdata dev_proc_ops = {
4692         .init = dev_proc_net_init,
4693         .exit = dev_proc_net_exit,
4694 };
4695
4696 static int __init dev_proc_init(void)
4697 {
4698         return register_pernet_subsys(&dev_proc_ops);
4699 }
4700 #else
4701 #define dev_proc_init() 0
4702 #endif  /* CONFIG_PROC_FS */
4703
4704
4705 struct netdev_upper {
4706         struct net_device *dev;
4707         bool master;
4708         struct list_head list;
4709         struct rcu_head rcu;
4710         struct list_head search_list;
4711 };
4712
4713 static void __append_search_uppers(struct list_head *search_list,
4714                                    struct net_device *dev)
4715 {
4716         struct netdev_upper *upper;
4717
4718         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4719                 /* check if this upper is not already in search list */
4720                 if (list_empty(&upper->search_list))
4721                         list_add_tail(&upper->search_list, search_list);
4722         }
4723 }
4724
4725 static bool __netdev_search_upper_dev(struct net_device *dev,
4726                                       struct net_device *upper_dev)
4727 {
4728         LIST_HEAD(search_list);
4729         struct netdev_upper *upper;
4730         struct netdev_upper *tmp;
4731         bool ret = false;
4732
4733         __append_search_uppers(&search_list, dev);
4734         list_for_each_entry(upper, &search_list, search_list) {
4735                 if (upper->dev == upper_dev) {
4736                         ret = true;
4737                         break;
4738                 }
4739                 __append_search_uppers(&search_list, upper->dev);
4740         }
4741         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4742                 INIT_LIST_HEAD(&upper->search_list);
4743         return ret;
4744 }
4745
4746 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4747                                                 struct net_device *upper_dev)
4748 {
4749         struct netdev_upper *upper;
4750
4751         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4752                 if (upper->dev == upper_dev)
4753                         return upper;
4754         }
4755         return NULL;
4756 }
4757
4758 /**
4759  * netdev_has_upper_dev - Check if device is linked to an upper device
4760  * @dev: device
4761  * @upper_dev: upper device to check
4762  *
4763  * Find out if a device is linked to specified upper device and return true
4764  * in case it is. Note that this checks only immediate upper device,
4765  * not through a complete stack of devices. The caller must hold the RTNL lock.
4766  */
4767 bool netdev_has_upper_dev(struct net_device *dev,
4768                           struct net_device *upper_dev)
4769 {
4770         ASSERT_RTNL();
4771
4772         return __netdev_find_upper(dev, upper_dev);
4773 }
4774 EXPORT_SYMBOL(netdev_has_upper_dev);
4775
4776 /**
4777  * netdev_has_any_upper_dev - Check if device is linked to some device
4778  * @dev: device
4779  *
4780  * Find out if a device is linked to an upper device and return true in case
4781  * it is. The caller must hold the RTNL lock.
4782  */
4783 bool netdev_has_any_upper_dev(struct net_device *dev)
4784 {
4785         ASSERT_RTNL();
4786
4787         return !list_empty(&dev->upper_dev_list);
4788 }
4789 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4790
4791 /**
4792  * netdev_master_upper_dev_get - Get master upper device
4793  * @dev: device
4794  *
4795  * Find a master upper device and return pointer to it or NULL in case
4796  * it's not there. The caller must hold the RTNL lock.
4797  */
4798 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4799 {
4800         struct netdev_upper *upper;
4801
4802         ASSERT_RTNL();
4803
4804         if (list_empty(&dev->upper_dev_list))
4805                 return NULL;
4806
4807         upper = list_first_entry(&dev->upper_dev_list,
4808                                  struct netdev_upper, list);
4809         if (likely(upper->master))
4810                 return upper->dev;
4811         return NULL;
4812 }
4813 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4814
4815 /**
4816  * netdev_master_upper_dev_get_rcu - Get master upper device
4817  * @dev: device
4818  *
4819  * Find a master upper device and return pointer to it or NULL in case
4820  * it's not there. The caller must hold the RCU read lock.
4821  */
4822 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4823 {
4824         struct netdev_upper *upper;
4825
4826         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4827                                        struct netdev_upper, list);
4828         if (upper && likely(upper->master))
4829                 return upper->dev;
4830         return NULL;
4831 }
4832 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4833
4834 static int __netdev_upper_dev_link(struct net_device *dev,
4835                                    struct net_device *upper_dev, bool master)
4836 {
4837         struct netdev_upper *upper;
4838
4839         ASSERT_RTNL();
4840
4841         if (dev == upper_dev)
4842                 return -EBUSY;
4843
4844         /* To prevent loops, check if dev is not upper device to upper_dev. */
4845         if (__netdev_search_upper_dev(upper_dev, dev))
4846                 return -EBUSY;
4847
4848         if (__netdev_find_upper(dev, upper_dev))
4849                 return -EEXIST;
4850
4851         if (master && netdev_master_upper_dev_get(dev))
4852                 return -EBUSY;
4853
4854         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4855         if (!upper)
4856                 return -ENOMEM;
4857
4858         upper->dev = upper_dev;
4859         upper->master = master;
4860         INIT_LIST_HEAD(&upper->search_list);
4861
4862         /* Ensure that master upper link is always the first item in list. */
4863         if (master)
4864                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4865         else
4866                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4867         dev_hold(upper_dev);
4868
4869         return 0;
4870 }
4871
4872 /**
4873  * netdev_upper_dev_link - Add a link to the upper device
4874  * @dev: device
4875  * @upper_dev: new upper device
4876  *
4877  * Adds a link to device which is upper to this one. The caller must hold
4878  * the RTNL lock. On a failure a negative errno code is returned.
4879  * On success the reference counts are adjusted and the function
4880  * returns zero.
4881  */
4882 int netdev_upper_dev_link(struct net_device *dev,
4883                           struct net_device *upper_dev)
4884 {
4885         return __netdev_upper_dev_link(dev, upper_dev, false);
4886 }
4887 EXPORT_SYMBOL(netdev_upper_dev_link);
4888
4889 /**
4890  * netdev_master_upper_dev_link - Add a master link to the upper device
4891  * @dev: device
4892  * @upper_dev: new upper device
4893  *
4894  * Adds a link to device which is upper to this one. In this case, only
4895  * one master upper device can be linked, although other non-master devices
4896  * might be linked as well. The caller must hold the RTNL lock.
4897  * On a failure a negative errno code is returned. On success the reference
4898  * counts are adjusted and the function returns zero.
4899  */
4900 int netdev_master_upper_dev_link(struct net_device *dev,
4901                                  struct net_device *upper_dev)
4902 {
4903         return __netdev_upper_dev_link(dev, upper_dev, true);
4904 }
4905 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4906
4907 /**
4908  * netdev_upper_dev_unlink - Removes a link to upper device
4909  * @dev: device
4910  * @upper_dev: new upper device
4911  *
4912  * Removes a link to device which is upper to this one. The caller must hold
4913  * the RTNL lock.
4914  */
4915 void netdev_upper_dev_unlink(struct net_device *dev,
4916                              struct net_device *upper_dev)
4917 {
4918         struct netdev_upper *upper;
4919
4920         ASSERT_RTNL();
4921
4922         upper = __netdev_find_upper(dev, upper_dev);
4923         if (!upper)
4924                 return;
4925         list_del_rcu(&upper->list);
4926         dev_put(upper_dev);
4927         kfree_rcu(upper, rcu);
4928 }
4929 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4930
4931 static void dev_change_rx_flags(struct net_device *dev, int flags)
4932 {
4933         const struct net_device_ops *ops = dev->netdev_ops;
4934
4935         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4936                 ops->ndo_change_rx_flags(dev, flags);
4937 }
4938
4939 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4940 {
4941         unsigned int old_flags = dev->flags;
4942         kuid_t uid;
4943         kgid_t gid;
4944
4945         ASSERT_RTNL();
4946
4947         dev->flags |= IFF_PROMISC;
4948         dev->promiscuity += inc;
4949         if (dev->promiscuity == 0) {
4950                 /*
4951                  * Avoid overflow.
4952                  * If inc causes overflow, untouch promisc and return error.
4953                  */
4954                 if (inc < 0)
4955                         dev->flags &= ~IFF_PROMISC;
4956                 else {
4957                         dev->promiscuity -= inc;
4958                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4959                                 dev->name);
4960                         return -EOVERFLOW;
4961                 }
4962         }
4963         if (dev->flags != old_flags) {
4964                 pr_info("device %s %s promiscuous mode\n",
4965                         dev->name,
4966                         dev->flags & IFF_PROMISC ? "entered" : "left");
4967                 if (audit_enabled) {
4968                         current_uid_gid(&uid, &gid);
4969                         audit_log(current->audit_context, GFP_ATOMIC,
4970                                 AUDIT_ANOM_PROMISCUOUS,
4971                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4972                                 dev->name, (dev->flags & IFF_PROMISC),
4973                                 (old_flags & IFF_PROMISC),
4974                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4975                                 from_kuid(&init_user_ns, uid),
4976                                 from_kgid(&init_user_ns, gid),
4977                                 audit_get_sessionid(current));
4978                 }
4979
4980                 dev_change_rx_flags(dev, IFF_PROMISC);
4981         }
4982         return 0;
4983 }
4984
4985 /**
4986  *      dev_set_promiscuity     - update promiscuity count on a device
4987  *      @dev: device
4988  *      @inc: modifier
4989  *
4990  *      Add or remove promiscuity from a device. While the count in the device
4991  *      remains above zero the interface remains promiscuous. Once it hits zero
4992  *      the device reverts back to normal filtering operation. A negative inc
4993  *      value is used to drop promiscuity on the device.
4994  *      Return 0 if successful or a negative errno code on error.
4995  */
4996 int dev_set_promiscuity(struct net_device *dev, int inc)
4997 {
4998         unsigned int old_flags = dev->flags;
4999         int err;
5000
5001         err = __dev_set_promiscuity(dev, inc);
5002         if (err < 0)
5003                 return err;
5004         if (dev->flags != old_flags)
5005                 dev_set_rx_mode(dev);
5006         return err;
5007 }
5008 EXPORT_SYMBOL(dev_set_promiscuity);
5009
5010 /**
5011  *      dev_set_allmulti        - update allmulti count on a device
5012  *      @dev: device
5013  *      @inc: modifier
5014  *
5015  *      Add or remove reception of all multicast frames to a device. While the
5016  *      count in the device remains above zero the interface remains listening
5017  *      to all interfaces. Once it hits zero the device reverts back to normal
5018  *      filtering operation. A negative @inc value is used to drop the counter
5019  *      when releasing a resource needing all multicasts.
5020  *      Return 0 if successful or a negative errno code on error.
5021  */
5022
5023 int dev_set_allmulti(struct net_device *dev, int inc)
5024 {
5025         unsigned int old_flags = dev->flags;
5026
5027         ASSERT_RTNL();
5028
5029         dev->flags |= IFF_ALLMULTI;
5030         dev->allmulti += inc;
5031         if (dev->allmulti == 0) {
5032                 /*
5033                  * Avoid overflow.
5034                  * If inc causes overflow, untouch allmulti and return error.
5035                  */
5036                 if (inc < 0)
5037                         dev->flags &= ~IFF_ALLMULTI;
5038                 else {
5039                         dev->allmulti -= inc;
5040                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5041                                 dev->name);
5042                         return -EOVERFLOW;
5043                 }
5044         }
5045         if (dev->flags ^ old_flags) {
5046                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5047                 dev_set_rx_mode(dev);
5048         }
5049         return 0;
5050 }
5051 EXPORT_SYMBOL(dev_set_allmulti);
5052
5053 /*
5054  *      Upload unicast and multicast address lists to device and
5055  *      configure RX filtering. When the device doesn't support unicast
5056  *      filtering it is put in promiscuous mode while unicast addresses
5057  *      are present.
5058  */
5059 void __dev_set_rx_mode(struct net_device *dev)
5060 {
5061         const struct net_device_ops *ops = dev->netdev_ops;
5062
5063         /* dev_open will call this function so the list will stay sane. */
5064         if (!(dev->flags&IFF_UP))
5065                 return;
5066
5067         if (!netif_device_present(dev))
5068                 return;
5069
5070         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5071                 /* Unicast addresses changes may only happen under the rtnl,
5072                  * therefore calling __dev_set_promiscuity here is safe.
5073                  */
5074                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5075                         __dev_set_promiscuity(dev, 1);
5076                         dev->uc_promisc = true;
5077                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5078                         __dev_set_promiscuity(dev, -1);
5079                         dev->uc_promisc = false;
5080                 }
5081         }
5082
5083         if (ops->ndo_set_rx_mode)
5084                 ops->ndo_set_rx_mode(dev);
5085 }
5086
5087 void dev_set_rx_mode(struct net_device *dev)
5088 {
5089         netif_addr_lock_bh(dev);
5090         __dev_set_rx_mode(dev);
5091         netif_addr_unlock_bh(dev);
5092 }
5093
5094 /**
5095  *      dev_get_flags - get flags reported to userspace
5096  *      @dev: device
5097  *
5098  *      Get the combination of flag bits exported through APIs to userspace.
5099  */
5100 unsigned int dev_get_flags(const struct net_device *dev)
5101 {
5102         unsigned int flags;
5103
5104         flags = (dev->flags & ~(IFF_PROMISC |
5105                                 IFF_ALLMULTI |
5106                                 IFF_RUNNING |
5107                                 IFF_LOWER_UP |
5108                                 IFF_DORMANT)) |
5109                 (dev->gflags & (IFF_PROMISC |
5110                                 IFF_ALLMULTI));
5111
5112         if (netif_running(dev)) {
5113                 if (netif_oper_up(dev))
5114                         flags |= IFF_RUNNING;
5115                 if (netif_carrier_ok(dev))
5116                         flags |= IFF_LOWER_UP;
5117                 if (netif_dormant(dev))
5118                         flags |= IFF_DORMANT;
5119         }
5120
5121         return flags;
5122 }
5123 EXPORT_SYMBOL(dev_get_flags);
5124
5125 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5126 {
5127         unsigned int old_flags = dev->flags;
5128         int ret;
5129
5130         ASSERT_RTNL();
5131
5132         /*
5133          *      Set the flags on our device.
5134          */
5135
5136         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5137                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5138                                IFF_AUTOMEDIA)) |
5139                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5140                                     IFF_ALLMULTI));
5141
5142         /*
5143          *      Load in the correct multicast list now the flags have changed.
5144          */
5145
5146         if ((old_flags ^ flags) & IFF_MULTICAST)
5147                 dev_change_rx_flags(dev, IFF_MULTICAST);
5148
5149         dev_set_rx_mode(dev);
5150
5151         /*
5152          *      Have we downed the interface. We handle IFF_UP ourselves
5153          *      according to user attempts to set it, rather than blindly
5154          *      setting it.
5155          */
5156
5157         ret = 0;
5158         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5159                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5160
5161                 if (!ret)
5162                         dev_set_rx_mode(dev);
5163         }
5164
5165         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5166                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5167
5168                 dev->gflags ^= IFF_PROMISC;
5169                 dev_set_promiscuity(dev, inc);
5170         }
5171
5172         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5173            is important. Some (broken) drivers set IFF_PROMISC, when
5174            IFF_ALLMULTI is requested not asking us and not reporting.
5175          */
5176         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5177                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5178
5179                 dev->gflags ^= IFF_ALLMULTI;
5180                 dev_set_allmulti(dev, inc);
5181         }
5182
5183         return ret;
5184 }
5185
5186 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5187 {
5188         unsigned int changes = dev->flags ^ old_flags;
5189
5190         if (changes & IFF_UP) {
5191                 if (dev->flags & IFF_UP)
5192                         call_netdevice_notifiers(NETDEV_UP, dev);
5193                 else
5194                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5195         }
5196
5197         if (dev->flags & IFF_UP &&
5198             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5199                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5200 }
5201
5202 /**
5203  *      dev_change_flags - change device settings
5204  *      @dev: device
5205  *      @flags: device state flags
5206  *
5207  *      Change settings on device based state flags. The flags are
5208  *      in the userspace exported format.
5209  */
5210 int dev_change_flags(struct net_device *dev, unsigned int flags)
5211 {
5212         int ret;
5213         unsigned int changes, old_flags = dev->flags;
5214
5215         ret = __dev_change_flags(dev, flags);
5216         if (ret < 0)
5217                 return ret;
5218
5219         changes = old_flags ^ dev->flags;
5220         if (changes)
5221                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5222
5223         __dev_notify_flags(dev, old_flags);
5224         return ret;
5225 }
5226 EXPORT_SYMBOL(dev_change_flags);
5227
5228 /**
5229  *      dev_set_mtu - Change maximum transfer unit
5230  *      @dev: device
5231  *      @new_mtu: new transfer unit
5232  *
5233  *      Change the maximum transfer size of the network device.
5234  */
5235 int dev_set_mtu(struct net_device *dev, int new_mtu)
5236 {
5237         const struct net_device_ops *ops = dev->netdev_ops;
5238         int err;
5239
5240         if (new_mtu == dev->mtu)
5241                 return 0;
5242
5243         /*      MTU must be positive.    */
5244         if (new_mtu < 0)
5245                 return -EINVAL;
5246
5247         if (!netif_device_present(dev))
5248                 return -ENODEV;
5249
5250         err = 0;
5251         if (ops->ndo_change_mtu)
5252                 err = ops->ndo_change_mtu(dev, new_mtu);
5253         else
5254                 dev->mtu = new_mtu;
5255
5256         if (!err)
5257                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5258         return err;
5259 }
5260 EXPORT_SYMBOL(dev_set_mtu);
5261
5262 /**
5263  *      dev_set_group - Change group this device belongs to
5264  *      @dev: device
5265  *      @new_group: group this device should belong to
5266  */
5267 void dev_set_group(struct net_device *dev, int new_group)
5268 {
5269         dev->group = new_group;
5270 }
5271 EXPORT_SYMBOL(dev_set_group);
5272
5273 /**
5274  *      dev_set_mac_address - Change Media Access Control Address
5275  *      @dev: device
5276  *      @sa: new address
5277  *
5278  *      Change the hardware (MAC) address of the device
5279  */
5280 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5281 {
5282         const struct net_device_ops *ops = dev->netdev_ops;
5283         int err;
5284
5285         if (!ops->ndo_set_mac_address)
5286                 return -EOPNOTSUPP;
5287         if (sa->sa_family != dev->type)
5288                 return -EINVAL;
5289         if (!netif_device_present(dev))
5290                 return -ENODEV;
5291         err = ops->ndo_set_mac_address(dev, sa);
5292         if (err)
5293                 return err;
5294         dev->addr_assign_type = NET_ADDR_SET;
5295         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5296         add_device_randomness(dev->dev_addr, dev->addr_len);
5297         return 0;
5298 }
5299 EXPORT_SYMBOL(dev_set_mac_address);
5300
5301 /**
5302  *      dev_change_carrier - Change device carrier
5303  *      @dev: device
5304  *      @new_carries: new value
5305  *
5306  *      Change device carrier
5307  */
5308 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5309 {
5310         const struct net_device_ops *ops = dev->netdev_ops;
5311
5312         if (!ops->ndo_change_carrier)
5313                 return -EOPNOTSUPP;
5314         if (!netif_device_present(dev))
5315                 return -ENODEV;
5316         return ops->ndo_change_carrier(dev, new_carrier);
5317 }
5318 EXPORT_SYMBOL(dev_change_carrier);
5319
5320 /*
5321  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5322  */
5323 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5324 {
5325         int err;
5326         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5327
5328         if (!dev)
5329                 return -ENODEV;
5330
5331         switch (cmd) {
5332         case SIOCGIFFLAGS:      /* Get interface flags */
5333                 ifr->ifr_flags = (short) dev_get_flags(dev);
5334                 return 0;
5335
5336         case SIOCGIFMETRIC:     /* Get the metric on the interface
5337                                    (currently unused) */
5338                 ifr->ifr_metric = 0;
5339                 return 0;
5340
5341         case SIOCGIFMTU:        /* Get the MTU of a device */
5342                 ifr->ifr_mtu = dev->mtu;
5343                 return 0;
5344
5345         case SIOCGIFHWADDR:
5346                 if (!dev->addr_len)
5347                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5348                 else
5349                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5350                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5351                 ifr->ifr_hwaddr.sa_family = dev->type;
5352                 return 0;
5353
5354         case SIOCGIFSLAVE:
5355                 err = -EINVAL;
5356                 break;
5357
5358         case SIOCGIFMAP:
5359                 ifr->ifr_map.mem_start = dev->mem_start;
5360                 ifr->ifr_map.mem_end   = dev->mem_end;
5361                 ifr->ifr_map.base_addr = dev->base_addr;
5362                 ifr->ifr_map.irq       = dev->irq;
5363                 ifr->ifr_map.dma       = dev->dma;
5364                 ifr->ifr_map.port      = dev->if_port;
5365                 return 0;
5366
5367         case SIOCGIFINDEX:
5368                 ifr->ifr_ifindex = dev->ifindex;
5369                 return 0;
5370
5371         case SIOCGIFTXQLEN:
5372                 ifr->ifr_qlen = dev->tx_queue_len;
5373                 return 0;
5374
5375         default:
5376                 /* dev_ioctl() should ensure this case
5377                  * is never reached
5378                  */
5379                 WARN_ON(1);
5380                 err = -ENOTTY;
5381                 break;
5382
5383         }
5384         return err;
5385 }
5386
5387 /*
5388  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5389  */
5390 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5391 {
5392         int err;
5393         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5394         const struct net_device_ops *ops;
5395
5396         if (!dev)
5397                 return -ENODEV;
5398
5399         ops = dev->netdev_ops;
5400
5401         switch (cmd) {
5402         case SIOCSIFFLAGS:      /* Set interface flags */
5403                 return dev_change_flags(dev, ifr->ifr_flags);
5404
5405         case SIOCSIFMETRIC:     /* Set the metric on the interface
5406                                    (currently unused) */
5407                 return -EOPNOTSUPP;
5408
5409         case SIOCSIFMTU:        /* Set the MTU of a device */
5410                 return dev_set_mtu(dev, ifr->ifr_mtu);
5411
5412         case SIOCSIFHWADDR:
5413                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5414
5415         case SIOCSIFHWBROADCAST:
5416                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5417                         return -EINVAL;
5418                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5419                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5420                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5421                 return 0;
5422
5423         case SIOCSIFMAP:
5424                 if (ops->ndo_set_config) {
5425                         if (!netif_device_present(dev))
5426                                 return -ENODEV;
5427                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5428                 }
5429                 return -EOPNOTSUPP;
5430
5431         case SIOCADDMULTI:
5432                 if (!ops->ndo_set_rx_mode ||
5433                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5434                         return -EINVAL;
5435                 if (!netif_device_present(dev))
5436                         return -ENODEV;
5437                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5438
5439         case SIOCDELMULTI:
5440                 if (!ops->ndo_set_rx_mode ||
5441                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5442                         return -EINVAL;
5443                 if (!netif_device_present(dev))
5444                         return -ENODEV;
5445                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5446
5447         case SIOCSIFTXQLEN:
5448                 if (ifr->ifr_qlen < 0)
5449                         return -EINVAL;
5450                 dev->tx_queue_len = ifr->ifr_qlen;
5451                 return 0;
5452
5453         case SIOCSIFNAME:
5454                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5455                 return dev_change_name(dev, ifr->ifr_newname);
5456
5457         case SIOCSHWTSTAMP:
5458                 err = net_hwtstamp_validate(ifr);
5459                 if (err)
5460                         return err;
5461                 /* fall through */
5462
5463         /*
5464          *      Unknown or private ioctl
5465          */
5466         default:
5467                 if ((cmd >= SIOCDEVPRIVATE &&
5468                     cmd <= SIOCDEVPRIVATE + 15) ||
5469                     cmd == SIOCBONDENSLAVE ||
5470                     cmd == SIOCBONDRELEASE ||
5471                     cmd == SIOCBONDSETHWADDR ||
5472                     cmd == SIOCBONDSLAVEINFOQUERY ||
5473                     cmd == SIOCBONDINFOQUERY ||
5474                     cmd == SIOCBONDCHANGEACTIVE ||
5475                     cmd == SIOCGMIIPHY ||
5476                     cmd == SIOCGMIIREG ||
5477                     cmd == SIOCSMIIREG ||
5478                     cmd == SIOCBRADDIF ||
5479                     cmd == SIOCBRDELIF ||
5480                     cmd == SIOCSHWTSTAMP ||
5481                     cmd == SIOCWANDEV) {
5482                         err = -EOPNOTSUPP;
5483                         if (ops->ndo_do_ioctl) {
5484                                 if (netif_device_present(dev))
5485                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5486                                 else
5487                                         err = -ENODEV;
5488                         }
5489                 } else
5490                         err = -EINVAL;
5491
5492         }
5493         return err;
5494 }
5495
5496 /*
5497  *      This function handles all "interface"-type I/O control requests. The actual
5498  *      'doing' part of this is dev_ifsioc above.
5499  */
5500
5501 /**
5502  *      dev_ioctl       -       network device ioctl
5503  *      @net: the applicable net namespace
5504  *      @cmd: command to issue
5505  *      @arg: pointer to a struct ifreq in user space
5506  *
5507  *      Issue ioctl functions to devices. This is normally called by the
5508  *      user space syscall interfaces but can sometimes be useful for
5509  *      other purposes. The return value is the return from the syscall if
5510  *      positive or a negative errno code on error.
5511  */
5512
5513 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5514 {
5515         struct ifreq ifr;
5516         int ret;
5517         char *colon;
5518
5519         /* One special case: SIOCGIFCONF takes ifconf argument
5520            and requires shared lock, because it sleeps writing
5521            to user space.
5522          */
5523
5524         if (cmd == SIOCGIFCONF) {
5525                 rtnl_lock();
5526                 ret = dev_ifconf(net, (char __user *) arg);
5527                 rtnl_unlock();
5528                 return ret;
5529         }
5530         if (cmd == SIOCGIFNAME)
5531                 return dev_ifname(net, (struct ifreq __user *)arg);
5532
5533         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5534                 return -EFAULT;
5535
5536         ifr.ifr_name[IFNAMSIZ-1] = 0;
5537
5538         colon = strchr(ifr.ifr_name, ':');
5539         if (colon)
5540                 *colon = 0;
5541
5542         /*
5543          *      See which interface the caller is talking about.
5544          */
5545
5546         switch (cmd) {
5547         /*
5548          *      These ioctl calls:
5549          *      - can be done by all.
5550          *      - atomic and do not require locking.
5551          *      - return a value
5552          */
5553         case SIOCGIFFLAGS:
5554         case SIOCGIFMETRIC:
5555         case SIOCGIFMTU:
5556         case SIOCGIFHWADDR:
5557         case SIOCGIFSLAVE:
5558         case SIOCGIFMAP:
5559         case SIOCGIFINDEX:
5560         case SIOCGIFTXQLEN:
5561                 dev_load(net, ifr.ifr_name);
5562                 rcu_read_lock();
5563                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5564                 rcu_read_unlock();
5565                 if (!ret) {
5566                         if (colon)
5567                                 *colon = ':';
5568                         if (copy_to_user(arg, &ifr,
5569                                          sizeof(struct ifreq)))
5570                                 ret = -EFAULT;
5571                 }
5572                 return ret;
5573
5574         case SIOCETHTOOL:
5575                 dev_load(net, ifr.ifr_name);
5576                 rtnl_lock();
5577                 ret = dev_ethtool(net, &ifr);
5578                 rtnl_unlock();
5579                 if (!ret) {
5580                         if (colon)
5581                                 *colon = ':';
5582                         if (copy_to_user(arg, &ifr,
5583                                          sizeof(struct ifreq)))
5584                                 ret = -EFAULT;
5585                 }
5586                 return ret;
5587
5588         /*
5589          *      These ioctl calls:
5590          *      - require superuser power.
5591          *      - require strict serialization.
5592          *      - return a value
5593          */
5594         case SIOCGMIIPHY:
5595         case SIOCGMIIREG:
5596         case SIOCSIFNAME:
5597                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5598                         return -EPERM;
5599                 dev_load(net, ifr.ifr_name);
5600                 rtnl_lock();
5601                 ret = dev_ifsioc(net, &ifr, cmd);
5602                 rtnl_unlock();
5603                 if (!ret) {
5604                         if (colon)
5605                                 *colon = ':';
5606                         if (copy_to_user(arg, &ifr,
5607                                          sizeof(struct ifreq)))
5608                                 ret = -EFAULT;
5609                 }
5610                 return ret;
5611
5612         /*
5613          *      These ioctl calls:
5614          *      - require superuser power.
5615          *      - require strict serialization.
5616          *      - do not return a value
5617          */
5618         case SIOCSIFMAP:
5619         case SIOCSIFTXQLEN:
5620                 if (!capable(CAP_NET_ADMIN))
5621                         return -EPERM;
5622                 /* fall through */
5623         /*
5624          *      These ioctl calls:
5625          *      - require local superuser power.
5626          *      - require strict serialization.
5627          *      - do not return a value
5628          */
5629         case SIOCSIFFLAGS:
5630         case SIOCSIFMETRIC:
5631         case SIOCSIFMTU:
5632         case SIOCSIFHWADDR:
5633         case SIOCSIFSLAVE:
5634         case SIOCADDMULTI:
5635         case SIOCDELMULTI:
5636         case SIOCSIFHWBROADCAST:
5637         case SIOCSMIIREG:
5638         case SIOCBONDENSLAVE:
5639         case SIOCBONDRELEASE:
5640         case SIOCBONDSETHWADDR:
5641         case SIOCBONDCHANGEACTIVE:
5642         case SIOCBRADDIF:
5643         case SIOCBRDELIF:
5644         case SIOCSHWTSTAMP:
5645                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5646                         return -EPERM;
5647                 /* fall through */
5648         case SIOCBONDSLAVEINFOQUERY:
5649         case SIOCBONDINFOQUERY:
5650                 dev_load(net, ifr.ifr_name);
5651                 rtnl_lock();
5652                 ret = dev_ifsioc(net, &ifr, cmd);
5653                 rtnl_unlock();
5654                 return ret;
5655
5656         case SIOCGIFMEM:
5657                 /* Get the per device memory space. We can add this but
5658                  * currently do not support it */
5659         case SIOCSIFMEM:
5660                 /* Set the per device memory buffer space.
5661                  * Not applicable in our case */
5662         case SIOCSIFLINK:
5663                 return -ENOTTY;
5664
5665         /*
5666          *      Unknown or private ioctl.
5667          */
5668         default:
5669                 if (cmd == SIOCWANDEV ||
5670                     (cmd >= SIOCDEVPRIVATE &&
5671                      cmd <= SIOCDEVPRIVATE + 15)) {
5672                         dev_load(net, ifr.ifr_name);
5673                         rtnl_lock();
5674                         ret = dev_ifsioc(net, &ifr, cmd);
5675                         rtnl_unlock();
5676                         if (!ret && copy_to_user(arg, &ifr,
5677                                                  sizeof(struct ifreq)))
5678                                 ret = -EFAULT;
5679                         return ret;
5680                 }
5681                 /* Take care of Wireless Extensions */
5682                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5683                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5684                 return -ENOTTY;
5685         }
5686 }
5687
5688
5689 /**
5690  *      dev_new_index   -       allocate an ifindex
5691  *      @net: the applicable net namespace
5692  *
5693  *      Returns a suitable unique value for a new device interface
5694  *      number.  The caller must hold the rtnl semaphore or the
5695  *      dev_base_lock to be sure it remains unique.
5696  */
5697 static int dev_new_index(struct net *net)
5698 {
5699         int ifindex = net->ifindex;
5700         for (;;) {
5701                 if (++ifindex <= 0)
5702                         ifindex = 1;
5703                 if (!__dev_get_by_index(net, ifindex))
5704                         return net->ifindex = ifindex;
5705         }
5706 }
5707
5708 /* Delayed registration/unregisteration */
5709 static LIST_HEAD(net_todo_list);
5710
5711 static void net_set_todo(struct net_device *dev)
5712 {
5713         list_add_tail(&dev->todo_list, &net_todo_list);
5714 }
5715
5716 static void rollback_registered_many(struct list_head *head)
5717 {
5718         struct net_device *dev, *tmp;
5719
5720         BUG_ON(dev_boot_phase);
5721         ASSERT_RTNL();
5722
5723         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5724                 /* Some devices call without registering
5725                  * for initialization unwind. Remove those
5726                  * devices and proceed with the remaining.
5727                  */
5728                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5729                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5730                                  dev->name, dev);
5731
5732                         WARN_ON(1);
5733                         list_del(&dev->unreg_list);
5734                         continue;
5735                 }
5736                 dev->dismantle = true;
5737                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5738         }
5739
5740         /* If device is running, close it first. */
5741         dev_close_many(head);
5742
5743         list_for_each_entry(dev, head, unreg_list) {
5744                 /* And unlink it from device chain. */
5745                 unlist_netdevice(dev);
5746
5747                 dev->reg_state = NETREG_UNREGISTERING;
5748         }
5749
5750         synchronize_net();
5751
5752         list_for_each_entry(dev, head, unreg_list) {
5753                 /* Shutdown queueing discipline. */
5754                 dev_shutdown(dev);
5755
5756
5757                 /* Notify protocols, that we are about to destroy
5758                    this device. They should clean all the things.
5759                 */
5760                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5761
5762                 if (!dev->rtnl_link_ops ||
5763                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5764                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5765
5766                 /*
5767                  *      Flush the unicast and multicast chains
5768                  */
5769                 dev_uc_flush(dev);
5770                 dev_mc_flush(dev);
5771
5772                 if (dev->netdev_ops->ndo_uninit)
5773                         dev->netdev_ops->ndo_uninit(dev);
5774
5775                 /* Notifier chain MUST detach us all upper devices. */
5776                 WARN_ON(netdev_has_any_upper_dev(dev));
5777
5778                 /* Remove entries from kobject tree */
5779                 netdev_unregister_kobject(dev);
5780 #ifdef CONFIG_XPS
5781                 /* Remove XPS queueing entries */
5782                 netif_reset_xps_queues_gt(dev, 0);
5783 #endif
5784         }
5785
5786         synchronize_net();
5787
5788         list_for_each_entry(dev, head, unreg_list)
5789                 dev_put(dev);
5790 }
5791
5792 static void rollback_registered(struct net_device *dev)
5793 {
5794         LIST_HEAD(single);
5795
5796         list_add(&dev->unreg_list, &single);
5797         rollback_registered_many(&single);
5798         list_del(&single);
5799 }
5800
5801 static netdev_features_t netdev_fix_features(struct net_device *dev,
5802         netdev_features_t features)
5803 {
5804         /* Fix illegal checksum combinations */
5805         if ((features & NETIF_F_HW_CSUM) &&
5806             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5807                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5808                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5809         }
5810
5811         /* Fix illegal SG+CSUM combinations. */
5812         if ((features & NETIF_F_SG) &&
5813             !(features & NETIF_F_ALL_CSUM)) {
5814                 netdev_dbg(dev,
5815                         "Dropping NETIF_F_SG since no checksum feature.\n");
5816                 features &= ~NETIF_F_SG;
5817         }
5818
5819         /* TSO requires that SG is present as well. */
5820         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5821                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5822                 features &= ~NETIF_F_ALL_TSO;
5823         }
5824
5825         /* TSO ECN requires that TSO is present as well. */
5826         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5827                 features &= ~NETIF_F_TSO_ECN;
5828
5829         /* Software GSO depends on SG. */
5830         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5831                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5832                 features &= ~NETIF_F_GSO;
5833         }
5834
5835         /* UFO needs SG and checksumming */
5836         if (features & NETIF_F_UFO) {
5837                 /* maybe split UFO into V4 and V6? */
5838                 if (!((features & NETIF_F_GEN_CSUM) ||
5839                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5840                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5841                         netdev_dbg(dev,
5842                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5843                         features &= ~NETIF_F_UFO;
5844                 }
5845
5846                 if (!(features & NETIF_F_SG)) {
5847                         netdev_dbg(dev,
5848                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5849                         features &= ~NETIF_F_UFO;
5850                 }
5851         }
5852
5853         return features;
5854 }
5855
5856 int __netdev_update_features(struct net_device *dev)
5857 {
5858         netdev_features_t features;
5859         int err = 0;
5860
5861         ASSERT_RTNL();
5862
5863         features = netdev_get_wanted_features(dev);
5864
5865         if (dev->netdev_ops->ndo_fix_features)
5866                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5867
5868         /* driver might be less strict about feature dependencies */
5869         features = netdev_fix_features(dev, features);
5870
5871         if (dev->features == features)
5872                 return 0;
5873
5874         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5875                 &dev->features, &features);
5876
5877         if (dev->netdev_ops->ndo_set_features)
5878                 err = dev->netdev_ops->ndo_set_features(dev, features);
5879
5880         if (unlikely(err < 0)) {
5881                 netdev_err(dev,
5882                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5883                         err, &features, &dev->features);
5884                 return -1;
5885         }
5886
5887         if (!err)
5888                 dev->features = features;
5889
5890         return 1;
5891 }
5892
5893 /**
5894  *      netdev_update_features - recalculate device features
5895  *      @dev: the device to check
5896  *
5897  *      Recalculate dev->features set and send notifications if it
5898  *      has changed. Should be called after driver or hardware dependent
5899  *      conditions might have changed that influence the features.
5900  */
5901 void netdev_update_features(struct net_device *dev)
5902 {
5903         if (__netdev_update_features(dev))
5904                 netdev_features_change(dev);
5905 }
5906 EXPORT_SYMBOL(netdev_update_features);
5907
5908 /**
5909  *      netdev_change_features - recalculate device features
5910  *      @dev: the device to check
5911  *
5912  *      Recalculate dev->features set and send notifications even
5913  *      if they have not changed. Should be called instead of
5914  *      netdev_update_features() if also dev->vlan_features might
5915  *      have changed to allow the changes to be propagated to stacked
5916  *      VLAN devices.
5917  */
5918 void netdev_change_features(struct net_device *dev)
5919 {
5920         __netdev_update_features(dev);
5921         netdev_features_change(dev);
5922 }
5923 EXPORT_SYMBOL(netdev_change_features);
5924
5925 /**
5926  *      netif_stacked_transfer_operstate -      transfer operstate
5927  *      @rootdev: the root or lower level device to transfer state from
5928  *      @dev: the device to transfer operstate to
5929  *
5930  *      Transfer operational state from root to device. This is normally
5931  *      called when a stacking relationship exists between the root
5932  *      device and the device(a leaf device).
5933  */
5934 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5935                                         struct net_device *dev)
5936 {
5937         if (rootdev->operstate == IF_OPER_DORMANT)
5938                 netif_dormant_on(dev);
5939         else
5940                 netif_dormant_off(dev);
5941
5942         if (netif_carrier_ok(rootdev)) {
5943                 if (!netif_carrier_ok(dev))
5944                         netif_carrier_on(dev);
5945         } else {
5946                 if (netif_carrier_ok(dev))
5947                         netif_carrier_off(dev);
5948         }
5949 }
5950 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5951
5952 #ifdef CONFIG_RPS
5953 static int netif_alloc_rx_queues(struct net_device *dev)
5954 {
5955         unsigned int i, count = dev->num_rx_queues;
5956         struct netdev_rx_queue *rx;
5957
5958         BUG_ON(count < 1);
5959
5960         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5961         if (!rx)
5962                 return -ENOMEM;
5963
5964         dev->_rx = rx;
5965
5966         for (i = 0; i < count; i++)
5967                 rx[i].dev = dev;
5968         return 0;
5969 }
5970 #endif
5971
5972 static void netdev_init_one_queue(struct net_device *dev,
5973                                   struct netdev_queue *queue, void *_unused)
5974 {
5975         /* Initialize queue lock */
5976         spin_lock_init(&queue->_xmit_lock);
5977         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5978         queue->xmit_lock_owner = -1;
5979         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5980         queue->dev = dev;
5981 #ifdef CONFIG_BQL
5982         dql_init(&queue->dql, HZ);
5983 #endif
5984 }
5985
5986 static int netif_alloc_netdev_queues(struct net_device *dev)
5987 {
5988         unsigned int count = dev->num_tx_queues;
5989         struct netdev_queue *tx;
5990
5991         BUG_ON(count < 1);
5992
5993         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5994         if (!tx)
5995                 return -ENOMEM;
5996
5997         dev->_tx = tx;
5998
5999         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6000         spin_lock_init(&dev->tx_global_lock);
6001
6002         return 0;
6003 }
6004
6005 /**
6006  *      register_netdevice      - register a network device
6007  *      @dev: device to register
6008  *
6009  *      Take a completed network device structure and add it to the kernel
6010  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6011  *      chain. 0 is returned on success. A negative errno code is returned
6012  *      on a failure to set up the device, or if the name is a duplicate.
6013  *
6014  *      Callers must hold the rtnl semaphore. You may want
6015  *      register_netdev() instead of this.
6016  *
6017  *      BUGS:
6018  *      The locking appears insufficient to guarantee two parallel registers
6019  *      will not get the same name.
6020  */
6021
6022 int register_netdevice(struct net_device *dev)
6023 {
6024         int ret;
6025         struct net *net = dev_net(dev);
6026
6027         BUG_ON(dev_boot_phase);
6028         ASSERT_RTNL();
6029
6030         might_sleep();
6031
6032         /* When net_device's are persistent, this will be fatal. */
6033         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6034         BUG_ON(!net);
6035
6036         spin_lock_init(&dev->addr_list_lock);
6037         netdev_set_addr_lockdep_class(dev);
6038
6039         dev->iflink = -1;
6040
6041         ret = dev_get_valid_name(net, dev, dev->name);
6042         if (ret < 0)
6043                 goto out;
6044
6045         /* Init, if this function is available */
6046         if (dev->netdev_ops->ndo_init) {
6047                 ret = dev->netdev_ops->ndo_init(dev);
6048                 if (ret) {
6049                         if (ret > 0)
6050                                 ret = -EIO;
6051                         goto out;
6052                 }
6053         }
6054
6055         if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
6056             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6057              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6058                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6059                 ret = -EINVAL;
6060                 goto err_uninit;
6061         }
6062
6063         ret = -EBUSY;
6064         if (!dev->ifindex)
6065                 dev->ifindex = dev_new_index(net);
6066         else if (__dev_get_by_index(net, dev->ifindex))
6067                 goto err_uninit;
6068
6069         if (dev->iflink == -1)
6070                 dev->iflink = dev->ifindex;
6071
6072         /* Transfer changeable features to wanted_features and enable
6073          * software offloads (GSO and GRO).
6074          */
6075         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6076         dev->features |= NETIF_F_SOFT_FEATURES;
6077         dev->wanted_features = dev->features & dev->hw_features;
6078
6079         /* Turn on no cache copy if HW is doing checksum */
6080         if (!(dev->flags & IFF_LOOPBACK)) {
6081                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6082                 if (dev->features & NETIF_F_ALL_CSUM) {
6083                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
6084                         dev->features |= NETIF_F_NOCACHE_COPY;
6085                 }
6086         }
6087
6088         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6089          */
6090         dev->vlan_features |= NETIF_F_HIGHDMA;
6091
6092         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6093         ret = notifier_to_errno(ret);
6094         if (ret)
6095                 goto err_uninit;
6096
6097         ret = netdev_register_kobject(dev);
6098         if (ret)
6099                 goto err_uninit;
6100         dev->reg_state = NETREG_REGISTERED;
6101
6102         __netdev_update_features(dev);
6103
6104         /*
6105          *      Default initial state at registry is that the
6106          *      device is present.
6107          */
6108
6109         set_bit(__LINK_STATE_PRESENT, &dev->state);
6110
6111         linkwatch_init_dev(dev);
6112
6113         dev_init_scheduler(dev);
6114         dev_hold(dev);
6115         list_netdevice(dev);
6116         add_device_randomness(dev->dev_addr, dev->addr_len);
6117
6118         /* If the device has permanent device address, driver should
6119          * set dev_addr and also addr_assign_type should be set to
6120          * NET_ADDR_PERM (default value).
6121          */
6122         if (dev->addr_assign_type == NET_ADDR_PERM)
6123                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6124
6125         /* Notify protocols, that a new device appeared. */
6126         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6127         ret = notifier_to_errno(ret);
6128         if (ret) {
6129                 rollback_registered(dev);
6130                 dev->reg_state = NETREG_UNREGISTERED;
6131         }
6132         /*
6133          *      Prevent userspace races by waiting until the network
6134          *      device is fully setup before sending notifications.
6135          */
6136         if (!dev->rtnl_link_ops ||
6137             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6138                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6139
6140 out:
6141         return ret;
6142
6143 err_uninit:
6144         if (dev->netdev_ops->ndo_uninit)
6145                 dev->netdev_ops->ndo_uninit(dev);
6146         goto out;
6147 }
6148 EXPORT_SYMBOL(register_netdevice);
6149
6150 /**
6151  *      init_dummy_netdev       - init a dummy network device for NAPI
6152  *      @dev: device to init
6153  *
6154  *      This takes a network device structure and initialize the minimum
6155  *      amount of fields so it can be used to schedule NAPI polls without
6156  *      registering a full blown interface. This is to be used by drivers
6157  *      that need to tie several hardware interfaces to a single NAPI
6158  *      poll scheduler due to HW limitations.
6159  */
6160 int init_dummy_netdev(struct net_device *dev)
6161 {
6162         /* Clear everything. Note we don't initialize spinlocks
6163          * are they aren't supposed to be taken by any of the
6164          * NAPI code and this dummy netdev is supposed to be
6165          * only ever used for NAPI polls
6166          */
6167         memset(dev, 0, sizeof(struct net_device));
6168
6169         /* make sure we BUG if trying to hit standard
6170          * register/unregister code path
6171          */
6172         dev->reg_state = NETREG_DUMMY;
6173
6174         /* NAPI wants this */
6175         INIT_LIST_HEAD(&dev->napi_list);
6176
6177         /* a dummy interface is started by default */
6178         set_bit(__LINK_STATE_PRESENT, &dev->state);
6179         set_bit(__LINK_STATE_START, &dev->state);
6180
6181         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6182          * because users of this 'device' dont need to change
6183          * its refcount.
6184          */
6185
6186         return 0;
6187 }
6188 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6189
6190
6191 /**
6192  *      register_netdev - register a network device
6193  *      @dev: device to register
6194  *
6195  *      Take a completed network device structure and add it to the kernel
6196  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6197  *      chain. 0 is returned on success. A negative errno code is returned
6198  *      on a failure to set up the device, or if the name is a duplicate.
6199  *
6200  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6201  *      and expands the device name if you passed a format string to
6202  *      alloc_netdev.
6203  */
6204 int register_netdev(struct net_device *dev)
6205 {
6206         int err;
6207
6208         rtnl_lock();
6209         err = register_netdevice(dev);
6210         rtnl_unlock();
6211         return err;
6212 }
6213 EXPORT_SYMBOL(register_netdev);
6214
6215 int netdev_refcnt_read(const struct net_device *dev)
6216 {
6217         int i, refcnt = 0;
6218
6219         for_each_possible_cpu(i)
6220                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6221         return refcnt;
6222 }
6223 EXPORT_SYMBOL(netdev_refcnt_read);
6224
6225 /**
6226  * netdev_wait_allrefs - wait until all references are gone.
6227  * @dev: target net_device
6228  *
6229  * This is called when unregistering network devices.
6230  *
6231  * Any protocol or device that holds a reference should register
6232  * for netdevice notification, and cleanup and put back the
6233  * reference if they receive an UNREGISTER event.
6234  * We can get stuck here if buggy protocols don't correctly
6235  * call dev_put.
6236  */
6237 static void netdev_wait_allrefs(struct net_device *dev)
6238 {
6239         unsigned long rebroadcast_time, warning_time;
6240         int refcnt;
6241
6242         linkwatch_forget_dev(dev);
6243
6244         rebroadcast_time = warning_time = jiffies;
6245         refcnt = netdev_refcnt_read(dev);
6246
6247         while (refcnt != 0) {
6248                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6249                         rtnl_lock();
6250
6251                         /* Rebroadcast unregister notification */
6252                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6253
6254                         __rtnl_unlock();
6255                         rcu_barrier();
6256                         rtnl_lock();
6257
6258                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6259                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6260                                      &dev->state)) {
6261                                 /* We must not have linkwatch events
6262                                  * pending on unregister. If this
6263                                  * happens, we simply run the queue
6264                                  * unscheduled, resulting in a noop
6265                                  * for this device.
6266                                  */
6267                                 linkwatch_run_queue();
6268                         }
6269
6270                         __rtnl_unlock();
6271
6272                         rebroadcast_time = jiffies;
6273                 }
6274
6275                 msleep(250);
6276
6277                 refcnt = netdev_refcnt_read(dev);
6278
6279                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6280                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6281                                  dev->name, refcnt);
6282                         warning_time = jiffies;
6283                 }
6284         }
6285 }
6286
6287 /* The sequence is:
6288  *
6289  *      rtnl_lock();
6290  *      ...
6291  *      register_netdevice(x1);
6292  *      register_netdevice(x2);
6293  *      ...
6294  *      unregister_netdevice(y1);
6295  *      unregister_netdevice(y2);
6296  *      ...
6297  *      rtnl_unlock();
6298  *      free_netdev(y1);
6299  *      free_netdev(y2);
6300  *
6301  * We are invoked by rtnl_unlock().
6302  * This allows us to deal with problems:
6303  * 1) We can delete sysfs objects which invoke hotplug
6304  *    without deadlocking with linkwatch via keventd.
6305  * 2) Since we run with the RTNL semaphore not held, we can sleep
6306  *    safely in order to wait for the netdev refcnt to drop to zero.
6307  *
6308  * We must not return until all unregister events added during
6309  * the interval the lock was held have been completed.
6310  */
6311 void netdev_run_todo(void)
6312 {
6313         struct list_head list;
6314
6315         /* Snapshot list, allow later requests */
6316         list_replace_init(&net_todo_list, &list);
6317
6318         __rtnl_unlock();
6319
6320
6321         /* Wait for rcu callbacks to finish before next phase */
6322         if (!list_empty(&list))
6323                 rcu_barrier();
6324
6325         while (!list_empty(&list)) {
6326                 struct net_device *dev
6327                         = list_first_entry(&list, struct net_device, todo_list);
6328                 list_del(&dev->todo_list);
6329
6330                 rtnl_lock();
6331                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6332                 __rtnl_unlock();
6333
6334                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6335                         pr_err("network todo '%s' but state %d\n",
6336                                dev->name, dev->reg_state);
6337                         dump_stack();
6338                         continue;
6339                 }
6340
6341                 dev->reg_state = NETREG_UNREGISTERED;
6342
6343                 on_each_cpu(flush_backlog, dev, 1);
6344
6345                 netdev_wait_allrefs(dev);
6346
6347                 /* paranoia */
6348                 BUG_ON(netdev_refcnt_read(dev));
6349                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6350                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6351                 WARN_ON(dev->dn_ptr);
6352
6353                 if (dev->destructor)
6354                         dev->destructor(dev);
6355
6356                 /* Free network device */
6357                 kobject_put(&dev->dev.kobj);
6358         }
6359 }
6360
6361 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6362  * fields in the same order, with only the type differing.
6363  */
6364 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6365                              const struct net_device_stats *netdev_stats)
6366 {
6367 #if BITS_PER_LONG == 64
6368         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6369         memcpy(stats64, netdev_stats, sizeof(*stats64));
6370 #else
6371         size_t i, n = sizeof(*stats64) / sizeof(u64);
6372         const unsigned long *src = (const unsigned long *)netdev_stats;
6373         u64 *dst = (u64 *)stats64;
6374
6375         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6376                      sizeof(*stats64) / sizeof(u64));
6377         for (i = 0; i < n; i++)
6378                 dst[i] = src[i];
6379 #endif
6380 }
6381 EXPORT_SYMBOL(netdev_stats_to_stats64);
6382
6383 /**
6384  *      dev_get_stats   - get network device statistics
6385  *      @dev: device to get statistics from
6386  *      @storage: place to store stats
6387  *
6388  *      Get network statistics from device. Return @storage.
6389  *      The device driver may provide its own method by setting
6390  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6391  *      otherwise the internal statistics structure is used.
6392  */
6393 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6394                                         struct rtnl_link_stats64 *storage)
6395 {
6396         const struct net_device_ops *ops = dev->netdev_ops;
6397
6398         if (ops->ndo_get_stats64) {
6399                 memset(storage, 0, sizeof(*storage));
6400                 ops->ndo_get_stats64(dev, storage);
6401         } else if (ops->ndo_get_stats) {
6402                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6403         } else {
6404                 netdev_stats_to_stats64(storage, &dev->stats);
6405         }
6406         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6407         return storage;
6408 }
6409 EXPORT_SYMBOL(dev_get_stats);
6410
6411 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6412 {
6413         struct netdev_queue *queue = dev_ingress_queue(dev);
6414
6415 #ifdef CONFIG_NET_CLS_ACT
6416         if (queue)
6417                 return queue;
6418         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6419         if (!queue)
6420                 return NULL;
6421         netdev_init_one_queue(dev, queue, NULL);
6422         queue->qdisc = &noop_qdisc;
6423         queue->qdisc_sleeping = &noop_qdisc;
6424         rcu_assign_pointer(dev->ingress_queue, queue);
6425 #endif
6426         return queue;
6427 }
6428
6429 static const struct ethtool_ops default_ethtool_ops;
6430
6431 void netdev_set_default_ethtool_ops(struct net_device *dev,
6432                                     const struct ethtool_ops *ops)
6433 {
6434         if (dev->ethtool_ops == &default_ethtool_ops)
6435                 dev->ethtool_ops = ops;
6436 }
6437 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6438
6439 /**
6440  *      alloc_netdev_mqs - allocate network device
6441  *      @sizeof_priv:   size of private data to allocate space for
6442  *      @name:          device name format string
6443  *      @setup:         callback to initialize device
6444  *      @txqs:          the number of TX subqueues to allocate
6445  *      @rxqs:          the number of RX subqueues to allocate
6446  *
6447  *      Allocates a struct net_device with private data area for driver use
6448  *      and performs basic initialization.  Also allocates subquue structs
6449  *      for each queue on the device.
6450  */
6451 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6452                 void (*setup)(struct net_device *),
6453                 unsigned int txqs, unsigned int rxqs)
6454 {
6455         struct net_device *dev;
6456         size_t alloc_size;
6457         struct net_device *p;
6458
6459         BUG_ON(strlen(name) >= sizeof(dev->name));
6460
6461         if (txqs < 1) {
6462                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6463                 return NULL;
6464         }
6465
6466 #ifdef CONFIG_RPS
6467         if (rxqs < 1) {
6468                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6469                 return NULL;
6470         }
6471 #endif
6472
6473         alloc_size = sizeof(struct net_device);
6474         if (sizeof_priv) {
6475                 /* ensure 32-byte alignment of private area */
6476                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6477                 alloc_size += sizeof_priv;
6478         }
6479         /* ensure 32-byte alignment of whole construct */
6480         alloc_size += NETDEV_ALIGN - 1;
6481
6482         p = kzalloc(alloc_size, GFP_KERNEL);
6483         if (!p)
6484                 return NULL;
6485
6486         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6487         dev->padded = (char *)dev - (char *)p;
6488
6489         dev->pcpu_refcnt = alloc_percpu(int);
6490         if (!dev->pcpu_refcnt)
6491                 goto free_p;
6492
6493         if (dev_addr_init(dev))
6494                 goto free_pcpu;
6495
6496         dev_mc_init(dev);
6497         dev_uc_init(dev);
6498
6499         dev_net_set(dev, &init_net);
6500
6501         dev->gso_max_size = GSO_MAX_SIZE;
6502         dev->gso_max_segs = GSO_MAX_SEGS;
6503
6504         INIT_LIST_HEAD(&dev->napi_list);
6505         INIT_LIST_HEAD(&dev->unreg_list);
6506         INIT_LIST_HEAD(&dev->link_watch_list);
6507         INIT_LIST_HEAD(&dev->upper_dev_list);
6508         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6509         setup(dev);
6510
6511         dev->num_tx_queues = txqs;
6512         dev->real_num_tx_queues = txqs;
6513         if (netif_alloc_netdev_queues(dev))
6514                 goto free_all;
6515
6516 #ifdef CONFIG_RPS
6517         dev->num_rx_queues = rxqs;
6518         dev->real_num_rx_queues = rxqs;
6519         if (netif_alloc_rx_queues(dev))
6520                 goto free_all;
6521 #endif
6522
6523         strcpy(dev->name, name);
6524         dev->group = INIT_NETDEV_GROUP;
6525         if (!dev->ethtool_ops)
6526                 dev->ethtool_ops = &default_ethtool_ops;
6527         return dev;
6528
6529 free_all:
6530         free_netdev(dev);
6531         return NULL;
6532
6533 free_pcpu:
6534         free_percpu(dev->pcpu_refcnt);
6535         kfree(dev->_tx);
6536 #ifdef CONFIG_RPS
6537         kfree(dev->_rx);
6538 #endif
6539
6540 free_p:
6541         kfree(p);
6542         return NULL;
6543 }
6544 EXPORT_SYMBOL(alloc_netdev_mqs);
6545
6546 /**
6547  *      free_netdev - free network device
6548  *      @dev: device
6549  *
6550  *      This function does the last stage of destroying an allocated device
6551  *      interface. The reference to the device object is released.
6552  *      If this is the last reference then it will be freed.
6553  */
6554 void free_netdev(struct net_device *dev)
6555 {
6556         struct napi_struct *p, *n;
6557
6558         release_net(dev_net(dev));
6559
6560         kfree(dev->_tx);
6561 #ifdef CONFIG_RPS
6562         kfree(dev->_rx);
6563 #endif
6564
6565         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6566
6567         /* Flush device addresses */
6568         dev_addr_flush(dev);
6569
6570         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6571                 netif_napi_del(p);
6572
6573         free_percpu(dev->pcpu_refcnt);
6574         dev->pcpu_refcnt = NULL;
6575
6576         /*  Compatibility with error handling in drivers */
6577         if (dev->reg_state == NETREG_UNINITIALIZED) {
6578                 kfree((char *)dev - dev->padded);
6579                 return;
6580         }
6581
6582         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6583         dev->reg_state = NETREG_RELEASED;
6584
6585         /* will free via device release */
6586         put_device(&dev->dev);
6587 }
6588 EXPORT_SYMBOL(free_netdev);
6589
6590 /**
6591  *      synchronize_net -  Synchronize with packet receive processing
6592  *
6593  *      Wait for packets currently being received to be done.
6594  *      Does not block later packets from starting.
6595  */
6596 void synchronize_net(void)
6597 {
6598         might_sleep();
6599         if (rtnl_is_locked())
6600                 synchronize_rcu_expedited();
6601         else
6602                 synchronize_rcu();
6603 }
6604 EXPORT_SYMBOL(synchronize_net);
6605
6606 /**
6607  *      unregister_netdevice_queue - remove device from the kernel
6608  *      @dev: device
6609  *      @head: list
6610  *
6611  *      This function shuts down a device interface and removes it
6612  *      from the kernel tables.
6613  *      If head not NULL, device is queued to be unregistered later.
6614  *
6615  *      Callers must hold the rtnl semaphore.  You may want
6616  *      unregister_netdev() instead of this.
6617  */
6618
6619 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6620 {
6621         ASSERT_RTNL();
6622
6623         if (head) {
6624                 list_move_tail(&dev->unreg_list, head);
6625         } else {
6626                 rollback_registered(dev);
6627                 /* Finish processing unregister after unlock */
6628                 net_set_todo(dev);
6629         }
6630 }
6631 EXPORT_SYMBOL(unregister_netdevice_queue);
6632
6633 /**
6634  *      unregister_netdevice_many - unregister many devices
6635  *      @head: list of devices
6636  */
6637 void unregister_netdevice_many(struct list_head *head)
6638 {
6639         struct net_device *dev;
6640
6641         if (!list_empty(head)) {
6642                 rollback_registered_many(head);
6643                 list_for_each_entry(dev, head, unreg_list)
6644                         net_set_todo(dev);
6645         }
6646 }
6647 EXPORT_SYMBOL(unregister_netdevice_many);
6648
6649 /**
6650  *      unregister_netdev - remove device from the kernel
6651  *      @dev: device
6652  *
6653  *      This function shuts down a device interface and removes it
6654  *      from the kernel tables.
6655  *
6656  *      This is just a wrapper for unregister_netdevice that takes
6657  *      the rtnl semaphore.  In general you want to use this and not
6658  *      unregister_netdevice.
6659  */
6660 void unregister_netdev(struct net_device *dev)
6661 {
6662         rtnl_lock();
6663         unregister_netdevice(dev);
6664         rtnl_unlock();
6665 }
6666 EXPORT_SYMBOL(unregister_netdev);
6667
6668 /**
6669  *      dev_change_net_namespace - move device to different nethost namespace
6670  *      @dev: device
6671  *      @net: network namespace
6672  *      @pat: If not NULL name pattern to try if the current device name
6673  *            is already taken in the destination network namespace.
6674  *
6675  *      This function shuts down a device interface and moves it
6676  *      to a new network namespace. On success 0 is returned, on
6677  *      a failure a netagive errno code is returned.
6678  *
6679  *      Callers must hold the rtnl semaphore.
6680  */
6681
6682 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6683 {
6684         int err;
6685
6686         ASSERT_RTNL();
6687
6688         /* Don't allow namespace local devices to be moved. */
6689         err = -EINVAL;
6690         if (dev->features & NETIF_F_NETNS_LOCAL)
6691                 goto out;
6692
6693         /* Ensure the device has been registrered */
6694         if (dev->reg_state != NETREG_REGISTERED)
6695                 goto out;
6696
6697         /* Get out if there is nothing todo */
6698         err = 0;
6699         if (net_eq(dev_net(dev), net))
6700                 goto out;
6701
6702         /* Pick the destination device name, and ensure
6703          * we can use it in the destination network namespace.
6704          */
6705         err = -EEXIST;
6706         if (__dev_get_by_name(net, dev->name)) {
6707                 /* We get here if we can't use the current device name */
6708                 if (!pat)
6709                         goto out;
6710                 if (dev_get_valid_name(net, dev, pat) < 0)
6711                         goto out;
6712         }
6713
6714         /*
6715          * And now a mini version of register_netdevice unregister_netdevice.
6716          */
6717
6718         /* If device is running close it first. */
6719         dev_close(dev);
6720
6721         /* And unlink it from device chain */
6722         err = -ENODEV;
6723         unlist_netdevice(dev);
6724
6725         synchronize_net();
6726
6727         /* Shutdown queueing discipline. */
6728         dev_shutdown(dev);
6729
6730         /* Notify protocols, that we are about to destroy
6731            this device. They should clean all the things.
6732
6733            Note that dev->reg_state stays at NETREG_REGISTERED.
6734            This is wanted because this way 8021q and macvlan know
6735            the device is just moving and can keep their slaves up.
6736         */
6737         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6738         rcu_barrier();
6739         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6740         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6741
6742         /*
6743          *      Flush the unicast and multicast chains
6744          */
6745         dev_uc_flush(dev);
6746         dev_mc_flush(dev);
6747
6748         /* Send a netdev-removed uevent to the old namespace */
6749         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6750
6751         /* Actually switch the network namespace */
6752         dev_net_set(dev, net);
6753
6754         /* If there is an ifindex conflict assign a new one */
6755         if (__dev_get_by_index(net, dev->ifindex)) {
6756                 int iflink = (dev->iflink == dev->ifindex);
6757                 dev->ifindex = dev_new_index(net);
6758                 if (iflink)
6759                         dev->iflink = dev->ifindex;
6760         }
6761
6762         /* Send a netdev-add uevent to the new namespace */
6763         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6764
6765         /* Fixup kobjects */
6766         err = device_rename(&dev->dev, dev->name);
6767         WARN_ON(err);
6768
6769         /* Add the device back in the hashes */
6770         list_netdevice(dev);
6771
6772         /* Notify protocols, that a new device appeared. */
6773         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6774
6775         /*
6776          *      Prevent userspace races by waiting until the network
6777          *      device is fully setup before sending notifications.
6778          */
6779         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6780
6781         synchronize_net();
6782         err = 0;
6783 out:
6784         return err;
6785 }
6786 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6787
6788 static int dev_cpu_callback(struct notifier_block *nfb,
6789                             unsigned long action,
6790                             void *ocpu)
6791 {
6792         struct sk_buff **list_skb;
6793         struct sk_buff *skb;
6794         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6795         struct softnet_data *sd, *oldsd;
6796
6797         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6798                 return NOTIFY_OK;
6799
6800         local_irq_disable();
6801         cpu = smp_processor_id();
6802         sd = &per_cpu(softnet_data, cpu);
6803         oldsd = &per_cpu(softnet_data, oldcpu);
6804
6805         /* Find end of our completion_queue. */
6806         list_skb = &sd->completion_queue;
6807         while (*list_skb)
6808                 list_skb = &(*list_skb)->next;
6809         /* Append completion queue from offline CPU. */
6810         *list_skb = oldsd->completion_queue;
6811         oldsd->completion_queue = NULL;
6812
6813         /* Append output queue from offline CPU. */
6814         if (oldsd->output_queue) {
6815                 *sd->output_queue_tailp = oldsd->output_queue;
6816                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6817                 oldsd->output_queue = NULL;
6818                 oldsd->output_queue_tailp = &oldsd->output_queue;
6819         }
6820         /* Append NAPI poll list from offline CPU. */
6821         if (!list_empty(&oldsd->poll_list)) {
6822                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6823                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6824         }
6825
6826         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6827         local_irq_enable();
6828
6829         /* Process offline CPU's input_pkt_queue */
6830         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6831                 netif_rx(skb);
6832                 input_queue_head_incr(oldsd);
6833         }
6834         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6835                 netif_rx(skb);
6836                 input_queue_head_incr(oldsd);
6837         }
6838
6839         return NOTIFY_OK;
6840 }
6841
6842
6843 /**
6844  *      netdev_increment_features - increment feature set by one
6845  *      @all: current feature set
6846  *      @one: new feature set
6847  *      @mask: mask feature set
6848  *
6849  *      Computes a new feature set after adding a device with feature set
6850  *      @one to the master device with current feature set @all.  Will not
6851  *      enable anything that is off in @mask. Returns the new feature set.
6852  */
6853 netdev_features_t netdev_increment_features(netdev_features_t all,
6854         netdev_features_t one, netdev_features_t mask)
6855 {
6856         if (mask & NETIF_F_GEN_CSUM)
6857                 mask |= NETIF_F_ALL_CSUM;
6858         mask |= NETIF_F_VLAN_CHALLENGED;
6859
6860         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6861         all &= one | ~NETIF_F_ALL_FOR_ALL;
6862
6863         /* If one device supports hw checksumming, set for all. */
6864         if (all & NETIF_F_GEN_CSUM)
6865                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6866
6867         return all;
6868 }
6869 EXPORT_SYMBOL(netdev_increment_features);
6870
6871 static struct hlist_head *netdev_create_hash(void)
6872 {
6873         int i;
6874         struct hlist_head *hash;
6875
6876         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6877         if (hash != NULL)
6878                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6879                         INIT_HLIST_HEAD(&hash[i]);
6880
6881         return hash;
6882 }
6883
6884 /* Initialize per network namespace state */
6885 static int __net_init netdev_init(struct net *net)
6886 {
6887         if (net != &init_net)
6888                 INIT_LIST_HEAD(&net->dev_base_head);
6889
6890         net->dev_name_head = netdev_create_hash();
6891         if (net->dev_name_head == NULL)
6892                 goto err_name;
6893
6894         net->dev_index_head = netdev_create_hash();
6895         if (net->dev_index_head == NULL)
6896                 goto err_idx;
6897
6898         return 0;
6899
6900 err_idx:
6901         kfree(net->dev_name_head);
6902 err_name:
6903         return -ENOMEM;
6904 }
6905
6906 /**
6907  *      netdev_drivername - network driver for the device
6908  *      @dev: network device
6909  *
6910  *      Determine network driver for device.
6911  */
6912 const char *netdev_drivername(const struct net_device *dev)
6913 {
6914         const struct device_driver *driver;
6915         const struct device *parent;
6916         const char *empty = "";
6917
6918         parent = dev->dev.parent;
6919         if (!parent)
6920                 return empty;
6921
6922         driver = parent->driver;
6923         if (driver && driver->name)
6924                 return driver->name;
6925         return empty;
6926 }
6927
6928 static int __netdev_printk(const char *level, const struct net_device *dev,
6929                            struct va_format *vaf)
6930 {
6931         int r;
6932
6933         if (dev && dev->dev.parent) {
6934                 r = dev_printk_emit(level[1] - '0',
6935                                     dev->dev.parent,
6936                                     "%s %s %s: %pV",
6937                                     dev_driver_string(dev->dev.parent),
6938                                     dev_name(dev->dev.parent),
6939                                     netdev_name(dev), vaf);
6940         } else if (dev) {
6941                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6942         } else {
6943                 r = printk("%s(NULL net_device): %pV", level, vaf);
6944         }
6945
6946         return r;
6947 }
6948
6949 int netdev_printk(const char *level, const struct net_device *dev,
6950                   const char *format, ...)
6951 {
6952         struct va_format vaf;
6953         va_list args;
6954         int r;
6955
6956         va_start(args, format);
6957
6958         vaf.fmt = format;
6959         vaf.va = &args;
6960
6961         r = __netdev_printk(level, dev, &vaf);
6962
6963         va_end(args);
6964
6965         return r;
6966 }
6967 EXPORT_SYMBOL(netdev_printk);
6968
6969 #define define_netdev_printk_level(func, level)                 \
6970 int func(const struct net_device *dev, const char *fmt, ...)    \
6971 {                                                               \
6972         int r;                                                  \
6973         struct va_format vaf;                                   \
6974         va_list args;                                           \
6975                                                                 \
6976         va_start(args, fmt);                                    \
6977                                                                 \
6978         vaf.fmt = fmt;                                          \
6979         vaf.va = &args;                                         \
6980                                                                 \
6981         r = __netdev_printk(level, dev, &vaf);                  \
6982                                                                 \
6983         va_end(args);                                           \
6984                                                                 \
6985         return r;                                               \
6986 }                                                               \
6987 EXPORT_SYMBOL(func);
6988
6989 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6990 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6991 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6992 define_netdev_printk_level(netdev_err, KERN_ERR);
6993 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6994 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6995 define_netdev_printk_level(netdev_info, KERN_INFO);
6996
6997 static void __net_exit netdev_exit(struct net *net)
6998 {
6999         kfree(net->dev_name_head);
7000         kfree(net->dev_index_head);
7001 }
7002
7003 static struct pernet_operations __net_initdata netdev_net_ops = {
7004         .init = netdev_init,
7005         .exit = netdev_exit,
7006 };
7007
7008 static void __net_exit default_device_exit(struct net *net)
7009 {
7010         struct net_device *dev, *aux;
7011         /*
7012          * Push all migratable network devices back to the
7013          * initial network namespace
7014          */
7015         rtnl_lock();
7016         for_each_netdev_safe(net, dev, aux) {
7017                 int err;
7018                 char fb_name[IFNAMSIZ];
7019
7020                 /* Ignore unmoveable devices (i.e. loopback) */
7021                 if (dev->features & NETIF_F_NETNS_LOCAL)
7022                         continue;
7023
7024                 /* Leave virtual devices for the generic cleanup */
7025                 if (dev->rtnl_link_ops)
7026                         continue;
7027
7028                 /* Push remaining network devices to init_net */
7029                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7030                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7031                 if (err) {
7032                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7033                                  __func__, dev->name, err);
7034                         BUG();
7035                 }
7036         }
7037         rtnl_unlock();
7038 }
7039
7040 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7041 {
7042         /* At exit all network devices most be removed from a network
7043          * namespace.  Do this in the reverse order of registration.
7044          * Do this across as many network namespaces as possible to
7045          * improve batching efficiency.
7046          */
7047         struct net_device *dev;
7048         struct net *net;
7049         LIST_HEAD(dev_kill_list);
7050
7051         rtnl_lock();
7052         list_for_each_entry(net, net_list, exit_list) {
7053                 for_each_netdev_reverse(net, dev) {
7054                         if (dev->rtnl_link_ops)
7055                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7056                         else
7057                                 unregister_netdevice_queue(dev, &dev_kill_list);
7058                 }
7059         }
7060         unregister_netdevice_many(&dev_kill_list);
7061         list_del(&dev_kill_list);
7062         rtnl_unlock();
7063 }
7064
7065 static struct pernet_operations __net_initdata default_device_ops = {
7066         .exit = default_device_exit,
7067         .exit_batch = default_device_exit_batch,
7068 };
7069
7070 /*
7071  *      Initialize the DEV module. At boot time this walks the device list and
7072  *      unhooks any devices that fail to initialise (normally hardware not
7073  *      present) and leaves us with a valid list of present and active devices.
7074  *
7075  */
7076
7077 /*
7078  *       This is called single threaded during boot, so no need
7079  *       to take the rtnl semaphore.
7080  */
7081 static int __init net_dev_init(void)
7082 {
7083         int i, rc = -ENOMEM;
7084
7085         BUG_ON(!dev_boot_phase);
7086
7087         if (dev_proc_init())
7088                 goto out;
7089
7090         if (netdev_kobject_init())
7091                 goto out;
7092
7093         INIT_LIST_HEAD(&ptype_all);
7094         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7095                 INIT_LIST_HEAD(&ptype_base[i]);
7096
7097         INIT_LIST_HEAD(&offload_base);
7098
7099         if (register_pernet_subsys(&netdev_net_ops))
7100                 goto out;
7101
7102         /*
7103          *      Initialise the packet receive queues.
7104          */
7105
7106         for_each_possible_cpu(i) {
7107                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7108
7109                 memset(sd, 0, sizeof(*sd));
7110                 skb_queue_head_init(&sd->input_pkt_queue);
7111                 skb_queue_head_init(&sd->process_queue);
7112                 sd->completion_queue = NULL;
7113                 INIT_LIST_HEAD(&sd->poll_list);
7114                 sd->output_queue = NULL;
7115                 sd->output_queue_tailp = &sd->output_queue;
7116 #ifdef CONFIG_RPS
7117                 sd->csd.func = rps_trigger_softirq;
7118                 sd->csd.info = sd;
7119                 sd->csd.flags = 0;
7120                 sd->cpu = i;
7121 #endif
7122
7123                 sd->backlog.poll = process_backlog;
7124                 sd->backlog.weight = weight_p;
7125                 sd->backlog.gro_list = NULL;
7126                 sd->backlog.gro_count = 0;
7127         }
7128
7129         dev_boot_phase = 0;
7130
7131         /* The loopback device is special if any other network devices
7132          * is present in a network namespace the loopback device must
7133          * be present. Since we now dynamically allocate and free the
7134          * loopback device ensure this invariant is maintained by
7135          * keeping the loopback device as the first device on the
7136          * list of network devices.  Ensuring the loopback devices
7137          * is the first device that appears and the last network device
7138          * that disappears.
7139          */
7140         if (register_pernet_device(&loopback_net_ops))
7141                 goto out;
7142
7143         if (register_pernet_device(&default_device_ops))
7144                 goto out;
7145
7146         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7147         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7148
7149         hotcpu_notifier(dev_cpu_callback, 0);
7150         dst_init();
7151         dev_mcast_init();
7152         rc = 0;
7153 out:
7154         return rc;
7155 }
7156
7157 subsys_initcall(net_dev_init);