[MYRI10GE]: Use LRO.

[mirror_ubuntu-bionic-kernel.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index 3ba63aaa3001737adc66d7d9feba6317e5674aeb..29cf00c5d865df30b30ef1407299631ebc24c109 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -220,7 +220,8 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
   *     Device drivers call our routines to queue packets here. We empty the
   *     queue in the local softnet handler.
   */
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
+
+DEFINE_PER_CPU(struct softnet_data, softnet_data);
  
  #ifdef CONFIG_SYSFS
  extern int netdev_sysfs_init(void);
@@ -817,7 +818,9 @@ int dev_alloc_name(struct net_device *dev, const char *name)
   */
  int dev_change_name(struct net_device *dev, char *newname)
  {
+       char oldname[IFNAMSIZ];
         int err = 0;
+       int ret;
  
         ASSERT_RTNL();
  
@@ -827,6 +830,8 @@ int dev_change_name(struct net_device *dev, char *newname)
         if (!dev_valid_name(newname))
                 return -EINVAL;
  
+       memcpy(oldname, dev->name, IFNAMSIZ);
+
         if (strchr(newname, '%')) {
                 err = dev_alloc_name(dev, newname);
                 if (err < 0)
@@ -838,10 +843,28 @@ int dev_change_name(struct net_device *dev, char *newname)
         else
                 strlcpy(dev->name, newname, IFNAMSIZ);
  
+rollback:
         device_rename(&dev->dev, dev->name);
+
+       write_lock_bh(&dev_base_lock);
         hlist_del(&dev->name_hlist);
         hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
-       raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+       write_unlock_bh(&dev_base_lock);
+
+       ret = raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+       ret = notifier_to_errno(ret);
+
+       if (ret) {
+               if (err) {
+                       printk(KERN_ERR
+                              "%s: name change rollback failed: %d.\n",
+                              dev->name, ret);
+               } else {
+                       err = ret;
+                       memcpy(dev->name, oldname, IFNAMSIZ);
+                       goto rollback;
+               }
+       }
  
         return err;
  }
@@ -996,16 +1019,12 @@ int dev_close(struct net_device *dev)
         clear_bit(__LINK_STATE_START, &dev->state);
  
         /* Synchronize to scheduled poll. We cannot touch poll list,
-        * it can be even on different cpu. So just clear netif_running(),
-        * and wait when poll really will happen. Actually, the best place
-        * for this is inside dev->stop() after device stopped its irq
-        * engine, but this requires more changes in devices. */
-
+        * it can be even on different cpu. So just clear netif_running().
+        *
+        * dev->stop() will invoke napi_disable() on all of it's
+        * napi_struct instances on this device.
+        */
         smp_mb__after_clear_bit(); /* Commit netif_running(). */
-       while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
-               /* No hurry. */
-               msleep(1);
-       }
  
         /*
          *      Call the device specific close. This cannot fail.
@@ -1054,20 +1073,43 @@ int dev_close(struct net_device *dev)
  int register_netdevice_notifier(struct notifier_block *nb)
  {
         struct net_device *dev;
+       struct net_device *last;
         int err;
  
         rtnl_lock();
         err = raw_notifier_chain_register(&netdev_chain, nb);
-       if (!err) {
-               for_each_netdev(dev) {
-                       nb->notifier_call(nb, NETDEV_REGISTER, dev);
+       if (err)
+               goto unlock;
  
-                       if (dev->flags & IFF_UP)
-                               nb->notifier_call(nb, NETDEV_UP, dev);
-               }
+       for_each_netdev(dev) {
+               err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
+               err = notifier_to_errno(err);
+               if (err)
+                       goto rollback;
+
+               if (!(dev->flags & IFF_UP))
+                       continue;
+
+               nb->notifier_call(nb, NETDEV_UP, dev);
         }
+
+unlock:
         rtnl_unlock();
         return err;
+
+rollback:
+       last = dev;
+       for_each_netdev(dev) {
+               if (dev == last)
+                       break;
+
+               if (dev->flags & IFF_UP) {
+                       nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
+                       nb->notifier_call(nb, NETDEV_DOWN, dev);
+               }
+               nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+       }
+       goto unlock;
  }
  
  /**
@@ -1188,21 +1230,21 @@ void __netif_schedule(struct net_device *dev)
  }
  EXPORT_SYMBOL(__netif_schedule);
  
-void __netif_rx_schedule(struct net_device *dev)
+void dev_kfree_skb_irq(struct sk_buff *skb)
  {
-       unsigned long flags;
+       if (atomic_dec_and_test(&skb->users)) {
+               struct softnet_data *sd;
+               unsigned long flags;
  
-       local_irq_save(flags);
-       dev_hold(dev);
-       list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
-       if (dev->quota < 0)
-               dev->quota += dev->weight;
-       else
-               dev->quota = dev->weight;
-       __raise_softirq_irqoff(NET_RX_SOFTIRQ);
-       local_irq_restore(flags);
+               local_irq_save(flags);
+               sd = &__get_cpu_var(softnet_data);
+               skb->next = sd->completion_queue;
+               sd->completion_queue = skb;
+               raise_softirq_irqoff(NET_TX_SOFTIRQ);
+               local_irq_restore(flags);
+       }
  }
-EXPORT_SYMBOL(__netif_rx_schedule);
+EXPORT_SYMBOL(dev_kfree_skb_irq);
  
  void dev_kfree_skb_any(struct sk_buff *skb)
  {
@@ -1214,7 +1256,12 @@ void dev_kfree_skb_any(struct sk_buff *skb)
  EXPORT_SYMBOL(dev_kfree_skb_any);
  
  
-/* Hot-plugging. */
+/**
+ * netif_device_detach - mark device as removed
+ * @dev: network device
+ *
+ * Mark device as removed from system and therefore no longer available.
+ */
  void netif_device_detach(struct net_device *dev)
  {
         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
@@ -1224,6 +1271,12 @@ void netif_device_detach(struct net_device *dev)
  }
  EXPORT_SYMBOL(netif_device_detach);
  
+/**
+ * netif_device_attach - mark device as attached
+ * @dev: network device
+ *
+ * Mark device as attached from system and restart if needed.
+ */
  void netif_device_attach(struct net_device *dev)
  {
         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
@@ -1685,7 +1738,7 @@ enqueue:
                         return NET_RX_SUCCESS;
                 }
  
-               netif_rx_schedule(&queue->backlog_dev);
+               napi_schedule(&queue->backlog);
                 goto enqueue;
         }
  
@@ -1726,6 +1779,7 @@ static inline struct net_device *skb_bond(struct sk_buff *skb)
         return dev;
  }
  
+
  static void net_tx_action(struct softirq_action *h)
  {
         struct softnet_data *sd = &__get_cpu_var(softnet_data);
@@ -1882,7 +1936,7 @@ int netif_receive_skb(struct sk_buff *skb)
         __be16 type;
  
         /* if we've gotten here through NAPI, check netpoll */
-       if (skb->dev->poll && netpoll_rx(skb))
+       if (netpoll_receive_skb(skb))
                 return NET_RX_DROP;
  
         if (!skb->tstamp.tv64)
@@ -1972,22 +2026,25 @@ out:
         return ret;
  }
  
-static int process_backlog(struct net_device *backlog_dev, int *budget)
+static int process_backlog(struct napi_struct *napi, int quota)
  {
         int work = 0;
-       int quota = min(backlog_dev->quota, *budget);
         struct softnet_data *queue = &__get_cpu_var(softnet_data);
         unsigned long start_time = jiffies;
  
-       backlog_dev->weight = weight_p;
-       for (;;) {
+       napi->weight = weight_p;
+       do {
                 struct sk_buff *skb;
                 struct net_device *dev;
  
                 local_irq_disable();
                 skb = __skb_dequeue(&queue->input_pkt_queue);
-               if (!skb)
-                       goto job_done;
+               if (!skb) {
+                       __napi_complete(napi);
+                       local_irq_enable();
+                       break;
+               }
+
                 local_irq_enable();
  
                 dev = skb->dev;
@@ -1995,67 +2052,86 @@ static int process_backlog(struct net_device *backlog_dev, int *budget)
                 netif_receive_skb(skb);
  
                 dev_put(dev);
+       } while (++work < quota && jiffies == start_time);
  
-               work++;
-
-               if (work >= quota || jiffies - start_time > 1)
-                       break;
-
-       }
-
-       backlog_dev->quota -= work;
-       *budget -= work;
-       return -1;
-
-job_done:
-       backlog_dev->quota -= work;
-       *budget -= work;
+       return work;
+}
  
-       list_del(&backlog_dev->poll_list);
-       smp_mb__before_clear_bit();
-       netif_poll_enable(backlog_dev);
+/**
+ * __napi_schedule - schedule for receive
+ * @napi: entry to schedule
+ *
+ * The entry's receive function will be scheduled to run
+ */
+void fastcall __napi_schedule(struct napi_struct *n)
+{
+       unsigned long flags;
  
-       local_irq_enable();
-       return 0;
+       local_irq_save(flags);
+       list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
+       __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+       local_irq_restore(flags);
  }
+EXPORT_SYMBOL(__napi_schedule);
+
  
  static void net_rx_action(struct softirq_action *h)
  {
-       struct softnet_data *queue = &__get_cpu_var(softnet_data);
+       struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
         unsigned long start_time = jiffies;
         int budget = netdev_budget;
         void *have;
  
         local_irq_disable();
  
-       while (!list_empty(&queue->poll_list)) {
-               struct net_device *dev;
+       while (!list_empty(list)) {
+               struct napi_struct *n;
+               int work, weight;
  
-               if (budget <= 0 || jiffies - start_time > 1)
+               /* If softirq window is exhuasted then punt.
+                *
+                * Note that this is a slight policy change from the
+                * previous NAPI code, which would allow up to 2
+                * jiffies to pass before breaking out.  The test
+                * used to be "jiffies - start_time > 1".
+                */
+               if (unlikely(budget <= 0 || jiffies != start_time))
                         goto softnet_break;
  
                 local_irq_enable();
  
-               dev = list_entry(queue->poll_list.next,
-                                struct net_device, poll_list);
-               have = netpoll_poll_lock(dev);
+               /* Even though interrupts have been re-enabled, this
+                * access is safe because interrupts can only add new
+                * entries to the tail of this list, and only ->poll()
+                * calls can remove this head entry from the list.
+                */
+               n = list_entry(list->next, struct napi_struct, poll_list);
  
-               if (dev->quota <= 0 || dev->poll(dev, &budget)) {
-                       netpoll_poll_unlock(have);
-                       local_irq_disable();
-                       list_move_tail(&dev->poll_list, &queue->poll_list);
-                       if (dev->quota < 0)
-                               dev->quota += dev->weight;
-                       else
-                               dev->quota = dev->weight;
-               } else {
-                       netpoll_poll_unlock(have);
-                       dev_put(dev);
-                       local_irq_disable();
-               }
+               have = netpoll_poll_lock(n);
+
+               weight = n->weight;
+
+               work = n->poll(n, weight);
+
+               WARN_ON_ONCE(work > weight);
+
+               budget -= work;
+
+               local_irq_disable();
+
+               /* Drivers must not modify the NAPI state if they
+                * consume the entire weight.  In such cases this code
+                * still "owns" the NAPI instance and therefore can
+                * move the instance around on the list at-will.
+                */
+               if (unlikely(work == weight))
+                       list_move_tail(&n->poll_list, list);
+
+               netpoll_poll_unlock(have);
         }
  out:
         local_irq_enable();
+
  #ifdef CONFIG_NET_DMA
         /*
          * There may not be any more sk_buffs coming right now, so push
@@ -2070,6 +2146,7 @@ out:
                 }
         }
  #endif
+
         return;
  
  softnet_break:
@@ -2629,7 +2706,7 @@ void __dev_set_rx_mode(struct net_device *dev)
                 return;
  
         if (!netif_device_present(dev))
-               return;
+               return;
  
         if (dev->set_rx_mode)
                 dev->set_rx_mode(dev);
@@ -2715,26 +2792,14 @@ int __dev_addr_add(struct dev_addr_list **list, int *count,
         return 0;
  }
  
-static void __dev_addr_discard(struct dev_addr_list **list)
-{
-       struct dev_addr_list *tmp;
-
-       while (*list != NULL) {
-               tmp = *list;
-               *list = tmp->next;
-               if (tmp->da_users > tmp->da_gusers)
-                       printk("__dev_addr_discard: address leakage! "
-                              "da_users=%d\n", tmp->da_users);
-               kfree(tmp);
-       }
-}
-
  /**
   *     dev_unicast_delete      - Release secondary unicast address.
   *     @dev: device
+ *     @addr: address to delete
+ *     @alen: length of @addr
   *
   *     Release reference to a secondary unicast address and remove it
- *     from the device if the reference count drop to zero.
+ *     from the device if the reference count drops to zero.
   *
   *     The caller must hold the rtnl_mutex.
   */
@@ -2756,6 +2821,8 @@ EXPORT_SYMBOL(dev_unicast_delete);
  /**
   *     dev_unicast_add         - add a secondary unicast address
   *     @dev: device
+ *     @addr: address to delete
+ *     @alen: length of @addr
   *
   *     Add a secondary unicast address to the device or increase
   *     the reference count if it already exists.
@@ -2777,23 +2844,30 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen)
  }
  EXPORT_SYMBOL(dev_unicast_add);
  
-static void dev_unicast_discard(struct net_device *dev)
+static void __dev_addr_discard(struct dev_addr_list **list)
  {
-       netif_tx_lock_bh(dev);
-       __dev_addr_discard(&dev->uc_list);
-       dev->uc_count = 0;
-       netif_tx_unlock_bh(dev);
-}
+       struct dev_addr_list *tmp;
  
-/*
- *     Discard multicast list when a device is downed
- */
+       while (*list != NULL) {
+               tmp = *list;
+               *list = tmp->next;
+               if (tmp->da_users > tmp->da_gusers)
+                       printk("__dev_addr_discard: address leakage! "
+                              "da_users=%d\n", tmp->da_users);
+               kfree(tmp);
+       }
+}
  
-static void dev_mc_discard(struct net_device *dev)
+static void dev_addr_discard(struct net_device *dev)
  {
         netif_tx_lock_bh(dev);
+
+       __dev_addr_discard(&dev->uc_list);
+       dev->uc_count = 0;
+
         __dev_addr_discard(&dev->mc_list);
         dev->mc_count = 0;
+
         netif_tx_unlock_bh(dev);
  }
  
@@ -3340,7 +3414,7 @@ int register_netdevice(struct net_device *dev)
  
         if (!dev_valid_name(dev->name)) {
                 ret = -EINVAL;
-               goto out;
+               goto err_uninit;
         }
  
         dev->ifindex = dev_new_index();
@@ -3354,7 +3428,7 @@ int register_netdevice(struct net_device *dev)
                         = hlist_entry(p, struct net_device, name_hlist);
                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
                         ret = -EEXIST;
-                       goto out;
+                       goto err_uninit;
                 }
         }
  
@@ -3414,7 +3488,7 @@ int register_netdevice(struct net_device *dev)
  
         ret = netdev_register_sysfs(dev);
         if (ret)
-               goto out;
+               goto err_uninit;
         dev->reg_state = NETREG_REGISTERED;
  
         /*
@@ -3433,12 +3507,18 @@ int register_netdevice(struct net_device *dev)
         write_unlock_bh(&dev_base_lock);
  
         /* Notify protocols, that a new device appeared. */
-       raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
-
-       ret = 0;
+       ret = raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
+       ret = notifier_to_errno(ret);
+       if (ret)
+               unregister_netdevice(dev);
  
  out:
         return ret;
+
+err_uninit:
+       if (dev->uninit)
+               dev->uninit(dev);
+       goto out;
  }
  
  /**
@@ -3631,7 +3711,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
  
         /* ensure 32-byte alignment of both the device and private area */
         alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
-                    (sizeof(struct net_device_subqueue) * queue_count)) &
+                    (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
                      ~NETDEV_ALIGN_CONST;
         alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
  
@@ -3649,13 +3729,14 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
                 dev->priv = ((char *)dev +
                              ((sizeof(struct net_device) +
                                (sizeof(struct net_device_subqueue) *
-                               queue_count) + NETDEV_ALIGN_CONST)
+                               (queue_count - 1)) + NETDEV_ALIGN_CONST)
                               & ~NETDEV_ALIGN_CONST));
         }
  
         dev->egress_subqueue_count = queue_count;
  
         dev->get_stats = internal_stats;
+       netpoll_netdev_init(dev);
         setup(dev);
         strcpy(dev->name, name);
         return dev;
@@ -3751,8 +3832,7 @@ void unregister_netdevice(struct net_device *dev)
         /*
          *      Flush the unicast and multicast chains
          */
-       dev_unicast_discard(dev);
-       dev_mc_discard(dev);
+       dev_addr_discard(dev);
  
         if (dev->uninit)
                 dev->uninit(dev);
@@ -3838,9 +3918,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,
  
  #ifdef CONFIG_NET_DMA
  /**
- * net_dma_rebalance -
- * This is called when the number of channels allocated to the net_dma_client
- * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ * net_dma_rebalance - try to maintain one DMA channel per CPU
+ * @net_dma: DMA client and associated data (lock, channels, channel_mask)
+ *
+ * This is called when the number of channels allocated to the net_dma client
+ * changes.  The net_dma client tries to have one DMA channel per CPU.
   */
  
  static void net_dma_rebalance(struct net_dma *net_dma)
@@ -3877,7 +3959,7 @@ static void net_dma_rebalance(struct net_dma *net_dma)
   * netdev_dma_event - event callback for the net_dma_client
   * @client: should always be net_dma_client
   * @chan: DMA channel for the event
- * @event: event type
+ * @state: DMA state to be handled
   */
  static enum dma_state_client
  netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
@@ -3944,6 +4026,45 @@ static int __init netdev_dma_register(void)
  static int __init netdev_dma_register(void) { return -ENODEV; }
  #endif /* CONFIG_NET_DMA */
  
+/**
+ *     netdev_compute_feature - compute conjunction of two feature sets
+ *     @all: first feature set
+ *     @one: second feature set
+ *
+ *     Computes a new feature set after adding a device with feature set
+ *     @one to the master device with current feature set @all.  Returns
+ *     the new feature set.
+ */
+int netdev_compute_features(unsigned long all, unsigned long one)
+{
+       /* if device needs checksumming, downgrade to hw checksumming */
+       if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
+               all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
+
+       /* if device can't do all checksum, downgrade to ipv4/ipv6 */
+       if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
+               all ^= NETIF_F_HW_CSUM
+                       | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+
+       if (one & NETIF_F_GSO)
+               one |= NETIF_F_GSO_SOFTWARE;
+       one |= NETIF_F_GSO;
+
+       /* If even one device supports robust GSO, enable it for all. */
+       if (one & NETIF_F_GSO_ROBUST)
+               all |= NETIF_F_GSO_ROBUST;
+
+       all &= one | NETIF_F_LLTX;
+
+       if (!(all & NETIF_F_ALL_CSUM))
+               all &= ~NETIF_F_SG;
+       if (!(all & NETIF_F_SG))
+               all &= ~NETIF_F_GSO_MASK;
+
+       return all;
+}
+EXPORT_SYMBOL(netdev_compute_features);
+
  /*
   *     Initialize the DEV module. At boot time this walks the device list and
   *     unhooks any devices that fail to initialise (normally hardware not
@@ -3988,10 +4109,9 @@ static int __init net_dev_init(void)
                 skb_queue_head_init(&queue->input_pkt_queue);
                 queue->completion_queue = NULL;
                 INIT_LIST_HEAD(&queue->poll_list);
-               set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
-               queue->backlog_dev.weight = weight_p;
-               queue->backlog_dev.poll = process_backlog;
-               atomic_set(&queue->backlog_dev.refcnt, 1);
+
+               queue->backlog.poll = process_backlog;
+               queue->backlog.weight = weight_p;
         }
  
         netdev_dma_register();