net/netfilter/ipvs/ip_vs_wrr.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * IPVS:        Weighted Round-Robin Scheduling module
   4  *
   5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   6  *
   7  * Changes:
   8  *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
   9  *     Wensong Zhang            :     changed some comestics things for debugging
  10  *     Wensong Zhang            :     changed for the d-linked destination list
  11  *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
  12  *     Julian Anastasov         :     fixed the bug of returning destination
  13  *                                    with weight 0 when all weights are zero
  14  */
  15
  16 #define KMSG_COMPONENT "IPVS"
  17 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  18
  19 #include <linux/module.h>
  20 #include <linux/kernel.h>
  21 #include <linux/slab.h>
  22 #include <linux/net.h>
  23 #include <linux/gcd.h>
  24
  25 #include <net/ip_vs.h>
  26
  27 /* The WRR algorithm depends on some caclulations:
  28  * - mw: maximum weight
  29  * - di: weight step, greatest common divisor from all weights
  30  * - cw: current required weight
  31  * As result, all weights are in the [di..mw] range with a step=di.
  32  *
  33  * First, we start with cw = mw and select dests with weight >= cw.
  34  * Then cw is reduced with di and all dests are checked again.
  35  * Last pass should be with cw = di. We have mw/di passes in total:
  36  *
  37  * pass 1: cw = max weight
  38  * pass 2: cw = max weight - di
  39  * pass 3: cw = max weight - 2 * di
  40  * ...
  41  * last pass: cw = di
  42  *
  43  * Weights are supposed to be >= di but we run in parallel with
  44  * weight changes, it is possible some dest weight to be reduced
  45  * below di, bad if it is the only available dest.
  46  *
  47  * So, we modify how mw is calculated, now it is reduced with (di - 1),
  48  * so that last cw is 1 to catch such dests with weight below di:
  49  * pass 1: cw = max weight - (di - 1)
  50  * pass 2: cw = max weight - di - (di - 1)
  51  * pass 3: cw = max weight - 2 * di - (di - 1)
  52  * ...
  53  * last pass: cw = 1
  54  *
  55  */
  56
  57 /*
  58  * current destination pointer for weighted round-robin scheduling
  59  */
  60 struct ip_vs_wrr_mark {
  61         struct ip_vs_dest *cl;  /* current dest or head */
  62         int cw;                 /* current weight */
  63         int mw;                 /* maximum weight */
  64         int di;                 /* decreasing interval */
  65         struct rcu_head         rcu_head;
  66 };
  67
  68
  69 static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
  70 {
  71         struct ip_vs_dest *dest;
  72         int weight;
  73         int g = 0;
  74
  75         list_for_each_entry(dest, &svc->destinations, n_list) {
  76                 weight = atomic_read(&dest->weight);
  77                 if (weight > 0) {
  78                         if (g > 0)
  79                                 g = gcd(weight, g);
  80                         else
  81                                 g = weight;
  82                 }
  83         }
  84         return g ? g : 1;
  85 }
  86
  87
  88 /*
  89  *    Get the maximum weight of the service destinations.
  90  */
  91 static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
  92 {
  93         struct ip_vs_dest *dest;
  94         int new_weight, weight = 0;
  95
  96         list_for_each_entry(dest, &svc->destinations, n_list) {
  97                 new_weight = atomic_read(&dest->weight);
  98                 if (new_weight > weight)
  99                         weight = new_weight;
 100         }
 101
 102         return weight;
 103 }
 104
 105
 106 static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
 107 {
 108         struct ip_vs_wrr_mark *mark;
 109
 110         /*
 111          *    Allocate the mark variable for WRR scheduling
 112          */
 113         mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
 114         if (mark == NULL)
 115                 return -ENOMEM;
 116
 117         mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
 118         mark->di = ip_vs_wrr_gcd_weight(svc);
 119         mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
 120         mark->cw = mark->mw;
 121         svc->sched_data = mark;
 122
 123         return 0;
 124 }
 125
 126
 127 static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
 128 {
 129         struct ip_vs_wrr_mark *mark = svc->sched_data;
 130
 131         /*
 132          *    Release the mark variable
 133          */
 134         kfree_rcu(mark, rcu_head);
 135 }
 136
 137
 138 static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
 139                                   struct ip_vs_dest *dest)
 140 {
 141         struct ip_vs_wrr_mark *mark = svc->sched_data;
 142
 143         spin_lock_bh(&svc->sched_lock);
 144         mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
 145         mark->di = ip_vs_wrr_gcd_weight(svc);
 146         mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
 147         if (mark->cw > mark->mw || !mark->cw)
 148                 mark->cw = mark->mw;
 149         else if (mark->di > 1)
 150                 mark->cw = (mark->cw / mark->di) * mark->di + 1;
 151         spin_unlock_bh(&svc->sched_lock);
 152         return 0;
 153 }
 154
 155
 156 /*
 157  *    Weighted Round-Robin Scheduling
 158  */
 159 static struct ip_vs_dest *
 160 ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 161                    struct ip_vs_iphdr *iph)
 162 {
 163         struct ip_vs_dest *dest, *last, *stop = NULL;
 164         struct ip_vs_wrr_mark *mark = svc->sched_data;
 165         bool last_pass = false, restarted = false;
 166
 167         IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 168
 169         spin_lock_bh(&svc->sched_lock);
 170         dest = mark->cl;
 171         /* No available dests? */
 172         if (mark->mw == 0)
 173                 goto err_noavail;
 174         last = dest;
 175         /* Stop only after all dests were checked for weight >= 1 (last pass) */
 176         while (1) {
 177                 list_for_each_entry_continue_rcu(dest,
 178                                                  &svc->destinations,
 179                                                  n_list) {
 180                         if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
 181                             atomic_read(&dest->weight) >= mark->cw)
 182                                 goto found;
 183                         if (dest == stop)
 184                                 goto err_over;
 185                 }
 186                 mark->cw -= mark->di;
 187                 if (mark->cw <= 0) {
 188                         mark->cw = mark->mw;
 189                         /* Stop if we tried last pass from first dest:
 190                          * 1. last_pass: we started checks when cw > di but
 191                          *      then all dests were checked for w >= 1
 192                          * 2. last was head: the first and only traversal
 193                          *      was for weight >= 1, for all dests.
 194                          */
 195                         if (last_pass ||
 196                             &last->n_list == &svc->destinations)
 197                                 goto err_over;
 198                         restarted = true;
 199                 }
 200                 last_pass = mark->cw <= mark->di;
 201                 if (last_pass && restarted &&
 202                     &last->n_list != &svc->destinations) {
 203                         /* First traversal was for w >= 1 but only
 204                          * for dests after 'last', now do the same
 205                          * for all dests up to 'last'.
 206                          */
 207                         stop = last;
 208                 }
 209         }
 210
 211 found:
 212         IP_VS_DBG_BUF(6, "WRR: server %s:%u "
 213                       "activeconns %d refcnt %d weight %d\n",
 214                       IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
 215                       atomic_read(&dest->activeconns),
 216                       refcount_read(&dest->refcnt),
 217                       atomic_read(&dest->weight));
 218         mark->cl = dest;
 219
 220   out:
 221         spin_unlock_bh(&svc->sched_lock);
 222         return dest;
 223
 224 err_noavail:
 225         mark->cl = dest;
 226         dest = NULL;
 227         ip_vs_scheduler_err(svc, "no destination available");
 228         goto out;
 229
 230 err_over:
 231         mark->cl = dest;
 232         dest = NULL;
 233         ip_vs_scheduler_err(svc, "no destination available: "
 234                             "all destinations are overloaded");
 235         goto out;
 236 }
 237
 238
 239 static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
 240         .name =                 "wrr",
 241         .refcnt =               ATOMIC_INIT(0),
 242         .module =               THIS_MODULE,
 243         .n_list =               LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
 244         .init_service =         ip_vs_wrr_init_svc,
 245         .done_service =         ip_vs_wrr_done_svc,
 246         .add_dest =             ip_vs_wrr_dest_changed,
 247         .del_dest =             ip_vs_wrr_dest_changed,
 248         .upd_dest =             ip_vs_wrr_dest_changed,
 249         .schedule =             ip_vs_wrr_schedule,
 250 };
 251
 252 static int __init ip_vs_wrr_init(void)
 253 {
 254         return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
 255 }
 256
 257 static void __exit ip_vs_wrr_cleanup(void)
 258 {
 259         unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
 260         synchronize_rcu();
 261 }
 262
 263 module_init(ip_vs_wrr_init);
 264 module_exit(ip_vs_wrr_cleanup);
 265 MODULE_LICENSE("GPL");