net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/flow_keys.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132         struct red_parms *red_parms;
 133         struct tc_sfqred_stats stats;
 134         struct sfq_slot *tail;          /* current slot in round */
 135
 136         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                         /* Linked lists of slots, indexed by depth
 138                                          * dep[0] : list of unused flows
 139                                          * dep[1] : list of flows with 1 packet
 140                                          * dep[X] : list of flows with X packets
 141                                          */
 142
 143         unsigned int    maxflows;       /* number of flows in flows array */
 144         int             perturb_period;
 145         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146         struct timer_list perturb_timer;
 147 };
 148
 149 /*
 150  * sfq_head are either in a sfq_slot or in dep[] array
 151  */
 152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153 {
 154         if (val < SFQ_MAX_FLOWS)
 155                 return &q->slots[val].dep;
 156         return &q->dep[val - SFQ_MAX_FLOWS];
 157 }
 158
 159 /*
 160  * In order to be able to quickly rehash our queue when timer changes
 161  * q->perturbation, we store flow_keys in skb->cb[]
 162  */
 163 struct sfq_skb_cb {
 164        struct flow_keys        keys;
 165 };
 166
 167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168 {
 169         qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
 170         return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 171 }
 172
 173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 174                              const struct sk_buff *skb)
 175 {
 176         const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 177         unsigned int hash;
 178
 179         hash = jhash_3words((__force u32)keys->dst,
 180                             (__force u32)keys->src ^ keys->ip_proto,
 181                             (__force u32)keys->ports, q->perturbation);
 182         return hash & (q->divisor - 1);
 183 }
 184
 185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 186                                  int *qerr)
 187 {
 188         struct sfq_sched_data *q = qdisc_priv(sch);
 189         struct tcf_result res;
 190         struct tcf_proto *fl;
 191         int result;
 192
 193         if (TC_H_MAJ(skb->priority) == sch->handle &&
 194             TC_H_MIN(skb->priority) > 0 &&
 195             TC_H_MIN(skb->priority) <= q->divisor)
 196                 return TC_H_MIN(skb->priority);
 197
 198         fl = rcu_dereference_bh(q->filter_list);
 199         if (!fl) {
 200                 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 201                 return sfq_hash(q, skb) + 1;
 202         }
 203
 204         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 205         result = tc_classify(skb, fl, &res);
 206         if (result >= 0) {
 207 #ifdef CONFIG_NET_CLS_ACT
 208                 switch (result) {
 209                 case TC_ACT_STOLEN:
 210                 case TC_ACT_QUEUED:
 211                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 212                 case TC_ACT_SHOT:
 213                         return 0;
 214                 }
 215 #endif
 216                 if (TC_H_MIN(res.classid) <= q->divisor)
 217                         return TC_H_MIN(res.classid);
 218         }
 219         return 0;
 220 }
 221
 222 /*
 223  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 224  */
 225 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 226 {
 227         sfq_index p, n;
 228         struct sfq_slot *slot = &q->slots[x];
 229         int qlen = slot->qlen;
 230
 231         p = qlen + SFQ_MAX_FLOWS;
 232         n = q->dep[qlen].next;
 233
 234         slot->dep.next = n;
 235         slot->dep.prev = p;
 236
 237         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 238         sfq_dep_head(q, n)->prev = x;
 239 }
 240
 241 #define sfq_unlink(q, x, n, p)                  \
 242         do {                                    \
 243                 n = q->slots[x].dep.next;       \
 244                 p = q->slots[x].dep.prev;       \
 245                 sfq_dep_head(q, p)->next = n;   \
 246                 sfq_dep_head(q, n)->prev = p;   \
 247         } while (0)
 248
 249
 250 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 251 {
 252         sfq_index p, n;
 253         int d;
 254
 255         sfq_unlink(q, x, n, p);
 256
 257         d = q->slots[x].qlen--;
 258         if (n == p && q->cur_depth == d)
 259                 q->cur_depth--;
 260         sfq_link(q, x);
 261 }
 262
 263 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 264 {
 265         sfq_index p, n;
 266         int d;
 267
 268         sfq_unlink(q, x, n, p);
 269
 270         d = ++q->slots[x].qlen;
 271         if (q->cur_depth < d)
 272                 q->cur_depth = d;
 273         sfq_link(q, x);
 274 }
 275
 276 /* helper functions : might be changed when/if skb use a standard list_head */
 277
 278 /* remove one skb from tail of slot queue */
 279 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 280 {
 281         struct sk_buff *skb = slot->skblist_prev;
 282
 283         slot->skblist_prev = skb->prev;
 284         skb->prev->next = (struct sk_buff *)slot;
 285         skb->next = skb->prev = NULL;
 286         return skb;
 287 }
 288
 289 /* remove one skb from head of slot queue */
 290 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 291 {
 292         struct sk_buff *skb = slot->skblist_next;
 293
 294         slot->skblist_next = skb->next;
 295         skb->next->prev = (struct sk_buff *)slot;
 296         skb->next = skb->prev = NULL;
 297         return skb;
 298 }
 299
 300 static inline void slot_queue_init(struct sfq_slot *slot)
 301 {
 302         memset(slot, 0, sizeof(*slot));
 303         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 304 }
 305
 306 /* add skb to slot queue (tail add) */
 307 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 308 {
 309         skb->prev = slot->skblist_prev;
 310         skb->next = (struct sk_buff *)slot;
 311         slot->skblist_prev->next = skb;
 312         slot->skblist_prev = skb;
 313 }
 314
 315 static unsigned int sfq_drop(struct Qdisc *sch)
 316 {
 317         struct sfq_sched_data *q = qdisc_priv(sch);
 318         sfq_index x, d = q->cur_depth;
 319         struct sk_buff *skb;
 320         unsigned int len;
 321         struct sfq_slot *slot;
 322
 323         /* Queue is full! Find the longest slot and drop tail packet from it */
 324         if (d > 1) {
 325                 x = q->dep[d].next;
 326                 slot = &q->slots[x];
 327 drop:
 328                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 329                 len = qdisc_pkt_len(skb);
 330                 slot->backlog -= len;
 331                 sfq_dec(q, x);
 332                 kfree_skb(skb);
 333                 sch->q.qlen--;
 334                 sch->qstats.drops++;
 335                 sch->qstats.backlog -= len;
 336                 return len;
 337         }
 338
 339         if (d == 1) {
 340                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 341                 x = q->tail->next;
 342                 slot = &q->slots[x];
 343                 q->tail->next = slot->next;
 344                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 345                 goto drop;
 346         }
 347
 348         return 0;
 349 }
 350
 351 /* Is ECN parameter configured */
 352 static int sfq_prob_mark(const struct sfq_sched_data *q)
 353 {
 354         return q->flags & TC_RED_ECN;
 355 }
 356
 357 /* Should packets over max threshold just be marked */
 358 static int sfq_hard_mark(const struct sfq_sched_data *q)
 359 {
 360         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 361 }
 362
 363 static int sfq_headdrop(const struct sfq_sched_data *q)
 364 {
 365         return q->headdrop;
 366 }
 367
 368 static int
 369 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 370 {
 371         struct sfq_sched_data *q = qdisc_priv(sch);
 372         unsigned int hash;
 373         sfq_index x, qlen;
 374         struct sfq_slot *slot;
 375         int uninitialized_var(ret);
 376         struct sk_buff *head;
 377         int delta;
 378
 379         hash = sfq_classify(skb, sch, &ret);
 380         if (hash == 0) {
 381                 if (ret & __NET_XMIT_BYPASS)
 382                         sch->qstats.drops++;
 383                 kfree_skb(skb);
 384                 return ret;
 385         }
 386         hash--;
 387
 388         x = q->ht[hash];
 389         slot = &q->slots[x];
 390         if (x == SFQ_EMPTY_SLOT) {
 391                 x = q->dep[0].next; /* get a free slot */
 392                 if (x >= SFQ_MAX_FLOWS)
 393                         return qdisc_drop(skb, sch);
 394                 q->ht[hash] = x;
 395                 slot = &q->slots[x];
 396                 slot->hash = hash;
 397                 slot->backlog = 0; /* should already be 0 anyway... */
 398                 red_set_vars(&slot->vars);
 399                 goto enqueue;
 400         }
 401         if (q->red_parms) {
 402                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 403                                                         &slot->vars,
 404                                                         slot->backlog);
 405                 switch (red_action(q->red_parms,
 406                                    &slot->vars,
 407                                    slot->vars.qavg)) {
 408                 case RED_DONT_MARK:
 409                         break;
 410
 411                 case RED_PROB_MARK:
 412                         sch->qstats.overlimits++;
 413                         if (sfq_prob_mark(q)) {
 414                                 /* We know we have at least one packet in queue */
 415                                 if (sfq_headdrop(q) &&
 416                                     INET_ECN_set_ce(slot->skblist_next)) {
 417                                         q->stats.prob_mark_head++;
 418                                         break;
 419                                 }
 420                                 if (INET_ECN_set_ce(skb)) {
 421                                         q->stats.prob_mark++;
 422                                         break;
 423                                 }
 424                         }
 425                         q->stats.prob_drop++;
 426                         goto congestion_drop;
 427
 428                 case RED_HARD_MARK:
 429                         sch->qstats.overlimits++;
 430                         if (sfq_hard_mark(q)) {
 431                                 /* We know we have at least one packet in queue */
 432                                 if (sfq_headdrop(q) &&
 433                                     INET_ECN_set_ce(slot->skblist_next)) {
 434                                         q->stats.forced_mark_head++;
 435                                         break;
 436                                 }
 437                                 if (INET_ECN_set_ce(skb)) {
 438                                         q->stats.forced_mark++;
 439                                         break;
 440                                 }
 441                         }
 442                         q->stats.forced_drop++;
 443                         goto congestion_drop;
 444                 }
 445         }
 446
 447         if (slot->qlen >= q->maxdepth) {
 448 congestion_drop:
 449                 if (!sfq_headdrop(q))
 450                         return qdisc_drop(skb, sch);
 451
 452                 /* We know we have at least one packet in queue */
 453                 head = slot_dequeue_head(slot);
 454                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 455                 sch->qstats.backlog -= delta;
 456                 slot->backlog -= delta;
 457                 qdisc_drop(head, sch);
 458
 459                 slot_queue_add(slot, skb);
 460                 return NET_XMIT_CN;
 461         }
 462
 463 enqueue:
 464         sch->qstats.backlog += qdisc_pkt_len(skb);
 465         slot->backlog += qdisc_pkt_len(skb);
 466         slot_queue_add(slot, skb);
 467         sfq_inc(q, x);
 468         if (slot->qlen == 1) {          /* The flow is new */
 469                 if (q->tail == NULL) {  /* It is the first flow */
 470                         slot->next = x;
 471                 } else {
 472                         slot->next = q->tail->next;
 473                         q->tail->next = x;
 474                 }
 475                 /* We put this flow at the end of our flow list.
 476                  * This might sound unfair for a new flow to wait after old ones,
 477                  * but we could endup servicing new flows only, and freeze old ones.
 478                  */
 479                 q->tail = slot;
 480                 /* We could use a bigger initial quantum for new flows */
 481                 slot->allot = q->scaled_quantum;
 482         }
 483         if (++sch->q.qlen <= q->limit)
 484                 return NET_XMIT_SUCCESS;
 485
 486         qlen = slot->qlen;
 487         sfq_drop(sch);
 488         /* Return Congestion Notification only if we dropped a packet
 489          * from this flow.
 490          */
 491         if (qlen != slot->qlen)
 492                 return NET_XMIT_CN;
 493
 494         /* As we dropped a packet, better let upper stack know this */
 495         qdisc_tree_decrease_qlen(sch, 1);
 496         return NET_XMIT_SUCCESS;
 497 }
 498
 499 static struct sk_buff *
 500 sfq_dequeue(struct Qdisc *sch)
 501 {
 502         struct sfq_sched_data *q = qdisc_priv(sch);
 503         struct sk_buff *skb;
 504         sfq_index a, next_a;
 505         struct sfq_slot *slot;
 506
 507         /* No active slots */
 508         if (q->tail == NULL)
 509                 return NULL;
 510
 511 next_slot:
 512         a = q->tail->next;
 513         slot = &q->slots[a];
 514         if (slot->allot <= 0) {
 515                 q->tail = slot;
 516                 slot->allot += q->scaled_quantum;
 517                 goto next_slot;
 518         }
 519         skb = slot_dequeue_head(slot);
 520         sfq_dec(q, a);
 521         qdisc_bstats_update(sch, skb);
 522         sch->q.qlen--;
 523         sch->qstats.backlog -= qdisc_pkt_len(skb);
 524         slot->backlog -= qdisc_pkt_len(skb);
 525         /* Is the slot empty? */
 526         if (slot->qlen == 0) {
 527                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 528                 next_a = slot->next;
 529                 if (a == next_a) {
 530                         q->tail = NULL; /* no more active slots */
 531                         return skb;
 532                 }
 533                 q->tail->next = next_a;
 534         } else {
 535                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 536         }
 537         return skb;
 538 }
 539
 540 static void
 541 sfq_reset(struct Qdisc *sch)
 542 {
 543         struct sk_buff *skb;
 544
 545         while ((skb = sfq_dequeue(sch)) != NULL)
 546                 kfree_skb(skb);
 547 }
 548
 549 /*
 550  * When q->perturbation is changed, we rehash all queued skbs
 551  * to avoid OOO (Out Of Order) effects.
 552  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 553  * counters.
 554  */
 555 static void sfq_rehash(struct Qdisc *sch)
 556 {
 557         struct sfq_sched_data *q = qdisc_priv(sch);
 558         struct sk_buff *skb;
 559         int i;
 560         struct sfq_slot *slot;
 561         struct sk_buff_head list;
 562         int dropped = 0;
 563
 564         __skb_queue_head_init(&list);
 565
 566         for (i = 0; i < q->maxflows; i++) {
 567                 slot = &q->slots[i];
 568                 if (!slot->qlen)
 569                         continue;
 570                 while (slot->qlen) {
 571                         skb = slot_dequeue_head(slot);
 572                         sfq_dec(q, i);
 573                         __skb_queue_tail(&list, skb);
 574                 }
 575                 slot->backlog = 0;
 576                 red_set_vars(&slot->vars);
 577                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 578         }
 579         q->tail = NULL;
 580
 581         while ((skb = __skb_dequeue(&list)) != NULL) {
 582                 unsigned int hash = sfq_hash(q, skb);
 583                 sfq_index x = q->ht[hash];
 584
 585                 slot = &q->slots[x];
 586                 if (x == SFQ_EMPTY_SLOT) {
 587                         x = q->dep[0].next; /* get a free slot */
 588                         if (x >= SFQ_MAX_FLOWS) {
 589 drop:                           sch->qstats.backlog -= qdisc_pkt_len(skb);
 590                                 kfree_skb(skb);
 591                                 dropped++;
 592                                 continue;
 593                         }
 594                         q->ht[hash] = x;
 595                         slot = &q->slots[x];
 596                         slot->hash = hash;
 597                 }
 598                 if (slot->qlen >= q->maxdepth)
 599                         goto drop;
 600                 slot_queue_add(slot, skb);
 601                 if (q->red_parms)
 602                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 603                                                         &slot->vars,
 604                                                         slot->backlog);
 605                 slot->backlog += qdisc_pkt_len(skb);
 606                 sfq_inc(q, x);
 607                 if (slot->qlen == 1) {          /* The flow is new */
 608                         if (q->tail == NULL) {  /* It is the first flow */
 609                                 slot->next = x;
 610                         } else {
 611                                 slot->next = q->tail->next;
 612                                 q->tail->next = x;
 613                         }
 614                         q->tail = slot;
 615                         slot->allot = q->scaled_quantum;
 616                 }
 617         }
 618         sch->q.qlen -= dropped;
 619         qdisc_tree_decrease_qlen(sch, dropped);
 620 }
 621
 622 static void sfq_perturbation(unsigned long arg)
 623 {
 624         struct Qdisc *sch = (struct Qdisc *)arg;
 625         struct sfq_sched_data *q = qdisc_priv(sch);
 626         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 627
 628         spin_lock(root_lock);
 629         q->perturbation = prandom_u32();
 630         if (!q->filter_list && q->tail)
 631                 sfq_rehash(sch);
 632         spin_unlock(root_lock);
 633
 634         if (q->perturb_period)
 635                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 636 }
 637
 638 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 639 {
 640         struct sfq_sched_data *q = qdisc_priv(sch);
 641         struct tc_sfq_qopt *ctl = nla_data(opt);
 642         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 643         unsigned int qlen;
 644         struct red_parms *p = NULL;
 645
 646         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 647                 return -EINVAL;
 648         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 649                 ctl_v1 = nla_data(opt);
 650         if (ctl->divisor &&
 651             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 652                 return -EINVAL;
 653         if (ctl_v1 && ctl_v1->qth_min) {
 654                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 655                 if (!p)
 656                         return -ENOMEM;
 657         }
 658         sch_tree_lock(sch);
 659         if (ctl->quantum) {
 660                 q->quantum = ctl->quantum;
 661                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 662         }
 663         q->perturb_period = ctl->perturb_period * HZ;
 664         if (ctl->flows)
 665                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 666         if (ctl->divisor) {
 667                 q->divisor = ctl->divisor;
 668                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 669         }
 670         if (ctl_v1) {
 671                 if (ctl_v1->depth)
 672                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 673                 if (p) {
 674                         swap(q->red_parms, p);
 675                         red_set_parms(q->red_parms,
 676                                       ctl_v1->qth_min, ctl_v1->qth_max,
 677                                       ctl_v1->Wlog,
 678                                       ctl_v1->Plog, ctl_v1->Scell_log,
 679                                       NULL,
 680                                       ctl_v1->max_P);
 681                 }
 682                 q->flags = ctl_v1->flags;
 683                 q->headdrop = ctl_v1->headdrop;
 684         }
 685         if (ctl->limit) {
 686                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 687                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 688         }
 689
 690         qlen = sch->q.qlen;
 691         while (sch->q.qlen > q->limit)
 692                 sfq_drop(sch);
 693         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 694
 695         del_timer(&q->perturb_timer);
 696         if (q->perturb_period) {
 697                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 698                 q->perturbation = prandom_u32();
 699         }
 700         sch_tree_unlock(sch);
 701         kfree(p);
 702         return 0;
 703 }
 704
 705 static void *sfq_alloc(size_t sz)
 706 {
 707         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 708
 709         if (!ptr)
 710                 ptr = vmalloc(sz);
 711         return ptr;
 712 }
 713
 714 static void sfq_free(void *addr)
 715 {
 716         kvfree(addr);
 717 }
 718
 719 static void sfq_destroy(struct Qdisc *sch)
 720 {
 721         struct sfq_sched_data *q = qdisc_priv(sch);
 722
 723         tcf_destroy_chain(&q->filter_list);
 724         q->perturb_period = 0;
 725         del_timer_sync(&q->perturb_timer);
 726         sfq_free(q->ht);
 727         sfq_free(q->slots);
 728         kfree(q->red_parms);
 729 }
 730
 731 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 732 {
 733         struct sfq_sched_data *q = qdisc_priv(sch);
 734         int i;
 735
 736         q->perturb_timer.function = sfq_perturbation;
 737         q->perturb_timer.data = (unsigned long)sch;
 738         init_timer_deferrable(&q->perturb_timer);
 739
 740         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 741                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 742                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 743         }
 744
 745         q->limit = SFQ_MAX_DEPTH;
 746         q->maxdepth = SFQ_MAX_DEPTH;
 747         q->cur_depth = 0;
 748         q->tail = NULL;
 749         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 750         q->maxflows = SFQ_DEFAULT_FLOWS;
 751         q->quantum = psched_mtu(qdisc_dev(sch));
 752         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 753         q->perturb_period = 0;
 754         q->perturbation = prandom_u32();
 755
 756         if (opt) {
 757                 int err = sfq_change(sch, opt);
 758                 if (err)
 759                         return err;
 760         }
 761
 762         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 763         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 764         if (!q->ht || !q->slots) {
 765                 sfq_destroy(sch);
 766                 return -ENOMEM;
 767         }
 768         for (i = 0; i < q->divisor; i++)
 769                 q->ht[i] = SFQ_EMPTY_SLOT;
 770
 771         for (i = 0; i < q->maxflows; i++) {
 772                 slot_queue_init(&q->slots[i]);
 773                 sfq_link(q, i);
 774         }
 775         if (q->limit >= 1)
 776                 sch->flags |= TCQ_F_CAN_BYPASS;
 777         else
 778                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 779         return 0;
 780 }
 781
 782 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 783 {
 784         struct sfq_sched_data *q = qdisc_priv(sch);
 785         unsigned char *b = skb_tail_pointer(skb);
 786         struct tc_sfq_qopt_v1 opt;
 787         struct red_parms *p = q->red_parms;
 788
 789         memset(&opt, 0, sizeof(opt));
 790         opt.v0.quantum  = q->quantum;
 791         opt.v0.perturb_period = q->perturb_period / HZ;
 792         opt.v0.limit    = q->limit;
 793         opt.v0.divisor  = q->divisor;
 794         opt.v0.flows    = q->maxflows;
 795         opt.depth       = q->maxdepth;
 796         opt.headdrop    = q->headdrop;
 797
 798         if (p) {
 799                 opt.qth_min     = p->qth_min >> p->Wlog;
 800                 opt.qth_max     = p->qth_max >> p->Wlog;
 801                 opt.Wlog        = p->Wlog;
 802                 opt.Plog        = p->Plog;
 803                 opt.Scell_log   = p->Scell_log;
 804                 opt.max_P       = p->max_P;
 805         }
 806         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 807         opt.flags       = q->flags;
 808
 809         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 810                 goto nla_put_failure;
 811
 812         return skb->len;
 813
 814 nla_put_failure:
 815         nlmsg_trim(skb, b);
 816         return -1;
 817 }
 818
 819 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 820 {
 821         return NULL;
 822 }
 823
 824 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 825 {
 826         return 0;
 827 }
 828
 829 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 830                               u32 classid)
 831 {
 832         /* we cannot bypass queue discipline anymore */
 833         sch->flags &= ~TCQ_F_CAN_BYPASS;
 834         return 0;
 835 }
 836
 837 static void sfq_put(struct Qdisc *q, unsigned long cl)
 838 {
 839 }
 840
 841 static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
 842                                              unsigned long cl)
 843 {
 844         struct sfq_sched_data *q = qdisc_priv(sch);
 845
 846         if (cl)
 847                 return NULL;
 848         return &q->filter_list;
 849 }
 850
 851 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 852                           struct sk_buff *skb, struct tcmsg *tcm)
 853 {
 854         tcm->tcm_handle |= TC_H_MIN(cl);
 855         return 0;
 856 }
 857
 858 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 859                                 struct gnet_dump *d)
 860 {
 861         struct sfq_sched_data *q = qdisc_priv(sch);
 862         sfq_index idx = q->ht[cl - 1];
 863         struct gnet_stats_queue qs = { 0 };
 864         struct tc_sfq_xstats xstats = { 0 };
 865
 866         if (idx != SFQ_EMPTY_SLOT) {
 867                 const struct sfq_slot *slot = &q->slots[idx];
 868
 869                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 870                 qs.qlen = slot->qlen;
 871                 qs.backlog = slot->backlog;
 872         }
 873         if (gnet_stats_copy_queue(d, &qs) < 0)
 874                 return -1;
 875         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 876 }
 877
 878 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 879 {
 880         struct sfq_sched_data *q = qdisc_priv(sch);
 881         unsigned int i;
 882
 883         if (arg->stop)
 884                 return;
 885
 886         for (i = 0; i < q->divisor; i++) {
 887                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 888                     arg->count < arg->skip) {
 889                         arg->count++;
 890                         continue;
 891                 }
 892                 if (arg->fn(sch, i + 1, arg) < 0) {
 893                         arg->stop = 1;
 894                         break;
 895                 }
 896                 arg->count++;
 897         }
 898 }
 899
 900 static const struct Qdisc_class_ops sfq_class_ops = {
 901         .leaf           =       sfq_leaf,
 902         .get            =       sfq_get,
 903         .put            =       sfq_put,
 904         .tcf_chain      =       sfq_find_tcf,
 905         .bind_tcf       =       sfq_bind,
 906         .unbind_tcf     =       sfq_put,
 907         .dump           =       sfq_dump_class,
 908         .dump_stats     =       sfq_dump_class_stats,
 909         .walk           =       sfq_walk,
 910 };
 911
 912 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 913         .cl_ops         =       &sfq_class_ops,
 914         .id             =       "sfq",
 915         .priv_size      =       sizeof(struct sfq_sched_data),
 916         .enqueue        =       sfq_enqueue,
 917         .dequeue        =       sfq_dequeue,
 918         .peek           =       qdisc_peek_dequeued,
 919         .drop           =       sfq_drop,
 920         .init           =       sfq_init,
 921         .reset          =       sfq_reset,
 922         .destroy        =       sfq_destroy,
 923         .change         =       NULL,
 924         .dump           =       sfq_dump,
 925         .owner          =       THIS_MODULE,
 926 };
 927
 928 static int __init sfq_module_init(void)
 929 {
 930         return register_qdisc(&sfq_qdisc_ops);
 931 }
 932 static void __exit sfq_module_exit(void)
 933 {
 934         unregister_qdisc(&sfq_qdisc_ops);
 935 }
 936 module_init(sfq_module_init)
 937 module_exit(sfq_module_exit)
 938 MODULE_LICENSE("GPL");