net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/pkt_cls.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         struct tcf_block *block;
 130         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 131         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 132
 133         struct red_parms *red_parms;
 134         struct tc_sfqred_stats stats;
 135         struct sfq_slot *tail;          /* current slot in round */
 136
 137         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 138                                         /* Linked lists of slots, indexed by depth
 139                                          * dep[0] : list of unused flows
 140                                          * dep[1] : list of flows with 1 packet
 141                                          * dep[X] : list of flows with X packets
 142                                          */
 143
 144         unsigned int    maxflows;       /* number of flows in flows array */
 145         int             perturb_period;
 146         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 147         struct timer_list perturb_timer;
 148 };
 149
 150 /*
 151  * sfq_head are either in a sfq_slot or in dep[] array
 152  */
 153 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 154 {
 155         if (val < SFQ_MAX_FLOWS)
 156                 return &q->slots[val].dep;
 157         return &q->dep[val - SFQ_MAX_FLOWS];
 158 }
 159
 160 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 161                              const struct sk_buff *skb)
 162 {
 163         return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 164 }
 165
 166 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 167                                  int *qerr)
 168 {
 169         struct sfq_sched_data *q = qdisc_priv(sch);
 170         struct tcf_result res;
 171         struct tcf_proto *fl;
 172         int result;
 173
 174         if (TC_H_MAJ(skb->priority) == sch->handle &&
 175             TC_H_MIN(skb->priority) > 0 &&
 176             TC_H_MIN(skb->priority) <= q->divisor)
 177                 return TC_H_MIN(skb->priority);
 178
 179         fl = rcu_dereference_bh(q->filter_list);
 180         if (!fl)
 181                 return sfq_hash(q, skb) + 1;
 182
 183         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 184         result = tcf_classify(skb, fl, &res, false);
 185         if (result >= 0) {
 186 #ifdef CONFIG_NET_CLS_ACT
 187                 switch (result) {
 188                 case TC_ACT_STOLEN:
 189                 case TC_ACT_QUEUED:
 190                 case TC_ACT_TRAP:
 191                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 192                 case TC_ACT_SHOT:
 193                         return 0;
 194                 }
 195 #endif
 196                 if (TC_H_MIN(res.classid) <= q->divisor)
 197                         return TC_H_MIN(res.classid);
 198         }
 199         return 0;
 200 }
 201
 202 /*
 203  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 204  */
 205 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 206 {
 207         sfq_index p, n;
 208         struct sfq_slot *slot = &q->slots[x];
 209         int qlen = slot->qlen;
 210
 211         p = qlen + SFQ_MAX_FLOWS;
 212         n = q->dep[qlen].next;
 213
 214         slot->dep.next = n;
 215         slot->dep.prev = p;
 216
 217         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 218         sfq_dep_head(q, n)->prev = x;
 219 }
 220
 221 #define sfq_unlink(q, x, n, p)                  \
 222         do {                                    \
 223                 n = q->slots[x].dep.next;       \
 224                 p = q->slots[x].dep.prev;       \
 225                 sfq_dep_head(q, p)->next = n;   \
 226                 sfq_dep_head(q, n)->prev = p;   \
 227         } while (0)
 228
 229
 230 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 231 {
 232         sfq_index p, n;
 233         int d;
 234
 235         sfq_unlink(q, x, n, p);
 236
 237         d = q->slots[x].qlen--;
 238         if (n == p && q->cur_depth == d)
 239                 q->cur_depth--;
 240         sfq_link(q, x);
 241 }
 242
 243 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 244 {
 245         sfq_index p, n;
 246         int d;
 247
 248         sfq_unlink(q, x, n, p);
 249
 250         d = ++q->slots[x].qlen;
 251         if (q->cur_depth < d)
 252                 q->cur_depth = d;
 253         sfq_link(q, x);
 254 }
 255
 256 /* helper functions : might be changed when/if skb use a standard list_head */
 257
 258 /* remove one skb from tail of slot queue */
 259 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 260 {
 261         struct sk_buff *skb = slot->skblist_prev;
 262
 263         slot->skblist_prev = skb->prev;
 264         skb->prev->next = (struct sk_buff *)slot;
 265         skb->next = skb->prev = NULL;
 266         return skb;
 267 }
 268
 269 /* remove one skb from head of slot queue */
 270 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 271 {
 272         struct sk_buff *skb = slot->skblist_next;
 273
 274         slot->skblist_next = skb->next;
 275         skb->next->prev = (struct sk_buff *)slot;
 276         skb->next = skb->prev = NULL;
 277         return skb;
 278 }
 279
 280 static inline void slot_queue_init(struct sfq_slot *slot)
 281 {
 282         memset(slot, 0, sizeof(*slot));
 283         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 284 }
 285
 286 /* add skb to slot queue (tail add) */
 287 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 288 {
 289         skb->prev = slot->skblist_prev;
 290         skb->next = (struct sk_buff *)slot;
 291         slot->skblist_prev->next = skb;
 292         slot->skblist_prev = skb;
 293 }
 294
 295 static unsigned int sfq_drop(struct Qdisc *sch)
 296 {
 297         struct sfq_sched_data *q = qdisc_priv(sch);
 298         sfq_index x, d = q->cur_depth;
 299         struct sk_buff *skb;
 300         unsigned int len;
 301         struct sfq_slot *slot;
 302
 303         /* Queue is full! Find the longest slot and drop tail packet from it */
 304         if (d > 1) {
 305                 x = q->dep[d].next;
 306                 slot = &q->slots[x];
 307 drop:
 308                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 309                 len = qdisc_pkt_len(skb);
 310                 slot->backlog -= len;
 311                 sfq_dec(q, x);
 312                 sch->q.qlen--;
 313                 qdisc_qstats_drop(sch);
 314                 qdisc_qstats_backlog_dec(sch, skb);
 315                 kfree_skb(skb);
 316                 return len;
 317         }
 318
 319         if (d == 1) {
 320                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 321                 x = q->tail->next;
 322                 slot = &q->slots[x];
 323                 q->tail->next = slot->next;
 324                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 325                 goto drop;
 326         }
 327
 328         return 0;
 329 }
 330
 331 /* Is ECN parameter configured */
 332 static int sfq_prob_mark(const struct sfq_sched_data *q)
 333 {
 334         return q->flags & TC_RED_ECN;
 335 }
 336
 337 /* Should packets over max threshold just be marked */
 338 static int sfq_hard_mark(const struct sfq_sched_data *q)
 339 {
 340         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 341 }
 342
 343 static int sfq_headdrop(const struct sfq_sched_data *q)
 344 {
 345         return q->headdrop;
 346 }
 347
 348 static int
 349 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 350 {
 351         struct sfq_sched_data *q = qdisc_priv(sch);
 352         unsigned int hash, dropped;
 353         sfq_index x, qlen;
 354         struct sfq_slot *slot;
 355         int uninitialized_var(ret);
 356         struct sk_buff *head;
 357         int delta;
 358
 359         hash = sfq_classify(skb, sch, &ret);
 360         if (hash == 0) {
 361                 if (ret & __NET_XMIT_BYPASS)
 362                         qdisc_qstats_drop(sch);
 363                 kfree_skb(skb);
 364                 return ret;
 365         }
 366         hash--;
 367
 368         x = q->ht[hash];
 369         slot = &q->slots[x];
 370         if (x == SFQ_EMPTY_SLOT) {
 371                 x = q->dep[0].next; /* get a free slot */
 372                 if (x >= SFQ_MAX_FLOWS)
 373                         return qdisc_drop(skb, sch, to_free);
 374                 q->ht[hash] = x;
 375                 slot = &q->slots[x];
 376                 slot->hash = hash;
 377                 slot->backlog = 0; /* should already be 0 anyway... */
 378                 red_set_vars(&slot->vars);
 379                 goto enqueue;
 380         }
 381         if (q->red_parms) {
 382                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 383                                                         &slot->vars,
 384                                                         slot->backlog);
 385                 switch (red_action(q->red_parms,
 386                                    &slot->vars,
 387                                    slot->vars.qavg)) {
 388                 case RED_DONT_MARK:
 389                         break;
 390
 391                 case RED_PROB_MARK:
 392                         qdisc_qstats_overlimit(sch);
 393                         if (sfq_prob_mark(q)) {
 394                                 /* We know we have at least one packet in queue */
 395                                 if (sfq_headdrop(q) &&
 396                                     INET_ECN_set_ce(slot->skblist_next)) {
 397                                         q->stats.prob_mark_head++;
 398                                         break;
 399                                 }
 400                                 if (INET_ECN_set_ce(skb)) {
 401                                         q->stats.prob_mark++;
 402                                         break;
 403                                 }
 404                         }
 405                         q->stats.prob_drop++;
 406                         goto congestion_drop;
 407
 408                 case RED_HARD_MARK:
 409                         qdisc_qstats_overlimit(sch);
 410                         if (sfq_hard_mark(q)) {
 411                                 /* We know we have at least one packet in queue */
 412                                 if (sfq_headdrop(q) &&
 413                                     INET_ECN_set_ce(slot->skblist_next)) {
 414                                         q->stats.forced_mark_head++;
 415                                         break;
 416                                 }
 417                                 if (INET_ECN_set_ce(skb)) {
 418                                         q->stats.forced_mark++;
 419                                         break;
 420                                 }
 421                         }
 422                         q->stats.forced_drop++;
 423                         goto congestion_drop;
 424                 }
 425         }
 426
 427         if (slot->qlen >= q->maxdepth) {
 428 congestion_drop:
 429                 if (!sfq_headdrop(q))
 430                         return qdisc_drop(skb, sch, to_free);
 431
 432                 /* We know we have at least one packet in queue */
 433                 head = slot_dequeue_head(slot);
 434                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 435                 sch->qstats.backlog -= delta;
 436                 slot->backlog -= delta;
 437                 qdisc_drop(head, sch, to_free);
 438
 439                 slot_queue_add(slot, skb);
 440                 return NET_XMIT_CN;
 441         }
 442
 443 enqueue:
 444         qdisc_qstats_backlog_inc(sch, skb);
 445         slot->backlog += qdisc_pkt_len(skb);
 446         slot_queue_add(slot, skb);
 447         sfq_inc(q, x);
 448         if (slot->qlen == 1) {          /* The flow is new */
 449                 if (q->tail == NULL) {  /* It is the first flow */
 450                         slot->next = x;
 451                 } else {
 452                         slot->next = q->tail->next;
 453                         q->tail->next = x;
 454                 }
 455                 /* We put this flow at the end of our flow list.
 456                  * This might sound unfair for a new flow to wait after old ones,
 457                  * but we could endup servicing new flows only, and freeze old ones.
 458                  */
 459                 q->tail = slot;
 460                 /* We could use a bigger initial quantum for new flows */
 461                 slot->allot = q->scaled_quantum;
 462         }
 463         if (++sch->q.qlen <= q->limit)
 464                 return NET_XMIT_SUCCESS;
 465
 466         qlen = slot->qlen;
 467         dropped = sfq_drop(sch);
 468         /* Return Congestion Notification only if we dropped a packet
 469          * from this flow.
 470          */
 471         if (qlen != slot->qlen)
 472                 return NET_XMIT_CN;
 473
 474         /* As we dropped a packet, better let upper stack know this */
 475         qdisc_tree_reduce_backlog(sch, 1, dropped);
 476         return NET_XMIT_SUCCESS;
 477 }
 478
 479 static struct sk_buff *
 480 sfq_dequeue(struct Qdisc *sch)
 481 {
 482         struct sfq_sched_data *q = qdisc_priv(sch);
 483         struct sk_buff *skb;
 484         sfq_index a, next_a;
 485         struct sfq_slot *slot;
 486
 487         /* No active slots */
 488         if (q->tail == NULL)
 489                 return NULL;
 490
 491 next_slot:
 492         a = q->tail->next;
 493         slot = &q->slots[a];
 494         if (slot->allot <= 0) {
 495                 q->tail = slot;
 496                 slot->allot += q->scaled_quantum;
 497                 goto next_slot;
 498         }
 499         skb = slot_dequeue_head(slot);
 500         sfq_dec(q, a);
 501         qdisc_bstats_update(sch, skb);
 502         sch->q.qlen--;
 503         qdisc_qstats_backlog_dec(sch, skb);
 504         slot->backlog -= qdisc_pkt_len(skb);
 505         /* Is the slot empty? */
 506         if (slot->qlen == 0) {
 507                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 508                 next_a = slot->next;
 509                 if (a == next_a) {
 510                         q->tail = NULL; /* no more active slots */
 511                         return skb;
 512                 }
 513                 q->tail->next = next_a;
 514         } else {
 515                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 516         }
 517         return skb;
 518 }
 519
 520 static void
 521 sfq_reset(struct Qdisc *sch)
 522 {
 523         struct sk_buff *skb;
 524
 525         while ((skb = sfq_dequeue(sch)) != NULL)
 526                 rtnl_kfree_skbs(skb, skb);
 527 }
 528
 529 /*
 530  * When q->perturbation is changed, we rehash all queued skbs
 531  * to avoid OOO (Out Of Order) effects.
 532  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 533  * counters.
 534  */
 535 static void sfq_rehash(struct Qdisc *sch)
 536 {
 537         struct sfq_sched_data *q = qdisc_priv(sch);
 538         struct sk_buff *skb;
 539         int i;
 540         struct sfq_slot *slot;
 541         struct sk_buff_head list;
 542         int dropped = 0;
 543         unsigned int drop_len = 0;
 544
 545         __skb_queue_head_init(&list);
 546
 547         for (i = 0; i < q->maxflows; i++) {
 548                 slot = &q->slots[i];
 549                 if (!slot->qlen)
 550                         continue;
 551                 while (slot->qlen) {
 552                         skb = slot_dequeue_head(slot);
 553                         sfq_dec(q, i);
 554                         __skb_queue_tail(&list, skb);
 555                 }
 556                 slot->backlog = 0;
 557                 red_set_vars(&slot->vars);
 558                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 559         }
 560         q->tail = NULL;
 561
 562         while ((skb = __skb_dequeue(&list)) != NULL) {
 563                 unsigned int hash = sfq_hash(q, skb);
 564                 sfq_index x = q->ht[hash];
 565
 566                 slot = &q->slots[x];
 567                 if (x == SFQ_EMPTY_SLOT) {
 568                         x = q->dep[0].next; /* get a free slot */
 569                         if (x >= SFQ_MAX_FLOWS) {
 570 drop:
 571                                 qdisc_qstats_backlog_dec(sch, skb);
 572                                 drop_len += qdisc_pkt_len(skb);
 573                                 kfree_skb(skb);
 574                                 dropped++;
 575                                 continue;
 576                         }
 577                         q->ht[hash] = x;
 578                         slot = &q->slots[x];
 579                         slot->hash = hash;
 580                 }
 581                 if (slot->qlen >= q->maxdepth)
 582                         goto drop;
 583                 slot_queue_add(slot, skb);
 584                 if (q->red_parms)
 585                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 586                                                         &slot->vars,
 587                                                         slot->backlog);
 588                 slot->backlog += qdisc_pkt_len(skb);
 589                 sfq_inc(q, x);
 590                 if (slot->qlen == 1) {          /* The flow is new */
 591                         if (q->tail == NULL) {  /* It is the first flow */
 592                                 slot->next = x;
 593                         } else {
 594                                 slot->next = q->tail->next;
 595                                 q->tail->next = x;
 596                         }
 597                         q->tail = slot;
 598                         slot->allot = q->scaled_quantum;
 599                 }
 600         }
 601         sch->q.qlen -= dropped;
 602         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 603 }
 604
 605 static void sfq_perturbation(unsigned long arg)
 606 {
 607         struct Qdisc *sch = (struct Qdisc *)arg;
 608         struct sfq_sched_data *q = qdisc_priv(sch);
 609         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 610
 611         spin_lock(root_lock);
 612         q->perturbation = prandom_u32();
 613         if (!q->filter_list && q->tail)
 614                 sfq_rehash(sch);
 615         spin_unlock(root_lock);
 616
 617         if (q->perturb_period)
 618                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 619 }
 620
 621 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 622 {
 623         struct sfq_sched_data *q = qdisc_priv(sch);
 624         struct tc_sfq_qopt *ctl = nla_data(opt);
 625         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 626         unsigned int qlen, dropped = 0;
 627         struct red_parms *p = NULL;
 628
 629         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 630                 return -EINVAL;
 631         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 632                 ctl_v1 = nla_data(opt);
 633         if (ctl->divisor &&
 634             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 635                 return -EINVAL;
 636         if (ctl_v1 && ctl_v1->qth_min) {
 637                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 638                 if (!p)
 639                         return -ENOMEM;
 640         }
 641         sch_tree_lock(sch);
 642         if (ctl->quantum) {
 643                 q->quantum = ctl->quantum;
 644                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 645         }
 646         q->perturb_period = ctl->perturb_period * HZ;
 647         if (ctl->flows)
 648                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 649         if (ctl->divisor) {
 650                 q->divisor = ctl->divisor;
 651                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 652         }
 653         if (ctl_v1) {
 654                 if (ctl_v1->depth)
 655                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 656                 if (p) {
 657                         swap(q->red_parms, p);
 658                         red_set_parms(q->red_parms,
 659                                       ctl_v1->qth_min, ctl_v1->qth_max,
 660                                       ctl_v1->Wlog,
 661                                       ctl_v1->Plog, ctl_v1->Scell_log,
 662                                       NULL,
 663                                       ctl_v1->max_P);
 664                 }
 665                 q->flags = ctl_v1->flags;
 666                 q->headdrop = ctl_v1->headdrop;
 667         }
 668         if (ctl->limit) {
 669                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 670                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 671         }
 672
 673         qlen = sch->q.qlen;
 674         while (sch->q.qlen > q->limit)
 675                 dropped += sfq_drop(sch);
 676         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 677
 678         del_timer(&q->perturb_timer);
 679         if (q->perturb_period) {
 680                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 681                 q->perturbation = prandom_u32();
 682         }
 683         sch_tree_unlock(sch);
 684         kfree(p);
 685         return 0;
 686 }
 687
 688 static void *sfq_alloc(size_t sz)
 689 {
 690         return  kvmalloc(sz, GFP_KERNEL);
 691 }
 692
 693 static void sfq_free(void *addr)
 694 {
 695         kvfree(addr);
 696 }
 697
 698 static void sfq_destroy(struct Qdisc *sch)
 699 {
 700         struct sfq_sched_data *q = qdisc_priv(sch);
 701
 702         tcf_block_put(q->block);
 703         q->perturb_period = 0;
 704         del_timer_sync(&q->perturb_timer);
 705         sfq_free(q->ht);
 706         sfq_free(q->slots);
 707         kfree(q->red_parms);
 708 }
 709
 710 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 711 {
 712         struct sfq_sched_data *q = qdisc_priv(sch);
 713         int i;
 714         int err;
 715
 716         err = tcf_block_get(&q->block, &q->filter_list);
 717         if (err)
 718                 return err;
 719
 720         setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
 721                                (unsigned long)sch);
 722
 723         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 724                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 725                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 726         }
 727
 728         q->limit = SFQ_MAX_DEPTH;
 729         q->maxdepth = SFQ_MAX_DEPTH;
 730         q->cur_depth = 0;
 731         q->tail = NULL;
 732         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 733         q->maxflows = SFQ_DEFAULT_FLOWS;
 734         q->quantum = psched_mtu(qdisc_dev(sch));
 735         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 736         q->perturb_period = 0;
 737         q->perturbation = prandom_u32();
 738
 739         if (opt) {
 740                 int err = sfq_change(sch, opt);
 741                 if (err)
 742                         return err;
 743         }
 744
 745         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 746         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 747         if (!q->ht || !q->slots) {
 748                 /* Note: sfq_destroy() will be called by our caller */
 749                 return -ENOMEM;
 750         }
 751
 752         for (i = 0; i < q->divisor; i++)
 753                 q->ht[i] = SFQ_EMPTY_SLOT;
 754
 755         for (i = 0; i < q->maxflows; i++) {
 756                 slot_queue_init(&q->slots[i]);
 757                 sfq_link(q, i);
 758         }
 759         if (q->limit >= 1)
 760                 sch->flags |= TCQ_F_CAN_BYPASS;
 761         else
 762                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 763         return 0;
 764 }
 765
 766 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 767 {
 768         struct sfq_sched_data *q = qdisc_priv(sch);
 769         unsigned char *b = skb_tail_pointer(skb);
 770         struct tc_sfq_qopt_v1 opt;
 771         struct red_parms *p = q->red_parms;
 772
 773         memset(&opt, 0, sizeof(opt));
 774         opt.v0.quantum  = q->quantum;
 775         opt.v0.perturb_period = q->perturb_period / HZ;
 776         opt.v0.limit    = q->limit;
 777         opt.v0.divisor  = q->divisor;
 778         opt.v0.flows    = q->maxflows;
 779         opt.depth       = q->maxdepth;
 780         opt.headdrop    = q->headdrop;
 781
 782         if (p) {
 783                 opt.qth_min     = p->qth_min >> p->Wlog;
 784                 opt.qth_max     = p->qth_max >> p->Wlog;
 785                 opt.Wlog        = p->Wlog;
 786                 opt.Plog        = p->Plog;
 787                 opt.Scell_log   = p->Scell_log;
 788                 opt.max_P       = p->max_P;
 789         }
 790         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 791         opt.flags       = q->flags;
 792
 793         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 794                 goto nla_put_failure;
 795
 796         return skb->len;
 797
 798 nla_put_failure:
 799         nlmsg_trim(skb, b);
 800         return -1;
 801 }
 802
 803 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 804 {
 805         return NULL;
 806 }
 807
 808 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 809 {
 810         return 0;
 811 }
 812
 813 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 814                               u32 classid)
 815 {
 816         /* we cannot bypass queue discipline anymore */
 817         sch->flags &= ~TCQ_F_CAN_BYPASS;
 818         return 0;
 819 }
 820
 821 static void sfq_put(struct Qdisc *q, unsigned long cl)
 822 {
 823 }
 824
 825 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
 826 {
 827         struct sfq_sched_data *q = qdisc_priv(sch);
 828
 829         if (cl)
 830                 return NULL;
 831         return q->block;
 832 }
 833
 834 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 835                           struct sk_buff *skb, struct tcmsg *tcm)
 836 {
 837         tcm->tcm_handle |= TC_H_MIN(cl);
 838         return 0;
 839 }
 840
 841 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 842                                 struct gnet_dump *d)
 843 {
 844         struct sfq_sched_data *q = qdisc_priv(sch);
 845         sfq_index idx = q->ht[cl - 1];
 846         struct gnet_stats_queue qs = { 0 };
 847         struct tc_sfq_xstats xstats = { 0 };
 848
 849         if (idx != SFQ_EMPTY_SLOT) {
 850                 const struct sfq_slot *slot = &q->slots[idx];
 851
 852                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 853                 qs.qlen = slot->qlen;
 854                 qs.backlog = slot->backlog;
 855         }
 856         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 857                 return -1;
 858         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 859 }
 860
 861 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 862 {
 863         struct sfq_sched_data *q = qdisc_priv(sch);
 864         unsigned int i;
 865
 866         if (arg->stop)
 867                 return;
 868
 869         for (i = 0; i < q->divisor; i++) {
 870                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 871                     arg->count < arg->skip) {
 872                         arg->count++;
 873                         continue;
 874                 }
 875                 if (arg->fn(sch, i + 1, arg) < 0) {
 876                         arg->stop = 1;
 877                         break;
 878                 }
 879                 arg->count++;
 880         }
 881 }
 882
 883 static const struct Qdisc_class_ops sfq_class_ops = {
 884         .leaf           =       sfq_leaf,
 885         .get            =       sfq_get,
 886         .put            =       sfq_put,
 887         .tcf_block      =       sfq_tcf_block,
 888         .bind_tcf       =       sfq_bind,
 889         .unbind_tcf     =       sfq_put,
 890         .dump           =       sfq_dump_class,
 891         .dump_stats     =       sfq_dump_class_stats,
 892         .walk           =       sfq_walk,
 893 };
 894
 895 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 896         .cl_ops         =       &sfq_class_ops,
 897         .id             =       "sfq",
 898         .priv_size      =       sizeof(struct sfq_sched_data),
 899         .enqueue        =       sfq_enqueue,
 900         .dequeue        =       sfq_dequeue,
 901         .peek           =       qdisc_peek_dequeued,
 902         .init           =       sfq_init,
 903         .reset          =       sfq_reset,
 904         .destroy        =       sfq_destroy,
 905         .change         =       NULL,
 906         .dump           =       sfq_dump,
 907         .owner          =       THIS_MODULE,
 908 };
 909
 910 static int __init sfq_module_init(void)
 911 {
 912         return register_qdisc(&sfq_qdisc_ops);
 913 }
 914 static void __exit sfq_module_exit(void)
 915 {
 916         unregister_qdisc(&sfq_qdisc_ops);
 917 }
 918 module_init(sfq_module_init)
 919 module_exit(sfq_module_exit)
 920 MODULE_LICENSE("GPL");