net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/siphash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/pkt_cls.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         siphash_key_t   perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         struct tcf_block *block;
 130         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 131         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 132
 133         struct red_parms *red_parms;
 134         struct tc_sfqred_stats stats;
 135         struct sfq_slot *tail;          /* current slot in round */
 136
 137         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 138                                         /* Linked lists of slots, indexed by depth
 139                                          * dep[0] : list of unused flows
 140                                          * dep[1] : list of flows with 1 packet
 141                                          * dep[X] : list of flows with X packets
 142                                          */
 143
 144         unsigned int    maxflows;       /* number of flows in flows array */
 145         int             perturb_period;
 146         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 147         struct timer_list perturb_timer;
 148         struct Qdisc    *sch;
 149 };
 150
 151 /*
 152  * sfq_head are either in a sfq_slot or in dep[] array
 153  */
 154 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 155 {
 156         if (val < SFQ_MAX_FLOWS)
 157                 return &q->slots[val].dep;
 158         return &q->dep[val - SFQ_MAX_FLOWS];
 159 }
 160
 161 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 162                              const struct sk_buff *skb)
 163 {
 164         return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
 165 }
 166
 167 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 168                                  int *qerr)
 169 {
 170         struct sfq_sched_data *q = qdisc_priv(sch);
 171         struct tcf_result res;
 172         struct tcf_proto *fl;
 173         int result;
 174
 175         if (TC_H_MAJ(skb->priority) == sch->handle &&
 176             TC_H_MIN(skb->priority) > 0 &&
 177             TC_H_MIN(skb->priority) <= q->divisor)
 178                 return TC_H_MIN(skb->priority);
 179
 180         fl = rcu_dereference_bh(q->filter_list);
 181         if (!fl)
 182                 return sfq_hash(q, skb) + 1;
 183
 184         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 185         result = tcf_classify(skb, fl, &res, false);
 186         if (result >= 0) {
 187 #ifdef CONFIG_NET_CLS_ACT
 188                 switch (result) {
 189                 case TC_ACT_STOLEN:
 190                 case TC_ACT_QUEUED:
 191                 case TC_ACT_TRAP:
 192                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 193                         /* fall through */
 194                 case TC_ACT_SHOT:
 195                         return 0;
 196                 }
 197 #endif
 198                 if (TC_H_MIN(res.classid) <= q->divisor)
 199                         return TC_H_MIN(res.classid);
 200         }
 201         return 0;
 202 }
 203
 204 /*
 205  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 206  */
 207 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 208 {
 209         sfq_index p, n;
 210         struct sfq_slot *slot = &q->slots[x];
 211         int qlen = slot->qlen;
 212
 213         p = qlen + SFQ_MAX_FLOWS;
 214         n = q->dep[qlen].next;
 215
 216         slot->dep.next = n;
 217         slot->dep.prev = p;
 218
 219         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 220         sfq_dep_head(q, n)->prev = x;
 221 }
 222
 223 #define sfq_unlink(q, x, n, p)                  \
 224         do {                                    \
 225                 n = q->slots[x].dep.next;       \
 226                 p = q->slots[x].dep.prev;       \
 227                 sfq_dep_head(q, p)->next = n;   \
 228                 sfq_dep_head(q, n)->prev = p;   \
 229         } while (0)
 230
 231
 232 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 233 {
 234         sfq_index p, n;
 235         int d;
 236
 237         sfq_unlink(q, x, n, p);
 238
 239         d = q->slots[x].qlen--;
 240         if (n == p && q->cur_depth == d)
 241                 q->cur_depth--;
 242         sfq_link(q, x);
 243 }
 244
 245 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 246 {
 247         sfq_index p, n;
 248         int d;
 249
 250         sfq_unlink(q, x, n, p);
 251
 252         d = ++q->slots[x].qlen;
 253         if (q->cur_depth < d)
 254                 q->cur_depth = d;
 255         sfq_link(q, x);
 256 }
 257
 258 /* helper functions : might be changed when/if skb use a standard list_head */
 259
 260 /* remove one skb from tail of slot queue */
 261 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 262 {
 263         struct sk_buff *skb = slot->skblist_prev;
 264
 265         slot->skblist_prev = skb->prev;
 266         skb->prev->next = (struct sk_buff *)slot;
 267         skb->next = skb->prev = NULL;
 268         return skb;
 269 }
 270
 271 /* remove one skb from head of slot queue */
 272 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 273 {
 274         struct sk_buff *skb = slot->skblist_next;
 275
 276         slot->skblist_next = skb->next;
 277         skb->next->prev = (struct sk_buff *)slot;
 278         skb->next = skb->prev = NULL;
 279         return skb;
 280 }
 281
 282 static inline void slot_queue_init(struct sfq_slot *slot)
 283 {
 284         memset(slot, 0, sizeof(*slot));
 285         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 286 }
 287
 288 /* add skb to slot queue (tail add) */
 289 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 290 {
 291         skb->prev = slot->skblist_prev;
 292         skb->next = (struct sk_buff *)slot;
 293         slot->skblist_prev->next = skb;
 294         slot->skblist_prev = skb;
 295 }
 296
 297 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
 298 {
 299         struct sfq_sched_data *q = qdisc_priv(sch);
 300         sfq_index x, d = q->cur_depth;
 301         struct sk_buff *skb;
 302         unsigned int len;
 303         struct sfq_slot *slot;
 304
 305         /* Queue is full! Find the longest slot and drop tail packet from it */
 306         if (d > 1) {
 307                 x = q->dep[d].next;
 308                 slot = &q->slots[x];
 309 drop:
 310                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 311                 len = qdisc_pkt_len(skb);
 312                 slot->backlog -= len;
 313                 sfq_dec(q, x);
 314                 sch->q.qlen--;
 315                 qdisc_qstats_backlog_dec(sch, skb);
 316                 qdisc_drop(skb, sch, to_free);
 317                 return len;
 318         }
 319
 320         if (d == 1) {
 321                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 322                 x = q->tail->next;
 323                 slot = &q->slots[x];
 324                 q->tail->next = slot->next;
 325                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 326                 goto drop;
 327         }
 328
 329         return 0;
 330 }
 331
 332 /* Is ECN parameter configured */
 333 static int sfq_prob_mark(const struct sfq_sched_data *q)
 334 {
 335         return q->flags & TC_RED_ECN;
 336 }
 337
 338 /* Should packets over max threshold just be marked */
 339 static int sfq_hard_mark(const struct sfq_sched_data *q)
 340 {
 341         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 342 }
 343
 344 static int sfq_headdrop(const struct sfq_sched_data *q)
 345 {
 346         return q->headdrop;
 347 }
 348
 349 static int
 350 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 351 {
 352         struct sfq_sched_data *q = qdisc_priv(sch);
 353         unsigned int hash, dropped;
 354         sfq_index x, qlen;
 355         struct sfq_slot *slot;
 356         int uninitialized_var(ret);
 357         struct sk_buff *head;
 358         int delta;
 359
 360         hash = sfq_classify(skb, sch, &ret);
 361         if (hash == 0) {
 362                 if (ret & __NET_XMIT_BYPASS)
 363                         qdisc_qstats_drop(sch);
 364                 __qdisc_drop(skb, to_free);
 365                 return ret;
 366         }
 367         hash--;
 368
 369         x = q->ht[hash];
 370         slot = &q->slots[x];
 371         if (x == SFQ_EMPTY_SLOT) {
 372                 x = q->dep[0].next; /* get a free slot */
 373                 if (x >= SFQ_MAX_FLOWS)
 374                         return qdisc_drop(skb, sch, to_free);
 375                 q->ht[hash] = x;
 376                 slot = &q->slots[x];
 377                 slot->hash = hash;
 378                 slot->backlog = 0; /* should already be 0 anyway... */
 379                 red_set_vars(&slot->vars);
 380                 goto enqueue;
 381         }
 382         if (q->red_parms) {
 383                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 384                                                         &slot->vars,
 385                                                         slot->backlog);
 386                 switch (red_action(q->red_parms,
 387                                    &slot->vars,
 388                                    slot->vars.qavg)) {
 389                 case RED_DONT_MARK:
 390                         break;
 391
 392                 case RED_PROB_MARK:
 393                         qdisc_qstats_overlimit(sch);
 394                         if (sfq_prob_mark(q)) {
 395                                 /* We know we have at least one packet in queue */
 396                                 if (sfq_headdrop(q) &&
 397                                     INET_ECN_set_ce(slot->skblist_next)) {
 398                                         q->stats.prob_mark_head++;
 399                                         break;
 400                                 }
 401                                 if (INET_ECN_set_ce(skb)) {
 402                                         q->stats.prob_mark++;
 403                                         break;
 404                                 }
 405                         }
 406                         q->stats.prob_drop++;
 407                         goto congestion_drop;
 408
 409                 case RED_HARD_MARK:
 410                         qdisc_qstats_overlimit(sch);
 411                         if (sfq_hard_mark(q)) {
 412                                 /* We know we have at least one packet in queue */
 413                                 if (sfq_headdrop(q) &&
 414                                     INET_ECN_set_ce(slot->skblist_next)) {
 415                                         q->stats.forced_mark_head++;
 416                                         break;
 417                                 }
 418                                 if (INET_ECN_set_ce(skb)) {
 419                                         q->stats.forced_mark++;
 420                                         break;
 421                                 }
 422                         }
 423                         q->stats.forced_drop++;
 424                         goto congestion_drop;
 425                 }
 426         }
 427
 428         if (slot->qlen >= q->maxdepth) {
 429 congestion_drop:
 430                 if (!sfq_headdrop(q))
 431                         return qdisc_drop(skb, sch, to_free);
 432
 433                 /* We know we have at least one packet in queue */
 434                 head = slot_dequeue_head(slot);
 435                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 436                 sch->qstats.backlog -= delta;
 437                 slot->backlog -= delta;
 438                 qdisc_drop(head, sch, to_free);
 439
 440                 slot_queue_add(slot, skb);
 441                 qdisc_tree_reduce_backlog(sch, 0, delta);
 442                 return NET_XMIT_CN;
 443         }
 444
 445 enqueue:
 446         qdisc_qstats_backlog_inc(sch, skb);
 447         slot->backlog += qdisc_pkt_len(skb);
 448         slot_queue_add(slot, skb);
 449         sfq_inc(q, x);
 450         if (slot->qlen == 1) {          /* The flow is new */
 451                 if (q->tail == NULL) {  /* It is the first flow */
 452                         slot->next = x;
 453                 } else {
 454                         slot->next = q->tail->next;
 455                         q->tail->next = x;
 456                 }
 457                 /* We put this flow at the end of our flow list.
 458                  * This might sound unfair for a new flow to wait after old ones,
 459                  * but we could endup servicing new flows only, and freeze old ones.
 460                  */
 461                 q->tail = slot;
 462                 /* We could use a bigger initial quantum for new flows */
 463                 slot->allot = q->scaled_quantum;
 464         }
 465         if (++sch->q.qlen <= q->limit)
 466                 return NET_XMIT_SUCCESS;
 467
 468         qlen = slot->qlen;
 469         dropped = sfq_drop(sch, to_free);
 470         /* Return Congestion Notification only if we dropped a packet
 471          * from this flow.
 472          */
 473         if (qlen != slot->qlen) {
 474                 qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
 475                 return NET_XMIT_CN;
 476         }
 477
 478         /* As we dropped a packet, better let upper stack know this */
 479         qdisc_tree_reduce_backlog(sch, 1, dropped);
 480         return NET_XMIT_SUCCESS;
 481 }
 482
 483 static struct sk_buff *
 484 sfq_dequeue(struct Qdisc *sch)
 485 {
 486         struct sfq_sched_data *q = qdisc_priv(sch);
 487         struct sk_buff *skb;
 488         sfq_index a, next_a;
 489         struct sfq_slot *slot;
 490
 491         /* No active slots */
 492         if (q->tail == NULL)
 493                 return NULL;
 494
 495 next_slot:
 496         a = q->tail->next;
 497         slot = &q->slots[a];
 498         if (slot->allot <= 0) {
 499                 q->tail = slot;
 500                 slot->allot += q->scaled_quantum;
 501                 goto next_slot;
 502         }
 503         skb = slot_dequeue_head(slot);
 504         sfq_dec(q, a);
 505         qdisc_bstats_update(sch, skb);
 506         sch->q.qlen--;
 507         qdisc_qstats_backlog_dec(sch, skb);
 508         slot->backlog -= qdisc_pkt_len(skb);
 509         /* Is the slot empty? */
 510         if (slot->qlen == 0) {
 511                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 512                 next_a = slot->next;
 513                 if (a == next_a) {
 514                         q->tail = NULL; /* no more active slots */
 515                         return skb;
 516                 }
 517                 q->tail->next = next_a;
 518         } else {
 519                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 520         }
 521         return skb;
 522 }
 523
 524 static void
 525 sfq_reset(struct Qdisc *sch)
 526 {
 527         struct sk_buff *skb;
 528
 529         while ((skb = sfq_dequeue(sch)) != NULL)
 530                 rtnl_kfree_skbs(skb, skb);
 531 }
 532
 533 /*
 534  * When q->perturbation is changed, we rehash all queued skbs
 535  * to avoid OOO (Out Of Order) effects.
 536  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 537  * counters.
 538  */
 539 static void sfq_rehash(struct Qdisc *sch)
 540 {
 541         struct sfq_sched_data *q = qdisc_priv(sch);
 542         struct sk_buff *skb;
 543         int i;
 544         struct sfq_slot *slot;
 545         struct sk_buff_head list;
 546         int dropped = 0;
 547         unsigned int drop_len = 0;
 548
 549         __skb_queue_head_init(&list);
 550
 551         for (i = 0; i < q->maxflows; i++) {
 552                 slot = &q->slots[i];
 553                 if (!slot->qlen)
 554                         continue;
 555                 while (slot->qlen) {
 556                         skb = slot_dequeue_head(slot);
 557                         sfq_dec(q, i);
 558                         __skb_queue_tail(&list, skb);
 559                 }
 560                 slot->backlog = 0;
 561                 red_set_vars(&slot->vars);
 562                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 563         }
 564         q->tail = NULL;
 565
 566         while ((skb = __skb_dequeue(&list)) != NULL) {
 567                 unsigned int hash = sfq_hash(q, skb);
 568                 sfq_index x = q->ht[hash];
 569
 570                 slot = &q->slots[x];
 571                 if (x == SFQ_EMPTY_SLOT) {
 572                         x = q->dep[0].next; /* get a free slot */
 573                         if (x >= SFQ_MAX_FLOWS) {
 574 drop:
 575                                 qdisc_qstats_backlog_dec(sch, skb);
 576                                 drop_len += qdisc_pkt_len(skb);
 577                                 kfree_skb(skb);
 578                                 dropped++;
 579                                 continue;
 580                         }
 581                         q->ht[hash] = x;
 582                         slot = &q->slots[x];
 583                         slot->hash = hash;
 584                 }
 585                 if (slot->qlen >= q->maxdepth)
 586                         goto drop;
 587                 slot_queue_add(slot, skb);
 588                 if (q->red_parms)
 589                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 590                                                         &slot->vars,
 591                                                         slot->backlog);
 592                 slot->backlog += qdisc_pkt_len(skb);
 593                 sfq_inc(q, x);
 594                 if (slot->qlen == 1) {          /* The flow is new */
 595                         if (q->tail == NULL) {  /* It is the first flow */
 596                                 slot->next = x;
 597                         } else {
 598                                 slot->next = q->tail->next;
 599                                 q->tail->next = x;
 600                         }
 601                         q->tail = slot;
 602                         slot->allot = q->scaled_quantum;
 603                 }
 604         }
 605         sch->q.qlen -= dropped;
 606         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 607 }
 608
 609 static void sfq_perturbation(struct timer_list *t)
 610 {
 611         struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
 612         struct Qdisc *sch = q->sch;
 613         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 614         siphash_key_t nkey;
 615
 616         get_random_bytes(&nkey, sizeof(nkey));
 617         spin_lock(root_lock);
 618         q->perturbation = nkey;
 619         if (!q->filter_list && q->tail)
 620                 sfq_rehash(sch);
 621         spin_unlock(root_lock);
 622
 623         if (q->perturb_period)
 624                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 625 }
 626
 627 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 628 {
 629         struct sfq_sched_data *q = qdisc_priv(sch);
 630         struct tc_sfq_qopt *ctl = nla_data(opt);
 631         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 632         unsigned int qlen, dropped = 0;
 633         struct red_parms *p = NULL;
 634         struct sk_buff *to_free = NULL;
 635         struct sk_buff *tail = NULL;
 636
 637         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 638                 return -EINVAL;
 639         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 640                 ctl_v1 = nla_data(opt);
 641         if (ctl->divisor &&
 642             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 643                 return -EINVAL;
 644         if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
 645                                         ctl_v1->Wlog))
 646                 return -EINVAL;
 647         if (ctl_v1 && ctl_v1->qth_min) {
 648                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 649                 if (!p)
 650                         return -ENOMEM;
 651         }
 652         sch_tree_lock(sch);
 653         if (ctl->quantum) {
 654                 q->quantum = ctl->quantum;
 655                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 656         }
 657         q->perturb_period = ctl->perturb_period * HZ;
 658         if (ctl->flows)
 659                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 660         if (ctl->divisor) {
 661                 q->divisor = ctl->divisor;
 662                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 663         }
 664         if (ctl_v1) {
 665                 if (ctl_v1->depth)
 666                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 667                 if (p) {
 668                         swap(q->red_parms, p);
 669                         red_set_parms(q->red_parms,
 670                                       ctl_v1->qth_min, ctl_v1->qth_max,
 671                                       ctl_v1->Wlog,
 672                                       ctl_v1->Plog, ctl_v1->Scell_log,
 673                                       NULL,
 674                                       ctl_v1->max_P);
 675                 }
 676                 q->flags = ctl_v1->flags;
 677                 q->headdrop = ctl_v1->headdrop;
 678         }
 679         if (ctl->limit) {
 680                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 681                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 682         }
 683
 684         qlen = sch->q.qlen;
 685         while (sch->q.qlen > q->limit) {
 686                 dropped += sfq_drop(sch, &to_free);
 687                 if (!tail)
 688                         tail = to_free;
 689         }
 690
 691         rtnl_kfree_skbs(to_free, tail);
 692         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 693
 694         del_timer(&q->perturb_timer);
 695         if (q->perturb_period) {
 696                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 697                 get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 698         }
 699         sch_tree_unlock(sch);
 700         kfree(p);
 701         return 0;
 702 }
 703
 704 static void *sfq_alloc(size_t sz)
 705 {
 706         return  kvmalloc(sz, GFP_KERNEL);
 707 }
 708
 709 static void sfq_free(void *addr)
 710 {
 711         kvfree(addr);
 712 }
 713
 714 static void sfq_destroy(struct Qdisc *sch)
 715 {
 716         struct sfq_sched_data *q = qdisc_priv(sch);
 717
 718         tcf_block_put(q->block);
 719         q->perturb_period = 0;
 720         del_timer_sync(&q->perturb_timer);
 721         sfq_free(q->ht);
 722         sfq_free(q->slots);
 723         kfree(q->red_parms);
 724 }
 725
 726 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 727 {
 728         struct sfq_sched_data *q = qdisc_priv(sch);
 729         int i;
 730         int err;
 731
 732         q->sch = sch;
 733         timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 734
 735         err = tcf_block_get(&q->block, &q->filter_list, sch);
 736         if (err)
 737                 return err;
 738
 739         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 740                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 741                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 742         }
 743
 744         q->limit = SFQ_MAX_DEPTH;
 745         q->maxdepth = SFQ_MAX_DEPTH;
 746         q->cur_depth = 0;
 747         q->tail = NULL;
 748         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 749         q->maxflows = SFQ_DEFAULT_FLOWS;
 750         q->quantum = psched_mtu(qdisc_dev(sch));
 751         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 752         q->perturb_period = 0;
 753         get_random_bytes(&q->perturbation, sizeof(q->perturbation));
 754
 755         if (opt) {
 756                 int err = sfq_change(sch, opt);
 757                 if (err)
 758                         return err;
 759         }
 760
 761         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 762         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 763         if (!q->ht || !q->slots) {
 764                 /* Note: sfq_destroy() will be called by our caller */
 765                 return -ENOMEM;
 766         }
 767
 768         for (i = 0; i < q->divisor; i++)
 769                 q->ht[i] = SFQ_EMPTY_SLOT;
 770
 771         for (i = 0; i < q->maxflows; i++) {
 772                 slot_queue_init(&q->slots[i]);
 773                 sfq_link(q, i);
 774         }
 775         if (q->limit >= 1)
 776                 sch->flags |= TCQ_F_CAN_BYPASS;
 777         else
 778                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 779         return 0;
 780 }
 781
 782 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 783 {
 784         struct sfq_sched_data *q = qdisc_priv(sch);
 785         unsigned char *b = skb_tail_pointer(skb);
 786         struct tc_sfq_qopt_v1 opt;
 787         struct red_parms *p = q->red_parms;
 788
 789         memset(&opt, 0, sizeof(opt));
 790         opt.v0.quantum  = q->quantum;
 791         opt.v0.perturb_period = q->perturb_period / HZ;
 792         opt.v0.limit    = q->limit;
 793         opt.v0.divisor  = q->divisor;
 794         opt.v0.flows    = q->maxflows;
 795         opt.depth       = q->maxdepth;
 796         opt.headdrop    = q->headdrop;
 797
 798         if (p) {
 799                 opt.qth_min     = p->qth_min >> p->Wlog;
 800                 opt.qth_max     = p->qth_max >> p->Wlog;
 801                 opt.Wlog        = p->Wlog;
 802                 opt.Plog        = p->Plog;
 803                 opt.Scell_log   = p->Scell_log;
 804                 opt.max_P       = p->max_P;
 805         }
 806         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 807         opt.flags       = q->flags;
 808
 809         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 810                 goto nla_put_failure;
 811
 812         return skb->len;
 813
 814 nla_put_failure:
 815         nlmsg_trim(skb, b);
 816         return -1;
 817 }
 818
 819 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 820 {
 821         return NULL;
 822 }
 823
 824 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 825 {
 826         return 0;
 827 }
 828
 829 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 830                               u32 classid)
 831 {
 832         return 0;
 833 }
 834
 835 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 836 {
 837 }
 838
 839 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
 840 {
 841         struct sfq_sched_data *q = qdisc_priv(sch);
 842
 843         if (cl)
 844                 return NULL;
 845         return q->block;
 846 }
 847
 848 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 849                           struct sk_buff *skb, struct tcmsg *tcm)
 850 {
 851         tcm->tcm_handle |= TC_H_MIN(cl);
 852         return 0;
 853 }
 854
 855 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 856                                 struct gnet_dump *d)
 857 {
 858         struct sfq_sched_data *q = qdisc_priv(sch);
 859         sfq_index idx = q->ht[cl - 1];
 860         struct gnet_stats_queue qs = { 0 };
 861         struct tc_sfq_xstats xstats = { 0 };
 862
 863         if (idx != SFQ_EMPTY_SLOT) {
 864                 const struct sfq_slot *slot = &q->slots[idx];
 865
 866                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 867                 qs.qlen = slot->qlen;
 868                 qs.backlog = slot->backlog;
 869         }
 870         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 871                 return -1;
 872         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 873 }
 874
 875 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 876 {
 877         struct sfq_sched_data *q = qdisc_priv(sch);
 878         unsigned int i;
 879
 880         if (arg->stop)
 881                 return;
 882
 883         for (i = 0; i < q->divisor; i++) {
 884                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 885                     arg->count < arg->skip) {
 886                         arg->count++;
 887                         continue;
 888                 }
 889                 if (arg->fn(sch, i + 1, arg) < 0) {
 890                         arg->stop = 1;
 891                         break;
 892                 }
 893                 arg->count++;
 894         }
 895 }
 896
 897 static const struct Qdisc_class_ops sfq_class_ops = {
 898         .leaf           =       sfq_leaf,
 899         .find           =       sfq_find,
 900         .tcf_block      =       sfq_tcf_block,
 901         .bind_tcf       =       sfq_bind,
 902         .unbind_tcf     =       sfq_unbind,
 903         .dump           =       sfq_dump_class,
 904         .dump_stats     =       sfq_dump_class_stats,
 905         .walk           =       sfq_walk,
 906 };
 907
 908 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 909         .cl_ops         =       &sfq_class_ops,
 910         .id             =       "sfq",
 911         .priv_size      =       sizeof(struct sfq_sched_data),
 912         .enqueue        =       sfq_enqueue,
 913         .dequeue        =       sfq_dequeue,
 914         .peek           =       qdisc_peek_dequeued,
 915         .init           =       sfq_init,
 916         .reset          =       sfq_reset,
 917         .destroy        =       sfq_destroy,
 918         .change         =       NULL,
 919         .dump           =       sfq_dump,
 920         .owner          =       THIS_MODULE,
 921 };
 922
 923 static int __init sfq_module_init(void)
 924 {
 925         return register_qdisc(&sfq_qdisc_ops);
 926 }
 927 static void __exit sfq_module_exit(void)
 928 {
 929         unregister_qdisc(&sfq_qdisc_ops);
 930 }
 931 module_init(sfq_module_init)
 932 module_exit(sfq_module_exit)
 933 MODULE_LICENSE("GPL");