net/sched/sch_cbq.c

   1 /*
   2  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  */
  12
  13 #include <linux/module.h>
  14 #include <linux/slab.h>
  15 #include <linux/types.h>
  16 #include <linux/kernel.h>
  17 #include <linux/string.h>
  18 #include <linux/errno.h>
  19 #include <linux/skbuff.h>
  20 #include <net/netlink.h>
  21 #include <net/pkt_sched.h>
  22
  23
  24 /*      Class-Based Queueing (CBQ) algorithm.
  25         =======================================
  26
  27         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  28                  Management Models for Packet Networks",
  29                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  30
  31                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  32
  33                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  34                  Parameters", 1996
  35
  36                  [4] Sally Floyd and Michael Speer, "Experimental Results
  37                  for Class-Based Queueing", 1998, not published.
  38
  39         -----------------------------------------------------------------------
  40
  41         Algorithm skeleton was taken from NS simulator cbq.cc.
  42         If someone wants to check this code against the LBL version,
  43         he should take into account that ONLY the skeleton was borrowed,
  44         the implementation is different. Particularly:
  45
  46         --- The WRR algorithm is different. Our version looks more
  47         reasonable (I hope) and works when quanta are allowed to be
  48         less than MTU, which is always the case when real time classes
  49         have small rates. Note, that the statement of [3] is
  50         incomplete, delay may actually be estimated even if class
  51         per-round allotment is less than MTU. Namely, if per-round
  52         allotment is W*r_i, and r_1+...+r_k = r < 1
  53
  54         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  55
  56         In the worst case we have IntServ estimate with D = W*r+k*MTU
  57         and C = MTU*r. The proof (if correct at all) is trivial.
  58
  59
  60         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  61         interpret some places, which look like wrong translations
  62         from NS. Anyone is advised to find these differences
  63         and explain to me, why I am wrong 8).
  64
  65         --- Linux has no EOI event, so that we cannot estimate true class
  66         idle time. Workaround is to consider the next dequeue event
  67         as sign that previous packet is finished. This is wrong because of
  68         internal device queueing, but on a permanently loaded link it is true.
  69         Moreover, combined with clock integrator, this scheme looks
  70         very close to an ideal solution.  */
  71
  72 struct cbq_sched_data;
  73
  74
  75 struct cbq_class {
  76         struct Qdisc_class_common common;
  77         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  78
  79 /* Parameters */
  80         unsigned char           priority;       /* class priority */
  81         unsigned char           priority2;      /* priority to be used after overlimit */
  82         unsigned char           ewma_log;       /* time constant for idle time calculation */
  83
  84         u32                     defmap;
  85
  86         /* Link-sharing scheduler parameters */
  87         long                    maxidle;        /* Class parameters: see below. */
  88         long                    offtime;
  89         long                    minidle;
  90         u32                     avpkt;
  91         struct qdisc_rate_table *R_tab;
  92
  93         /* General scheduler (WRR) parameters */
  94         long                    allot;
  95         long                    quantum;        /* Allotment per WRR round */
  96         long                    weight;         /* Relative allotment: see below */
  97
  98         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
  99         struct cbq_class        *split;         /* Ptr to split node */
 100         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 101         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 102         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 103                                                    parent otherwise */
 104         struct cbq_class        *sibling;       /* Sibling chain */
 105         struct cbq_class        *children;      /* Pointer to children chain */
 106
 107         struct Qdisc            *q;             /* Elementary queueing discipline */
 108
 109
 110 /* Variables */
 111         unsigned char           cpriority;      /* Effective priority */
 112         unsigned char           delayed;
 113         unsigned char           level;          /* level of the class in hierarchy:
 114                                                    0 for leaf classes, and maximal
 115                                                    level of children + 1 for nodes.
 116                                                  */
 117
 118         psched_time_t           last;           /* Last end of service */
 119         psched_time_t           undertime;
 120         long                    avgidle;
 121         long                    deficit;        /* Saved deficit for WRR */
 122         psched_time_t           penalized;
 123         struct gnet_stats_basic_packed bstats;
 124         struct gnet_stats_queue qstats;
 125         struct gnet_stats_rate_est64 rate_est;
 126         struct tc_cbq_xstats    xstats;
 127
 128         struct tcf_proto __rcu  *filter_list;
 129
 130         int                     refcnt;
 131         int                     filters;
 132
 133         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 134 };
 135
 136 struct cbq_sched_data {
 137         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 138         int                     nclasses[TC_CBQ_MAXPRIO + 1];
 139         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 140
 141         struct cbq_class        link;
 142
 143         unsigned int            activemask;
 144         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 145                                                                    with backlog */
 146
 147 #ifdef CONFIG_NET_CLS_ACT
 148         struct cbq_class        *rx_class;
 149 #endif
 150         struct cbq_class        *tx_class;
 151         struct cbq_class        *tx_borrowed;
 152         int                     tx_len;
 153         psched_time_t           now;            /* Cached timestamp */
 154         unsigned int            pmask;
 155
 156         struct hrtimer          delay_timer;
 157         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 158                                                    started when CBQ has
 159                                                    backlog, but cannot
 160                                                    transmit just now */
 161         psched_tdiff_t          wd_expires;
 162         int                     toplevel;
 163         u32                     hgenerator;
 164 };
 165
 166
 167 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 168
 169 static inline struct cbq_class *
 170 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 171 {
 172         struct Qdisc_class_common *clc;
 173
 174         clc = qdisc_class_find(&q->clhash, classid);
 175         if (clc == NULL)
 176                 return NULL;
 177         return container_of(clc, struct cbq_class, common);
 178 }
 179
 180 #ifdef CONFIG_NET_CLS_ACT
 181
 182 static struct cbq_class *
 183 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 184 {
 185         struct cbq_class *cl;
 186
 187         for (cl = this->tparent; cl; cl = cl->tparent) {
 188                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 189
 190                 if (new != NULL && new != this)
 191                         return new;
 192         }
 193         return NULL;
 194 }
 195
 196 #endif
 197
 198 /* Classify packet. The procedure is pretty complicated, but
 199  * it allows us to combine link sharing and priority scheduling
 200  * transparently.
 201  *
 202  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 203  * so that it resolves to split nodes. Then packets are classified
 204  * by logical priority, or a more specific classifier may be attached
 205  * to the split node.
 206  */
 207
 208 static struct cbq_class *
 209 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 210 {
 211         struct cbq_sched_data *q = qdisc_priv(sch);
 212         struct cbq_class *head = &q->link;
 213         struct cbq_class **defmap;
 214         struct cbq_class *cl = NULL;
 215         u32 prio = skb->priority;
 216         struct tcf_proto *fl;
 217         struct tcf_result res;
 218
 219         /*
 220          *  Step 1. If skb->priority points to one of our classes, use it.
 221          */
 222         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 223             (cl = cbq_class_lookup(q, prio)) != NULL)
 224                 return cl;
 225
 226         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 227         for (;;) {
 228                 int result = 0;
 229                 defmap = head->defaults;
 230
 231                 fl = rcu_dereference_bh(head->filter_list);
 232                 /*
 233                  * Step 2+n. Apply classifier.
 234                  */
 235                 result = tc_classify(skb, fl, &res, true);
 236                 if (!fl || result < 0)
 237                         goto fallback;
 238
 239                 cl = (void *)res.class;
 240                 if (!cl) {
 241                         if (TC_H_MAJ(res.classid))
 242                                 cl = cbq_class_lookup(q, res.classid);
 243                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 244                                 cl = defmap[TC_PRIO_BESTEFFORT];
 245
 246                         if (cl == NULL)
 247                                 goto fallback;
 248                 }
 249                 if (cl->level >= head->level)
 250                         goto fallback;
 251 #ifdef CONFIG_NET_CLS_ACT
 252                 switch (result) {
 253                 case TC_ACT_QUEUED:
 254                 case TC_ACT_STOLEN:
 255                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 256                 case TC_ACT_SHOT:
 257                         return NULL;
 258                 case TC_ACT_RECLASSIFY:
 259                         return cbq_reclassify(skb, cl);
 260                 }
 261 #endif
 262                 if (cl->level == 0)
 263                         return cl;
 264
 265                 /*
 266                  * Step 3+n. If classifier selected a link sharing class,
 267                  *         apply agency specific classifier.
 268                  *         Repeat this procdure until we hit a leaf node.
 269                  */
 270                 head = cl;
 271         }
 272
 273 fallback:
 274         cl = head;
 275
 276         /*
 277          * Step 4. No success...
 278          */
 279         if (TC_H_MAJ(prio) == 0 &&
 280             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 281             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 282                 return head;
 283
 284         return cl;
 285 }
 286
 287 /*
 288  * A packet has just been enqueued on the empty class.
 289  * cbq_activate_class adds it to the tail of active class list
 290  * of its priority band.
 291  */
 292
 293 static inline void cbq_activate_class(struct cbq_class *cl)
 294 {
 295         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 296         int prio = cl->cpriority;
 297         struct cbq_class *cl_tail;
 298
 299         cl_tail = q->active[prio];
 300         q->active[prio] = cl;
 301
 302         if (cl_tail != NULL) {
 303                 cl->next_alive = cl_tail->next_alive;
 304                 cl_tail->next_alive = cl;
 305         } else {
 306                 cl->next_alive = cl;
 307                 q->activemask |= (1<<prio);
 308         }
 309 }
 310
 311 /*
 312  * Unlink class from active chain.
 313  * Note that this same procedure is done directly in cbq_dequeue*
 314  * during round-robin procedure.
 315  */
 316
 317 static void cbq_deactivate_class(struct cbq_class *this)
 318 {
 319         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 320         int prio = this->cpriority;
 321         struct cbq_class *cl;
 322         struct cbq_class *cl_prev = q->active[prio];
 323
 324         do {
 325                 cl = cl_prev->next_alive;
 326                 if (cl == this) {
 327                         cl_prev->next_alive = cl->next_alive;
 328                         cl->next_alive = NULL;
 329
 330                         if (cl == q->active[prio]) {
 331                                 q->active[prio] = cl_prev;
 332                                 if (cl == q->active[prio]) {
 333                                         q->active[prio] = NULL;
 334                                         q->activemask &= ~(1<<prio);
 335                                         return;
 336                                 }
 337                         }
 338                         return;
 339                 }
 340         } while ((cl_prev = cl) != q->active[prio]);
 341 }
 342
 343 static void
 344 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 345 {
 346         int toplevel = q->toplevel;
 347
 348         if (toplevel > cl->level) {
 349                 psched_time_t now = psched_get_time();
 350
 351                 do {
 352                         if (cl->undertime < now) {
 353                                 q->toplevel = cl->level;
 354                                 return;
 355                         }
 356                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 357         }
 358 }
 359
 360 static int
 361 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 362 {
 363         struct cbq_sched_data *q = qdisc_priv(sch);
 364         int uninitialized_var(ret);
 365         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 366
 367 #ifdef CONFIG_NET_CLS_ACT
 368         q->rx_class = cl;
 369 #endif
 370         if (cl == NULL) {
 371                 if (ret & __NET_XMIT_BYPASS)
 372                         qdisc_qstats_drop(sch);
 373                 kfree_skb(skb);
 374                 return ret;
 375         }
 376
 377         ret = qdisc_enqueue(skb, cl->q);
 378         if (ret == NET_XMIT_SUCCESS) {
 379                 sch->q.qlen++;
 380                 cbq_mark_toplevel(q, cl);
 381                 if (!cl->next_alive)
 382                         cbq_activate_class(cl);
 383                 return ret;
 384         }
 385
 386         if (net_xmit_drop_count(ret)) {
 387                 qdisc_qstats_drop(sch);
 388                 cbq_mark_toplevel(q, cl);
 389                 cl->qstats.drops++;
 390         }
 391         return ret;
 392 }
 393
 394 /* Overlimit action: penalize leaf class by adding offtime */
 395 static void cbq_overlimit(struct cbq_class *cl)
 396 {
 397         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 398         psched_tdiff_t delay = cl->undertime - q->now;
 399
 400         if (!cl->delayed) {
 401                 delay += cl->offtime;
 402
 403                 /*
 404                  * Class goes to sleep, so that it will have no
 405                  * chance to work avgidle. Let's forgive it 8)
 406                  *
 407                  * BTW cbq-2.0 has a crap in this
 408                  * place, apparently they forgot to shift it by cl->ewma_log.
 409                  */
 410                 if (cl->avgidle < 0)
 411                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 412                 if (cl->avgidle < cl->minidle)
 413                         cl->avgidle = cl->minidle;
 414                 if (delay <= 0)
 415                         delay = 1;
 416                 cl->undertime = q->now + delay;
 417
 418                 cl->xstats.overactions++;
 419                 cl->delayed = 1;
 420         }
 421         if (q->wd_expires == 0 || q->wd_expires > delay)
 422                 q->wd_expires = delay;
 423
 424         /* Dirty work! We must schedule wakeups based on
 425          * real available rate, rather than leaf rate,
 426          * which may be tiny (even zero).
 427          */
 428         if (q->toplevel == TC_CBQ_MAXLEVEL) {
 429                 struct cbq_class *b;
 430                 psched_tdiff_t base_delay = q->wd_expires;
 431
 432                 for (b = cl->borrow; b; b = b->borrow) {
 433                         delay = b->undertime - q->now;
 434                         if (delay < base_delay) {
 435                                 if (delay <= 0)
 436                                         delay = 1;
 437                                 base_delay = delay;
 438                         }
 439                 }
 440
 441                 q->wd_expires = base_delay;
 442         }
 443 }
 444
 445 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 446                                        psched_time_t now)
 447 {
 448         struct cbq_class *cl;
 449         struct cbq_class *cl_prev = q->active[prio];
 450         psched_time_t sched = now;
 451
 452         if (cl_prev == NULL)
 453                 return 0;
 454
 455         do {
 456                 cl = cl_prev->next_alive;
 457                 if (now - cl->penalized > 0) {
 458                         cl_prev->next_alive = cl->next_alive;
 459                         cl->next_alive = NULL;
 460                         cl->cpriority = cl->priority;
 461                         cl->delayed = 0;
 462                         cbq_activate_class(cl);
 463
 464                         if (cl == q->active[prio]) {
 465                                 q->active[prio] = cl_prev;
 466                                 if (cl == q->active[prio]) {
 467                                         q->active[prio] = NULL;
 468                                         return 0;
 469                                 }
 470                         }
 471
 472                         cl = cl_prev->next_alive;
 473                 } else if (sched - cl->penalized > 0)
 474                         sched = cl->penalized;
 475         } while ((cl_prev = cl) != q->active[prio]);
 476
 477         return sched - now;
 478 }
 479
 480 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 481 {
 482         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 483                                                 delay_timer);
 484         struct Qdisc *sch = q->watchdog.qdisc;
 485         psched_time_t now;
 486         psched_tdiff_t delay = 0;
 487         unsigned int pmask;
 488
 489         now = psched_get_time();
 490
 491         pmask = q->pmask;
 492         q->pmask = 0;
 493
 494         while (pmask) {
 495                 int prio = ffz(~pmask);
 496                 psched_tdiff_t tmp;
 497
 498                 pmask &= ~(1<<prio);
 499
 500                 tmp = cbq_undelay_prio(q, prio, now);
 501                 if (tmp > 0) {
 502                         q->pmask |= 1<<prio;
 503                         if (tmp < delay || delay == 0)
 504                                 delay = tmp;
 505                 }
 506         }
 507
 508         if (delay) {
 509                 ktime_t time;
 510
 511                 time = ktime_set(0, 0);
 512                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 513                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 514         }
 515
 516         __netif_schedule(qdisc_root(sch));
 517         return HRTIMER_NORESTART;
 518 }
 519
 520 /*
 521  * It is mission critical procedure.
 522  *
 523  * We "regenerate" toplevel cutoff, if transmitting class
 524  * has backlog and it is not regulated. It is not part of
 525  * original CBQ description, but looks more reasonable.
 526  * Probably, it is wrong. This question needs further investigation.
 527  */
 528
 529 static inline void
 530 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 531                     struct cbq_class *borrowed)
 532 {
 533         if (cl && q->toplevel >= borrowed->level) {
 534                 if (cl->q->q.qlen > 1) {
 535                         do {
 536                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
 537                                         q->toplevel = borrowed->level;
 538                                         return;
 539                                 }
 540                         } while ((borrowed = borrowed->borrow) != NULL);
 541                 }
 542 #if 0
 543         /* It is not necessary now. Uncommenting it
 544            will save CPU cycles, but decrease fairness.
 545          */
 546                 q->toplevel = TC_CBQ_MAXLEVEL;
 547 #endif
 548         }
 549 }
 550
 551 static void
 552 cbq_update(struct cbq_sched_data *q)
 553 {
 554         struct cbq_class *this = q->tx_class;
 555         struct cbq_class *cl = this;
 556         int len = q->tx_len;
 557         psched_time_t now;
 558
 559         q->tx_class = NULL;
 560         /* Time integrator. We calculate EOS time
 561          * by adding expected packet transmission time.
 562          */
 563         now = q->now + L2T(&q->link, len);
 564
 565         for ( ; cl; cl = cl->share) {
 566                 long avgidle = cl->avgidle;
 567                 long idle;
 568
 569                 cl->bstats.packets++;
 570                 cl->bstats.bytes += len;
 571
 572                 /*
 573                  * (now - last) is total time between packet right edges.
 574                  * (last_pktlen/rate) is "virtual" busy time, so that
 575                  *
 576                  *      idle = (now - last) - last_pktlen/rate
 577                  */
 578
 579                 idle = now - cl->last;
 580                 if ((unsigned long)idle > 128*1024*1024) {
 581                         avgidle = cl->maxidle;
 582                 } else {
 583                         idle -= L2T(cl, len);
 584
 585                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
 586                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 587                  * cl->avgidle == true_avgidle/W,
 588                  * hence:
 589                  */
 590                         avgidle += idle - (avgidle>>cl->ewma_log);
 591                 }
 592
 593                 if (avgidle <= 0) {
 594                         /* Overlimit or at-limit */
 595
 596                         if (avgidle < cl->minidle)
 597                                 avgidle = cl->minidle;
 598
 599                         cl->avgidle = avgidle;
 600
 601                         /* Calculate expected time, when this class
 602                          * will be allowed to send.
 603                          * It will occur, when:
 604                          * (1-W)*true_avgidle + W*delay = 0, i.e.
 605                          * idle = (1/W - 1)*(-true_avgidle)
 606                          * or
 607                          * idle = (1 - W)*(-cl->avgidle);
 608                          */
 609                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 610
 611                         /*
 612                          * That is not all.
 613                          * To maintain the rate allocated to the class,
 614                          * we add to undertime virtual clock,
 615                          * necessary to complete transmitted packet.
 616                          * (len/phys_bandwidth has been already passed
 617                          * to the moment of cbq_update)
 618                          */
 619
 620                         idle -= L2T(&q->link, len);
 621                         idle += L2T(cl, len);
 622
 623                         cl->undertime = now + idle;
 624                 } else {
 625                         /* Underlimit */
 626
 627                         cl->undertime = PSCHED_PASTPERFECT;
 628                         if (avgidle > cl->maxidle)
 629                                 cl->avgidle = cl->maxidle;
 630                         else
 631                                 cl->avgidle = avgidle;
 632                 }
 633                 if ((s64)(now - cl->last) > 0)
 634                         cl->last = now;
 635         }
 636
 637         cbq_update_toplevel(q, this, q->tx_borrowed);
 638 }
 639
 640 static inline struct cbq_class *
 641 cbq_under_limit(struct cbq_class *cl)
 642 {
 643         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 644         struct cbq_class *this_cl = cl;
 645
 646         if (cl->tparent == NULL)
 647                 return cl;
 648
 649         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 650                 cl->delayed = 0;
 651                 return cl;
 652         }
 653
 654         do {
 655                 /* It is very suspicious place. Now overlimit
 656                  * action is generated for not bounded classes
 657                  * only if link is completely congested.
 658                  * Though it is in agree with ancestor-only paradigm,
 659                  * it looks very stupid. Particularly,
 660                  * it means that this chunk of code will either
 661                  * never be called or result in strong amplification
 662                  * of burstiness. Dangerous, silly, and, however,
 663                  * no another solution exists.
 664                  */
 665                 cl = cl->borrow;
 666                 if (!cl) {
 667                         this_cl->qstats.overlimits++;
 668                         cbq_overlimit(this_cl);
 669                         return NULL;
 670                 }
 671                 if (cl->level > q->toplevel)
 672                         return NULL;
 673         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 674
 675         cl->delayed = 0;
 676         return cl;
 677 }
 678
 679 static inline struct sk_buff *
 680 cbq_dequeue_prio(struct Qdisc *sch, int prio)
 681 {
 682         struct cbq_sched_data *q = qdisc_priv(sch);
 683         struct cbq_class *cl_tail, *cl_prev, *cl;
 684         struct sk_buff *skb;
 685         int deficit;
 686
 687         cl_tail = cl_prev = q->active[prio];
 688         cl = cl_prev->next_alive;
 689
 690         do {
 691                 deficit = 0;
 692
 693                 /* Start round */
 694                 do {
 695                         struct cbq_class *borrow = cl;
 696
 697                         if (cl->q->q.qlen &&
 698                             (borrow = cbq_under_limit(cl)) == NULL)
 699                                 goto skip_class;
 700
 701                         if (cl->deficit <= 0) {
 702                                 /* Class exhausted its allotment per
 703                                  * this round. Switch to the next one.
 704                                  */
 705                                 deficit = 1;
 706                                 cl->deficit += cl->quantum;
 707                                 goto next_class;
 708                         }
 709
 710                         skb = cl->q->dequeue(cl->q);
 711
 712                         /* Class did not give us any skb :-(
 713                          * It could occur even if cl->q->q.qlen != 0
 714                          * f.e. if cl->q == "tbf"
 715                          */
 716                         if (skb == NULL)
 717                                 goto skip_class;
 718
 719                         cl->deficit -= qdisc_pkt_len(skb);
 720                         q->tx_class = cl;
 721                         q->tx_borrowed = borrow;
 722                         if (borrow != cl) {
 723 #ifndef CBQ_XSTATS_BORROWS_BYTES
 724                                 borrow->xstats.borrows++;
 725                                 cl->xstats.borrows++;
 726 #else
 727                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
 728                                 cl->xstats.borrows += qdisc_pkt_len(skb);
 729 #endif
 730                         }
 731                         q->tx_len = qdisc_pkt_len(skb);
 732
 733                         if (cl->deficit <= 0) {
 734                                 q->active[prio] = cl;
 735                                 cl = cl->next_alive;
 736                                 cl->deficit += cl->quantum;
 737                         }
 738                         return skb;
 739
 740 skip_class:
 741                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 742                                 /* Class is empty or penalized.
 743                                  * Unlink it from active chain.
 744                                  */
 745                                 cl_prev->next_alive = cl->next_alive;
 746                                 cl->next_alive = NULL;
 747
 748                                 /* Did cl_tail point to it? */
 749                                 if (cl == cl_tail) {
 750                                         /* Repair it! */
 751                                         cl_tail = cl_prev;
 752
 753                                         /* Was it the last class in this band? */
 754                                         if (cl == cl_tail) {
 755                                                 /* Kill the band! */
 756                                                 q->active[prio] = NULL;
 757                                                 q->activemask &= ~(1<<prio);
 758                                                 if (cl->q->q.qlen)
 759                                                         cbq_activate_class(cl);
 760                                                 return NULL;
 761                                         }
 762
 763                                         q->active[prio] = cl_tail;
 764                                 }
 765                                 if (cl->q->q.qlen)
 766                                         cbq_activate_class(cl);
 767
 768                                 cl = cl_prev;
 769                         }
 770
 771 next_class:
 772                         cl_prev = cl;
 773                         cl = cl->next_alive;
 774                 } while (cl_prev != cl_tail);
 775         } while (deficit);
 776
 777         q->active[prio] = cl_prev;
 778
 779         return NULL;
 780 }
 781
 782 static inline struct sk_buff *
 783 cbq_dequeue_1(struct Qdisc *sch)
 784 {
 785         struct cbq_sched_data *q = qdisc_priv(sch);
 786         struct sk_buff *skb;
 787         unsigned int activemask;
 788
 789         activemask = q->activemask & 0xFF;
 790         while (activemask) {
 791                 int prio = ffz(~activemask);
 792                 activemask &= ~(1<<prio);
 793                 skb = cbq_dequeue_prio(sch, prio);
 794                 if (skb)
 795                         return skb;
 796         }
 797         return NULL;
 798 }
 799
 800 static struct sk_buff *
 801 cbq_dequeue(struct Qdisc *sch)
 802 {
 803         struct sk_buff *skb;
 804         struct cbq_sched_data *q = qdisc_priv(sch);
 805         psched_time_t now;
 806
 807         now = psched_get_time();
 808
 809         if (q->tx_class)
 810                 cbq_update(q);
 811
 812         q->now = now;
 813
 814         for (;;) {
 815                 q->wd_expires = 0;
 816
 817                 skb = cbq_dequeue_1(sch);
 818                 if (skb) {
 819                         qdisc_bstats_update(sch, skb);
 820                         sch->q.qlen--;
 821                         return skb;
 822                 }
 823
 824                 /* All the classes are overlimit.
 825                  *
 826                  * It is possible, if:
 827                  *
 828                  * 1. Scheduler is empty.
 829                  * 2. Toplevel cutoff inhibited borrowing.
 830                  * 3. Root class is overlimit.
 831                  *
 832                  * Reset 2d and 3d conditions and retry.
 833                  *
 834                  * Note, that NS and cbq-2.0 are buggy, peeking
 835                  * an arbitrary class is appropriate for ancestor-only
 836                  * sharing, but not for toplevel algorithm.
 837                  *
 838                  * Our version is better, but slower, because it requires
 839                  * two passes, but it is unavoidable with top-level sharing.
 840                  */
 841
 842                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
 843                     q->link.undertime == PSCHED_PASTPERFECT)
 844                         break;
 845
 846                 q->toplevel = TC_CBQ_MAXLEVEL;
 847                 q->link.undertime = PSCHED_PASTPERFECT;
 848         }
 849
 850         /* No packets in scheduler or nobody wants to give them to us :-(
 851          * Sigh... start watchdog timer in the last case.
 852          */
 853
 854         if (sch->q.qlen) {
 855                 qdisc_qstats_overlimit(sch);
 856                 if (q->wd_expires)
 857                         qdisc_watchdog_schedule(&q->watchdog,
 858                                                 now + q->wd_expires);
 859         }
 860         return NULL;
 861 }
 862
 863 /* CBQ class maintanance routines */
 864
 865 static void cbq_adjust_levels(struct cbq_class *this)
 866 {
 867         if (this == NULL)
 868                 return;
 869
 870         do {
 871                 int level = 0;
 872                 struct cbq_class *cl;
 873
 874                 cl = this->children;
 875                 if (cl) {
 876                         do {
 877                                 if (cl->level > level)
 878                                         level = cl->level;
 879                         } while ((cl = cl->sibling) != this->children);
 880                 }
 881                 this->level = level + 1;
 882         } while ((this = this->tparent) != NULL);
 883 }
 884
 885 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 886 {
 887         struct cbq_class *cl;
 888         unsigned int h;
 889
 890         if (q->quanta[prio] == 0)
 891                 return;
 892
 893         for (h = 0; h < q->clhash.hashsize; h++) {
 894                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 895                         /* BUGGGG... Beware! This expression suffer of
 896                          * arithmetic overflows!
 897                          */
 898                         if (cl->priority == prio) {
 899                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 900                                         q->quanta[prio];
 901                         }
 902                         if (cl->quantum <= 0 ||
 903                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 904                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 905                                         cl->common.classid, cl->quantum);
 906                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 907                         }
 908                 }
 909         }
 910 }
 911
 912 static void cbq_sync_defmap(struct cbq_class *cl)
 913 {
 914         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 915         struct cbq_class *split = cl->split;
 916         unsigned int h;
 917         int i;
 918
 919         if (split == NULL)
 920                 return;
 921
 922         for (i = 0; i <= TC_PRIO_MAX; i++) {
 923                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 924                         split->defaults[i] = NULL;
 925         }
 926
 927         for (i = 0; i <= TC_PRIO_MAX; i++) {
 928                 int level = split->level;
 929
 930                 if (split->defaults[i])
 931                         continue;
 932
 933                 for (h = 0; h < q->clhash.hashsize; h++) {
 934                         struct cbq_class *c;
 935
 936                         hlist_for_each_entry(c, &q->clhash.hash[h],
 937                                              common.hnode) {
 938                                 if (c->split == split && c->level < level &&
 939                                     c->defmap & (1<<i)) {
 940                                         split->defaults[i] = c;
 941                                         level = c->level;
 942                                 }
 943                         }
 944                 }
 945         }
 946 }
 947
 948 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 949 {
 950         struct cbq_class *split = NULL;
 951
 952         if (splitid == 0) {
 953                 split = cl->split;
 954                 if (!split)
 955                         return;
 956                 splitid = split->common.classid;
 957         }
 958
 959         if (split == NULL || split->common.classid != splitid) {
 960                 for (split = cl->tparent; split; split = split->tparent)
 961                         if (split->common.classid == splitid)
 962                                 break;
 963         }
 964
 965         if (split == NULL)
 966                 return;
 967
 968         if (cl->split != split) {
 969                 cl->defmap = 0;
 970                 cbq_sync_defmap(cl);
 971                 cl->split = split;
 972                 cl->defmap = def & mask;
 973         } else
 974                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
 975
 976         cbq_sync_defmap(cl);
 977 }
 978
 979 static void cbq_unlink_class(struct cbq_class *this)
 980 {
 981         struct cbq_class *cl, **clp;
 982         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 983
 984         qdisc_class_hash_remove(&q->clhash, &this->common);
 985
 986         if (this->tparent) {
 987                 clp = &this->sibling;
 988                 cl = *clp;
 989                 do {
 990                         if (cl == this) {
 991                                 *clp = cl->sibling;
 992                                 break;
 993                         }
 994                         clp = &cl->sibling;
 995                 } while ((cl = *clp) != this->sibling);
 996
 997                 if (this->tparent->children == this) {
 998                         this->tparent->children = this->sibling;
 999                         if (this->sibling == this)
1000                                 this->tparent->children = NULL;
1001                 }
1002         } else {
1003                 WARN_ON(this->sibling != this);
1004         }
1005 }
1006
1007 static void cbq_link_class(struct cbq_class *this)
1008 {
1009         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1010         struct cbq_class *parent = this->tparent;
1011
1012         this->sibling = this;
1013         qdisc_class_hash_insert(&q->clhash, &this->common);
1014
1015         if (parent == NULL)
1016                 return;
1017
1018         if (parent->children == NULL) {
1019                 parent->children = this;
1020         } else {
1021                 this->sibling = parent->children->sibling;
1022                 parent->children->sibling = this;
1023         }
1024 }
1025
1026 static void
1027 cbq_reset(struct Qdisc *sch)
1028 {
1029         struct cbq_sched_data *q = qdisc_priv(sch);
1030         struct cbq_class *cl;
1031         int prio;
1032         unsigned int h;
1033
1034         q->activemask = 0;
1035         q->pmask = 0;
1036         q->tx_class = NULL;
1037         q->tx_borrowed = NULL;
1038         qdisc_watchdog_cancel(&q->watchdog);
1039         hrtimer_cancel(&q->delay_timer);
1040         q->toplevel = TC_CBQ_MAXLEVEL;
1041         q->now = psched_get_time();
1042
1043         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1044                 q->active[prio] = NULL;
1045
1046         for (h = 0; h < q->clhash.hashsize; h++) {
1047                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1048                         qdisc_reset(cl->q);
1049
1050                         cl->next_alive = NULL;
1051                         cl->undertime = PSCHED_PASTPERFECT;
1052                         cl->avgidle = cl->maxidle;
1053                         cl->deficit = cl->quantum;
1054                         cl->cpriority = cl->priority;
1055                 }
1056         }
1057         sch->q.qlen = 0;
1058 }
1059
1060
1061 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1062 {
1063         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1064                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1065                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1066         }
1067         if (lss->change & TCF_CBQ_LSS_EWMA)
1068                 cl->ewma_log = lss->ewma_log;
1069         if (lss->change & TCF_CBQ_LSS_AVPKT)
1070                 cl->avpkt = lss->avpkt;
1071         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1072                 cl->minidle = -(long)lss->minidle;
1073         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1074                 cl->maxidle = lss->maxidle;
1075                 cl->avgidle = lss->maxidle;
1076         }
1077         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1078                 cl->offtime = lss->offtime;
1079         return 0;
1080 }
1081
1082 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1083 {
1084         q->nclasses[cl->priority]--;
1085         q->quanta[cl->priority] -= cl->weight;
1086         cbq_normalize_quanta(q, cl->priority);
1087 }
1088
1089 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1090 {
1091         q->nclasses[cl->priority]++;
1092         q->quanta[cl->priority] += cl->weight;
1093         cbq_normalize_quanta(q, cl->priority);
1094 }
1095
1096 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1097 {
1098         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1099
1100         if (wrr->allot)
1101                 cl->allot = wrr->allot;
1102         if (wrr->weight)
1103                 cl->weight = wrr->weight;
1104         if (wrr->priority) {
1105                 cl->priority = wrr->priority - 1;
1106                 cl->cpriority = cl->priority;
1107                 if (cl->priority >= cl->priority2)
1108                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1109         }
1110
1111         cbq_addprio(q, cl);
1112         return 0;
1113 }
1114
1115 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1116 {
1117         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1118         return 0;
1119 }
1120
1121 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1122         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1123         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1124         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1125         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1126         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1127         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1128         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1129 };
1130
1131 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1132 {
1133         struct cbq_sched_data *q = qdisc_priv(sch);
1134         struct nlattr *tb[TCA_CBQ_MAX + 1];
1135         struct tc_ratespec *r;
1136         int err;
1137
1138         err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1139         if (err < 0)
1140                 return err;
1141
1142         if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1143                 return -EINVAL;
1144
1145         r = nla_data(tb[TCA_CBQ_RATE]);
1146
1147         if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1148                 return -EINVAL;
1149
1150         err = qdisc_class_hash_init(&q->clhash);
1151         if (err < 0)
1152                 goto put_rtab;
1153
1154         q->link.refcnt = 1;
1155         q->link.sibling = &q->link;
1156         q->link.common.classid = sch->handle;
1157         q->link.qdisc = sch;
1158         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1159                                       sch->handle);
1160         if (!q->link.q)
1161                 q->link.q = &noop_qdisc;
1162
1163         q->link.priority = TC_CBQ_MAXPRIO - 1;
1164         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1165         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1166         q->link.allot = psched_mtu(qdisc_dev(sch));
1167         q->link.quantum = q->link.allot;
1168         q->link.weight = q->link.R_tab->rate.rate;
1169
1170         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1171         q->link.avpkt = q->link.allot/2;
1172         q->link.minidle = -0x7FFFFFFF;
1173
1174         qdisc_watchdog_init(&q->watchdog, sch);
1175         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1176         q->delay_timer.function = cbq_undelay;
1177         q->toplevel = TC_CBQ_MAXLEVEL;
1178         q->now = psched_get_time();
1179
1180         cbq_link_class(&q->link);
1181
1182         if (tb[TCA_CBQ_LSSOPT])
1183                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1184
1185         cbq_addprio(q, &q->link);
1186         return 0;
1187
1188 put_rtab:
1189         qdisc_put_rtab(q->link.R_tab);
1190         return err;
1191 }
1192
1193 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1194 {
1195         unsigned char *b = skb_tail_pointer(skb);
1196
1197         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1198                 goto nla_put_failure;
1199         return skb->len;
1200
1201 nla_put_failure:
1202         nlmsg_trim(skb, b);
1203         return -1;
1204 }
1205
1206 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1207 {
1208         unsigned char *b = skb_tail_pointer(skb);
1209         struct tc_cbq_lssopt opt;
1210
1211         opt.flags = 0;
1212         if (cl->borrow == NULL)
1213                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1214         if (cl->share == NULL)
1215                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1216         opt.ewma_log = cl->ewma_log;
1217         opt.level = cl->level;
1218         opt.avpkt = cl->avpkt;
1219         opt.maxidle = cl->maxidle;
1220         opt.minidle = (u32)(-cl->minidle);
1221         opt.offtime = cl->offtime;
1222         opt.change = ~0;
1223         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1224                 goto nla_put_failure;
1225         return skb->len;
1226
1227 nla_put_failure:
1228         nlmsg_trim(skb, b);
1229         return -1;
1230 }
1231
1232 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1233 {
1234         unsigned char *b = skb_tail_pointer(skb);
1235         struct tc_cbq_wrropt opt;
1236
1237         memset(&opt, 0, sizeof(opt));
1238         opt.flags = 0;
1239         opt.allot = cl->allot;
1240         opt.priority = cl->priority + 1;
1241         opt.cpriority = cl->cpriority + 1;
1242         opt.weight = cl->weight;
1243         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1244                 goto nla_put_failure;
1245         return skb->len;
1246
1247 nla_put_failure:
1248         nlmsg_trim(skb, b);
1249         return -1;
1250 }
1251
1252 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1253 {
1254         unsigned char *b = skb_tail_pointer(skb);
1255         struct tc_cbq_fopt opt;
1256
1257         if (cl->split || cl->defmap) {
1258                 opt.split = cl->split ? cl->split->common.classid : 0;
1259                 opt.defmap = cl->defmap;
1260                 opt.defchange = ~0;
1261                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1262                         goto nla_put_failure;
1263         }
1264         return skb->len;
1265
1266 nla_put_failure:
1267         nlmsg_trim(skb, b);
1268         return -1;
1269 }
1270
1271 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1272 {
1273         if (cbq_dump_lss(skb, cl) < 0 ||
1274             cbq_dump_rate(skb, cl) < 0 ||
1275             cbq_dump_wrr(skb, cl) < 0 ||
1276             cbq_dump_fopt(skb, cl) < 0)
1277                 return -1;
1278         return 0;
1279 }
1280
1281 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1282 {
1283         struct cbq_sched_data *q = qdisc_priv(sch);
1284         struct nlattr *nest;
1285
1286         nest = nla_nest_start(skb, TCA_OPTIONS);
1287         if (nest == NULL)
1288                 goto nla_put_failure;
1289         if (cbq_dump_attr(skb, &q->link) < 0)
1290                 goto nla_put_failure;
1291         return nla_nest_end(skb, nest);
1292
1293 nla_put_failure:
1294         nla_nest_cancel(skb, nest);
1295         return -1;
1296 }
1297
1298 static int
1299 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1300 {
1301         struct cbq_sched_data *q = qdisc_priv(sch);
1302
1303         q->link.xstats.avgidle = q->link.avgidle;
1304         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1305 }
1306
1307 static int
1308 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1309                struct sk_buff *skb, struct tcmsg *tcm)
1310 {
1311         struct cbq_class *cl = (struct cbq_class *)arg;
1312         struct nlattr *nest;
1313
1314         if (cl->tparent)
1315                 tcm->tcm_parent = cl->tparent->common.classid;
1316         else
1317                 tcm->tcm_parent = TC_H_ROOT;
1318         tcm->tcm_handle = cl->common.classid;
1319         tcm->tcm_info = cl->q->handle;
1320
1321         nest = nla_nest_start(skb, TCA_OPTIONS);
1322         if (nest == NULL)
1323                 goto nla_put_failure;
1324         if (cbq_dump_attr(skb, cl) < 0)
1325                 goto nla_put_failure;
1326         return nla_nest_end(skb, nest);
1327
1328 nla_put_failure:
1329         nla_nest_cancel(skb, nest);
1330         return -1;
1331 }
1332
1333 static int
1334 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1335         struct gnet_dump *d)
1336 {
1337         struct cbq_sched_data *q = qdisc_priv(sch);
1338         struct cbq_class *cl = (struct cbq_class *)arg;
1339
1340         cl->xstats.avgidle = cl->avgidle;
1341         cl->xstats.undertime = 0;
1342
1343         if (cl->undertime != PSCHED_PASTPERFECT)
1344                 cl->xstats.undertime = cl->undertime - q->now;
1345
1346         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1347                                   d, NULL, &cl->bstats) < 0 ||
1348             gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1349             gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1350                 return -1;
1351
1352         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1353 }
1354
1355 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1356                      struct Qdisc **old)
1357 {
1358         struct cbq_class *cl = (struct cbq_class *)arg;
1359
1360         if (new == NULL) {
1361                 new = qdisc_create_dflt(sch->dev_queue,
1362                                         &pfifo_qdisc_ops, cl->common.classid);
1363                 if (new == NULL)
1364                         return -ENOBUFS;
1365         }
1366
1367         *old = qdisc_replace(sch, new, &cl->q);
1368         return 0;
1369 }
1370
1371 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1372 {
1373         struct cbq_class *cl = (struct cbq_class *)arg;
1374
1375         return cl->q;
1376 }
1377
1378 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1379 {
1380         struct cbq_class *cl = (struct cbq_class *)arg;
1381
1382         if (cl->q->q.qlen == 0)
1383                 cbq_deactivate_class(cl);
1384 }
1385
1386 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1387 {
1388         struct cbq_sched_data *q = qdisc_priv(sch);
1389         struct cbq_class *cl = cbq_class_lookup(q, classid);
1390
1391         if (cl) {
1392                 cl->refcnt++;
1393                 return (unsigned long)cl;
1394         }
1395         return 0;
1396 }
1397
1398 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1399 {
1400         struct cbq_sched_data *q = qdisc_priv(sch);
1401
1402         WARN_ON(cl->filters);
1403
1404         tcf_destroy_chain(&cl->filter_list);
1405         qdisc_destroy(cl->q);
1406         qdisc_put_rtab(cl->R_tab);
1407         gen_kill_estimator(&cl->bstats, &cl->rate_est);
1408         if (cl != &q->link)
1409                 kfree(cl);
1410 }
1411
1412 static void cbq_destroy(struct Qdisc *sch)
1413 {
1414         struct cbq_sched_data *q = qdisc_priv(sch);
1415         struct hlist_node *next;
1416         struct cbq_class *cl;
1417         unsigned int h;
1418
1419 #ifdef CONFIG_NET_CLS_ACT
1420         q->rx_class = NULL;
1421 #endif
1422         /*
1423          * Filters must be destroyed first because we don't destroy the
1424          * classes from root to leafs which means that filters can still
1425          * be bound to classes which have been destroyed already. --TGR '04
1426          */
1427         for (h = 0; h < q->clhash.hashsize; h++) {
1428                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1429                         tcf_destroy_chain(&cl->filter_list);
1430         }
1431         for (h = 0; h < q->clhash.hashsize; h++) {
1432                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1433                                           common.hnode)
1434                         cbq_destroy_class(sch, cl);
1435         }
1436         qdisc_class_hash_destroy(&q->clhash);
1437 }
1438
1439 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1440 {
1441         struct cbq_class *cl = (struct cbq_class *)arg;
1442
1443         if (--cl->refcnt == 0) {
1444 #ifdef CONFIG_NET_CLS_ACT
1445                 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1446                 struct cbq_sched_data *q = qdisc_priv(sch);
1447
1448                 spin_lock_bh(root_lock);
1449                 if (q->rx_class == cl)
1450                         q->rx_class = NULL;
1451                 spin_unlock_bh(root_lock);
1452 #endif
1453
1454                 cbq_destroy_class(sch, cl);
1455         }
1456 }
1457
1458 static int
1459 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1460                  unsigned long *arg)
1461 {
1462         int err;
1463         struct cbq_sched_data *q = qdisc_priv(sch);
1464         struct cbq_class *cl = (struct cbq_class *)*arg;
1465         struct nlattr *opt = tca[TCA_OPTIONS];
1466         struct nlattr *tb[TCA_CBQ_MAX + 1];
1467         struct cbq_class *parent;
1468         struct qdisc_rate_table *rtab = NULL;
1469
1470         if (opt == NULL)
1471                 return -EINVAL;
1472
1473         err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1474         if (err < 0)
1475                 return err;
1476
1477         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1478                 return -EOPNOTSUPP;
1479
1480         if (cl) {
1481                 /* Check parent */
1482                 if (parentid) {
1483                         if (cl->tparent &&
1484                             cl->tparent->common.classid != parentid)
1485                                 return -EINVAL;
1486                         if (!cl->tparent && parentid != TC_H_ROOT)
1487                                 return -EINVAL;
1488                 }
1489
1490                 if (tb[TCA_CBQ_RATE]) {
1491                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1492                                               tb[TCA_CBQ_RTAB]);
1493                         if (rtab == NULL)
1494                                 return -EINVAL;
1495                 }
1496
1497                 if (tca[TCA_RATE]) {
1498                         err = gen_replace_estimator(&cl->bstats, NULL,
1499                                                     &cl->rate_est,
1500                                                     NULL,
1501                                                     qdisc_root_sleeping_running(sch),
1502                                                     tca[TCA_RATE]);
1503                         if (err) {
1504                                 qdisc_put_rtab(rtab);
1505                                 return err;
1506                         }
1507                 }
1508
1509                 /* Change class parameters */
1510                 sch_tree_lock(sch);
1511
1512                 if (cl->next_alive != NULL)
1513                         cbq_deactivate_class(cl);
1514
1515                 if (rtab) {
1516                         qdisc_put_rtab(cl->R_tab);
1517                         cl->R_tab = rtab;
1518                 }
1519
1520                 if (tb[TCA_CBQ_LSSOPT])
1521                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1522
1523                 if (tb[TCA_CBQ_WRROPT]) {
1524                         cbq_rmprio(q, cl);
1525                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1526                 }
1527
1528                 if (tb[TCA_CBQ_FOPT])
1529                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1530
1531                 if (cl->q->q.qlen)
1532                         cbq_activate_class(cl);
1533
1534                 sch_tree_unlock(sch);
1535
1536                 return 0;
1537         }
1538
1539         if (parentid == TC_H_ROOT)
1540                 return -EINVAL;
1541
1542         if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1543             tb[TCA_CBQ_LSSOPT] == NULL)
1544                 return -EINVAL;
1545
1546         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1547         if (rtab == NULL)
1548                 return -EINVAL;
1549
1550         if (classid) {
1551                 err = -EINVAL;
1552                 if (TC_H_MAJ(classid ^ sch->handle) ||
1553                     cbq_class_lookup(q, classid))
1554                         goto failure;
1555         } else {
1556                 int i;
1557                 classid = TC_H_MAKE(sch->handle, 0x8000);
1558
1559                 for (i = 0; i < 0x8000; i++) {
1560                         if (++q->hgenerator >= 0x8000)
1561                                 q->hgenerator = 1;
1562                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1563                                 break;
1564                 }
1565                 err = -ENOSR;
1566                 if (i >= 0x8000)
1567                         goto failure;
1568                 classid = classid|q->hgenerator;
1569         }
1570
1571         parent = &q->link;
1572         if (parentid) {
1573                 parent = cbq_class_lookup(q, parentid);
1574                 err = -EINVAL;
1575                 if (parent == NULL)
1576                         goto failure;
1577         }
1578
1579         err = -ENOBUFS;
1580         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1581         if (cl == NULL)
1582                 goto failure;
1583
1584         if (tca[TCA_RATE]) {
1585                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1586                                         NULL,
1587                                         qdisc_root_sleeping_running(sch),
1588                                         tca[TCA_RATE]);
1589                 if (err) {
1590                         kfree(cl);
1591                         goto failure;
1592                 }
1593         }
1594
1595         cl->R_tab = rtab;
1596         rtab = NULL;
1597         cl->refcnt = 1;
1598         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1599         if (!cl->q)
1600                 cl->q = &noop_qdisc;
1601         cl->common.classid = classid;
1602         cl->tparent = parent;
1603         cl->qdisc = sch;
1604         cl->allot = parent->allot;
1605         cl->quantum = cl->allot;
1606         cl->weight = cl->R_tab->rate.rate;
1607
1608         sch_tree_lock(sch);
1609         cbq_link_class(cl);
1610         cl->borrow = cl->tparent;
1611         if (cl->tparent != &q->link)
1612                 cl->share = cl->tparent;
1613         cbq_adjust_levels(parent);
1614         cl->minidle = -0x7FFFFFFF;
1615         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1616         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1617         if (cl->ewma_log == 0)
1618                 cl->ewma_log = q->link.ewma_log;
1619         if (cl->maxidle == 0)
1620                 cl->maxidle = q->link.maxidle;
1621         if (cl->avpkt == 0)
1622                 cl->avpkt = q->link.avpkt;
1623         if (tb[TCA_CBQ_FOPT])
1624                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1625         sch_tree_unlock(sch);
1626
1627         qdisc_class_hash_grow(sch, &q->clhash);
1628
1629         *arg = (unsigned long)cl;
1630         return 0;
1631
1632 failure:
1633         qdisc_put_rtab(rtab);
1634         return err;
1635 }
1636
1637 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1638 {
1639         struct cbq_sched_data *q = qdisc_priv(sch);
1640         struct cbq_class *cl = (struct cbq_class *)arg;
1641         unsigned int qlen, backlog;
1642
1643         if (cl->filters || cl->children || cl == &q->link)
1644                 return -EBUSY;
1645
1646         sch_tree_lock(sch);
1647
1648         qlen = cl->q->q.qlen;
1649         backlog = cl->q->qstats.backlog;
1650         qdisc_reset(cl->q);
1651         qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1652
1653         if (cl->next_alive)
1654                 cbq_deactivate_class(cl);
1655
1656         if (q->tx_borrowed == cl)
1657                 q->tx_borrowed = q->tx_class;
1658         if (q->tx_class == cl) {
1659                 q->tx_class = NULL;
1660                 q->tx_borrowed = NULL;
1661         }
1662 #ifdef CONFIG_NET_CLS_ACT
1663         if (q->rx_class == cl)
1664                 q->rx_class = NULL;
1665 #endif
1666
1667         cbq_unlink_class(cl);
1668         cbq_adjust_levels(cl->tparent);
1669         cl->defmap = 0;
1670         cbq_sync_defmap(cl);
1671
1672         cbq_rmprio(q, cl);
1673         sch_tree_unlock(sch);
1674
1675         BUG_ON(--cl->refcnt == 0);
1676         /*
1677          * This shouldn't happen: we "hold" one cops->get() when called
1678          * from tc_ctl_tclass; the destroy method is done from cops->put().
1679          */
1680
1681         return 0;
1682 }
1683
1684 static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1685                                              unsigned long arg)
1686 {
1687         struct cbq_sched_data *q = qdisc_priv(sch);
1688         struct cbq_class *cl = (struct cbq_class *)arg;
1689
1690         if (cl == NULL)
1691                 cl = &q->link;
1692
1693         return &cl->filter_list;
1694 }
1695
1696 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1697                                      u32 classid)
1698 {
1699         struct cbq_sched_data *q = qdisc_priv(sch);
1700         struct cbq_class *p = (struct cbq_class *)parent;
1701         struct cbq_class *cl = cbq_class_lookup(q, classid);
1702
1703         if (cl) {
1704                 if (p && p->level <= cl->level)
1705                         return 0;
1706                 cl->filters++;
1707                 return (unsigned long)cl;
1708         }
1709         return 0;
1710 }
1711
1712 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1713 {
1714         struct cbq_class *cl = (struct cbq_class *)arg;
1715
1716         cl->filters--;
1717 }
1718
1719 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1720 {
1721         struct cbq_sched_data *q = qdisc_priv(sch);
1722         struct cbq_class *cl;
1723         unsigned int h;
1724
1725         if (arg->stop)
1726                 return;
1727
1728         for (h = 0; h < q->clhash.hashsize; h++) {
1729                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1730                         if (arg->count < arg->skip) {
1731                                 arg->count++;
1732                                 continue;
1733                         }
1734                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1735                                 arg->stop = 1;
1736                                 return;
1737                         }
1738                         arg->count++;
1739                 }
1740         }
1741 }
1742
1743 static const struct Qdisc_class_ops cbq_class_ops = {
1744         .graft          =       cbq_graft,
1745         .leaf           =       cbq_leaf,
1746         .qlen_notify    =       cbq_qlen_notify,
1747         .get            =       cbq_get,
1748         .put            =       cbq_put,
1749         .change         =       cbq_change_class,
1750         .delete         =       cbq_delete,
1751         .walk           =       cbq_walk,
1752         .tcf_chain      =       cbq_find_tcf,
1753         .bind_tcf       =       cbq_bind_filter,
1754         .unbind_tcf     =       cbq_unbind_filter,
1755         .dump           =       cbq_dump_class,
1756         .dump_stats     =       cbq_dump_class_stats,
1757 };
1758
1759 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1760         .next           =       NULL,
1761         .cl_ops         =       &cbq_class_ops,
1762         .id             =       "cbq",
1763         .priv_size      =       sizeof(struct cbq_sched_data),
1764         .enqueue        =       cbq_enqueue,
1765         .dequeue        =       cbq_dequeue,
1766         .peek           =       qdisc_peek_dequeued,
1767         .init           =       cbq_init,
1768         .reset          =       cbq_reset,
1769         .destroy        =       cbq_destroy,
1770         .change         =       NULL,
1771         .dump           =       cbq_dump,
1772         .dump_stats     =       cbq_dump_stats,
1773         .owner          =       THIS_MODULE,
1774 };
1775
1776 static int __init cbq_module_init(void)
1777 {
1778         return register_qdisc(&cbq_qdisc_ops);
1779 }
1780 static void __exit cbq_module_exit(void)
1781 {
1782         unregister_qdisc(&cbq_qdisc_ops);
1783 }
1784 module_init(cbq_module_init)
1785 module_exit(cbq_module_exit)
1786 MODULE_LICENSE("GPL");