]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/sched/sch_cbq.c
net_sched: remove generic throttled management
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_cbq.c
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22
23
24 /* Class-Based Queueing (CBQ) algorithm.
25 =======================================
26
27 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28 Management Models for Packet Networks",
29 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30
31 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32
33 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34 Parameters", 1996
35
36 [4] Sally Floyd and Michael Speer, "Experimental Results
37 for Class-Based Queueing", 1998, not published.
38
39 -----------------------------------------------------------------------
40
41 Algorithm skeleton was taken from NS simulator cbq.cc.
42 If someone wants to check this code against the LBL version,
43 he should take into account that ONLY the skeleton was borrowed,
44 the implementation is different. Particularly:
45
46 --- The WRR algorithm is different. Our version looks more
47 reasonable (I hope) and works when quanta are allowed to be
48 less than MTU, which is always the case when real time classes
49 have small rates. Note, that the statement of [3] is
50 incomplete, delay may actually be estimated even if class
51 per-round allotment is less than MTU. Namely, if per-round
52 allotment is W*r_i, and r_1+...+r_k = r < 1
53
54 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55
56 In the worst case we have IntServ estimate with D = W*r+k*MTU
57 and C = MTU*r. The proof (if correct at all) is trivial.
58
59
60 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
61 interpret some places, which look like wrong translations
62 from NS. Anyone is advised to find these differences
63 and explain to me, why I am wrong 8).
64
65 --- Linux has no EOI event, so that we cannot estimate true class
66 idle time. Workaround is to consider the next dequeue event
67 as sign that previous packet is finished. This is wrong because of
68 internal device queueing, but on a permanently loaded link it is true.
69 Moreover, combined with clock integrator, this scheme looks
70 very close to an ideal solution. */
71
72 struct cbq_sched_data;
73
74
75 struct cbq_class {
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
78
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83
84 u32 defmap;
85
86 /* Link-sharing scheduler parameters */
87 long maxidle; /* Class parameters: see below. */
88 long offtime;
89 long minidle;
90 u32 avpkt;
91 struct qdisc_rate_table *R_tab;
92
93 /* General scheduler (WRR) parameters */
94 long allot;
95 long quantum; /* Allotment per WRR round */
96 long weight; /* Relative allotment: see below */
97
98 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
99 struct cbq_class *split; /* Ptr to split node */
100 struct cbq_class *share; /* Ptr to LS parent in the class tree */
101 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
102 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
103 parent otherwise */
104 struct cbq_class *sibling; /* Sibling chain */
105 struct cbq_class *children; /* Pointer to children chain */
106
107 struct Qdisc *q; /* Elementary queueing discipline */
108
109
110 /* Variables */
111 unsigned char cpriority; /* Effective priority */
112 unsigned char delayed;
113 unsigned char level; /* level of the class in hierarchy:
114 0 for leaf classes, and maximal
115 level of children + 1 for nodes.
116 */
117
118 psched_time_t last; /* Last end of service */
119 psched_time_t undertime;
120 long avgidle;
121 long deficit; /* Saved deficit for WRR */
122 psched_time_t penalized;
123 struct gnet_stats_basic_packed bstats;
124 struct gnet_stats_queue qstats;
125 struct gnet_stats_rate_est64 rate_est;
126 struct tc_cbq_xstats xstats;
127
128 struct tcf_proto __rcu *filter_list;
129
130 int refcnt;
131 int filters;
132
133 struct cbq_class *defaults[TC_PRIO_MAX + 1];
134 };
135
136 struct cbq_sched_data {
137 struct Qdisc_class_hash clhash; /* Hash table of all classes */
138 int nclasses[TC_CBQ_MAXPRIO + 1];
139 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
140
141 struct cbq_class link;
142
143 unsigned int activemask;
144 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
145 with backlog */
146
147 #ifdef CONFIG_NET_CLS_ACT
148 struct cbq_class *rx_class;
149 #endif
150 struct cbq_class *tx_class;
151 struct cbq_class *tx_borrowed;
152 int tx_len;
153 psched_time_t now; /* Cached timestamp */
154 unsigned int pmask;
155
156 struct hrtimer delay_timer;
157 struct qdisc_watchdog watchdog; /* Watchdog timer,
158 started when CBQ has
159 backlog, but cannot
160 transmit just now */
161 psched_tdiff_t wd_expires;
162 int toplevel;
163 u32 hgenerator;
164 };
165
166
167 #define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
168
169 static inline struct cbq_class *
170 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
171 {
172 struct Qdisc_class_common *clc;
173
174 clc = qdisc_class_find(&q->clhash, classid);
175 if (clc == NULL)
176 return NULL;
177 return container_of(clc, struct cbq_class, common);
178 }
179
180 #ifdef CONFIG_NET_CLS_ACT
181
182 static struct cbq_class *
183 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
184 {
185 struct cbq_class *cl;
186
187 for (cl = this->tparent; cl; cl = cl->tparent) {
188 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
189
190 if (new != NULL && new != this)
191 return new;
192 }
193 return NULL;
194 }
195
196 #endif
197
198 /* Classify packet. The procedure is pretty complicated, but
199 * it allows us to combine link sharing and priority scheduling
200 * transparently.
201 *
202 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
203 * so that it resolves to split nodes. Then packets are classified
204 * by logical priority, or a more specific classifier may be attached
205 * to the split node.
206 */
207
208 static struct cbq_class *
209 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
210 {
211 struct cbq_sched_data *q = qdisc_priv(sch);
212 struct cbq_class *head = &q->link;
213 struct cbq_class **defmap;
214 struct cbq_class *cl = NULL;
215 u32 prio = skb->priority;
216 struct tcf_proto *fl;
217 struct tcf_result res;
218
219 /*
220 * Step 1. If skb->priority points to one of our classes, use it.
221 */
222 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
223 (cl = cbq_class_lookup(q, prio)) != NULL)
224 return cl;
225
226 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
227 for (;;) {
228 int result = 0;
229 defmap = head->defaults;
230
231 fl = rcu_dereference_bh(head->filter_list);
232 /*
233 * Step 2+n. Apply classifier.
234 */
235 result = tc_classify(skb, fl, &res, true);
236 if (!fl || result < 0)
237 goto fallback;
238
239 cl = (void *)res.class;
240 if (!cl) {
241 if (TC_H_MAJ(res.classid))
242 cl = cbq_class_lookup(q, res.classid);
243 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
244 cl = defmap[TC_PRIO_BESTEFFORT];
245
246 if (cl == NULL)
247 goto fallback;
248 }
249 if (cl->level >= head->level)
250 goto fallback;
251 #ifdef CONFIG_NET_CLS_ACT
252 switch (result) {
253 case TC_ACT_QUEUED:
254 case TC_ACT_STOLEN:
255 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
256 case TC_ACT_SHOT:
257 return NULL;
258 case TC_ACT_RECLASSIFY:
259 return cbq_reclassify(skb, cl);
260 }
261 #endif
262 if (cl->level == 0)
263 return cl;
264
265 /*
266 * Step 3+n. If classifier selected a link sharing class,
267 * apply agency specific classifier.
268 * Repeat this procdure until we hit a leaf node.
269 */
270 head = cl;
271 }
272
273 fallback:
274 cl = head;
275
276 /*
277 * Step 4. No success...
278 */
279 if (TC_H_MAJ(prio) == 0 &&
280 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
281 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
282 return head;
283
284 return cl;
285 }
286
287 /*
288 * A packet has just been enqueued on the empty class.
289 * cbq_activate_class adds it to the tail of active class list
290 * of its priority band.
291 */
292
293 static inline void cbq_activate_class(struct cbq_class *cl)
294 {
295 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
296 int prio = cl->cpriority;
297 struct cbq_class *cl_tail;
298
299 cl_tail = q->active[prio];
300 q->active[prio] = cl;
301
302 if (cl_tail != NULL) {
303 cl->next_alive = cl_tail->next_alive;
304 cl_tail->next_alive = cl;
305 } else {
306 cl->next_alive = cl;
307 q->activemask |= (1<<prio);
308 }
309 }
310
311 /*
312 * Unlink class from active chain.
313 * Note that this same procedure is done directly in cbq_dequeue*
314 * during round-robin procedure.
315 */
316
317 static void cbq_deactivate_class(struct cbq_class *this)
318 {
319 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
320 int prio = this->cpriority;
321 struct cbq_class *cl;
322 struct cbq_class *cl_prev = q->active[prio];
323
324 do {
325 cl = cl_prev->next_alive;
326 if (cl == this) {
327 cl_prev->next_alive = cl->next_alive;
328 cl->next_alive = NULL;
329
330 if (cl == q->active[prio]) {
331 q->active[prio] = cl_prev;
332 if (cl == q->active[prio]) {
333 q->active[prio] = NULL;
334 q->activemask &= ~(1<<prio);
335 return;
336 }
337 }
338 return;
339 }
340 } while ((cl_prev = cl) != q->active[prio]);
341 }
342
343 static void
344 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
345 {
346 int toplevel = q->toplevel;
347
348 if (toplevel > cl->level) {
349 psched_time_t now = psched_get_time();
350
351 do {
352 if (cl->undertime < now) {
353 q->toplevel = cl->level;
354 return;
355 }
356 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
357 }
358 }
359
360 static int
361 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
362 {
363 struct cbq_sched_data *q = qdisc_priv(sch);
364 int uninitialized_var(ret);
365 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
366
367 #ifdef CONFIG_NET_CLS_ACT
368 q->rx_class = cl;
369 #endif
370 if (cl == NULL) {
371 if (ret & __NET_XMIT_BYPASS)
372 qdisc_qstats_drop(sch);
373 kfree_skb(skb);
374 return ret;
375 }
376
377 ret = qdisc_enqueue(skb, cl->q);
378 if (ret == NET_XMIT_SUCCESS) {
379 sch->q.qlen++;
380 cbq_mark_toplevel(q, cl);
381 if (!cl->next_alive)
382 cbq_activate_class(cl);
383 return ret;
384 }
385
386 if (net_xmit_drop_count(ret)) {
387 qdisc_qstats_drop(sch);
388 cbq_mark_toplevel(q, cl);
389 cl->qstats.drops++;
390 }
391 return ret;
392 }
393
394 /* Overlimit action: penalize leaf class by adding offtime */
395 static void cbq_overlimit(struct cbq_class *cl)
396 {
397 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
398 psched_tdiff_t delay = cl->undertime - q->now;
399
400 if (!cl->delayed) {
401 delay += cl->offtime;
402
403 /*
404 * Class goes to sleep, so that it will have no
405 * chance to work avgidle. Let's forgive it 8)
406 *
407 * BTW cbq-2.0 has a crap in this
408 * place, apparently they forgot to shift it by cl->ewma_log.
409 */
410 if (cl->avgidle < 0)
411 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
412 if (cl->avgidle < cl->minidle)
413 cl->avgidle = cl->minidle;
414 if (delay <= 0)
415 delay = 1;
416 cl->undertime = q->now + delay;
417
418 cl->xstats.overactions++;
419 cl->delayed = 1;
420 }
421 if (q->wd_expires == 0 || q->wd_expires > delay)
422 q->wd_expires = delay;
423
424 /* Dirty work! We must schedule wakeups based on
425 * real available rate, rather than leaf rate,
426 * which may be tiny (even zero).
427 */
428 if (q->toplevel == TC_CBQ_MAXLEVEL) {
429 struct cbq_class *b;
430 psched_tdiff_t base_delay = q->wd_expires;
431
432 for (b = cl->borrow; b; b = b->borrow) {
433 delay = b->undertime - q->now;
434 if (delay < base_delay) {
435 if (delay <= 0)
436 delay = 1;
437 base_delay = delay;
438 }
439 }
440
441 q->wd_expires = base_delay;
442 }
443 }
444
445 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
446 psched_time_t now)
447 {
448 struct cbq_class *cl;
449 struct cbq_class *cl_prev = q->active[prio];
450 psched_time_t sched = now;
451
452 if (cl_prev == NULL)
453 return 0;
454
455 do {
456 cl = cl_prev->next_alive;
457 if (now - cl->penalized > 0) {
458 cl_prev->next_alive = cl->next_alive;
459 cl->next_alive = NULL;
460 cl->cpriority = cl->priority;
461 cl->delayed = 0;
462 cbq_activate_class(cl);
463
464 if (cl == q->active[prio]) {
465 q->active[prio] = cl_prev;
466 if (cl == q->active[prio]) {
467 q->active[prio] = NULL;
468 return 0;
469 }
470 }
471
472 cl = cl_prev->next_alive;
473 } else if (sched - cl->penalized > 0)
474 sched = cl->penalized;
475 } while ((cl_prev = cl) != q->active[prio]);
476
477 return sched - now;
478 }
479
480 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
481 {
482 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
483 delay_timer);
484 struct Qdisc *sch = q->watchdog.qdisc;
485 psched_time_t now;
486 psched_tdiff_t delay = 0;
487 unsigned int pmask;
488
489 now = psched_get_time();
490
491 pmask = q->pmask;
492 q->pmask = 0;
493
494 while (pmask) {
495 int prio = ffz(~pmask);
496 psched_tdiff_t tmp;
497
498 pmask &= ~(1<<prio);
499
500 tmp = cbq_undelay_prio(q, prio, now);
501 if (tmp > 0) {
502 q->pmask |= 1<<prio;
503 if (tmp < delay || delay == 0)
504 delay = tmp;
505 }
506 }
507
508 if (delay) {
509 ktime_t time;
510
511 time = ktime_set(0, 0);
512 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
513 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
514 }
515
516 __netif_schedule(qdisc_root(sch));
517 return HRTIMER_NORESTART;
518 }
519
520 /*
521 * It is mission critical procedure.
522 *
523 * We "regenerate" toplevel cutoff, if transmitting class
524 * has backlog and it is not regulated. It is not part of
525 * original CBQ description, but looks more reasonable.
526 * Probably, it is wrong. This question needs further investigation.
527 */
528
529 static inline void
530 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
531 struct cbq_class *borrowed)
532 {
533 if (cl && q->toplevel >= borrowed->level) {
534 if (cl->q->q.qlen > 1) {
535 do {
536 if (borrowed->undertime == PSCHED_PASTPERFECT) {
537 q->toplevel = borrowed->level;
538 return;
539 }
540 } while ((borrowed = borrowed->borrow) != NULL);
541 }
542 #if 0
543 /* It is not necessary now. Uncommenting it
544 will save CPU cycles, but decrease fairness.
545 */
546 q->toplevel = TC_CBQ_MAXLEVEL;
547 #endif
548 }
549 }
550
551 static void
552 cbq_update(struct cbq_sched_data *q)
553 {
554 struct cbq_class *this = q->tx_class;
555 struct cbq_class *cl = this;
556 int len = q->tx_len;
557 psched_time_t now;
558
559 q->tx_class = NULL;
560 /* Time integrator. We calculate EOS time
561 * by adding expected packet transmission time.
562 */
563 now = q->now + L2T(&q->link, len);
564
565 for ( ; cl; cl = cl->share) {
566 long avgidle = cl->avgidle;
567 long idle;
568
569 cl->bstats.packets++;
570 cl->bstats.bytes += len;
571
572 /*
573 * (now - last) is total time between packet right edges.
574 * (last_pktlen/rate) is "virtual" busy time, so that
575 *
576 * idle = (now - last) - last_pktlen/rate
577 */
578
579 idle = now - cl->last;
580 if ((unsigned long)idle > 128*1024*1024) {
581 avgidle = cl->maxidle;
582 } else {
583 idle -= L2T(cl, len);
584
585 /* true_avgidle := (1-W)*true_avgidle + W*idle,
586 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
587 * cl->avgidle == true_avgidle/W,
588 * hence:
589 */
590 avgidle += idle - (avgidle>>cl->ewma_log);
591 }
592
593 if (avgidle <= 0) {
594 /* Overlimit or at-limit */
595
596 if (avgidle < cl->minidle)
597 avgidle = cl->minidle;
598
599 cl->avgidle = avgidle;
600
601 /* Calculate expected time, when this class
602 * will be allowed to send.
603 * It will occur, when:
604 * (1-W)*true_avgidle + W*delay = 0, i.e.
605 * idle = (1/W - 1)*(-true_avgidle)
606 * or
607 * idle = (1 - W)*(-cl->avgidle);
608 */
609 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
610
611 /*
612 * That is not all.
613 * To maintain the rate allocated to the class,
614 * we add to undertime virtual clock,
615 * necessary to complete transmitted packet.
616 * (len/phys_bandwidth has been already passed
617 * to the moment of cbq_update)
618 */
619
620 idle -= L2T(&q->link, len);
621 idle += L2T(cl, len);
622
623 cl->undertime = now + idle;
624 } else {
625 /* Underlimit */
626
627 cl->undertime = PSCHED_PASTPERFECT;
628 if (avgidle > cl->maxidle)
629 cl->avgidle = cl->maxidle;
630 else
631 cl->avgidle = avgidle;
632 }
633 if ((s64)(now - cl->last) > 0)
634 cl->last = now;
635 }
636
637 cbq_update_toplevel(q, this, q->tx_borrowed);
638 }
639
640 static inline struct cbq_class *
641 cbq_under_limit(struct cbq_class *cl)
642 {
643 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
644 struct cbq_class *this_cl = cl;
645
646 if (cl->tparent == NULL)
647 return cl;
648
649 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
650 cl->delayed = 0;
651 return cl;
652 }
653
654 do {
655 /* It is very suspicious place. Now overlimit
656 * action is generated for not bounded classes
657 * only if link is completely congested.
658 * Though it is in agree with ancestor-only paradigm,
659 * it looks very stupid. Particularly,
660 * it means that this chunk of code will either
661 * never be called or result in strong amplification
662 * of burstiness. Dangerous, silly, and, however,
663 * no another solution exists.
664 */
665 cl = cl->borrow;
666 if (!cl) {
667 this_cl->qstats.overlimits++;
668 cbq_overlimit(this_cl);
669 return NULL;
670 }
671 if (cl->level > q->toplevel)
672 return NULL;
673 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
674
675 cl->delayed = 0;
676 return cl;
677 }
678
679 static inline struct sk_buff *
680 cbq_dequeue_prio(struct Qdisc *sch, int prio)
681 {
682 struct cbq_sched_data *q = qdisc_priv(sch);
683 struct cbq_class *cl_tail, *cl_prev, *cl;
684 struct sk_buff *skb;
685 int deficit;
686
687 cl_tail = cl_prev = q->active[prio];
688 cl = cl_prev->next_alive;
689
690 do {
691 deficit = 0;
692
693 /* Start round */
694 do {
695 struct cbq_class *borrow = cl;
696
697 if (cl->q->q.qlen &&
698 (borrow = cbq_under_limit(cl)) == NULL)
699 goto skip_class;
700
701 if (cl->deficit <= 0) {
702 /* Class exhausted its allotment per
703 * this round. Switch to the next one.
704 */
705 deficit = 1;
706 cl->deficit += cl->quantum;
707 goto next_class;
708 }
709
710 skb = cl->q->dequeue(cl->q);
711
712 /* Class did not give us any skb :-(
713 * It could occur even if cl->q->q.qlen != 0
714 * f.e. if cl->q == "tbf"
715 */
716 if (skb == NULL)
717 goto skip_class;
718
719 cl->deficit -= qdisc_pkt_len(skb);
720 q->tx_class = cl;
721 q->tx_borrowed = borrow;
722 if (borrow != cl) {
723 #ifndef CBQ_XSTATS_BORROWS_BYTES
724 borrow->xstats.borrows++;
725 cl->xstats.borrows++;
726 #else
727 borrow->xstats.borrows += qdisc_pkt_len(skb);
728 cl->xstats.borrows += qdisc_pkt_len(skb);
729 #endif
730 }
731 q->tx_len = qdisc_pkt_len(skb);
732
733 if (cl->deficit <= 0) {
734 q->active[prio] = cl;
735 cl = cl->next_alive;
736 cl->deficit += cl->quantum;
737 }
738 return skb;
739
740 skip_class:
741 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
742 /* Class is empty or penalized.
743 * Unlink it from active chain.
744 */
745 cl_prev->next_alive = cl->next_alive;
746 cl->next_alive = NULL;
747
748 /* Did cl_tail point to it? */
749 if (cl == cl_tail) {
750 /* Repair it! */
751 cl_tail = cl_prev;
752
753 /* Was it the last class in this band? */
754 if (cl == cl_tail) {
755 /* Kill the band! */
756 q->active[prio] = NULL;
757 q->activemask &= ~(1<<prio);
758 if (cl->q->q.qlen)
759 cbq_activate_class(cl);
760 return NULL;
761 }
762
763 q->active[prio] = cl_tail;
764 }
765 if (cl->q->q.qlen)
766 cbq_activate_class(cl);
767
768 cl = cl_prev;
769 }
770
771 next_class:
772 cl_prev = cl;
773 cl = cl->next_alive;
774 } while (cl_prev != cl_tail);
775 } while (deficit);
776
777 q->active[prio] = cl_prev;
778
779 return NULL;
780 }
781
782 static inline struct sk_buff *
783 cbq_dequeue_1(struct Qdisc *sch)
784 {
785 struct cbq_sched_data *q = qdisc_priv(sch);
786 struct sk_buff *skb;
787 unsigned int activemask;
788
789 activemask = q->activemask & 0xFF;
790 while (activemask) {
791 int prio = ffz(~activemask);
792 activemask &= ~(1<<prio);
793 skb = cbq_dequeue_prio(sch, prio);
794 if (skb)
795 return skb;
796 }
797 return NULL;
798 }
799
800 static struct sk_buff *
801 cbq_dequeue(struct Qdisc *sch)
802 {
803 struct sk_buff *skb;
804 struct cbq_sched_data *q = qdisc_priv(sch);
805 psched_time_t now;
806
807 now = psched_get_time();
808
809 if (q->tx_class)
810 cbq_update(q);
811
812 q->now = now;
813
814 for (;;) {
815 q->wd_expires = 0;
816
817 skb = cbq_dequeue_1(sch);
818 if (skb) {
819 qdisc_bstats_update(sch, skb);
820 sch->q.qlen--;
821 return skb;
822 }
823
824 /* All the classes are overlimit.
825 *
826 * It is possible, if:
827 *
828 * 1. Scheduler is empty.
829 * 2. Toplevel cutoff inhibited borrowing.
830 * 3. Root class is overlimit.
831 *
832 * Reset 2d and 3d conditions and retry.
833 *
834 * Note, that NS and cbq-2.0 are buggy, peeking
835 * an arbitrary class is appropriate for ancestor-only
836 * sharing, but not for toplevel algorithm.
837 *
838 * Our version is better, but slower, because it requires
839 * two passes, but it is unavoidable with top-level sharing.
840 */
841
842 if (q->toplevel == TC_CBQ_MAXLEVEL &&
843 q->link.undertime == PSCHED_PASTPERFECT)
844 break;
845
846 q->toplevel = TC_CBQ_MAXLEVEL;
847 q->link.undertime = PSCHED_PASTPERFECT;
848 }
849
850 /* No packets in scheduler or nobody wants to give them to us :-(
851 * Sigh... start watchdog timer in the last case.
852 */
853
854 if (sch->q.qlen) {
855 qdisc_qstats_overlimit(sch);
856 if (q->wd_expires)
857 qdisc_watchdog_schedule(&q->watchdog,
858 now + q->wd_expires);
859 }
860 return NULL;
861 }
862
863 /* CBQ class maintanance routines */
864
865 static void cbq_adjust_levels(struct cbq_class *this)
866 {
867 if (this == NULL)
868 return;
869
870 do {
871 int level = 0;
872 struct cbq_class *cl;
873
874 cl = this->children;
875 if (cl) {
876 do {
877 if (cl->level > level)
878 level = cl->level;
879 } while ((cl = cl->sibling) != this->children);
880 }
881 this->level = level + 1;
882 } while ((this = this->tparent) != NULL);
883 }
884
885 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
886 {
887 struct cbq_class *cl;
888 unsigned int h;
889
890 if (q->quanta[prio] == 0)
891 return;
892
893 for (h = 0; h < q->clhash.hashsize; h++) {
894 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
895 /* BUGGGG... Beware! This expression suffer of
896 * arithmetic overflows!
897 */
898 if (cl->priority == prio) {
899 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
900 q->quanta[prio];
901 }
902 if (cl->quantum <= 0 ||
903 cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
904 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
905 cl->common.classid, cl->quantum);
906 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
907 }
908 }
909 }
910 }
911
912 static void cbq_sync_defmap(struct cbq_class *cl)
913 {
914 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
915 struct cbq_class *split = cl->split;
916 unsigned int h;
917 int i;
918
919 if (split == NULL)
920 return;
921
922 for (i = 0; i <= TC_PRIO_MAX; i++) {
923 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
924 split->defaults[i] = NULL;
925 }
926
927 for (i = 0; i <= TC_PRIO_MAX; i++) {
928 int level = split->level;
929
930 if (split->defaults[i])
931 continue;
932
933 for (h = 0; h < q->clhash.hashsize; h++) {
934 struct cbq_class *c;
935
936 hlist_for_each_entry(c, &q->clhash.hash[h],
937 common.hnode) {
938 if (c->split == split && c->level < level &&
939 c->defmap & (1<<i)) {
940 split->defaults[i] = c;
941 level = c->level;
942 }
943 }
944 }
945 }
946 }
947
948 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
949 {
950 struct cbq_class *split = NULL;
951
952 if (splitid == 0) {
953 split = cl->split;
954 if (!split)
955 return;
956 splitid = split->common.classid;
957 }
958
959 if (split == NULL || split->common.classid != splitid) {
960 for (split = cl->tparent; split; split = split->tparent)
961 if (split->common.classid == splitid)
962 break;
963 }
964
965 if (split == NULL)
966 return;
967
968 if (cl->split != split) {
969 cl->defmap = 0;
970 cbq_sync_defmap(cl);
971 cl->split = split;
972 cl->defmap = def & mask;
973 } else
974 cl->defmap = (cl->defmap & ~mask) | (def & mask);
975
976 cbq_sync_defmap(cl);
977 }
978
979 static void cbq_unlink_class(struct cbq_class *this)
980 {
981 struct cbq_class *cl, **clp;
982 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
983
984 qdisc_class_hash_remove(&q->clhash, &this->common);
985
986 if (this->tparent) {
987 clp = &this->sibling;
988 cl = *clp;
989 do {
990 if (cl == this) {
991 *clp = cl->sibling;
992 break;
993 }
994 clp = &cl->sibling;
995 } while ((cl = *clp) != this->sibling);
996
997 if (this->tparent->children == this) {
998 this->tparent->children = this->sibling;
999 if (this->sibling == this)
1000 this->tparent->children = NULL;
1001 }
1002 } else {
1003 WARN_ON(this->sibling != this);
1004 }
1005 }
1006
1007 static void cbq_link_class(struct cbq_class *this)
1008 {
1009 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1010 struct cbq_class *parent = this->tparent;
1011
1012 this->sibling = this;
1013 qdisc_class_hash_insert(&q->clhash, &this->common);
1014
1015 if (parent == NULL)
1016 return;
1017
1018 if (parent->children == NULL) {
1019 parent->children = this;
1020 } else {
1021 this->sibling = parent->children->sibling;
1022 parent->children->sibling = this;
1023 }
1024 }
1025
1026 static void
1027 cbq_reset(struct Qdisc *sch)
1028 {
1029 struct cbq_sched_data *q = qdisc_priv(sch);
1030 struct cbq_class *cl;
1031 int prio;
1032 unsigned int h;
1033
1034 q->activemask = 0;
1035 q->pmask = 0;
1036 q->tx_class = NULL;
1037 q->tx_borrowed = NULL;
1038 qdisc_watchdog_cancel(&q->watchdog);
1039 hrtimer_cancel(&q->delay_timer);
1040 q->toplevel = TC_CBQ_MAXLEVEL;
1041 q->now = psched_get_time();
1042
1043 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1044 q->active[prio] = NULL;
1045
1046 for (h = 0; h < q->clhash.hashsize; h++) {
1047 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1048 qdisc_reset(cl->q);
1049
1050 cl->next_alive = NULL;
1051 cl->undertime = PSCHED_PASTPERFECT;
1052 cl->avgidle = cl->maxidle;
1053 cl->deficit = cl->quantum;
1054 cl->cpriority = cl->priority;
1055 }
1056 }
1057 sch->q.qlen = 0;
1058 }
1059
1060
1061 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1062 {
1063 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1064 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1065 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1066 }
1067 if (lss->change & TCF_CBQ_LSS_EWMA)
1068 cl->ewma_log = lss->ewma_log;
1069 if (lss->change & TCF_CBQ_LSS_AVPKT)
1070 cl->avpkt = lss->avpkt;
1071 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1072 cl->minidle = -(long)lss->minidle;
1073 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1074 cl->maxidle = lss->maxidle;
1075 cl->avgidle = lss->maxidle;
1076 }
1077 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1078 cl->offtime = lss->offtime;
1079 return 0;
1080 }
1081
1082 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1083 {
1084 q->nclasses[cl->priority]--;
1085 q->quanta[cl->priority] -= cl->weight;
1086 cbq_normalize_quanta(q, cl->priority);
1087 }
1088
1089 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1090 {
1091 q->nclasses[cl->priority]++;
1092 q->quanta[cl->priority] += cl->weight;
1093 cbq_normalize_quanta(q, cl->priority);
1094 }
1095
1096 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1097 {
1098 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1099
1100 if (wrr->allot)
1101 cl->allot = wrr->allot;
1102 if (wrr->weight)
1103 cl->weight = wrr->weight;
1104 if (wrr->priority) {
1105 cl->priority = wrr->priority - 1;
1106 cl->cpriority = cl->priority;
1107 if (cl->priority >= cl->priority2)
1108 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1109 }
1110
1111 cbq_addprio(q, cl);
1112 return 0;
1113 }
1114
1115 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1116 {
1117 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1118 return 0;
1119 }
1120
1121 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1122 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1123 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1124 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1125 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1126 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1127 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1128 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1129 };
1130
1131 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1132 {
1133 struct cbq_sched_data *q = qdisc_priv(sch);
1134 struct nlattr *tb[TCA_CBQ_MAX + 1];
1135 struct tc_ratespec *r;
1136 int err;
1137
1138 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1139 if (err < 0)
1140 return err;
1141
1142 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1143 return -EINVAL;
1144
1145 r = nla_data(tb[TCA_CBQ_RATE]);
1146
1147 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1148 return -EINVAL;
1149
1150 err = qdisc_class_hash_init(&q->clhash);
1151 if (err < 0)
1152 goto put_rtab;
1153
1154 q->link.refcnt = 1;
1155 q->link.sibling = &q->link;
1156 q->link.common.classid = sch->handle;
1157 q->link.qdisc = sch;
1158 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1159 sch->handle);
1160 if (!q->link.q)
1161 q->link.q = &noop_qdisc;
1162
1163 q->link.priority = TC_CBQ_MAXPRIO - 1;
1164 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1165 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1166 q->link.allot = psched_mtu(qdisc_dev(sch));
1167 q->link.quantum = q->link.allot;
1168 q->link.weight = q->link.R_tab->rate.rate;
1169
1170 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1171 q->link.avpkt = q->link.allot/2;
1172 q->link.minidle = -0x7FFFFFFF;
1173
1174 qdisc_watchdog_init(&q->watchdog, sch);
1175 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1176 q->delay_timer.function = cbq_undelay;
1177 q->toplevel = TC_CBQ_MAXLEVEL;
1178 q->now = psched_get_time();
1179
1180 cbq_link_class(&q->link);
1181
1182 if (tb[TCA_CBQ_LSSOPT])
1183 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1184
1185 cbq_addprio(q, &q->link);
1186 return 0;
1187
1188 put_rtab:
1189 qdisc_put_rtab(q->link.R_tab);
1190 return err;
1191 }
1192
1193 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1194 {
1195 unsigned char *b = skb_tail_pointer(skb);
1196
1197 if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1198 goto nla_put_failure;
1199 return skb->len;
1200
1201 nla_put_failure:
1202 nlmsg_trim(skb, b);
1203 return -1;
1204 }
1205
1206 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1207 {
1208 unsigned char *b = skb_tail_pointer(skb);
1209 struct tc_cbq_lssopt opt;
1210
1211 opt.flags = 0;
1212 if (cl->borrow == NULL)
1213 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1214 if (cl->share == NULL)
1215 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1216 opt.ewma_log = cl->ewma_log;
1217 opt.level = cl->level;
1218 opt.avpkt = cl->avpkt;
1219 opt.maxidle = cl->maxidle;
1220 opt.minidle = (u32)(-cl->minidle);
1221 opt.offtime = cl->offtime;
1222 opt.change = ~0;
1223 if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1224 goto nla_put_failure;
1225 return skb->len;
1226
1227 nla_put_failure:
1228 nlmsg_trim(skb, b);
1229 return -1;
1230 }
1231
1232 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1233 {
1234 unsigned char *b = skb_tail_pointer(skb);
1235 struct tc_cbq_wrropt opt;
1236
1237 memset(&opt, 0, sizeof(opt));
1238 opt.flags = 0;
1239 opt.allot = cl->allot;
1240 opt.priority = cl->priority + 1;
1241 opt.cpriority = cl->cpriority + 1;
1242 opt.weight = cl->weight;
1243 if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1244 goto nla_put_failure;
1245 return skb->len;
1246
1247 nla_put_failure:
1248 nlmsg_trim(skb, b);
1249 return -1;
1250 }
1251
1252 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1253 {
1254 unsigned char *b = skb_tail_pointer(skb);
1255 struct tc_cbq_fopt opt;
1256
1257 if (cl->split || cl->defmap) {
1258 opt.split = cl->split ? cl->split->common.classid : 0;
1259 opt.defmap = cl->defmap;
1260 opt.defchange = ~0;
1261 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1262 goto nla_put_failure;
1263 }
1264 return skb->len;
1265
1266 nla_put_failure:
1267 nlmsg_trim(skb, b);
1268 return -1;
1269 }
1270
1271 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1272 {
1273 if (cbq_dump_lss(skb, cl) < 0 ||
1274 cbq_dump_rate(skb, cl) < 0 ||
1275 cbq_dump_wrr(skb, cl) < 0 ||
1276 cbq_dump_fopt(skb, cl) < 0)
1277 return -1;
1278 return 0;
1279 }
1280
1281 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1282 {
1283 struct cbq_sched_data *q = qdisc_priv(sch);
1284 struct nlattr *nest;
1285
1286 nest = nla_nest_start(skb, TCA_OPTIONS);
1287 if (nest == NULL)
1288 goto nla_put_failure;
1289 if (cbq_dump_attr(skb, &q->link) < 0)
1290 goto nla_put_failure;
1291 return nla_nest_end(skb, nest);
1292
1293 nla_put_failure:
1294 nla_nest_cancel(skb, nest);
1295 return -1;
1296 }
1297
1298 static int
1299 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1300 {
1301 struct cbq_sched_data *q = qdisc_priv(sch);
1302
1303 q->link.xstats.avgidle = q->link.avgidle;
1304 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1305 }
1306
1307 static int
1308 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1309 struct sk_buff *skb, struct tcmsg *tcm)
1310 {
1311 struct cbq_class *cl = (struct cbq_class *)arg;
1312 struct nlattr *nest;
1313
1314 if (cl->tparent)
1315 tcm->tcm_parent = cl->tparent->common.classid;
1316 else
1317 tcm->tcm_parent = TC_H_ROOT;
1318 tcm->tcm_handle = cl->common.classid;
1319 tcm->tcm_info = cl->q->handle;
1320
1321 nest = nla_nest_start(skb, TCA_OPTIONS);
1322 if (nest == NULL)
1323 goto nla_put_failure;
1324 if (cbq_dump_attr(skb, cl) < 0)
1325 goto nla_put_failure;
1326 return nla_nest_end(skb, nest);
1327
1328 nla_put_failure:
1329 nla_nest_cancel(skb, nest);
1330 return -1;
1331 }
1332
1333 static int
1334 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1335 struct gnet_dump *d)
1336 {
1337 struct cbq_sched_data *q = qdisc_priv(sch);
1338 struct cbq_class *cl = (struct cbq_class *)arg;
1339
1340 cl->xstats.avgidle = cl->avgidle;
1341 cl->xstats.undertime = 0;
1342
1343 if (cl->undertime != PSCHED_PASTPERFECT)
1344 cl->xstats.undertime = cl->undertime - q->now;
1345
1346 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1347 d, NULL, &cl->bstats) < 0 ||
1348 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1349 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1350 return -1;
1351
1352 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1353 }
1354
1355 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1356 struct Qdisc **old)
1357 {
1358 struct cbq_class *cl = (struct cbq_class *)arg;
1359
1360 if (new == NULL) {
1361 new = qdisc_create_dflt(sch->dev_queue,
1362 &pfifo_qdisc_ops, cl->common.classid);
1363 if (new == NULL)
1364 return -ENOBUFS;
1365 }
1366
1367 *old = qdisc_replace(sch, new, &cl->q);
1368 return 0;
1369 }
1370
1371 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1372 {
1373 struct cbq_class *cl = (struct cbq_class *)arg;
1374
1375 return cl->q;
1376 }
1377
1378 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1379 {
1380 struct cbq_class *cl = (struct cbq_class *)arg;
1381
1382 if (cl->q->q.qlen == 0)
1383 cbq_deactivate_class(cl);
1384 }
1385
1386 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1387 {
1388 struct cbq_sched_data *q = qdisc_priv(sch);
1389 struct cbq_class *cl = cbq_class_lookup(q, classid);
1390
1391 if (cl) {
1392 cl->refcnt++;
1393 return (unsigned long)cl;
1394 }
1395 return 0;
1396 }
1397
1398 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1399 {
1400 struct cbq_sched_data *q = qdisc_priv(sch);
1401
1402 WARN_ON(cl->filters);
1403
1404 tcf_destroy_chain(&cl->filter_list);
1405 qdisc_destroy(cl->q);
1406 qdisc_put_rtab(cl->R_tab);
1407 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1408 if (cl != &q->link)
1409 kfree(cl);
1410 }
1411
1412 static void cbq_destroy(struct Qdisc *sch)
1413 {
1414 struct cbq_sched_data *q = qdisc_priv(sch);
1415 struct hlist_node *next;
1416 struct cbq_class *cl;
1417 unsigned int h;
1418
1419 #ifdef CONFIG_NET_CLS_ACT
1420 q->rx_class = NULL;
1421 #endif
1422 /*
1423 * Filters must be destroyed first because we don't destroy the
1424 * classes from root to leafs which means that filters can still
1425 * be bound to classes which have been destroyed already. --TGR '04
1426 */
1427 for (h = 0; h < q->clhash.hashsize; h++) {
1428 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1429 tcf_destroy_chain(&cl->filter_list);
1430 }
1431 for (h = 0; h < q->clhash.hashsize; h++) {
1432 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1433 common.hnode)
1434 cbq_destroy_class(sch, cl);
1435 }
1436 qdisc_class_hash_destroy(&q->clhash);
1437 }
1438
1439 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1440 {
1441 struct cbq_class *cl = (struct cbq_class *)arg;
1442
1443 if (--cl->refcnt == 0) {
1444 #ifdef CONFIG_NET_CLS_ACT
1445 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1446 struct cbq_sched_data *q = qdisc_priv(sch);
1447
1448 spin_lock_bh(root_lock);
1449 if (q->rx_class == cl)
1450 q->rx_class = NULL;
1451 spin_unlock_bh(root_lock);
1452 #endif
1453
1454 cbq_destroy_class(sch, cl);
1455 }
1456 }
1457
1458 static int
1459 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1460 unsigned long *arg)
1461 {
1462 int err;
1463 struct cbq_sched_data *q = qdisc_priv(sch);
1464 struct cbq_class *cl = (struct cbq_class *)*arg;
1465 struct nlattr *opt = tca[TCA_OPTIONS];
1466 struct nlattr *tb[TCA_CBQ_MAX + 1];
1467 struct cbq_class *parent;
1468 struct qdisc_rate_table *rtab = NULL;
1469
1470 if (opt == NULL)
1471 return -EINVAL;
1472
1473 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1474 if (err < 0)
1475 return err;
1476
1477 if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1478 return -EOPNOTSUPP;
1479
1480 if (cl) {
1481 /* Check parent */
1482 if (parentid) {
1483 if (cl->tparent &&
1484 cl->tparent->common.classid != parentid)
1485 return -EINVAL;
1486 if (!cl->tparent && parentid != TC_H_ROOT)
1487 return -EINVAL;
1488 }
1489
1490 if (tb[TCA_CBQ_RATE]) {
1491 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1492 tb[TCA_CBQ_RTAB]);
1493 if (rtab == NULL)
1494 return -EINVAL;
1495 }
1496
1497 if (tca[TCA_RATE]) {
1498 err = gen_replace_estimator(&cl->bstats, NULL,
1499 &cl->rate_est,
1500 NULL,
1501 qdisc_root_sleeping_running(sch),
1502 tca[TCA_RATE]);
1503 if (err) {
1504 qdisc_put_rtab(rtab);
1505 return err;
1506 }
1507 }
1508
1509 /* Change class parameters */
1510 sch_tree_lock(sch);
1511
1512 if (cl->next_alive != NULL)
1513 cbq_deactivate_class(cl);
1514
1515 if (rtab) {
1516 qdisc_put_rtab(cl->R_tab);
1517 cl->R_tab = rtab;
1518 }
1519
1520 if (tb[TCA_CBQ_LSSOPT])
1521 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1522
1523 if (tb[TCA_CBQ_WRROPT]) {
1524 cbq_rmprio(q, cl);
1525 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1526 }
1527
1528 if (tb[TCA_CBQ_FOPT])
1529 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1530
1531 if (cl->q->q.qlen)
1532 cbq_activate_class(cl);
1533
1534 sch_tree_unlock(sch);
1535
1536 return 0;
1537 }
1538
1539 if (parentid == TC_H_ROOT)
1540 return -EINVAL;
1541
1542 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1543 tb[TCA_CBQ_LSSOPT] == NULL)
1544 return -EINVAL;
1545
1546 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1547 if (rtab == NULL)
1548 return -EINVAL;
1549
1550 if (classid) {
1551 err = -EINVAL;
1552 if (TC_H_MAJ(classid ^ sch->handle) ||
1553 cbq_class_lookup(q, classid))
1554 goto failure;
1555 } else {
1556 int i;
1557 classid = TC_H_MAKE(sch->handle, 0x8000);
1558
1559 for (i = 0; i < 0x8000; i++) {
1560 if (++q->hgenerator >= 0x8000)
1561 q->hgenerator = 1;
1562 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1563 break;
1564 }
1565 err = -ENOSR;
1566 if (i >= 0x8000)
1567 goto failure;
1568 classid = classid|q->hgenerator;
1569 }
1570
1571 parent = &q->link;
1572 if (parentid) {
1573 parent = cbq_class_lookup(q, parentid);
1574 err = -EINVAL;
1575 if (parent == NULL)
1576 goto failure;
1577 }
1578
1579 err = -ENOBUFS;
1580 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1581 if (cl == NULL)
1582 goto failure;
1583
1584 if (tca[TCA_RATE]) {
1585 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1586 NULL,
1587 qdisc_root_sleeping_running(sch),
1588 tca[TCA_RATE]);
1589 if (err) {
1590 kfree(cl);
1591 goto failure;
1592 }
1593 }
1594
1595 cl->R_tab = rtab;
1596 rtab = NULL;
1597 cl->refcnt = 1;
1598 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1599 if (!cl->q)
1600 cl->q = &noop_qdisc;
1601 cl->common.classid = classid;
1602 cl->tparent = parent;
1603 cl->qdisc = sch;
1604 cl->allot = parent->allot;
1605 cl->quantum = cl->allot;
1606 cl->weight = cl->R_tab->rate.rate;
1607
1608 sch_tree_lock(sch);
1609 cbq_link_class(cl);
1610 cl->borrow = cl->tparent;
1611 if (cl->tparent != &q->link)
1612 cl->share = cl->tparent;
1613 cbq_adjust_levels(parent);
1614 cl->minidle = -0x7FFFFFFF;
1615 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1616 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1617 if (cl->ewma_log == 0)
1618 cl->ewma_log = q->link.ewma_log;
1619 if (cl->maxidle == 0)
1620 cl->maxidle = q->link.maxidle;
1621 if (cl->avpkt == 0)
1622 cl->avpkt = q->link.avpkt;
1623 if (tb[TCA_CBQ_FOPT])
1624 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1625 sch_tree_unlock(sch);
1626
1627 qdisc_class_hash_grow(sch, &q->clhash);
1628
1629 *arg = (unsigned long)cl;
1630 return 0;
1631
1632 failure:
1633 qdisc_put_rtab(rtab);
1634 return err;
1635 }
1636
1637 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1638 {
1639 struct cbq_sched_data *q = qdisc_priv(sch);
1640 struct cbq_class *cl = (struct cbq_class *)arg;
1641 unsigned int qlen, backlog;
1642
1643 if (cl->filters || cl->children || cl == &q->link)
1644 return -EBUSY;
1645
1646 sch_tree_lock(sch);
1647
1648 qlen = cl->q->q.qlen;
1649 backlog = cl->q->qstats.backlog;
1650 qdisc_reset(cl->q);
1651 qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1652
1653 if (cl->next_alive)
1654 cbq_deactivate_class(cl);
1655
1656 if (q->tx_borrowed == cl)
1657 q->tx_borrowed = q->tx_class;
1658 if (q->tx_class == cl) {
1659 q->tx_class = NULL;
1660 q->tx_borrowed = NULL;
1661 }
1662 #ifdef CONFIG_NET_CLS_ACT
1663 if (q->rx_class == cl)
1664 q->rx_class = NULL;
1665 #endif
1666
1667 cbq_unlink_class(cl);
1668 cbq_adjust_levels(cl->tparent);
1669 cl->defmap = 0;
1670 cbq_sync_defmap(cl);
1671
1672 cbq_rmprio(q, cl);
1673 sch_tree_unlock(sch);
1674
1675 BUG_ON(--cl->refcnt == 0);
1676 /*
1677 * This shouldn't happen: we "hold" one cops->get() when called
1678 * from tc_ctl_tclass; the destroy method is done from cops->put().
1679 */
1680
1681 return 0;
1682 }
1683
1684 static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1685 unsigned long arg)
1686 {
1687 struct cbq_sched_data *q = qdisc_priv(sch);
1688 struct cbq_class *cl = (struct cbq_class *)arg;
1689
1690 if (cl == NULL)
1691 cl = &q->link;
1692
1693 return &cl->filter_list;
1694 }
1695
1696 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1697 u32 classid)
1698 {
1699 struct cbq_sched_data *q = qdisc_priv(sch);
1700 struct cbq_class *p = (struct cbq_class *)parent;
1701 struct cbq_class *cl = cbq_class_lookup(q, classid);
1702
1703 if (cl) {
1704 if (p && p->level <= cl->level)
1705 return 0;
1706 cl->filters++;
1707 return (unsigned long)cl;
1708 }
1709 return 0;
1710 }
1711
1712 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1713 {
1714 struct cbq_class *cl = (struct cbq_class *)arg;
1715
1716 cl->filters--;
1717 }
1718
1719 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1720 {
1721 struct cbq_sched_data *q = qdisc_priv(sch);
1722 struct cbq_class *cl;
1723 unsigned int h;
1724
1725 if (arg->stop)
1726 return;
1727
1728 for (h = 0; h < q->clhash.hashsize; h++) {
1729 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1730 if (arg->count < arg->skip) {
1731 arg->count++;
1732 continue;
1733 }
1734 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1735 arg->stop = 1;
1736 return;
1737 }
1738 arg->count++;
1739 }
1740 }
1741 }
1742
1743 static const struct Qdisc_class_ops cbq_class_ops = {
1744 .graft = cbq_graft,
1745 .leaf = cbq_leaf,
1746 .qlen_notify = cbq_qlen_notify,
1747 .get = cbq_get,
1748 .put = cbq_put,
1749 .change = cbq_change_class,
1750 .delete = cbq_delete,
1751 .walk = cbq_walk,
1752 .tcf_chain = cbq_find_tcf,
1753 .bind_tcf = cbq_bind_filter,
1754 .unbind_tcf = cbq_unbind_filter,
1755 .dump = cbq_dump_class,
1756 .dump_stats = cbq_dump_class_stats,
1757 };
1758
1759 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1760 .next = NULL,
1761 .cl_ops = &cbq_class_ops,
1762 .id = "cbq",
1763 .priv_size = sizeof(struct cbq_sched_data),
1764 .enqueue = cbq_enqueue,
1765 .dequeue = cbq_dequeue,
1766 .peek = qdisc_peek_dequeued,
1767 .init = cbq_init,
1768 .reset = cbq_reset,
1769 .destroy = cbq_destroy,
1770 .change = NULL,
1771 .dump = cbq_dump,
1772 .dump_stats = cbq_dump_stats,
1773 .owner = THIS_MODULE,
1774 };
1775
1776 static int __init cbq_module_init(void)
1777 {
1778 return register_qdisc(&cbq_qdisc_ops);
1779 }
1780 static void __exit cbq_module_exit(void)
1781 {
1782 unregister_qdisc(&cbq_qdisc_ops);
1783 }
1784 module_init(cbq_module_init)
1785 module_exit(cbq_module_exit)
1786 MODULE_LICENSE("GPL");