]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/sched/sch_cbq.c
netdev: Make netif_schedule() routines work with netdev_queue objects.
[mirror_ubuntu-artful-kernel.git] / net / sched / sch_cbq.c
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/string.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <net/netlink.h>
20 #include <net/pkt_sched.h>
21
22
23 /* Class-Based Queueing (CBQ) algorithm.
24 =======================================
25
26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
27 Management Models for Packet Networks",
28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
29
30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
31
32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
33 Parameters", 1996
34
35 [4] Sally Floyd and Michael Speer, "Experimental Results
36 for Class-Based Queueing", 1998, not published.
37
38 -----------------------------------------------------------------------
39
40 Algorithm skeleton was taken from NS simulator cbq.cc.
41 If someone wants to check this code against the LBL version,
42 he should take into account that ONLY the skeleton was borrowed,
43 the implementation is different. Particularly:
44
45 --- The WRR algorithm is different. Our version looks more
46 reasonable (I hope) and works when quanta are allowed to be
47 less than MTU, which is always the case when real time classes
48 have small rates. Note, that the statement of [3] is
49 incomplete, delay may actually be estimated even if class
50 per-round allotment is less than MTU. Namely, if per-round
51 allotment is W*r_i, and r_1+...+r_k = r < 1
52
53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
54
55 In the worst case we have IntServ estimate with D = W*r+k*MTU
56 and C = MTU*r. The proof (if correct at all) is trivial.
57
58
59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
60 interpret some places, which look like wrong translations
61 from NS. Anyone is advised to find these differences
62 and explain to me, why I am wrong 8).
63
64 --- Linux has no EOI event, so that we cannot estimate true class
65 idle time. Workaround is to consider the next dequeue event
66 as sign that previous packet is finished. This is wrong because of
67 internal device queueing, but on a permanently loaded link it is true.
68 Moreover, combined with clock integrator, this scheme looks
69 very close to an ideal solution. */
70
71 struct cbq_sched_data;
72
73
74 struct cbq_class
75 {
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
78
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83 unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85 unsigned char police;
86 #endif
87
88 u32 defmap;
89
90 /* Link-sharing scheduler parameters */
91 long maxidle; /* Class parameters: see below. */
92 long offtime;
93 long minidle;
94 u32 avpkt;
95 struct qdisc_rate_table *R_tab;
96
97 /* Overlimit strategy parameters */
98 void (*overlimit)(struct cbq_class *cl);
99 psched_tdiff_t penalty;
100
101 /* General scheduler (WRR) parameters */
102 long allot;
103 long quantum; /* Allotment per WRR round */
104 long weight; /* Relative allotment: see below */
105
106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107 struct cbq_class *split; /* Ptr to split node */
108 struct cbq_class *share; /* Ptr to LS parent in the class tree */
109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111 parent otherwise */
112 struct cbq_class *sibling; /* Sibling chain */
113 struct cbq_class *children; /* Pointer to children chain */
114
115 struct Qdisc *q; /* Elementary queueing discipline */
116
117
118 /* Variables */
119 unsigned char cpriority; /* Effective priority */
120 unsigned char delayed;
121 unsigned char level; /* level of the class in hierarchy:
122 0 for leaf classes, and maximal
123 level of children + 1 for nodes.
124 */
125
126 psched_time_t last; /* Last end of service */
127 psched_time_t undertime;
128 long avgidle;
129 long deficit; /* Saved deficit for WRR */
130 psched_time_t penalized;
131 struct gnet_stats_basic bstats;
132 struct gnet_stats_queue qstats;
133 struct gnet_stats_rate_est rate_est;
134 struct tc_cbq_xstats xstats;
135
136 struct tcf_proto *filter_list;
137
138 int refcnt;
139 int filters;
140
141 struct cbq_class *defaults[TC_PRIO_MAX+1];
142 };
143
144 struct cbq_sched_data
145 {
146 struct Qdisc_class_hash clhash; /* Hash table of all classes */
147 int nclasses[TC_CBQ_MAXPRIO+1];
148 unsigned quanta[TC_CBQ_MAXPRIO+1];
149
150 struct cbq_class link;
151
152 unsigned activemask;
153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
154 with backlog */
155
156 #ifdef CONFIG_NET_CLS_ACT
157 struct cbq_class *rx_class;
158 #endif
159 struct cbq_class *tx_class;
160 struct cbq_class *tx_borrowed;
161 int tx_len;
162 psched_time_t now; /* Cached timestamp */
163 psched_time_t now_rt; /* Cached real time */
164 unsigned pmask;
165
166 struct hrtimer delay_timer;
167 struct qdisc_watchdog watchdog; /* Watchdog timer,
168 started when CBQ has
169 backlog, but cannot
170 transmit just now */
171 psched_tdiff_t wd_expires;
172 int toplevel;
173 u32 hgenerator;
174 };
175
176
177 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
178
179 static __inline__ struct cbq_class *
180 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
181 {
182 struct Qdisc_class_common *clc;
183
184 clc = qdisc_class_find(&q->clhash, classid);
185 if (clc == NULL)
186 return NULL;
187 return container_of(clc, struct cbq_class, common);
188 }
189
190 #ifdef CONFIG_NET_CLS_ACT
191
192 static struct cbq_class *
193 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
194 {
195 struct cbq_class *cl, *new;
196
197 for (cl = this->tparent; cl; cl = cl->tparent)
198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
199 return new;
200
201 return NULL;
202 }
203
204 #endif
205
206 /* Classify packet. The procedure is pretty complicated, but
207 it allows us to combine link sharing and priority scheduling
208 transparently.
209
210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 so that it resolves to split nodes. Then packets are classified
212 by logical priority, or a more specific classifier may be attached
213 to the split node.
214 */
215
216 static struct cbq_class *
217 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
218 {
219 struct cbq_sched_data *q = qdisc_priv(sch);
220 struct cbq_class *head = &q->link;
221 struct cbq_class **defmap;
222 struct cbq_class *cl = NULL;
223 u32 prio = skb->priority;
224 struct tcf_result res;
225
226 /*
227 * Step 1. If skb->priority points to one of our classes, use it.
228 */
229 if (TC_H_MAJ(prio^sch->handle) == 0 &&
230 (cl = cbq_class_lookup(q, prio)) != NULL)
231 return cl;
232
233 *qerr = NET_XMIT_BYPASS;
234 for (;;) {
235 int result = 0;
236 defmap = head->defaults;
237
238 /*
239 * Step 2+n. Apply classifier.
240 */
241 if (!head->filter_list ||
242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
243 goto fallback;
244
245 if ((cl = (void*)res.class) == NULL) {
246 if (TC_H_MAJ(res.classid))
247 cl = cbq_class_lookup(q, res.classid);
248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
249 cl = defmap[TC_PRIO_BESTEFFORT];
250
251 if (cl == NULL || cl->level >= head->level)
252 goto fallback;
253 }
254
255 #ifdef CONFIG_NET_CLS_ACT
256 switch (result) {
257 case TC_ACT_QUEUED:
258 case TC_ACT_STOLEN:
259 *qerr = NET_XMIT_SUCCESS;
260 case TC_ACT_SHOT:
261 return NULL;
262 case TC_ACT_RECLASSIFY:
263 return cbq_reclassify(skb, cl);
264 }
265 #endif
266 if (cl->level == 0)
267 return cl;
268
269 /*
270 * Step 3+n. If classifier selected a link sharing class,
271 * apply agency specific classifier.
272 * Repeat this procdure until we hit a leaf node.
273 */
274 head = cl;
275 }
276
277 fallback:
278 cl = head;
279
280 /*
281 * Step 4. No success...
282 */
283 if (TC_H_MAJ(prio) == 0 &&
284 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
285 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
286 return head;
287
288 return cl;
289 }
290
291 /*
292 A packet has just been enqueued on the empty class.
293 cbq_activate_class adds it to the tail of active class list
294 of its priority band.
295 */
296
297 static __inline__ void cbq_activate_class(struct cbq_class *cl)
298 {
299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
300 int prio = cl->cpriority;
301 struct cbq_class *cl_tail;
302
303 cl_tail = q->active[prio];
304 q->active[prio] = cl;
305
306 if (cl_tail != NULL) {
307 cl->next_alive = cl_tail->next_alive;
308 cl_tail->next_alive = cl;
309 } else {
310 cl->next_alive = cl;
311 q->activemask |= (1<<prio);
312 }
313 }
314
315 /*
316 Unlink class from active chain.
317 Note that this same procedure is done directly in cbq_dequeue*
318 during round-robin procedure.
319 */
320
321 static void cbq_deactivate_class(struct cbq_class *this)
322 {
323 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
324 int prio = this->cpriority;
325 struct cbq_class *cl;
326 struct cbq_class *cl_prev = q->active[prio];
327
328 do {
329 cl = cl_prev->next_alive;
330 if (cl == this) {
331 cl_prev->next_alive = cl->next_alive;
332 cl->next_alive = NULL;
333
334 if (cl == q->active[prio]) {
335 q->active[prio] = cl_prev;
336 if (cl == q->active[prio]) {
337 q->active[prio] = NULL;
338 q->activemask &= ~(1<<prio);
339 return;
340 }
341 }
342 return;
343 }
344 } while ((cl_prev = cl) != q->active[prio]);
345 }
346
347 static void
348 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
349 {
350 int toplevel = q->toplevel;
351
352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
353 psched_time_t now;
354 psched_tdiff_t incr;
355
356 now = psched_get_time();
357 incr = now - q->now_rt;
358 now = q->now + incr;
359
360 do {
361 if (cl->undertime < now) {
362 q->toplevel = cl->level;
363 return;
364 }
365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
366 }
367 }
368
369 static int
370 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
371 {
372 struct cbq_sched_data *q = qdisc_priv(sch);
373 int len = skb->len;
374 int uninitialized_var(ret);
375 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
376
377 #ifdef CONFIG_NET_CLS_ACT
378 q->rx_class = cl;
379 #endif
380 if (cl == NULL) {
381 if (ret == NET_XMIT_BYPASS)
382 sch->qstats.drops++;
383 kfree_skb(skb);
384 return ret;
385 }
386
387 #ifdef CONFIG_NET_CLS_ACT
388 cl->q->__parent = sch;
389 #endif
390 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
391 sch->q.qlen++;
392 sch->bstats.packets++;
393 sch->bstats.bytes+=len;
394 cbq_mark_toplevel(q, cl);
395 if (!cl->next_alive)
396 cbq_activate_class(cl);
397 return ret;
398 }
399
400 sch->qstats.drops++;
401 cbq_mark_toplevel(q, cl);
402 cl->qstats.drops++;
403 return ret;
404 }
405
406 static int
407 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
408 {
409 struct cbq_sched_data *q = qdisc_priv(sch);
410 struct cbq_class *cl;
411 int ret;
412
413 if ((cl = q->tx_class) == NULL) {
414 kfree_skb(skb);
415 sch->qstats.drops++;
416 return NET_XMIT_CN;
417 }
418 q->tx_class = NULL;
419
420 cbq_mark_toplevel(q, cl);
421
422 #ifdef CONFIG_NET_CLS_ACT
423 q->rx_class = cl;
424 cl->q->__parent = sch;
425 #endif
426 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
427 sch->q.qlen++;
428 sch->qstats.requeues++;
429 if (!cl->next_alive)
430 cbq_activate_class(cl);
431 return 0;
432 }
433 sch->qstats.drops++;
434 cl->qstats.drops++;
435 return ret;
436 }
437
438 /* Overlimit actions */
439
440 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
441
442 static void cbq_ovl_classic(struct cbq_class *cl)
443 {
444 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
445 psched_tdiff_t delay = cl->undertime - q->now;
446
447 if (!cl->delayed) {
448 delay += cl->offtime;
449
450 /*
451 Class goes to sleep, so that it will have no
452 chance to work avgidle. Let's forgive it 8)
453
454 BTW cbq-2.0 has a crap in this
455 place, apparently they forgot to shift it by cl->ewma_log.
456 */
457 if (cl->avgidle < 0)
458 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
459 if (cl->avgidle < cl->minidle)
460 cl->avgidle = cl->minidle;
461 if (delay <= 0)
462 delay = 1;
463 cl->undertime = q->now + delay;
464
465 cl->xstats.overactions++;
466 cl->delayed = 1;
467 }
468 if (q->wd_expires == 0 || q->wd_expires > delay)
469 q->wd_expires = delay;
470
471 /* Dirty work! We must schedule wakeups based on
472 real available rate, rather than leaf rate,
473 which may be tiny (even zero).
474 */
475 if (q->toplevel == TC_CBQ_MAXLEVEL) {
476 struct cbq_class *b;
477 psched_tdiff_t base_delay = q->wd_expires;
478
479 for (b = cl->borrow; b; b = b->borrow) {
480 delay = b->undertime - q->now;
481 if (delay < base_delay) {
482 if (delay <= 0)
483 delay = 1;
484 base_delay = delay;
485 }
486 }
487
488 q->wd_expires = base_delay;
489 }
490 }
491
492 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
493 they go overlimit
494 */
495
496 static void cbq_ovl_rclassic(struct cbq_class *cl)
497 {
498 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
499 struct cbq_class *this = cl;
500
501 do {
502 if (cl->level > q->toplevel) {
503 cl = NULL;
504 break;
505 }
506 } while ((cl = cl->borrow) != NULL);
507
508 if (cl == NULL)
509 cl = this;
510 cbq_ovl_classic(cl);
511 }
512
513 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
514
515 static void cbq_ovl_delay(struct cbq_class *cl)
516 {
517 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
518 psched_tdiff_t delay = cl->undertime - q->now;
519
520 if (!cl->delayed) {
521 psched_time_t sched = q->now;
522 ktime_t expires;
523
524 delay += cl->offtime;
525 if (cl->avgidle < 0)
526 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
527 if (cl->avgidle < cl->minidle)
528 cl->avgidle = cl->minidle;
529 cl->undertime = q->now + delay;
530
531 if (delay > 0) {
532 sched += delay + cl->penalty;
533 cl->penalized = sched;
534 cl->cpriority = TC_CBQ_MAXPRIO;
535 q->pmask |= (1<<TC_CBQ_MAXPRIO);
536
537 expires = ktime_set(0, 0);
538 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
539 if (hrtimer_try_to_cancel(&q->delay_timer) &&
540 ktime_to_ns(ktime_sub(q->delay_timer.expires,
541 expires)) > 0)
542 q->delay_timer.expires = expires;
543 hrtimer_restart(&q->delay_timer);
544 cl->delayed = 1;
545 cl->xstats.overactions++;
546 return;
547 }
548 delay = 1;
549 }
550 if (q->wd_expires == 0 || q->wd_expires > delay)
551 q->wd_expires = delay;
552 }
553
554 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
555
556 static void cbq_ovl_lowprio(struct cbq_class *cl)
557 {
558 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
559
560 cl->penalized = q->now + cl->penalty;
561
562 if (cl->cpriority != cl->priority2) {
563 cl->cpriority = cl->priority2;
564 q->pmask |= (1<<cl->cpriority);
565 cl->xstats.overactions++;
566 }
567 cbq_ovl_classic(cl);
568 }
569
570 /* TC_CBQ_OVL_DROP: penalize class by dropping */
571
572 static void cbq_ovl_drop(struct cbq_class *cl)
573 {
574 if (cl->q->ops->drop)
575 if (cl->q->ops->drop(cl->q))
576 cl->qdisc->q.qlen--;
577 cl->xstats.overactions++;
578 cbq_ovl_classic(cl);
579 }
580
581 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
582 psched_time_t now)
583 {
584 struct cbq_class *cl;
585 struct cbq_class *cl_prev = q->active[prio];
586 psched_time_t sched = now;
587
588 if (cl_prev == NULL)
589 return 0;
590
591 do {
592 cl = cl_prev->next_alive;
593 if (now - cl->penalized > 0) {
594 cl_prev->next_alive = cl->next_alive;
595 cl->next_alive = NULL;
596 cl->cpriority = cl->priority;
597 cl->delayed = 0;
598 cbq_activate_class(cl);
599
600 if (cl == q->active[prio]) {
601 q->active[prio] = cl_prev;
602 if (cl == q->active[prio]) {
603 q->active[prio] = NULL;
604 return 0;
605 }
606 }
607
608 cl = cl_prev->next_alive;
609 } else if (sched - cl->penalized > 0)
610 sched = cl->penalized;
611 } while ((cl_prev = cl) != q->active[prio]);
612
613 return sched - now;
614 }
615
616 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
617 {
618 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
619 delay_timer);
620 struct Qdisc *sch = q->watchdog.qdisc;
621 psched_time_t now;
622 psched_tdiff_t delay = 0;
623 unsigned pmask;
624
625 now = psched_get_time();
626
627 pmask = q->pmask;
628 q->pmask = 0;
629
630 while (pmask) {
631 int prio = ffz(~pmask);
632 psched_tdiff_t tmp;
633
634 pmask &= ~(1<<prio);
635
636 tmp = cbq_undelay_prio(q, prio, now);
637 if (tmp > 0) {
638 q->pmask |= 1<<prio;
639 if (tmp < delay || delay == 0)
640 delay = tmp;
641 }
642 }
643
644 if (delay) {
645 ktime_t time;
646
647 time = ktime_set(0, 0);
648 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
649 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
650 }
651
652 sch->flags &= ~TCQ_F_THROTTLED;
653 netif_schedule_queue(sch->dev_queue);
654 return HRTIMER_NORESTART;
655 }
656
657 #ifdef CONFIG_NET_CLS_ACT
658 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
659 {
660 int len = skb->len;
661 struct Qdisc *sch = child->__parent;
662 struct cbq_sched_data *q = qdisc_priv(sch);
663 struct cbq_class *cl = q->rx_class;
664
665 q->rx_class = NULL;
666
667 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
668
669 cbq_mark_toplevel(q, cl);
670
671 q->rx_class = cl;
672 cl->q->__parent = sch;
673
674 if (cl->q->enqueue(skb, cl->q) == 0) {
675 sch->q.qlen++;
676 sch->bstats.packets++;
677 sch->bstats.bytes+=len;
678 if (!cl->next_alive)
679 cbq_activate_class(cl);
680 return 0;
681 }
682 sch->qstats.drops++;
683 return 0;
684 }
685
686 sch->qstats.drops++;
687 return -1;
688 }
689 #endif
690
691 /*
692 It is mission critical procedure.
693
694 We "regenerate" toplevel cutoff, if transmitting class
695 has backlog and it is not regulated. It is not part of
696 original CBQ description, but looks more reasonable.
697 Probably, it is wrong. This question needs further investigation.
698 */
699
700 static __inline__ void
701 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
702 struct cbq_class *borrowed)
703 {
704 if (cl && q->toplevel >= borrowed->level) {
705 if (cl->q->q.qlen > 1) {
706 do {
707 if (borrowed->undertime == PSCHED_PASTPERFECT) {
708 q->toplevel = borrowed->level;
709 return;
710 }
711 } while ((borrowed=borrowed->borrow) != NULL);
712 }
713 #if 0
714 /* It is not necessary now. Uncommenting it
715 will save CPU cycles, but decrease fairness.
716 */
717 q->toplevel = TC_CBQ_MAXLEVEL;
718 #endif
719 }
720 }
721
722 static void
723 cbq_update(struct cbq_sched_data *q)
724 {
725 struct cbq_class *this = q->tx_class;
726 struct cbq_class *cl = this;
727 int len = q->tx_len;
728
729 q->tx_class = NULL;
730
731 for ( ; cl; cl = cl->share) {
732 long avgidle = cl->avgidle;
733 long idle;
734
735 cl->bstats.packets++;
736 cl->bstats.bytes += len;
737
738 /*
739 (now - last) is total time between packet right edges.
740 (last_pktlen/rate) is "virtual" busy time, so that
741
742 idle = (now - last) - last_pktlen/rate
743 */
744
745 idle = q->now - cl->last;
746 if ((unsigned long)idle > 128*1024*1024) {
747 avgidle = cl->maxidle;
748 } else {
749 idle -= L2T(cl, len);
750
751 /* true_avgidle := (1-W)*true_avgidle + W*idle,
752 where W=2^{-ewma_log}. But cl->avgidle is scaled:
753 cl->avgidle == true_avgidle/W,
754 hence:
755 */
756 avgidle += idle - (avgidle>>cl->ewma_log);
757 }
758
759 if (avgidle <= 0) {
760 /* Overlimit or at-limit */
761
762 if (avgidle < cl->minidle)
763 avgidle = cl->minidle;
764
765 cl->avgidle = avgidle;
766
767 /* Calculate expected time, when this class
768 will be allowed to send.
769 It will occur, when:
770 (1-W)*true_avgidle + W*delay = 0, i.e.
771 idle = (1/W - 1)*(-true_avgidle)
772 or
773 idle = (1 - W)*(-cl->avgidle);
774 */
775 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
776
777 /*
778 That is not all.
779 To maintain the rate allocated to the class,
780 we add to undertime virtual clock,
781 necessary to complete transmitted packet.
782 (len/phys_bandwidth has been already passed
783 to the moment of cbq_update)
784 */
785
786 idle -= L2T(&q->link, len);
787 idle += L2T(cl, len);
788
789 cl->undertime = q->now + idle;
790 } else {
791 /* Underlimit */
792
793 cl->undertime = PSCHED_PASTPERFECT;
794 if (avgidle > cl->maxidle)
795 cl->avgidle = cl->maxidle;
796 else
797 cl->avgidle = avgidle;
798 }
799 cl->last = q->now;
800 }
801
802 cbq_update_toplevel(q, this, q->tx_borrowed);
803 }
804
805 static __inline__ struct cbq_class *
806 cbq_under_limit(struct cbq_class *cl)
807 {
808 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
809 struct cbq_class *this_cl = cl;
810
811 if (cl->tparent == NULL)
812 return cl;
813
814 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
815 cl->delayed = 0;
816 return cl;
817 }
818
819 do {
820 /* It is very suspicious place. Now overlimit
821 action is generated for not bounded classes
822 only if link is completely congested.
823 Though it is in agree with ancestor-only paradigm,
824 it looks very stupid. Particularly,
825 it means that this chunk of code will either
826 never be called or result in strong amplification
827 of burstiness. Dangerous, silly, and, however,
828 no another solution exists.
829 */
830 if ((cl = cl->borrow) == NULL) {
831 this_cl->qstats.overlimits++;
832 this_cl->overlimit(this_cl);
833 return NULL;
834 }
835 if (cl->level > q->toplevel)
836 return NULL;
837 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
838
839 cl->delayed = 0;
840 return cl;
841 }
842
843 static __inline__ struct sk_buff *
844 cbq_dequeue_prio(struct Qdisc *sch, int prio)
845 {
846 struct cbq_sched_data *q = qdisc_priv(sch);
847 struct cbq_class *cl_tail, *cl_prev, *cl;
848 struct sk_buff *skb;
849 int deficit;
850
851 cl_tail = cl_prev = q->active[prio];
852 cl = cl_prev->next_alive;
853
854 do {
855 deficit = 0;
856
857 /* Start round */
858 do {
859 struct cbq_class *borrow = cl;
860
861 if (cl->q->q.qlen &&
862 (borrow = cbq_under_limit(cl)) == NULL)
863 goto skip_class;
864
865 if (cl->deficit <= 0) {
866 /* Class exhausted its allotment per
867 this round. Switch to the next one.
868 */
869 deficit = 1;
870 cl->deficit += cl->quantum;
871 goto next_class;
872 }
873
874 skb = cl->q->dequeue(cl->q);
875
876 /* Class did not give us any skb :-(
877 It could occur even if cl->q->q.qlen != 0
878 f.e. if cl->q == "tbf"
879 */
880 if (skb == NULL)
881 goto skip_class;
882
883 cl->deficit -= skb->len;
884 q->tx_class = cl;
885 q->tx_borrowed = borrow;
886 if (borrow != cl) {
887 #ifndef CBQ_XSTATS_BORROWS_BYTES
888 borrow->xstats.borrows++;
889 cl->xstats.borrows++;
890 #else
891 borrow->xstats.borrows += skb->len;
892 cl->xstats.borrows += skb->len;
893 #endif
894 }
895 q->tx_len = skb->len;
896
897 if (cl->deficit <= 0) {
898 q->active[prio] = cl;
899 cl = cl->next_alive;
900 cl->deficit += cl->quantum;
901 }
902 return skb;
903
904 skip_class:
905 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
906 /* Class is empty or penalized.
907 Unlink it from active chain.
908 */
909 cl_prev->next_alive = cl->next_alive;
910 cl->next_alive = NULL;
911
912 /* Did cl_tail point to it? */
913 if (cl == cl_tail) {
914 /* Repair it! */
915 cl_tail = cl_prev;
916
917 /* Was it the last class in this band? */
918 if (cl == cl_tail) {
919 /* Kill the band! */
920 q->active[prio] = NULL;
921 q->activemask &= ~(1<<prio);
922 if (cl->q->q.qlen)
923 cbq_activate_class(cl);
924 return NULL;
925 }
926
927 q->active[prio] = cl_tail;
928 }
929 if (cl->q->q.qlen)
930 cbq_activate_class(cl);
931
932 cl = cl_prev;
933 }
934
935 next_class:
936 cl_prev = cl;
937 cl = cl->next_alive;
938 } while (cl_prev != cl_tail);
939 } while (deficit);
940
941 q->active[prio] = cl_prev;
942
943 return NULL;
944 }
945
946 static __inline__ struct sk_buff *
947 cbq_dequeue_1(struct Qdisc *sch)
948 {
949 struct cbq_sched_data *q = qdisc_priv(sch);
950 struct sk_buff *skb;
951 unsigned activemask;
952
953 activemask = q->activemask&0xFF;
954 while (activemask) {
955 int prio = ffz(~activemask);
956 activemask &= ~(1<<prio);
957 skb = cbq_dequeue_prio(sch, prio);
958 if (skb)
959 return skb;
960 }
961 return NULL;
962 }
963
964 static struct sk_buff *
965 cbq_dequeue(struct Qdisc *sch)
966 {
967 struct sk_buff *skb;
968 struct cbq_sched_data *q = qdisc_priv(sch);
969 psched_time_t now;
970 psched_tdiff_t incr;
971
972 now = psched_get_time();
973 incr = now - q->now_rt;
974
975 if (q->tx_class) {
976 psched_tdiff_t incr2;
977 /* Time integrator. We calculate EOS time
978 by adding expected packet transmission time.
979 If real time is greater, we warp artificial clock,
980 so that:
981
982 cbq_time = max(real_time, work);
983 */
984 incr2 = L2T(&q->link, q->tx_len);
985 q->now += incr2;
986 cbq_update(q);
987 if ((incr -= incr2) < 0)
988 incr = 0;
989 }
990 q->now += incr;
991 q->now_rt = now;
992
993 for (;;) {
994 q->wd_expires = 0;
995
996 skb = cbq_dequeue_1(sch);
997 if (skb) {
998 sch->q.qlen--;
999 sch->flags &= ~TCQ_F_THROTTLED;
1000 return skb;
1001 }
1002
1003 /* All the classes are overlimit.
1004
1005 It is possible, if:
1006
1007 1. Scheduler is empty.
1008 2. Toplevel cutoff inhibited borrowing.
1009 3. Root class is overlimit.
1010
1011 Reset 2d and 3d conditions and retry.
1012
1013 Note, that NS and cbq-2.0 are buggy, peeking
1014 an arbitrary class is appropriate for ancestor-only
1015 sharing, but not for toplevel algorithm.
1016
1017 Our version is better, but slower, because it requires
1018 two passes, but it is unavoidable with top-level sharing.
1019 */
1020
1021 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1022 q->link.undertime == PSCHED_PASTPERFECT)
1023 break;
1024
1025 q->toplevel = TC_CBQ_MAXLEVEL;
1026 q->link.undertime = PSCHED_PASTPERFECT;
1027 }
1028
1029 /* No packets in scheduler or nobody wants to give them to us :-(
1030 Sigh... start watchdog timer in the last case. */
1031
1032 if (sch->q.qlen) {
1033 sch->qstats.overlimits++;
1034 if (q->wd_expires)
1035 qdisc_watchdog_schedule(&q->watchdog,
1036 now + q->wd_expires);
1037 }
1038 return NULL;
1039 }
1040
1041 /* CBQ class maintanance routines */
1042
1043 static void cbq_adjust_levels(struct cbq_class *this)
1044 {
1045 if (this == NULL)
1046 return;
1047
1048 do {
1049 int level = 0;
1050 struct cbq_class *cl;
1051
1052 if ((cl = this->children) != NULL) {
1053 do {
1054 if (cl->level > level)
1055 level = cl->level;
1056 } while ((cl = cl->sibling) != this->children);
1057 }
1058 this->level = level+1;
1059 } while ((this = this->tparent) != NULL);
1060 }
1061
1062 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1063 {
1064 struct cbq_class *cl;
1065 struct hlist_node *n;
1066 unsigned int h;
1067
1068 if (q->quanta[prio] == 0)
1069 return;
1070
1071 for (h = 0; h < q->clhash.hashsize; h++) {
1072 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1073 /* BUGGGG... Beware! This expression suffer of
1074 arithmetic overflows!
1075 */
1076 if (cl->priority == prio) {
1077 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1078 q->quanta[prio];
1079 }
1080 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1081 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1082 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1083 }
1084 }
1085 }
1086 }
1087
1088 static void cbq_sync_defmap(struct cbq_class *cl)
1089 {
1090 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1091 struct cbq_class *split = cl->split;
1092 unsigned h;
1093 int i;
1094
1095 if (split == NULL)
1096 return;
1097
1098 for (i=0; i<=TC_PRIO_MAX; i++) {
1099 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1100 split->defaults[i] = NULL;
1101 }
1102
1103 for (i=0; i<=TC_PRIO_MAX; i++) {
1104 int level = split->level;
1105
1106 if (split->defaults[i])
1107 continue;
1108
1109 for (h = 0; h < q->clhash.hashsize; h++) {
1110 struct hlist_node *n;
1111 struct cbq_class *c;
1112
1113 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1114 common.hnode) {
1115 if (c->split == split && c->level < level &&
1116 c->defmap&(1<<i)) {
1117 split->defaults[i] = c;
1118 level = c->level;
1119 }
1120 }
1121 }
1122 }
1123 }
1124
1125 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1126 {
1127 struct cbq_class *split = NULL;
1128
1129 if (splitid == 0) {
1130 if ((split = cl->split) == NULL)
1131 return;
1132 splitid = split->common.classid;
1133 }
1134
1135 if (split == NULL || split->common.classid != splitid) {
1136 for (split = cl->tparent; split; split = split->tparent)
1137 if (split->common.classid == splitid)
1138 break;
1139 }
1140
1141 if (split == NULL)
1142 return;
1143
1144 if (cl->split != split) {
1145 cl->defmap = 0;
1146 cbq_sync_defmap(cl);
1147 cl->split = split;
1148 cl->defmap = def&mask;
1149 } else
1150 cl->defmap = (cl->defmap&~mask)|(def&mask);
1151
1152 cbq_sync_defmap(cl);
1153 }
1154
1155 static void cbq_unlink_class(struct cbq_class *this)
1156 {
1157 struct cbq_class *cl, **clp;
1158 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1159
1160 qdisc_class_hash_remove(&q->clhash, &this->common);
1161
1162 if (this->tparent) {
1163 clp=&this->sibling;
1164 cl = *clp;
1165 do {
1166 if (cl == this) {
1167 *clp = cl->sibling;
1168 break;
1169 }
1170 clp = &cl->sibling;
1171 } while ((cl = *clp) != this->sibling);
1172
1173 if (this->tparent->children == this) {
1174 this->tparent->children = this->sibling;
1175 if (this->sibling == this)
1176 this->tparent->children = NULL;
1177 }
1178 } else {
1179 BUG_TRAP(this->sibling == this);
1180 }
1181 }
1182
1183 static void cbq_link_class(struct cbq_class *this)
1184 {
1185 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1186 struct cbq_class *parent = this->tparent;
1187
1188 this->sibling = this;
1189 qdisc_class_hash_insert(&q->clhash, &this->common);
1190
1191 if (parent == NULL)
1192 return;
1193
1194 if (parent->children == NULL) {
1195 parent->children = this;
1196 } else {
1197 this->sibling = parent->children->sibling;
1198 parent->children->sibling = this;
1199 }
1200 }
1201
1202 static unsigned int cbq_drop(struct Qdisc* sch)
1203 {
1204 struct cbq_sched_data *q = qdisc_priv(sch);
1205 struct cbq_class *cl, *cl_head;
1206 int prio;
1207 unsigned int len;
1208
1209 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1210 if ((cl_head = q->active[prio]) == NULL)
1211 continue;
1212
1213 cl = cl_head;
1214 do {
1215 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1216 sch->q.qlen--;
1217 if (!cl->q->q.qlen)
1218 cbq_deactivate_class(cl);
1219 return len;
1220 }
1221 } while ((cl = cl->next_alive) != cl_head);
1222 }
1223 return 0;
1224 }
1225
1226 static void
1227 cbq_reset(struct Qdisc* sch)
1228 {
1229 struct cbq_sched_data *q = qdisc_priv(sch);
1230 struct cbq_class *cl;
1231 struct hlist_node *n;
1232 int prio;
1233 unsigned h;
1234
1235 q->activemask = 0;
1236 q->pmask = 0;
1237 q->tx_class = NULL;
1238 q->tx_borrowed = NULL;
1239 qdisc_watchdog_cancel(&q->watchdog);
1240 hrtimer_cancel(&q->delay_timer);
1241 q->toplevel = TC_CBQ_MAXLEVEL;
1242 q->now = psched_get_time();
1243 q->now_rt = q->now;
1244
1245 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1246 q->active[prio] = NULL;
1247
1248 for (h = 0; h < q->clhash.hashsize; h++) {
1249 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1250 qdisc_reset(cl->q);
1251
1252 cl->next_alive = NULL;
1253 cl->undertime = PSCHED_PASTPERFECT;
1254 cl->avgidle = cl->maxidle;
1255 cl->deficit = cl->quantum;
1256 cl->cpriority = cl->priority;
1257 }
1258 }
1259 sch->q.qlen = 0;
1260 }
1261
1262
1263 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1264 {
1265 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1266 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1267 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1268 }
1269 if (lss->change&TCF_CBQ_LSS_EWMA)
1270 cl->ewma_log = lss->ewma_log;
1271 if (lss->change&TCF_CBQ_LSS_AVPKT)
1272 cl->avpkt = lss->avpkt;
1273 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1274 cl->minidle = -(long)lss->minidle;
1275 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1276 cl->maxidle = lss->maxidle;
1277 cl->avgidle = lss->maxidle;
1278 }
1279 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1280 cl->offtime = lss->offtime;
1281 return 0;
1282 }
1283
1284 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1285 {
1286 q->nclasses[cl->priority]--;
1287 q->quanta[cl->priority] -= cl->weight;
1288 cbq_normalize_quanta(q, cl->priority);
1289 }
1290
1291 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1292 {
1293 q->nclasses[cl->priority]++;
1294 q->quanta[cl->priority] += cl->weight;
1295 cbq_normalize_quanta(q, cl->priority);
1296 }
1297
1298 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1299 {
1300 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1301
1302 if (wrr->allot)
1303 cl->allot = wrr->allot;
1304 if (wrr->weight)
1305 cl->weight = wrr->weight;
1306 if (wrr->priority) {
1307 cl->priority = wrr->priority-1;
1308 cl->cpriority = cl->priority;
1309 if (cl->priority >= cl->priority2)
1310 cl->priority2 = TC_CBQ_MAXPRIO-1;
1311 }
1312
1313 cbq_addprio(q, cl);
1314 return 0;
1315 }
1316
1317 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1318 {
1319 switch (ovl->strategy) {
1320 case TC_CBQ_OVL_CLASSIC:
1321 cl->overlimit = cbq_ovl_classic;
1322 break;
1323 case TC_CBQ_OVL_DELAY:
1324 cl->overlimit = cbq_ovl_delay;
1325 break;
1326 case TC_CBQ_OVL_LOWPRIO:
1327 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1328 ovl->priority2-1 <= cl->priority)
1329 return -EINVAL;
1330 cl->priority2 = ovl->priority2-1;
1331 cl->overlimit = cbq_ovl_lowprio;
1332 break;
1333 case TC_CBQ_OVL_DROP:
1334 cl->overlimit = cbq_ovl_drop;
1335 break;
1336 case TC_CBQ_OVL_RCLASSIC:
1337 cl->overlimit = cbq_ovl_rclassic;
1338 break;
1339 default:
1340 return -EINVAL;
1341 }
1342 cl->penalty = ovl->penalty;
1343 return 0;
1344 }
1345
1346 #ifdef CONFIG_NET_CLS_ACT
1347 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1348 {
1349 cl->police = p->police;
1350
1351 if (cl->q->handle) {
1352 if (p->police == TC_POLICE_RECLASSIFY)
1353 cl->q->reshape_fail = cbq_reshape_fail;
1354 else
1355 cl->q->reshape_fail = NULL;
1356 }
1357 return 0;
1358 }
1359 #endif
1360
1361 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1362 {
1363 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1364 return 0;
1365 }
1366
1367 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1368 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1369 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1370 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1371 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1372 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1373 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1374 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1375 };
1376
1377 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1378 {
1379 struct cbq_sched_data *q = qdisc_priv(sch);
1380 struct nlattr *tb[TCA_CBQ_MAX + 1];
1381 struct tc_ratespec *r;
1382 int err;
1383
1384 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1385 if (err < 0)
1386 return err;
1387
1388 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1389 return -EINVAL;
1390
1391 r = nla_data(tb[TCA_CBQ_RATE]);
1392
1393 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1394 return -EINVAL;
1395
1396 err = qdisc_class_hash_init(&q->clhash);
1397 if (err < 0)
1398 goto put_rtab;
1399
1400 q->link.refcnt = 1;
1401 q->link.sibling = &q->link;
1402 q->link.common.classid = sch->handle;
1403 q->link.qdisc = sch;
1404 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1405 &pfifo_qdisc_ops,
1406 sch->handle)))
1407 q->link.q = &noop_qdisc;
1408
1409 q->link.priority = TC_CBQ_MAXPRIO-1;
1410 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1411 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1412 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1413 q->link.overlimit = cbq_ovl_classic;
1414 q->link.allot = psched_mtu(qdisc_dev(sch));
1415 q->link.quantum = q->link.allot;
1416 q->link.weight = q->link.R_tab->rate.rate;
1417
1418 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1419 q->link.avpkt = q->link.allot/2;
1420 q->link.minidle = -0x7FFFFFFF;
1421
1422 qdisc_watchdog_init(&q->watchdog, sch);
1423 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1424 q->delay_timer.function = cbq_undelay;
1425 q->toplevel = TC_CBQ_MAXLEVEL;
1426 q->now = psched_get_time();
1427 q->now_rt = q->now;
1428
1429 cbq_link_class(&q->link);
1430
1431 if (tb[TCA_CBQ_LSSOPT])
1432 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1433
1434 cbq_addprio(q, &q->link);
1435 return 0;
1436
1437 put_rtab:
1438 qdisc_put_rtab(q->link.R_tab);
1439 return err;
1440 }
1441
1442 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1443 {
1444 unsigned char *b = skb_tail_pointer(skb);
1445
1446 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1447 return skb->len;
1448
1449 nla_put_failure:
1450 nlmsg_trim(skb, b);
1451 return -1;
1452 }
1453
1454 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1455 {
1456 unsigned char *b = skb_tail_pointer(skb);
1457 struct tc_cbq_lssopt opt;
1458
1459 opt.flags = 0;
1460 if (cl->borrow == NULL)
1461 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1462 if (cl->share == NULL)
1463 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1464 opt.ewma_log = cl->ewma_log;
1465 opt.level = cl->level;
1466 opt.avpkt = cl->avpkt;
1467 opt.maxidle = cl->maxidle;
1468 opt.minidle = (u32)(-cl->minidle);
1469 opt.offtime = cl->offtime;
1470 opt.change = ~0;
1471 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1472 return skb->len;
1473
1474 nla_put_failure:
1475 nlmsg_trim(skb, b);
1476 return -1;
1477 }
1478
1479 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1480 {
1481 unsigned char *b = skb_tail_pointer(skb);
1482 struct tc_cbq_wrropt opt;
1483
1484 opt.flags = 0;
1485 opt.allot = cl->allot;
1486 opt.priority = cl->priority+1;
1487 opt.cpriority = cl->cpriority+1;
1488 opt.weight = cl->weight;
1489 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1490 return skb->len;
1491
1492 nla_put_failure:
1493 nlmsg_trim(skb, b);
1494 return -1;
1495 }
1496
1497 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1498 {
1499 unsigned char *b = skb_tail_pointer(skb);
1500 struct tc_cbq_ovl opt;
1501
1502 opt.strategy = cl->ovl_strategy;
1503 opt.priority2 = cl->priority2+1;
1504 opt.pad = 0;
1505 opt.penalty = cl->penalty;
1506 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1507 return skb->len;
1508
1509 nla_put_failure:
1510 nlmsg_trim(skb, b);
1511 return -1;
1512 }
1513
1514 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1515 {
1516 unsigned char *b = skb_tail_pointer(skb);
1517 struct tc_cbq_fopt opt;
1518
1519 if (cl->split || cl->defmap) {
1520 opt.split = cl->split ? cl->split->common.classid : 0;
1521 opt.defmap = cl->defmap;
1522 opt.defchange = ~0;
1523 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1524 }
1525 return skb->len;
1526
1527 nla_put_failure:
1528 nlmsg_trim(skb, b);
1529 return -1;
1530 }
1531
1532 #ifdef CONFIG_NET_CLS_ACT
1533 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1534 {
1535 unsigned char *b = skb_tail_pointer(skb);
1536 struct tc_cbq_police opt;
1537
1538 if (cl->police) {
1539 opt.police = cl->police;
1540 opt.__res1 = 0;
1541 opt.__res2 = 0;
1542 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1543 }
1544 return skb->len;
1545
1546 nla_put_failure:
1547 nlmsg_trim(skb, b);
1548 return -1;
1549 }
1550 #endif
1551
1552 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1553 {
1554 if (cbq_dump_lss(skb, cl) < 0 ||
1555 cbq_dump_rate(skb, cl) < 0 ||
1556 cbq_dump_wrr(skb, cl) < 0 ||
1557 cbq_dump_ovl(skb, cl) < 0 ||
1558 #ifdef CONFIG_NET_CLS_ACT
1559 cbq_dump_police(skb, cl) < 0 ||
1560 #endif
1561 cbq_dump_fopt(skb, cl) < 0)
1562 return -1;
1563 return 0;
1564 }
1565
1566 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1567 {
1568 struct cbq_sched_data *q = qdisc_priv(sch);
1569 struct nlattr *nest;
1570
1571 nest = nla_nest_start(skb, TCA_OPTIONS);
1572 if (nest == NULL)
1573 goto nla_put_failure;
1574 if (cbq_dump_attr(skb, &q->link) < 0)
1575 goto nla_put_failure;
1576 nla_nest_end(skb, nest);
1577 return skb->len;
1578
1579 nla_put_failure:
1580 nla_nest_cancel(skb, nest);
1581 return -1;
1582 }
1583
1584 static int
1585 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1586 {
1587 struct cbq_sched_data *q = qdisc_priv(sch);
1588
1589 q->link.xstats.avgidle = q->link.avgidle;
1590 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1591 }
1592
1593 static int
1594 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1595 struct sk_buff *skb, struct tcmsg *tcm)
1596 {
1597 struct cbq_class *cl = (struct cbq_class*)arg;
1598 struct nlattr *nest;
1599
1600 if (cl->tparent)
1601 tcm->tcm_parent = cl->tparent->common.classid;
1602 else
1603 tcm->tcm_parent = TC_H_ROOT;
1604 tcm->tcm_handle = cl->common.classid;
1605 tcm->tcm_info = cl->q->handle;
1606
1607 nest = nla_nest_start(skb, TCA_OPTIONS);
1608 if (nest == NULL)
1609 goto nla_put_failure;
1610 if (cbq_dump_attr(skb, cl) < 0)
1611 goto nla_put_failure;
1612 nla_nest_end(skb, nest);
1613 return skb->len;
1614
1615 nla_put_failure:
1616 nla_nest_cancel(skb, nest);
1617 return -1;
1618 }
1619
1620 static int
1621 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1622 struct gnet_dump *d)
1623 {
1624 struct cbq_sched_data *q = qdisc_priv(sch);
1625 struct cbq_class *cl = (struct cbq_class*)arg;
1626
1627 cl->qstats.qlen = cl->q->q.qlen;
1628 cl->xstats.avgidle = cl->avgidle;
1629 cl->xstats.undertime = 0;
1630
1631 if (cl->undertime != PSCHED_PASTPERFECT)
1632 cl->xstats.undertime = cl->undertime - q->now;
1633
1634 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1635 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1636 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1637 return -1;
1638
1639 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1640 }
1641
1642 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1643 struct Qdisc **old)
1644 {
1645 struct cbq_class *cl = (struct cbq_class*)arg;
1646
1647 if (cl) {
1648 if (new == NULL) {
1649 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1650 &pfifo_qdisc_ops,
1651 cl->common.classid);
1652 if (new == NULL)
1653 return -ENOBUFS;
1654 } else {
1655 #ifdef CONFIG_NET_CLS_ACT
1656 if (cl->police == TC_POLICE_RECLASSIFY)
1657 new->reshape_fail = cbq_reshape_fail;
1658 #endif
1659 }
1660 sch_tree_lock(sch);
1661 *old = xchg(&cl->q, new);
1662 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1663 qdisc_reset(*old);
1664 sch_tree_unlock(sch);
1665
1666 return 0;
1667 }
1668 return -ENOENT;
1669 }
1670
1671 static struct Qdisc *
1672 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1673 {
1674 struct cbq_class *cl = (struct cbq_class*)arg;
1675
1676 return cl ? cl->q : NULL;
1677 }
1678
1679 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1680 {
1681 struct cbq_class *cl = (struct cbq_class *)arg;
1682
1683 if (cl->q->q.qlen == 0)
1684 cbq_deactivate_class(cl);
1685 }
1686
1687 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1688 {
1689 struct cbq_sched_data *q = qdisc_priv(sch);
1690 struct cbq_class *cl = cbq_class_lookup(q, classid);
1691
1692 if (cl) {
1693 cl->refcnt++;
1694 return (unsigned long)cl;
1695 }
1696 return 0;
1697 }
1698
1699 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1700 {
1701 struct cbq_sched_data *q = qdisc_priv(sch);
1702
1703 BUG_TRAP(!cl->filters);
1704
1705 tcf_destroy_chain(&cl->filter_list);
1706 qdisc_destroy(cl->q);
1707 qdisc_put_rtab(cl->R_tab);
1708 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1709 if (cl != &q->link)
1710 kfree(cl);
1711 }
1712
1713 static void
1714 cbq_destroy(struct Qdisc* sch)
1715 {
1716 struct cbq_sched_data *q = qdisc_priv(sch);
1717 struct hlist_node *n, *next;
1718 struct cbq_class *cl;
1719 unsigned h;
1720
1721 #ifdef CONFIG_NET_CLS_ACT
1722 q->rx_class = NULL;
1723 #endif
1724 /*
1725 * Filters must be destroyed first because we don't destroy the
1726 * classes from root to leafs which means that filters can still
1727 * be bound to classes which have been destroyed already. --TGR '04
1728 */
1729 for (h = 0; h < q->clhash.hashsize; h++) {
1730 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1731 tcf_destroy_chain(&cl->filter_list);
1732 }
1733 for (h = 0; h < q->clhash.hashsize; h++) {
1734 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1735 common.hnode)
1736 cbq_destroy_class(sch, cl);
1737 }
1738 qdisc_class_hash_destroy(&q->clhash);
1739 }
1740
1741 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1742 {
1743 struct cbq_class *cl = (struct cbq_class*)arg;
1744
1745 if (--cl->refcnt == 0) {
1746 #ifdef CONFIG_NET_CLS_ACT
1747 struct cbq_sched_data *q = qdisc_priv(sch);
1748
1749 spin_lock_bh(&sch->dev_queue->lock);
1750 if (q->rx_class == cl)
1751 q->rx_class = NULL;
1752 spin_unlock_bh(&sch->dev_queue->lock);
1753 #endif
1754
1755 cbq_destroy_class(sch, cl);
1756 }
1757 }
1758
1759 static int
1760 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1761 unsigned long *arg)
1762 {
1763 int err;
1764 struct cbq_sched_data *q = qdisc_priv(sch);
1765 struct cbq_class *cl = (struct cbq_class*)*arg;
1766 struct nlattr *opt = tca[TCA_OPTIONS];
1767 struct nlattr *tb[TCA_CBQ_MAX + 1];
1768 struct cbq_class *parent;
1769 struct qdisc_rate_table *rtab = NULL;
1770
1771 if (opt == NULL)
1772 return -EINVAL;
1773
1774 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1775 if (err < 0)
1776 return err;
1777
1778 if (cl) {
1779 /* Check parent */
1780 if (parentid) {
1781 if (cl->tparent &&
1782 cl->tparent->common.classid != parentid)
1783 return -EINVAL;
1784 if (!cl->tparent && parentid != TC_H_ROOT)
1785 return -EINVAL;
1786 }
1787
1788 if (tb[TCA_CBQ_RATE]) {
1789 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1790 if (rtab == NULL)
1791 return -EINVAL;
1792 }
1793
1794 /* Change class parameters */
1795 sch_tree_lock(sch);
1796
1797 if (cl->next_alive != NULL)
1798 cbq_deactivate_class(cl);
1799
1800 if (rtab) {
1801 rtab = xchg(&cl->R_tab, rtab);
1802 qdisc_put_rtab(rtab);
1803 }
1804
1805 if (tb[TCA_CBQ_LSSOPT])
1806 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1807
1808 if (tb[TCA_CBQ_WRROPT]) {
1809 cbq_rmprio(q, cl);
1810 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1811 }
1812
1813 if (tb[TCA_CBQ_OVL_STRATEGY])
1814 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1815
1816 #ifdef CONFIG_NET_CLS_ACT
1817 if (tb[TCA_CBQ_POLICE])
1818 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1819 #endif
1820
1821 if (tb[TCA_CBQ_FOPT])
1822 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1823
1824 if (cl->q->q.qlen)
1825 cbq_activate_class(cl);
1826
1827 sch_tree_unlock(sch);
1828
1829 if (tca[TCA_RATE])
1830 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1831 &sch->dev_queue->lock,
1832 tca[TCA_RATE]);
1833 return 0;
1834 }
1835
1836 if (parentid == TC_H_ROOT)
1837 return -EINVAL;
1838
1839 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1840 tb[TCA_CBQ_LSSOPT] == NULL)
1841 return -EINVAL;
1842
1843 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1844 if (rtab == NULL)
1845 return -EINVAL;
1846
1847 if (classid) {
1848 err = -EINVAL;
1849 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1850 goto failure;
1851 } else {
1852 int i;
1853 classid = TC_H_MAKE(sch->handle,0x8000);
1854
1855 for (i=0; i<0x8000; i++) {
1856 if (++q->hgenerator >= 0x8000)
1857 q->hgenerator = 1;
1858 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1859 break;
1860 }
1861 err = -ENOSR;
1862 if (i >= 0x8000)
1863 goto failure;
1864 classid = classid|q->hgenerator;
1865 }
1866
1867 parent = &q->link;
1868 if (parentid) {
1869 parent = cbq_class_lookup(q, parentid);
1870 err = -EINVAL;
1871 if (parent == NULL)
1872 goto failure;
1873 }
1874
1875 err = -ENOBUFS;
1876 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1877 if (cl == NULL)
1878 goto failure;
1879 cl->R_tab = rtab;
1880 rtab = NULL;
1881 cl->refcnt = 1;
1882 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1883 &pfifo_qdisc_ops, classid)))
1884 cl->q = &noop_qdisc;
1885 cl->common.classid = classid;
1886 cl->tparent = parent;
1887 cl->qdisc = sch;
1888 cl->allot = parent->allot;
1889 cl->quantum = cl->allot;
1890 cl->weight = cl->R_tab->rate.rate;
1891
1892 sch_tree_lock(sch);
1893 cbq_link_class(cl);
1894 cl->borrow = cl->tparent;
1895 if (cl->tparent != &q->link)
1896 cl->share = cl->tparent;
1897 cbq_adjust_levels(parent);
1898 cl->minidle = -0x7FFFFFFF;
1899 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1900 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1901 if (cl->ewma_log==0)
1902 cl->ewma_log = q->link.ewma_log;
1903 if (cl->maxidle==0)
1904 cl->maxidle = q->link.maxidle;
1905 if (cl->avpkt==0)
1906 cl->avpkt = q->link.avpkt;
1907 cl->overlimit = cbq_ovl_classic;
1908 if (tb[TCA_CBQ_OVL_STRATEGY])
1909 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1910 #ifdef CONFIG_NET_CLS_ACT
1911 if (tb[TCA_CBQ_POLICE])
1912 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1913 #endif
1914 if (tb[TCA_CBQ_FOPT])
1915 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1916 sch_tree_unlock(sch);
1917
1918 qdisc_class_hash_grow(sch, &q->clhash);
1919
1920 if (tca[TCA_RATE])
1921 gen_new_estimator(&cl->bstats, &cl->rate_est,
1922 &sch->dev_queue->lock, tca[TCA_RATE]);
1923
1924 *arg = (unsigned long)cl;
1925 return 0;
1926
1927 failure:
1928 qdisc_put_rtab(rtab);
1929 return err;
1930 }
1931
1932 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1933 {
1934 struct cbq_sched_data *q = qdisc_priv(sch);
1935 struct cbq_class *cl = (struct cbq_class*)arg;
1936 unsigned int qlen;
1937
1938 if (cl->filters || cl->children || cl == &q->link)
1939 return -EBUSY;
1940
1941 sch_tree_lock(sch);
1942
1943 qlen = cl->q->q.qlen;
1944 qdisc_reset(cl->q);
1945 qdisc_tree_decrease_qlen(cl->q, qlen);
1946
1947 if (cl->next_alive)
1948 cbq_deactivate_class(cl);
1949
1950 if (q->tx_borrowed == cl)
1951 q->tx_borrowed = q->tx_class;
1952 if (q->tx_class == cl) {
1953 q->tx_class = NULL;
1954 q->tx_borrowed = NULL;
1955 }
1956 #ifdef CONFIG_NET_CLS_ACT
1957 if (q->rx_class == cl)
1958 q->rx_class = NULL;
1959 #endif
1960
1961 cbq_unlink_class(cl);
1962 cbq_adjust_levels(cl->tparent);
1963 cl->defmap = 0;
1964 cbq_sync_defmap(cl);
1965
1966 cbq_rmprio(q, cl);
1967 sch_tree_unlock(sch);
1968
1969 if (--cl->refcnt == 0)
1970 cbq_destroy_class(sch, cl);
1971
1972 return 0;
1973 }
1974
1975 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1976 {
1977 struct cbq_sched_data *q = qdisc_priv(sch);
1978 struct cbq_class *cl = (struct cbq_class *)arg;
1979
1980 if (cl == NULL)
1981 cl = &q->link;
1982
1983 return &cl->filter_list;
1984 }
1985
1986 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1987 u32 classid)
1988 {
1989 struct cbq_sched_data *q = qdisc_priv(sch);
1990 struct cbq_class *p = (struct cbq_class*)parent;
1991 struct cbq_class *cl = cbq_class_lookup(q, classid);
1992
1993 if (cl) {
1994 if (p && p->level <= cl->level)
1995 return 0;
1996 cl->filters++;
1997 return (unsigned long)cl;
1998 }
1999 return 0;
2000 }
2001
2002 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2003 {
2004 struct cbq_class *cl = (struct cbq_class*)arg;
2005
2006 cl->filters--;
2007 }
2008
2009 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2010 {
2011 struct cbq_sched_data *q = qdisc_priv(sch);
2012 struct cbq_class *cl;
2013 struct hlist_node *n;
2014 unsigned h;
2015
2016 if (arg->stop)
2017 return;
2018
2019 for (h = 0; h < q->clhash.hashsize; h++) {
2020 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2021 if (arg->count < arg->skip) {
2022 arg->count++;
2023 continue;
2024 }
2025 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2026 arg->stop = 1;
2027 return;
2028 }
2029 arg->count++;
2030 }
2031 }
2032 }
2033
2034 static const struct Qdisc_class_ops cbq_class_ops = {
2035 .graft = cbq_graft,
2036 .leaf = cbq_leaf,
2037 .qlen_notify = cbq_qlen_notify,
2038 .get = cbq_get,
2039 .put = cbq_put,
2040 .change = cbq_change_class,
2041 .delete = cbq_delete,
2042 .walk = cbq_walk,
2043 .tcf_chain = cbq_find_tcf,
2044 .bind_tcf = cbq_bind_filter,
2045 .unbind_tcf = cbq_unbind_filter,
2046 .dump = cbq_dump_class,
2047 .dump_stats = cbq_dump_class_stats,
2048 };
2049
2050 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2051 .next = NULL,
2052 .cl_ops = &cbq_class_ops,
2053 .id = "cbq",
2054 .priv_size = sizeof(struct cbq_sched_data),
2055 .enqueue = cbq_enqueue,
2056 .dequeue = cbq_dequeue,
2057 .requeue = cbq_requeue,
2058 .drop = cbq_drop,
2059 .init = cbq_init,
2060 .reset = cbq_reset,
2061 .destroy = cbq_destroy,
2062 .change = NULL,
2063 .dump = cbq_dump,
2064 .dump_stats = cbq_dump_stats,
2065 .owner = THIS_MODULE,
2066 };
2067
2068 static int __init cbq_module_init(void)
2069 {
2070 return register_qdisc(&cbq_qdisc_ops);
2071 }
2072 static void __exit cbq_module_exit(void)
2073 {
2074 unregister_qdisc(&cbq_qdisc_ops);
2075 }
2076 module_init(cbq_module_init)
2077 module_exit(cbq_module_exit)
2078 MODULE_LICENSE("GPL");